diff --git a/docs/Model Support.md b/docs/Model Support.md index 36384a54e..30396c317 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -562,6 +562,23 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - **Sampler:** Defaults to `ER-SDE-Solver`, but all common samplers work. They officially recommend also trying out `Euler Ancestral` or `DPM++ 2M SDE` - **Scheduler:** Default is fine (`Simple`), or you can experiment at will. The model is adaptable. +# Ernie + +- [Ernie and Ernie Turbo]() are supported in SwarmUI! +- It is an 8B model, with both a strong base and an official turbo designed to run extremely fast while competing at the top level of image models + - The "Turbo" model can be downloaded here [Comfy-Org/ERNIE-Image]() + - Or the BF16 fat version [Comfy-Org/ERNIE-Image]() + - Save in `diffusion_models` +- Uses the Flux.2 VAE, will be downloaded and handled automatically +- Uses the Ministral 3 3b text encoder, will be downloaded and handled automatically +- **Parameters:** + - **Prompt:** Supports general prompting in any format just fine. Speaks English and Chinese deeply. + - **Sampler:** Default is fine. + - **Scheduler:** Default is fine. + - **CFG Scale:** For Turbo, `1`, for base normal CFG ranges (eg 4 or 7) + - **Steps:** For Turbo `8` is recommended. For Base, 20+ steps as normal. + - **Resolution:** Side length `1024` is the standard. + # Video Models - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md). diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index 1b1e5182a..7c739de54 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -830,7 +830,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent latent = [srCond, 2]; } } - else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma()) + else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie()) { defscheduler ??= "simple"; } diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index 42ccc7c4d..144d3474f 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -63,6 +63,9 @@ public bool IsSD3() /// Returns true if the current model is any Black Forest Labs' Flux.2 variant. public bool IsAnyFlux2() => IsFlux2Dev() || IsFlux2Klein4B() || IsFlux2Klein9B(); + /// Returns true if the current model is Ernie Image. + public bool IsErnie() => IsModelCompatClass(T2IModelClassSorter.CompatErnieImage); + /// Returns true if the current model is AuraFlow. public bool IsAuraFlow() => IsModelCompatClass(T2IModelClassSorter.CompatAuraFlow); @@ -263,7 +266,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n ["width"] = width }, id)); } - else if (IsAnyFlux2()) + else if (IsAnyFlux2() || IsErnie()) { return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject() { @@ -578,6 +581,11 @@ public string GetMistralFlux2Model() return RequireClipModel("mistral_3_small_flux2.safetensors", "https://huggingface.co/Comfy-Org/flux2-dev/resolve/main/split_files/text_encoders/mistral_3_small_flux2_fp4_mixed.safetensors", "1ee1ff334d78228d73049ef0ee4fcd21c1700536b5a45c06547af057f92463a7", T2IParamTypes.MistralModel); } + public string GetMinistral3_3bModel() + { + return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel); + } + public string GetClipLModel() { if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model)) @@ -1032,6 +1040,11 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) LoadingModel = [kvcached, 0]; } } + else if (IsErnie()) + { + helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel()); + helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae"); + } else if (IsFlux() && (LoadingClip is null || LoadingVAE is null || UserInput.Get(T2IParamTypes.T5XXLModel) is not null || UserInput.Get(T2IParamTypes.ClipLModel) is not null)) { helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel()); diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index b88430802..be37787d0 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -65,6 +65,7 @@ public static T2IModelCompatClass CompatFlux2 = RegisterCompat(new() { ID = "flux-2", ShortCode = "Flux2", LorasTargetTextEnc = false }), CompatFlux2Klein4B = RegisterCompat(new() { ID = "flux-2-klein-4b", ShortCode = "Fl2K4", LorasTargetTextEnc = false }), CompatFlux2Klein9B = RegisterCompat(new() { ID = "flux-2-klein-9b", ShortCode = "Fl2K9", LorasTargetTextEnc = false }), + CompatErnieImage = RegisterCompat(new() { ID = "ernie-image", ShortCode = "Ernie", LorasTargetTextEnc = false }), CompatLtxv2 = RegisterCompat(new() { ID = "lightricks-ltx-video-2", ShortCode = "LTXV2", IsText2Video = true, IsImage2Video = true }), CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false }), CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }), @@ -215,6 +216,7 @@ bool isQwenImageLora(JObject h) => (hasLoraKey(h, "transformer_blocks.0.attn.add bool isHyVid15Lora(JObject h) => hasKey(h, "cond_type_embedding.lora_down.weight") && hasKey(h, "byt5_in.fc1.lora_down.weight") && hasKey(h, "vision_in.proj.1.lora_down.weight"); bool isHyImgRefiner(JObject h) => h.ContainsKey("double_blocks.0.img_attn_k_norm.weight") && h.TryGetValue("time_r_in.mlp.0.bias", out JToken timeTok) && timeTok["shape"].ToArray()[0].Value() == 3328; bool isAuraFlow(JObject h) => h.ContainsKey("model.cond_seq_linear.weight") && h.ContainsKey("model.double_layers.0.attn.w1k.weight"); + bool isErnie(JObject h) => hasKey(h, "layers.0.mlp.linear_fc2.weight") && hasKey(h, "x_embedder.proj.weight") && hasKey(h, "layers.0.adaLN_sa_ln.weight"); bool isKandinsky5(JObject h) => hasKey(h, "pooled_text_embeddings.in_layer.weight") && hasKey(h, "text_transformer_blocks.0.feed_forward.in_layer.weight"); bool tryGetKan5IdKey(JObject h, out JToken tok) => h.TryGetValue("text_embeddings.in_layer.weight", out tok); bool isKan5VidLite(JObject h) => tryGetKan5IdKey(h, out JToken tok) && tok["shape"].ToArray()[0].Value() == 1792; @@ -628,6 +630,11 @@ JToken GetEmbeddingKey(JObject h) { return isQwenImageLora(h); }}); + // ====================== Ernie Image ====================== + Register(new() { ID = "ernie-image", CompatClass = CompatErnieImage, Name = "Ernie Image", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => + { + return isErnie(h); + }}); // ====================== Kandinsky5 ====================== Register(new() { ID = "kandinsky5-image-lite", CompatClass = CompatKandinsky5ImgLite, Name = "Kandinsky5 Image Lite", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => {