diff --git a/docs/Model Support.md b/docs/Model Support.md index 9f5644719..dab1c18a2 100644 --- a/docs/Model Support.md +++ b/docs/Model Support.md @@ -19,6 +19,7 @@ [Kandinsky 5](#kandinsky-5) | DiT | 2025 | Kandinsky Lab | 6B | No | Modern, Decent Quality | [Anima](#anima) | DiT | 2026 | Circlestone Labs | 2B | WTF | Modern, very small, decent for anime | [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast | +[HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality | Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md): @@ -590,6 +591,29 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended - Out of range doesn't corrupt immediately but will fail at composition. - Prefers aspects from square to 16:9, gets funny in 21:9 +# HiDream-O1 + +- HiDream's [HiDream O1]() is supported in SwarmUI! +- It is an 8B model, with both a base and an official 'dev' distill designed to run faster + - The "Dev" model (in fat BF16) can be downloaded here [Comfy-Org/HiDream-O1-Image - dev]() + - Dev FP8 version can be downloaded here [Comfy-Org/HiDream-O1-Image - dev FP8]() + - Or the base version (in fat BF16) [Comfy-Org/HiDream-O1-Image - base]() + - Base FP8 version can be downloaded here [Comfy-Org/HiDream-O1-Image - base FP8]() + - Save in `Stable-Diffusion` +- It has no VAE, but has in-middle dedicated large patch scaling to compensate +- Its text encoding is similarly native-integrated +- **Parameters:** + - **Prompt:** Supports general prompting in any format just fine. Speaks at least English and Chinese. Was designed to use LLM-written prompts. + - **Prompt Images:** You can upload up to 10 images for image-editing input, but it only strongly obeys single-image input. + - **Sampler:** Default is fine. + - **Scheduler:** Default is fine. + - **CFG Scale:** For Dev, `1`, for base normal CFG ranges (around `5`) + - **Steps:** For Dev `28` is their recommendation, but even just `4` works fine for simple images. For Base, 50 steps is the official recommendation. + - **Resolution:** Side length `2048` is the model's standard, but a wide range works well. + - Because of the aggressive patch scaling, 2048 on this model looks more like 1024 on most other models. 1024 on this model looks noticeably worse. Going above 2048 will have some color distortion. +- **Dev Lora:** + - A low-step dev lora can be downloaded here [Kijai/hidream-O1-image_comfy](). It allows use of the base model with the distilled behavior from the Dev model. 8 steps will generate a coherent image of lower quality, 16 steps seems closer to original quality. Use CFG Scale 1. + # Video Models - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md). diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs index 85a43237b..a878632f9 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs @@ -830,7 +830,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent latent = [srCond, 2]; } } - else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie()) + else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1()) { defscheduler ??= "simple"; } @@ -1023,6 +1023,41 @@ void makeRefLatent(JArray image) } } } + else if (IsHiDreamO1()) + { + List refImages = []; + if (UserInput.TryGet(T2IParamTypes.PromptImages, out List images) && images.Count > 0) + { + int count = Math.Min(images.Count, 10); + for (int i = 0; i < count; i++) + { + refImages.Add(GetPromptImage(true, false, i)); + } + } + else if (MaskShrunkInfo is not null && MaskShrunkInfo.ScaledImage is not null) + { + refImages.Add([MaskShrunkInfo.ScaledImage, 0]); + } + else if (BasicInputImage is not null) + { + refImages.Add(BasicInputImage.Path); + } + if (refImages.Count > 0) + { + JObject refInputs = new() + { + ["positive"] = pos, + ["negative"] = neg + }; + for (int i = 0; i < refImages.Count; i++) + { + refInputs[$"images.image_{i + 1}"] = refImages[i]; + } + string refNode = CreateNode("HiDreamO1ReferenceImages", refInputs); + pos = [refNode, 0]; + neg = [refNode, 1]; + } + } else if (IsWanVideo()) // TODO: Somehow check if this is actually a phantom model? { if (UserInput.TryGet(T2IParamTypes.PromptImages, out List images) && images.Count > 0) diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs index 144d3474f..b24ebd129 100644 --- a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs +++ b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs @@ -88,6 +88,9 @@ public bool IsKontext() /// Returns true if the current model is HiDream-i1. public bool IsHiDream() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamI1); + /// Returns true if the current model is HiDream-O1 Image. + public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1); + /// Returns true if the current model supports Flux Guidance. public bool HasFluxGuidance() { @@ -404,6 +407,15 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n ["width"] = width }, id)); } + else if (IsHiDreamO1()) + { + return resultImage(CreateNode("EmptyHiDreamO1LatentImage", new JObject() + { + ["batch_size"] = batchSize, + ["height"] = height, + ["width"] = width + }, id)); + } else if (UserInput.Get(ComfyUIBackendExtension.ShiftedLatentAverageInit, false)) { double offA = 0, offB = 0, offC = 0, offD = 0; @@ -1113,6 +1125,26 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC) LoadingClip = [quadClipLoader, 0]; helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, "flux-1", "flux-ae"); } + else if (IsHiDreamO1()) + { + string noiseScaleNode = CreateNode("ModelNoiseScale", new JObject() + { + ["model"] = LoadingModel, + ["noise_scale"] = 7.5 // TODO: Configurable? + }); + LoadingModel = [noiseScaleNode, 0]; + string seamSmoothingNode = CreateNode("HiDreamO1PatchSeamSmoothing", new JObject() + { // TODO: Configurable? + ["model"] = LoadingModel, + ["start_percent"] = 0.8, + ["end_percent"] = 1.00, + ["pattern"] = "single_shift", + ["passes"] = "2", + ["blend"] = "average", + ["strength"] = 1.00 + }); + LoadingModel = [seamSmoothingNode, 0]; + } else if (IsOmniGen()) { helpers.LoadClip("omnigen2", helpers.GetOmniQwenModel()); diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs index e90e2cf06..6ceffa9c3 100644 --- a/src/Text2Image/T2IModelClassSorter.cs +++ b/src/Text2Image/T2IModelClassSorter.cs @@ -70,6 +70,7 @@ public static T2IModelCompatClass CompatZImage = RegisterCompat(new() { ID = "z-image", ShortCode = "ZImg", LorasTargetTextEnc = false }), CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }), CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }), + CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }), // Audio models CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }), // Obscure old random ones @@ -199,6 +200,8 @@ bool isZImageLora(JObject h) => (hasLoraKey(h, "layers.0.adaLN_modulation.0") && bool isWanVace(JObject h) => hasKey(h, "vace_blocks.0.after_proj.bias"); bool isHiDream(JObject h) => h.ContainsKey("caption_projection.0.linear.weight"); bool isHiDreamLora(JObject h) => hasKey(h, "double_stream_blocks.0.block.ff_i.shared_experts.w1.lora_A.weight"); + bool isHiDreamO1(JObject h) => (h.ContainsKey("model.t_embedder1.mlp.0.weight") && h.ContainsKey("model.t_embedder1.mlp.0.bias")); + bool isHiDreamO1Lora(JObject h) => hasLoraKey(h, "final_layer2.linear") && hasLoraKey(h, "language_model.layers.0.self_attn.q_proj"); bool isChroma(JObject h) => h.ContainsKey("distilled_guidance_layer.in_proj.bias") && h.ContainsKey("double_blocks.0.img_attn.proj.bias"); bool isChromaRadiance(JObject h) => h.ContainsKey("nerf_image_embedder.embedder.0.bias"); bool isOmniGen(JObject h) => h.ContainsKey("time_caption_embed.timestep_embedder.linear_2.weight") && h.ContainsKey("context_refiner.0.attn.norm_k.weight"); @@ -723,6 +726,14 @@ JToken GetEmbeddingKey(JObject h) { return isHiDreamLora(h); }}); + Register(new() { ID = "hidream-o1", CompatClass = CompatHiDreamO1, Name = "HiDream O1 Image", StandardWidth = 2048, StandardHeight = 2048, IsThisModelOfClass = (m, h) => + { + return isHiDreamO1(h); + }}); + Register(new() { ID = "hidream-o1/lora", CompatClass = CompatHiDreamO1, Name = "HiDream O1 LoRA", StandardWidth = 2048, StandardHeight = 2048, IsThisModelOfClass = (m, h) => + { + return isHiDreamO1Lora(h); + }}); Register(new() { ID = "omnigen-2", CompatClass = CompatOmniGen2, Name = "OmniGen 2", StandardWidth = 1024, StandardHeight = 1024, IsThisModelOfClass = (m, h) => { return isOmniGen(h); @@ -855,6 +866,7 @@ JToken GetEmbeddingKey(JObject h) Remaps["hunyuanvideo1.5_720p_i2v"] = "hunyuan-video-1_5"; Remaps["hunyuanvideo1.5_1080p_sr_distilled"] = "hunyuan-video-1_5-sr"; Remaps["hunyuanvideo1.5_720p_sr_distilled"] = "hunyuan-video-1_5-sr"; + Remaps["hidream_o1_image"] = "hidream-o1"; } /// Returns the model class that matches this model, or null if none.