Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions blueprints/Text to Image (LongCat-Image).json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "a7e3b1c0-4f2d-4e8a-9b1c-longcat00001", "revision": 0, "last_node_id": 20, "last_link_id": 20, "nodes": [{"id": 1, "type": "lc-subgraph-001", "pos": [0, 1230], "size": [400, 470], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "prompt", "name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "width", "type": "INT", "widget": {"name": "width"}, "link": null}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": null}, {"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": null}, {"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": null}, {"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": null}], "outputs": [{"localized_name": "IMAGE", "name": "IMAGE", "type": "IMAGE", "links": []}], "properties": {"proxyWidgets": [["-1", "text"], ["-1", "width"], ["-1", "height"], ["7", "seed"], ["7", "control_after_generate"], ["-1", "unet_name"], ["-1", "clip_name"], ["-1", "vae_name"]], "cnr_id": "comfy-core", "ver": "0.3.73", "enableTabs": false}, "widgets_values": ["A young Asian woman wearing a yellow knit sweater with a white necklace, sitting with her hands on her knees and a serene expression. The background is a rough brick wall with warm afternoon sunlight.", 768, 1344, null, null, "longcat_image_bf16.safetensors", "qwen_2.5_vl_7b.safetensors", "ae.safetensors"]}], "links": [], "groups": [], "definitions": {"subgraphs": [{"id": "lc-subgraph-001", "version": 1, "state": {"lastGroupId": 4, "lastNodeId": 20, "lastLinkId": 20, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "local-Text to Image (LongCat-Image)", "inputNode": {"id": -10, "bounding": [-80, 425, 120, 160]}, "outputNode": {"id": -20, "bounding": [1490, 415, 120, 60]}, "inputs": [{"id": "inp-text", "name": "text", "type": "STRING", "linkIds": [10], "label": "prompt", "pos": [20, 445]}, {"id": "inp-width", "name": "width", "type": "INT", "linkIds": [11], "pos": [20, 465]}, {"id": "inp-height", "name": "height", "type": "INT", "linkIds": [12], "pos": [20, 485]}, {"id": "inp-unet", "name": "unet_name", "type": "COMBO", "linkIds": [13], "pos": [20, 505]}, {"id": "inp-clip", "name": "clip_name", "type": "COMBO", "linkIds": [14], "pos": [20, 525]}, {"id": "inp-vae", "name": "vae_name", "type": "COMBO", "linkIds": [15], "pos": [20, 545]}], "outputs": [{"id": "out-image", "name": "IMAGE", "type": "IMAGE", "linkIds": [9], "localized_name": "IMAGE", "pos": [1510, 435]}], "widgets": [], "nodes": [{"id": 1, "type": "UNETLoader", "pos": [110, 200], "size": [270, 82], "flags": {}, "order": 0, "mode": 0, "inputs": [{"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": 13}, {"name": "weight_dtype", "type": "COMBO", "widget": {"name": "weight_dtype"}, "link": null}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [1]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "UNETLoader", "models": [{"name": "longcat_image_bf16.safetensors", "url": "https://huggingface.co/TalmajM/LongCat-Image_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/longcat_image_bf16.safetensors", "directory": "unet"}]}, "widgets_values": ["longcat_image_bf16.safetensors", "default"]}, {"id": 2, "type": "CLIPLoader", "pos": [110, 330], "size": [270, 106], "flags": {}, "order": 1, "mode": 0, "inputs": [{"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": 14}, {"name": "type", "type": "COMBO", "widget": {"name": "type"}, "link": null}, {"name": "device", "shape": 7, "type": "COMBO", "widget": {"name": "device"}, "link": null}], "outputs": [{"name": "CLIP", "type": "CLIP", "links": [2, 16]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPLoader", "models": [{"name": "qwen_2.5_vl_7b.safetensors", "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b.safetensors", "directory": "text_encoders"}]}, "widgets_values": ["qwen_2.5_vl_7b.safetensors", "longcat_image", "default"]}, {"id": 3, "type": "VAELoader", "pos": [110, 480], "size": [270, 58], "flags": {}, "order": 2, "mode": 0, "inputs": [{"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": 15}], "outputs": [{"name": "VAE", "type": "VAE", "links": [3]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAELoader", "models": [{"name": "ae.safetensors", "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors", "directory": "vae"}]}, "widgets_values": ["ae.safetensors"]}, {"id": 4, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 200], "size": [410, 250], "flags": {}, "order": 3, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 2}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": 10}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [4]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 5, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 510], "size": [410, 120], "flags": {}, "order": 4, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 16}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [6]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 10, "type": "CFGRenormLongCatImage", "pos": [880, 160], "size": [280, 26], "flags": {}, "order": 5, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 1}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [17]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CFGRenormLongCatImage"}, "widgets_values": []}, {"id": 6, "type": "EmptySD3LatentImage", "pos": [110, 630], "size": [260, 106], "flags": {}, "order": 6, "mode": 0, "inputs": [{"name": "width", "type": "INT", "widget": {"name": "width"}, "link": 11}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": 12}, {"name": "batch_size", "type": "INT", "widget": {"name": "batch_size"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [7]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "EmptySD3LatentImage"}, "widgets_values": [768, 1344, 1]}, {"id": 7, "type": "KSampler", "pos": [880, 230], "size": [315, 262], "flags": {}, "order": 7, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 17}, {"name": "positive", "type": "CONDITIONING", "link": 4}, {"name": "negative", "type": "CONDITIONING", "link": 6}, {"name": "latent_image", "type": "LATENT", "link": 7}, {"name": "seed", "type": "INT", "widget": {"name": "seed"}, "link": null}, {"name": "steps", "type": "INT", "widget": {"name": "steps"}, "link": null}, {"name": "cfg", "type": "FLOAT", "widget": {"name": "cfg"}, "link": null}, {"name": "sampler_name", "type": "COMBO", "widget": {"name": "sampler_name"}, "link": null}, {"name": "scheduler", "type": "COMBO", "widget": {"name": "scheduler"}, "link": null}, {"name": "denoise", "type": "FLOAT", "widget": {"name": "denoise"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [8]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "KSampler"}, "widgets_values": [0, "randomize", 20, 4.0, "euler", "simple", 1.0]}, {"id": 8, "type": "VAEDecode", "pos": [1220, 160], "size": [210, 46], "flags": {}, "order": 8, "mode": 0, "inputs": [{"name": "samples", "type": "LATENT", "link": 8}, {"name": "vae", "type": "VAE", "link": 3}], "outputs": [{"name": "IMAGE", "type": "IMAGE", "links": [9]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAEDecode"}, "widgets_values": []}], "groups": [{"id": 1, "title": "Image size", "bounding": [100, 560, 290, 200], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 2, "title": "Prompt", "bounding": [410, 130, 450, 540], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 3, "title": "Models", "bounding": [100, 130, 290, 413], "color": "#3f789e", "font_size": 24, "flags": {}}], "links": [{"id": 1, "origin_id": 1, "origin_slot": 0, "target_id": 10, "target_slot": 0, "type": "MODEL"}, {"id": 2, "origin_id": 2, "origin_slot": 0, "target_id": 4, "target_slot": 0, "type": "CLIP"}, {"id": 3, "origin_id": 3, "origin_slot": 0, "target_id": 8, "target_slot": 1, "type": "VAE"}, {"id": 4, "origin_id": 4, "origin_slot": 0, "target_id": 7, "target_slot": 1, "type": "CONDITIONING"}, {"id": 6, "origin_id": 5, "origin_slot": 0, "target_id": 7, "target_slot": 2, "type": "CONDITIONING"}, {"id": 7, "origin_id": 6, "origin_slot": 0, "target_id": 7, "target_slot": 3, "type": "LATENT"}, {"id": 8, "origin_id": 7, "origin_slot": 0, "target_id": 8, "target_slot": 0, "type": "LATENT"}, {"id": 9, "origin_id": 8, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 10, "origin_id": -10, "origin_slot": 0, "target_id": 4, "target_slot": 1, "type": "STRING"}, {"id": 11, "origin_id": -10, "origin_slot": 1, "target_id": 6, "target_slot": 0, "type": "INT"}, {"id": 12, "origin_id": -10, "origin_slot": 2, "target_id": 6, "target_slot": 1, "type": "INT"}, {"id": 13, "origin_id": -10, "origin_slot": 3, "target_id": 1, "target_slot": 0, "type": "COMBO"}, {"id": 14, "origin_id": -10, "origin_slot": 4, "target_id": 2, "target_slot": 0, "type": "COMBO"}, {"id": 15, "origin_id": -10, "origin_slot": 5, "target_id": 3, "target_slot": 0, "type": "COMBO"}, {"id": 16, "origin_id": 2, "origin_slot": 0, "target_id": 5, "target_slot": 0, "type": "CLIP"}, {"id": 17, "origin_id": 10, "origin_slot": 0, "target_id": 7, "target_slot": 0, "type": "MODEL"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image generation and editing/Text to image"}]}, "config": {}, "extra": {"frontendVersion": "1.37.10", "workflowRendererVersion": "LG"}, "version": 0.4}
19 changes: 19 additions & 0 deletions comfy/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,25 @@ def extra_conds_shapes(self, **kwargs):
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
return out

class LongCatImage(Flux):
def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
transformer_options = transformer_options.copy()
rope_opts = transformer_options.get("rope_options", {})
rope_opts = dict(rope_opts)
rope_opts.setdefault("shift_t", 1.0)
rope_opts.setdefault("shift_y", 512.0)
rope_opts.setdefault("shift_x", 512.0)
transformer_options["rope_options"] = rope_opts
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)

def encode_adm(self, **kwargs):
return None

def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
out.pop('guidance', None)
return out

class Flux2(Flux):
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
Expand Down
2 changes: 2 additions & 0 deletions comfy/model_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model
dit_config["txt_ids_dims"] = [1, 2]
if dit_config.get("context_in_dim") == 3584 and dit_config["vec_in_dim"] is None: # LongCat-Image
dit_config["txt_ids_dims"] = [1, 2]

return dit_config

Expand Down
5 changes: 5 additions & 0 deletions comfy/sd.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
import comfy.text_encoders.newbie
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image

import comfy.model_patcher
import comfy.lora
Expand Down Expand Up @@ -1159,6 +1160,7 @@ class CLIPType(Enum):
KANDINSKY5_IMAGE = 23
NEWBIE = 24
FLUX2 = 25
LONGCAT_IMAGE = 26


def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
Expand Down Expand Up @@ -1371,6 +1373,9 @@ class EmptyClass:
if clip_type == CLIPType.HUNYUAN_IMAGE:
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
elif clip_type == CLIPType.LONGCAT_IMAGE:
clip_target.clip = comfy.text_encoders.longcat_image.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.longcat_image.LongCatImageTokenizer
else:
clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer
Expand Down
34 changes: 33 additions & 1 deletion comfy/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import comfy.text_encoders.z_image
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image

from . import supported_models_base
from . import latent_formats
Expand Down Expand Up @@ -1667,6 +1668,37 @@ def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**detect))


models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
class LongCatImage(supported_models_base.BASE):
unet_config = {
"image_model": "flux",
"guidance_embed": False,
"vec_in_dim": None,
"context_in_dim": 3584,
"txt_ids_dims": [1, 2],
}

sampling_settings = {
}

unet_extra_config = {}
latent_format = latent_formats.Flux

memory_usage_factor = 2.5

supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]

vae_key_prefix = ["vae."]
text_encoder_key_prefix = ["text_encoders."]

def get_model(self, state_dict, prefix="", device=None):
out = model_base.LongCatImage(self, device=device)
return out

def clip_target(self, state_dict={}):
pref = self.text_encoder_key_prefix[0]
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))

models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]

models += [SVD_img2vid]
Loading