diff --git a/examples/hyvideo_ip2v_experimental_dango.json b/examples/hyvideo_ip2v_experimental_dango.json index 507b76d..503fbda 100644 --- a/examples/hyvideo_ip2v_experimental_dango.json +++ b/examples/hyvideo_ip2v_experimental_dango.json @@ -1 +1 @@ -{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","vision_languague","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of , furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4} \ No newline at end of file +{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextImageEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of , furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4} \ No newline at end of file diff --git a/hyvideo/text_encoder/__init__.py b/hyvideo/text_encoder/__init__.py index 14b15f6..27d4839 100644 --- a/hyvideo/text_encoder/__init__.py +++ b/hyvideo/text_encoder/__init__.py @@ -274,7 +274,6 @@ def encode( hidden_state_skip_layer=None, return_texts=False, prompt_template=None, - image_token_strategy="text_only", image_token_selection_expr="::4", device=None, ): diff --git a/nodes.py b/nodes.py index bed5d74..efdabf6 100644 --- a/nodes.py +++ b/nodes.py @@ -568,7 +568,6 @@ def INPUT_TYPES(s): "required": { "llm_model": (["Kijai/llava-llama-3-8b-text-encoder-tokenizer","xtuner/llava-llama-3-8b-v1_1-transformers"],), "clip_model": (["disabled","openai/clip-vit-large-patch14",],), - "lm_type": (["languague","vision_languague"],), "precision": (["fp16", "fp32", "bf16"], {"default": "bf16"} ), @@ -586,8 +585,12 @@ def INPUT_TYPES(s): CATEGORY = "HunyuanVideoWrapper" DESCRIPTION = "Loads Hunyuan text_encoder model from 'ComfyUI/models/LLM'" - def loadmodel(self, llm_model, clip_model, precision, lm_type, apply_final_norm=False, hidden_state_skip_layer=2, quantization="disabled"): - + def loadmodel(self, llm_model, clip_model, precision, apply_final_norm=False, hidden_state_skip_layer=2, quantization="disabled"): + lm_type_mapping = { + "Kijai/llava-llama-3-8b-text-encoder-tokenizer": "llm", + "xtuner/llava-llama-3-8b-v1_1-transformers": "vlm", + } + lm_type = lm_type_mapping[llm_model] device = mm.get_torch_device() offload_device = mm.unet_offload_device() dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] @@ -636,11 +639,6 @@ def loadmodel(self, llm_model, clip_model, precision, lm_type, apply_final_norm= local_dir=base_path, local_dir_use_symlinks=False, ) - LM_TYPE = { - "languague": "llm", - "vision_languague": "vlm", - } - lm_type = LM_TYPE.get(lm_type, "llm") text_encoder = TextEncoder( text_encoder_path=base_path, text_encoder_type=lm_type, @@ -708,16 +706,12 @@ def INPUT_TYPES(s): return {"required": { "text_encoders": ("HYVIDTEXTENCODER",), "prompt": ("STRING", {"default": "", "multiline": True} ), - "image_token_selection_expr": ("STRING", {"default": "::4", "multiline": False} ), }, "optional": { "force_offload": ("BOOLEAN", {"default": True}), "prompt_template": (["video", "image", "custom", "disabled"], {"default": "video", "tooltip": "Use the default prompt templates for the llm text encoder"}), "custom_prompt_template": ("PROMPT_TEMPLATE", {"default": PROMPT_TEMPLATE["dit-llm-encode-video"], "multiline": True}), "clip_l": ("CLIP", {"tooltip": "Use comfy clip model instead, in this case the text encoder loader's clip_l should be disabled"}), - "image1": ("IMAGE", {"default": None}), - "image2": ("IMAGE", {"default": None}), - "clip_text_override": ("STRING", {"default": "", "multiline": True} ), "hyvid_cfg": ("HYVID_CFG", ), } } @@ -727,8 +721,8 @@ def INPUT_TYPES(s): FUNCTION = "process" CATEGORY = "HunyuanVideoWrapper" - def process(self, text_encoders, prompt, force_offload=True, prompt_template="video", custom_prompt_template=None, clip_l=None, image_token_strategy="text_only", image_token_selection_expr="::4", hyvid_cfg=None, image1=None, image2=None, clip_text_override=None): - if len(clip_text_override) == 0: + def process(self, text_encoders, prompt, force_offload=True, prompt_template="video", custom_prompt_template=None, clip_l=None, image_token_selection_expr="::4", hyvid_cfg=None, image1=None, image2=None, clip_text_override=None): + if clip_text_override is not None and len(clip_text_override) == 0: clip_text_override = None device = mm.text_encoder_device() offload_device = mm.text_encoder_offload_device() @@ -766,7 +760,7 @@ def process(self, text_encoders, prompt, force_offload=True, prompt_template="vi else: prompt_template_dict = None - def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strategy="text_only", image_token_selection_expr="::4", image1=None, image2=None, clip_text_override=None): + def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_selection_expr="::4", image1=None, image2=None, clip_text_override=None): batch_size = 1 num_videos_per_prompt = 1 @@ -777,7 +771,6 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat clip_text_override=clip_text_override) prompt_outputs = text_encoder.encode(text_inputs, prompt_template=prompt_template_dict, - image_token_strategy=image_token_strategy, image_token_selection_expr=image_token_selection_expr, device=device ) @@ -850,7 +843,6 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat prompt, negative_prompt, text_encoder_1, - image_token_strategy=image_token_strategy, image_token_selection_expr=image_token_selection_expr, image1=image1, image2=image2) @@ -902,6 +894,32 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat } return (prompt_embeds_dict,) +class HyVideoTextImageEncode(HyVideoTextEncode): + # Experimental Image Prompt to Video (IP2V) via VLM implementation by @Dango233 + @classmethod + def INPUT_TYPES(s): + return {"required": { + "text_encoders": ("HYVIDTEXTENCODER",), + "prompt": ("STRING", {"default": "", "multiline": True} ), + "image_token_selection_expr": ("STRING", {"default": "::4", "multiline": False} ), + }, + "optional": { + "force_offload": ("BOOLEAN", {"default": True}), + "prompt_template": (["video", "image", "custom", "disabled"], {"default": "video", "tooltip": "Use the default prompt templates for the llm text encoder"}), + "custom_prompt_template": ("PROMPT_TEMPLATE", {"default": PROMPT_TEMPLATE["dit-llm-encode-video"], "multiline": True}), + "clip_l": ("CLIP", {"tooltip": "Use comfy clip model instead, in this case the text encoder loader's clip_l should be disabled"}), + "image1": ("IMAGE", {"default": None}), + "image2": ("IMAGE", {"default": None}), + "clip_text_override": ("STRING", {"default": "", "multiline": True} ), + "hyvid_cfg": ("HYVID_CFG", ), + } + } + + RETURN_TYPES = ("HYVIDEMBEDS", ) + RETURN_NAMES = ("hyvid_embeds",) + FUNCTION = "process" + CATEGORY = "HunyuanVideoWrapper" + # region CFG class HyVideoCFG: @classmethod @@ -1365,6 +1383,7 @@ def sample(self, samples, seed, min_val, max_val, r_bias, g_bias, b_bias): "HyVideoSampler": HyVideoSampler, "HyVideoDecode": HyVideoDecode, "HyVideoTextEncode": HyVideoTextEncode, + "HyVideoTextImageEncode": HyVideoTextImageEncode, "HyVideoModelLoader": HyVideoModelLoader, "HyVideoVAELoader": HyVideoVAELoader, "DownloadAndLoadHyVideoTextEncoder": DownloadAndLoadHyVideoTextEncoder, @@ -1384,6 +1403,7 @@ def sample(self, samples, seed, min_val, max_val, r_bias, g_bias, b_bias): "HyVideoSampler": "HunyuanVideo Sampler", "HyVideoDecode": "HunyuanVideo Decode", "HyVideoTextEncode": "HunyuanVideo TextEncode", + "HyVideoTextImageEncode": "HunyuanVideo TextImageEncode (IP2V)", "HyVideoModelLoader": "HunyuanVideo Model Loader", "HyVideoVAELoader": "HunyuanVideo VAE Loader", "DownloadAndLoadHyVideoTextEncoder": "(Down)Load HunyuanVideo TextEncoder",