Skip to content

Commit

Permalink
* TextImage Encoder now a seperate node.
Browse files Browse the repository at this point in the history
* No more behavior change of original nodes
* Clean up
  • Loading branch information
Dango233 committed Dec 15, 2024
1 parent a3fbc3d commit fbfaa8d
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 19 deletions.
2 changes: 1 addition & 1 deletion examples/hyvideo_ip2v_experimental_dango.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","vision_languague","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of <image>, furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4}
{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextImageEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of <image>, furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4}
1 change: 0 additions & 1 deletion hyvideo/text_encoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,6 @@ def encode(
hidden_state_skip_layer=None,
return_texts=False,
prompt_template=None,
image_token_strategy="text_only",
image_token_selection_expr="::4",
device=None,
):
Expand Down
54 changes: 37 additions & 17 deletions nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,6 @@ def INPUT_TYPES(s):
"required": {
"llm_model": (["Kijai/llava-llama-3-8b-text-encoder-tokenizer","xtuner/llava-llama-3-8b-v1_1-transformers"],),
"clip_model": (["disabled","openai/clip-vit-large-patch14",],),
"lm_type": (["languague","vision_languague"],),
"precision": (["fp16", "fp32", "bf16"],
{"default": "bf16"}
),
Expand All @@ -586,8 +585,12 @@ def INPUT_TYPES(s):
CATEGORY = "HunyuanVideoWrapper"
DESCRIPTION = "Loads Hunyuan text_encoder model from 'ComfyUI/models/LLM'"

def loadmodel(self, llm_model, clip_model, precision, lm_type, apply_final_norm=False, hidden_state_skip_layer=2, quantization="disabled"):

def loadmodel(self, llm_model, clip_model, precision, apply_final_norm=False, hidden_state_skip_layer=2, quantization="disabled"):
lm_type_mapping = {
"Kijai/llava-llama-3-8b-text-encoder-tokenizer": "llm",
"xtuner/llava-llama-3-8b-v1_1-transformers": "vlm",
}
lm_type = lm_type_mapping[llm_model]
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
Expand Down Expand Up @@ -636,11 +639,6 @@ def loadmodel(self, llm_model, clip_model, precision, lm_type, apply_final_norm=
local_dir=base_path,
local_dir_use_symlinks=False,
)
LM_TYPE = {
"languague": "llm",
"vision_languague": "vlm",
}
lm_type = LM_TYPE.get(lm_type, "llm")
text_encoder = TextEncoder(
text_encoder_path=base_path,
text_encoder_type=lm_type,
Expand Down Expand Up @@ -708,16 +706,12 @@ def INPUT_TYPES(s):
return {"required": {
"text_encoders": ("HYVIDTEXTENCODER",),
"prompt": ("STRING", {"default": "", "multiline": True} ),
"image_token_selection_expr": ("STRING", {"default": "::4", "multiline": False} ),
},
"optional": {
"force_offload": ("BOOLEAN", {"default": True}),
"prompt_template": (["video", "image", "custom", "disabled"], {"default": "video", "tooltip": "Use the default prompt templates for the llm text encoder"}),
"custom_prompt_template": ("PROMPT_TEMPLATE", {"default": PROMPT_TEMPLATE["dit-llm-encode-video"], "multiline": True}),
"clip_l": ("CLIP", {"tooltip": "Use comfy clip model instead, in this case the text encoder loader's clip_l should be disabled"}),
"image1": ("IMAGE", {"default": None}),
"image2": ("IMAGE", {"default": None}),
"clip_text_override": ("STRING", {"default": "", "multiline": True} ),
"hyvid_cfg": ("HYVID_CFG", ),
}
}
Expand All @@ -727,8 +721,8 @@ def INPUT_TYPES(s):
FUNCTION = "process"
CATEGORY = "HunyuanVideoWrapper"

def process(self, text_encoders, prompt, force_offload=True, prompt_template="video", custom_prompt_template=None, clip_l=None, image_token_strategy="text_only", image_token_selection_expr="::4", hyvid_cfg=None, image1=None, image2=None, clip_text_override=None):
if len(clip_text_override) == 0:
def process(self, text_encoders, prompt, force_offload=True, prompt_template="video", custom_prompt_template=None, clip_l=None, image_token_selection_expr="::4", hyvid_cfg=None, image1=None, image2=None, clip_text_override=None):
if clip_text_override is not None and len(clip_text_override) == 0:
clip_text_override = None
device = mm.text_encoder_device()
offload_device = mm.text_encoder_offload_device()
Expand Down Expand Up @@ -766,7 +760,7 @@ def process(self, text_encoders, prompt, force_offload=True, prompt_template="vi
else:
prompt_template_dict = None

def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strategy="text_only", image_token_selection_expr="::4", image1=None, image2=None, clip_text_override=None):
def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_selection_expr="::4", image1=None, image2=None, clip_text_override=None):
batch_size = 1
num_videos_per_prompt = 1

Expand All @@ -777,7 +771,6 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat
clip_text_override=clip_text_override)
prompt_outputs = text_encoder.encode(text_inputs,
prompt_template=prompt_template_dict,
image_token_strategy=image_token_strategy,
image_token_selection_expr=image_token_selection_expr,
device=device
)
Expand Down Expand Up @@ -850,7 +843,6 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat
prompt,
negative_prompt,
text_encoder_1,
image_token_strategy=image_token_strategy,
image_token_selection_expr=image_token_selection_expr,
image1=image1,
image2=image2)
Expand Down Expand Up @@ -902,6 +894,32 @@ def encode_prompt(self, prompt, negative_prompt, text_encoder, image_token_strat
}
return (prompt_embeds_dict,)

class HyVideoTextImageEncode(HyVideoTextEncode):
# Experimental Image Prompt to Video (IP2V) via VLM implementation by @Dango233
@classmethod
def INPUT_TYPES(s):
return {"required": {
"text_encoders": ("HYVIDTEXTENCODER",),
"prompt": ("STRING", {"default": "", "multiline": True} ),
"image_token_selection_expr": ("STRING", {"default": "::4", "multiline": False} ),
},
"optional": {
"force_offload": ("BOOLEAN", {"default": True}),
"prompt_template": (["video", "image", "custom", "disabled"], {"default": "video", "tooltip": "Use the default prompt templates for the llm text encoder"}),
"custom_prompt_template": ("PROMPT_TEMPLATE", {"default": PROMPT_TEMPLATE["dit-llm-encode-video"], "multiline": True}),
"clip_l": ("CLIP", {"tooltip": "Use comfy clip model instead, in this case the text encoder loader's clip_l should be disabled"}),
"image1": ("IMAGE", {"default": None}),
"image2": ("IMAGE", {"default": None}),
"clip_text_override": ("STRING", {"default": "", "multiline": True} ),
"hyvid_cfg": ("HYVID_CFG", ),
}
}

RETURN_TYPES = ("HYVIDEMBEDS", )
RETURN_NAMES = ("hyvid_embeds",)
FUNCTION = "process"
CATEGORY = "HunyuanVideoWrapper"

# region CFG
class HyVideoCFG:
@classmethod
Expand Down Expand Up @@ -1365,6 +1383,7 @@ def sample(self, samples, seed, min_val, max_val, r_bias, g_bias, b_bias):
"HyVideoSampler": HyVideoSampler,
"HyVideoDecode": HyVideoDecode,
"HyVideoTextEncode": HyVideoTextEncode,
"HyVideoTextImageEncode": HyVideoTextImageEncode,
"HyVideoModelLoader": HyVideoModelLoader,
"HyVideoVAELoader": HyVideoVAELoader,
"DownloadAndLoadHyVideoTextEncoder": DownloadAndLoadHyVideoTextEncoder,
Expand All @@ -1384,6 +1403,7 @@ def sample(self, samples, seed, min_val, max_val, r_bias, g_bias, b_bias):
"HyVideoSampler": "HunyuanVideo Sampler",
"HyVideoDecode": "HunyuanVideo Decode",
"HyVideoTextEncode": "HunyuanVideo TextEncode",
"HyVideoTextImageEncode": "HunyuanVideo TextImageEncode (IP2V)",
"HyVideoModelLoader": "HunyuanVideo Model Loader",
"HyVideoVAELoader": "HunyuanVideo VAE Loader",
"DownloadAndLoadHyVideoTextEncoder": "(Down)Load HunyuanVideo TextEncoder",
Expand Down

0 comments on commit fbfaa8d

Please sign in to comment.