IP2V - Experimental implementation based on Kijai Nodes, Initial Commit

kijai · Dec 15, 2024 · a3fbc3d · a3fbc3d
1 parent ecae5f2
commit a3fbc3d
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 24 deletions.
diff --git a/examples/hyvideo_ip2v_experimental_dango.json b/examples/hyvideo_ip2v_experimental_dango.json
@@ -0,0 +1 @@
+{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","vision_languague","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of <image>, furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4}
diff --git a/examples/ip2v/example_input.png b/examples/ip2v/example_input.png
diff --git a/examples/ip2v/example_output.mp4 b/examples/ip2v/example_output.mp4
diff --git a/examples/ip2v/example_output_with_workflow.png b/examples/ip2v/example_output_with_workflow.png
diff --git a/hyvideo/text_encoder/__init__.py b/hyvideo/text_encoder/__init__.py
@@ -4,12 +4,13 @@
 
 import torch
 import torch.nn as nn
-from transformers import CLIPTextModel, CLIPTokenizer, AutoTokenizer, AutoModel
+from transformers import CLIPTextModel, CLIPTokenizer, AutoTokenizer, AutoModel, LlavaForConditionalGeneration, AutoProcessor
 from transformers.utils import ModelOutput
 
 from ..constants import TEXT_ENCODER_PATH, TOKENIZER_PATH
 from ..constants import PRECISION_TO_TYPE
-
+from ..utils.token_helper import find_subsequence, multi_slice_to_mask
+from PIL import Image
 
 def use_default(value, default):
     return value if value is not None else default
@@ -41,6 +42,12 @@ def load_text_encoder(
             quantization_config=quantization_config
         )
         text_encoder.final_layer_norm = text_encoder.norm
+    elif text_encoder_type == "vlm":
+        text_encoder = LlavaForConditionalGeneration.from_pretrained(
+            text_encoder_path, 
+            low_cpu_mem_usage=True,
+            quantization_config=quantization_config
+        )
     else:
         raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
     # from_pretrained will ensure that the model is in eval mode.
@@ -69,7 +76,7 @@ def load_tokenizer(
 
     if tokenizer_type == "clipL":
         tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
-    elif tokenizer_type == "llm":
+    elif tokenizer_type == "llm" or tokenizer_type == "vlm":
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_path, padding_side=padding_side
         )
@@ -149,8 +156,9 @@ def __init__(
             self.output_key = output_key or "last_hidden_state"
         elif "clip" in text_encoder_type:
             self.output_key = output_key or "pooler_output"
-        elif "llm" in text_encoder_type or "glm" in text_encoder_type:
+        elif "llm" in text_encoder_type or "glm" in text_encoder_type or "vlm" in text_encoder_type:
             self.output_key = output_key or "last_hidden_state"
+            self.processor = AutoProcessor.from_pretrained(text_encoder_path, device=device)
         else:
             raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
 
@@ -193,15 +201,17 @@ def apply_text_to_template(text, template, prevent_empty_text=True):
         else:
             raise TypeError(f"Unsupported template type: {type(template)}")
 
-    def text2tokens(self, text, prompt_template):
+    def text2tokens(self, text, prompt_template, image1=None, image2=None, clip_text_override=None):
         """
         Tokenize the input text.
 
         Args:
             text (str or list): Input text.
         """
+        if self.text_encoder_type != "vlm" and image1 is not None:
+            raise ValueError("Only vision_languague models support image input")
         tokenize_input_type = "str"
-        if prompt_template is not None and self.text_encoder_type == "llm":
+        if prompt_template is not None and self.text_encoder_type == "llm" or self.text_encoder_type == "vlm":
             if isinstance(text, (list, tuple)):
                 text = [
                     self.apply_text_to_template(one_text, prompt_template["template"])
@@ -215,21 +225,35 @@ def text2tokens(self, text, prompt_template):
                     tokenize_input_type = "list"
             else:
                 raise TypeError(f"Unsupported text type: {type(text)}")
+        elif clip_text_override is not None and self.text_encoder_type == "clipL":
+            text = clip_text_override
 
         kwargs = dict(
             truncation=True,
             max_length=self.max_length,
-            padding="max_length",
+            padding="max_length" if self.text_encoder_type != "vlm" else "do_not_pad",
             return_tensors="pt",
         )
         if tokenize_input_type == "str":
-            return self.tokenizer(
+            text_tokens = self.tokenizer(
                 text,
                 return_length=False,
                 return_overflowing_tokens=False,
                 return_attention_mask=True,
                 **kwargs,
             )
+            if self.text_encoder_type == "vlm":
+                raw_images = []
+                if image1 is not None:
+                    raw_images.append(image1.squeeze(0)*255)
+                if image2 is not None:
+                    raw_images.append(image2.squeeze(0)*255)
+                text_tokens = self.processor(
+                    raw_images, 
+                    text, 
+                    **kwargs,
+                    ).to(0, torch.float16)
+            return text_tokens #text_tokens
         elif tokenize_input_type == "list":
             return self.tokenizer.apply_chat_template(
                 text,
@@ -250,6 +274,8 @@ def encode(
         hidden_state_skip_layer=None,
         return_texts=False,
         prompt_template=None,
+        image_token_strategy="text_only",
+        image_token_selection_expr="::4",
         device=None,
     ):
         """
@@ -275,12 +301,14 @@ def encode(
         attention_mask = (
             batch_encoding["attention_mask"].to(device) if use_attention_mask else None
         )
+        for k,v in batch_encoding.items():
+            batch_encoding[k] = v.to(device) if isinstance(v, torch.Tensor) else v
         outputs = self.model(
-            input_ids=batch_encoding["input_ids"].to(device),
-            attention_mask=attention_mask,
+            **batch_encoding,
             output_hidden_states=output_hidden_states
             or hidden_state_skip_layer is not None,
         )
+
         if hidden_state_skip_layer is not None:
             last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
             # Real last hidden state already has layer norm applied. So here we only apply it
@@ -293,12 +321,49 @@ def encode(
         # Remove hidden states of instruction tokens, only keep prompt tokens.
         if prompt_template is not None and self.text_encoder_type == "llm":
             crop_start = prompt_template.get("crop_start", -1)
-
             if crop_start > 0:
                 last_hidden_state = last_hidden_state[:, crop_start:]
                 attention_mask = (
                     attention_mask[:, crop_start:] if use_attention_mask else None
                 )
+        elif prompt_template is not None and self.text_encoder_type == "vlm":
+            # Temporory implementation for one round chat template to get rid of system prompts aand chat header
+            user_start_tokens = self.tokenizer(
+                text="<|start_header_id|>user<|end_header_id|>",
+                add_special_tokens=False, 
+                return_tensors="pt"
+                )
+            image_token = self.tokenizer(
+                text="<image>",
+                add_special_tokens=False, 
+                return_tensors="pt"
+                )
+            image_token = image_token["input_ids"].to(device)
+            user_start_tokens["input_ids"] = user_start_tokens["input_ids"].to(device)
+            tk_idx, tk_n, tk_len = find_subsequence(batch_encoding["input_ids"], user_start_tokens["input_ids"])
+            if tk_n != 1:
+                raise ValueError("Template seems not in the required format, do you have <|start_header_id|>user<|end_header_id|> in place, and only one round of user input?")
+            user_tokens = batch_encoding["input_ids"][:,tk_idx[0]+tk_len:]
+            img_idx, img_n, _ = find_subsequence(user_tokens, image_token)
+            img_seq_len=outputs["image_hidden_states"].shape[1]
+            last_hidden_state = last_hidden_state[:, tk_idx[0]+tk_len:]
+            # create image_mask to subset non-image hidden state
+            seq_mask = torch.ones_like(last_hidden_state, device=device, dtype=torch.bool)
+            img_mask=torch.zeros_like(outputs["image_hidden_states"][0:1], device=device, dtype=torch.bool)
+            img_mask[:, multi_slice_to_mask(image_token_selection_expr, img_mask.shape[1])]=True
+
+            drift=0  
+            for i in img_idx:
+                i = i+drift
+                seq_mask[:,i:i+img_seq_len,:] = img_mask
+                drift+=img_seq_len 
+
+            last_hidden_state = last_hidden_state[seq_mask].view(1,-1,outputs["image_hidden_states"].shape[-1])
+
+            attention_mask = torch.ones(last_hidden_state.shape[0], last_hidden_state.shape[1], device=device, dtype=torch.int64)
+
+        elif prompt_template is None and self.text_encoder_type == "vlm":
+            raise ValueError("Vlm encoders must use compatiable chat template.")
 
         if output_hidden_states:
             return TextEncoderModelOutput(

diff --git a/hyvideo/utils/token_helper.py b/hyvideo/utils/token_helper.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn.functional as F
+
+def find_subsequence(sequence, sub_sequence):
+
+    assert sequence.shape[0]==1
+    sequence = sequence[0]
+    sub_sequence = sub_sequence[0]
+
+    sub_len = len(sub_sequence)
+    indices = []
+
+    windows = sequence.unfold(0, sub_len, 1)
+    matches = (windows == sub_sequence).all(dim=1)
+    indices = matches.nonzero().flatten().tolist()
+
+    return indices, len(indices), sub_len
+
+import ast
+import torch
+
+def multi_slice_to_mask(expr, length):
+    def process_single_slice(s):
+        s = s.replace(':', ',').replace(' ', '')
+        while ',,' in s:
+            s = s.replace(',,', ',None,')
+        if s.startswith(','):
+            s = 'None' + s
+        if s.endswith(','):
+            s = s + 'None'
+        return s
+
+    try:
+        slices = expr.split(',')
+        mask = torch.zeros(length, dtype=torch.bool)
+        if expr == "":
+            return mask
+        i = 0
+        while i < len(slices):
+            if ':' in slices[i]:
+                slice_expr = process_single_slice(slices[i])
+                slice_args = ast.literal_eval(f"({slice_expr})")
+                s = slice(*slice_args)
+                mask[s] = True
+                i += 1
+            else:
+                idx = ast.literal_eval(slices[i])
+                if idx < 0:
+                    idx = length + idx
+                if 0 <= idx < length:
+                    mask[idx] = True
+                i += 1
+
+        return mask
+    except Exception as e:
+        raise ValueError(f"Invalid slice expression: {e}")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","vision_languague","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of <image>, furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4}