Skip to content

Commit

Permalink
IP2V - Experimental implementation based on Kijai Nodes, Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Dango233 committed Dec 15, 2024
1 parent ecae5f2 commit a3fbc3d
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 24 deletions.
1 change: 1 addition & 0 deletions examples/hyvideo_ip2v_experimental_dango.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"last_node_id":67,"last_link_id":75,"nodes":[{"id":7,"type":"HyVideoVAELoader","pos":[-277,-284],"size":[379.166748046875,82],"flags":{},"order":0,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7}],"outputs":[{"name":"vae","type":"VAE","links":[6],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoVAELoader"},"widgets_values":["hyvid/hunyuan_video_vae_bf16.safetensors","bf16"]},{"id":1,"type":"HyVideoModelLoader","pos":[-285,-94],"size":[426.1773986816406,194],"flags":{},"order":1,"mode":0,"inputs":[{"name":"compile_args","type":"COMPILEARGS","link":null,"shape":7},{"name":"block_swap_args","type":"BLOCKSWAPARGS","link":null,"shape":7},{"name":"lora","type":"HYVIDLORA","link":null,"shape":7}],"outputs":[{"name":"model","type":"HYVIDEOMODEL","links":[2],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoModelLoader"},"widgets_values":["hyvideo/hunyuan_video_720_cfgdistill_bf16.safetensors","bf16","fp8_e4m3fn_fast","offload_device","sageattn_varlen"]},{"id":65,"type":"LoadImage","pos":[-540,530],"size":[315,314],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[{"name":"IMAGE","type":"IMAGE","links":[75],"slot_index":0},{"name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":5,"type":"HyVideoDecode","pos":[690,-230],"size":[345.4285888671875,150],"flags":{},"order":6,"mode":0,"inputs":[{"name":"vae","type":"VAE","link":6},{"name":"samples","type":"LATENT","link":4}],"outputs":[{"name":"images","type":"IMAGE","links":[42],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoDecode"},"widgets_values":[true,64,256,true]},{"id":3,"type":"HyVideoSampler","pos":[260,-230],"size":[315,546],"flags":{},"order":5,"mode":0,"inputs":[{"name":"model","type":"HYVIDEOMODEL","link":2},{"name":"hyvid_embeds","type":"HYVIDEMBEDS","link":67},{"name":"samples","type":"LATENT","link":null,"shape":7},{"name":"stg_args","type":"STGARGS","link":null,"shape":7}],"outputs":[{"name":"samples","type":"LATENT","links":[4],"slot_index":0}],"properties":{"Node name for S&R":"HyVideoSampler"},"widgets_values":[720,480,61,30,7.5,7.5,233,"fixed",true,1]},{"id":34,"type":"VHS_VideoCombine","pos":[660,30],"size":[580.7774658203125,697.8516235351562],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","type":"IMAGE","link":42},{"name":"audio","type":"AUDIO","link":null,"shape":7},{"name":"meta_batch","type":"VHS_BatchManager","link":null,"shape":7},{"name":"vae","type":"VAE","link":null,"shape":7}],"outputs":[{"name":"Filenames","type":"VHS_FILENAMES","links":null}],"properties":{"Node name for S&R":"VHS_VideoCombine"},"widgets_values":{"frame_rate":24,"loop_count":0,"filename_prefix":"HunyuanVideo","format":"video/h264-mp4","pix_fmt":"yuv420p","crf":20,"save_metadata":true,"pingpong":false,"save_output":true,"videopreview":{"hidden":false,"paused":false,"params":{"filename":"HunyuanVideo_00298.mp4","subfolder":"","type":"output","format":"video/h264-mp4","frame_rate":24},"muted":false}}},{"id":59,"type":"DownloadAndLoadHyVideoTextEncoder","pos":[-310,240],"size":[441,202],"flags":{},"order":3,"mode":0,"inputs":[],"outputs":[{"name":"hyvid_text_encoder","type":"HYVIDTEXTENCODER","links":[66]}],"properties":{"Node name for S&R":"DownloadAndLoadHyVideoTextEncoder"},"widgets_values":["xtuner/llava-llama-3-8b-v1_1-transformers","openai/clip-vit-large-patch14","vision_languague","fp16",false,2,"disabled"]},{"id":63,"type":"HyVideoTextEncode","pos":[180,500],"size":[443.4000244140625,322],"flags":{},"order":4,"mode":0,"inputs":[{"name":"text_encoders","type":"HYVIDTEXTENCODER","link":66},{"name":"custom_prompt_template","type":"PROMPT_TEMPLATE","link":null,"shape":7},{"name":"clip_l","type":"CLIP","link":null,"shape":7},{"name":"image1","type":"IMAGE","link":75,"shape":7},{"name":"image2","type":"IMAGE","link":null,"shape":7},{"name":"hyvid_cfg","type":"HYVID_CFG","link":null,"shape":7}],"outputs":[{"name":"hyvid_embeds","type":"HYVIDEMBEDS","links":[67]}],"properties":{"Node name for S&R":"HyVideoTextEncode"},"widgets_values":["Astonishing promotion video of a toy movie, high quality video 4k A fluffy plushie stuffed animal of <image>, furry fox ears, dancing on grass land with blue sky. cinematic realistic rendering ","::3",true,"video","A dancing plushie, 4K, 8K, super detailed cinematic shot"]}],"links":[[2,1,0,3,0,"HYVIDEOMODEL"],[4,3,0,5,1,"LATENT"],[6,7,0,5,0,"VAE"],[42,5,0,34,0,"IMAGE"],[66,59,0,63,0,"HYVIDTEXTENCODER"],[67,63,0,3,1,"HYVIDEMBEDS"],[75,65,0,63,3,"IMAGE"]],"groups":[],"config":{},"extra":{"ds":{"scale":0.8140274938684717,"offset":[1434.7893740456389,367.5882823671863]},"workspace_info":{"id":"kZ4q7BpZY-s3NIJ0k8OPz"}},"version":0.4}
Binary file added examples/ip2v/example_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/ip2v/example_output.mp4
Binary file not shown.
Binary file added examples/ip2v/example_output_with_workflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
87 changes: 76 additions & 11 deletions hyvideo/text_encoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

import torch
import torch.nn as nn
from transformers import CLIPTextModel, CLIPTokenizer, AutoTokenizer, AutoModel
from transformers import CLIPTextModel, CLIPTokenizer, AutoTokenizer, AutoModel, LlavaForConditionalGeneration, AutoProcessor
from transformers.utils import ModelOutput

from ..constants import TEXT_ENCODER_PATH, TOKENIZER_PATH
from ..constants import PRECISION_TO_TYPE

from ..utils.token_helper import find_subsequence, multi_slice_to_mask
from PIL import Image

def use_default(value, default):
return value if value is not None else default
Expand Down Expand Up @@ -41,6 +42,12 @@ def load_text_encoder(
quantization_config=quantization_config
)
text_encoder.final_layer_norm = text_encoder.norm
elif text_encoder_type == "vlm":
text_encoder = LlavaForConditionalGeneration.from_pretrained(
text_encoder_path,
low_cpu_mem_usage=True,
quantization_config=quantization_config
)
else:
raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
# from_pretrained will ensure that the model is in eval mode.
Expand Down Expand Up @@ -69,7 +76,7 @@ def load_tokenizer(

if tokenizer_type == "clipL":
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
elif tokenizer_type == "llm":
elif tokenizer_type == "llm" or tokenizer_type == "vlm":
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path, padding_side=padding_side
)
Expand Down Expand Up @@ -149,8 +156,9 @@ def __init__(
self.output_key = output_key or "last_hidden_state"
elif "clip" in text_encoder_type:
self.output_key = output_key or "pooler_output"
elif "llm" in text_encoder_type or "glm" in text_encoder_type:
elif "llm" in text_encoder_type or "glm" in text_encoder_type or "vlm" in text_encoder_type:
self.output_key = output_key or "last_hidden_state"
self.processor = AutoProcessor.from_pretrained(text_encoder_path, device=device)
else:
raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")

Expand Down Expand Up @@ -193,15 +201,17 @@ def apply_text_to_template(text, template, prevent_empty_text=True):
else:
raise TypeError(f"Unsupported template type: {type(template)}")

def text2tokens(self, text, prompt_template):
def text2tokens(self, text, prompt_template, image1=None, image2=None, clip_text_override=None):
"""
Tokenize the input text.
Args:
text (str or list): Input text.
"""
if self.text_encoder_type != "vlm" and image1 is not None:
raise ValueError("Only vision_languague models support image input")
tokenize_input_type = "str"
if prompt_template is not None and self.text_encoder_type == "llm":
if prompt_template is not None and self.text_encoder_type == "llm" or self.text_encoder_type == "vlm":
if isinstance(text, (list, tuple)):
text = [
self.apply_text_to_template(one_text, prompt_template["template"])
Expand All @@ -215,21 +225,35 @@ def text2tokens(self, text, prompt_template):
tokenize_input_type = "list"
else:
raise TypeError(f"Unsupported text type: {type(text)}")
elif clip_text_override is not None and self.text_encoder_type == "clipL":
text = clip_text_override

kwargs = dict(
truncation=True,
max_length=self.max_length,
padding="max_length",
padding="max_length" if self.text_encoder_type != "vlm" else "do_not_pad",
return_tensors="pt",
)
if tokenize_input_type == "str":
return self.tokenizer(
text_tokens = self.tokenizer(
text,
return_length=False,
return_overflowing_tokens=False,
return_attention_mask=True,
**kwargs,
)
if self.text_encoder_type == "vlm":
raw_images = []
if image1 is not None:
raw_images.append(image1.squeeze(0)*255)
if image2 is not None:
raw_images.append(image2.squeeze(0)*255)
text_tokens = self.processor(
raw_images,
text,
**kwargs,
).to(0, torch.float16)
return text_tokens #text_tokens
elif tokenize_input_type == "list":
return self.tokenizer.apply_chat_template(
text,
Expand All @@ -250,6 +274,8 @@ def encode(
hidden_state_skip_layer=None,
return_texts=False,
prompt_template=None,
image_token_strategy="text_only",
image_token_selection_expr="::4",
device=None,
):
"""
Expand All @@ -275,12 +301,14 @@ def encode(
attention_mask = (
batch_encoding["attention_mask"].to(device) if use_attention_mask else None
)
for k,v in batch_encoding.items():
batch_encoding[k] = v.to(device) if isinstance(v, torch.Tensor) else v
outputs = self.model(
input_ids=batch_encoding["input_ids"].to(device),
attention_mask=attention_mask,
**batch_encoding,
output_hidden_states=output_hidden_states
or hidden_state_skip_layer is not None,
)

if hidden_state_skip_layer is not None:
last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
# Real last hidden state already has layer norm applied. So here we only apply it
Expand All @@ -293,12 +321,49 @@ def encode(
# Remove hidden states of instruction tokens, only keep prompt tokens.
if prompt_template is not None and self.text_encoder_type == "llm":
crop_start = prompt_template.get("crop_start", -1)

if crop_start > 0:
last_hidden_state = last_hidden_state[:, crop_start:]
attention_mask = (
attention_mask[:, crop_start:] if use_attention_mask else None
)
elif prompt_template is not None and self.text_encoder_type == "vlm":
# Temporory implementation for one round chat template to get rid of system prompts aand chat header
user_start_tokens = self.tokenizer(
text="<|start_header_id|>user<|end_header_id|>",
add_special_tokens=False,
return_tensors="pt"
)
image_token = self.tokenizer(
text="<image>",
add_special_tokens=False,
return_tensors="pt"
)
image_token = image_token["input_ids"].to(device)
user_start_tokens["input_ids"] = user_start_tokens["input_ids"].to(device)
tk_idx, tk_n, tk_len = find_subsequence(batch_encoding["input_ids"], user_start_tokens["input_ids"])
if tk_n != 1:
raise ValueError("Template seems not in the required format, do you have <|start_header_id|>user<|end_header_id|> in place, and only one round of user input?")
user_tokens = batch_encoding["input_ids"][:,tk_idx[0]+tk_len:]
img_idx, img_n, _ = find_subsequence(user_tokens, image_token)
img_seq_len=outputs["image_hidden_states"].shape[1]
last_hidden_state = last_hidden_state[:, tk_idx[0]+tk_len:]
# create image_mask to subset non-image hidden state
seq_mask = torch.ones_like(last_hidden_state, device=device, dtype=torch.bool)
img_mask=torch.zeros_like(outputs["image_hidden_states"][0:1], device=device, dtype=torch.bool)
img_mask[:, multi_slice_to_mask(image_token_selection_expr, img_mask.shape[1])]=True

drift=0
for i in img_idx:
i = i+drift
seq_mask[:,i:i+img_seq_len,:] = img_mask
drift+=img_seq_len

last_hidden_state = last_hidden_state[seq_mask].view(1,-1,outputs["image_hidden_states"].shape[-1])

attention_mask = torch.ones(last_hidden_state.shape[0], last_hidden_state.shape[1], device=device, dtype=torch.int64)

elif prompt_template is None and self.text_encoder_type == "vlm":
raise ValueError("Vlm encoders must use compatiable chat template.")

if output_hidden_states:
return TextEncoderModelOutput(
Expand Down
56 changes: 56 additions & 0 deletions hyvideo/utils/token_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import torch
import torch.nn.functional as F

def find_subsequence(sequence, sub_sequence):

assert sequence.shape[0]==1
sequence = sequence[0]
sub_sequence = sub_sequence[0]

sub_len = len(sub_sequence)
indices = []

windows = sequence.unfold(0, sub_len, 1)
matches = (windows == sub_sequence).all(dim=1)
indices = matches.nonzero().flatten().tolist()

return indices, len(indices), sub_len

import ast
import torch

def multi_slice_to_mask(expr, length):
def process_single_slice(s):
s = s.replace(':', ',').replace(' ', '')
while ',,' in s:
s = s.replace(',,', ',None,')
if s.startswith(','):
s = 'None' + s
if s.endswith(','):
s = s + 'None'
return s

try:
slices = expr.split(',')
mask = torch.zeros(length, dtype=torch.bool)
if expr == "":
return mask
i = 0
while i < len(slices):
if ':' in slices[i]:
slice_expr = process_single_slice(slices[i])
slice_args = ast.literal_eval(f"({slice_expr})")
s = slice(*slice_args)
mask[s] = True
i += 1
else:
idx = ast.literal_eval(slices[i])
if idx < 0:
idx = length + idx
if 0 <= idx < length:
mask[idx] = True
i += 1

return mask
except Exception as e:
raise ValueError(f"Invalid slice expression: {e}")
Loading

0 comments on commit a3fbc3d

Please sign in to comment.