Merge pull request #47 from ai-forever/kirillova/lita_multigpu_fix

boomb0om · web-flow · commit 1822b22be4e7 · 2024-05-14T13:01:18.000+03:00
fix: change device choice in model loader function
diff --git a/DPF/filters/videos/lita_filter.py b/DPF/filters/videos/lita_filter.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from io import BytesIO
 from typing import Any, Optional
 
@@ -7,17 +8,20 @@
 from lita.constants import (
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
     DEFAULT_IMAGE_TOKEN,
     IMAGE_TOKEN_INDEX,
+    TIME_TOKEN_TEMPLATE,
 )
-from lita.model.builder import load_pretrained_model
+from lita.model.language_model.lita_llama import LitaLlamaForCausalLM
 from lita.utils import load_video
 from llava.conversation import SeparatorStyle, conv_templates
 from llava.mm_utils import (
     KeywordsStoppingCriteria,
     get_model_name_from_path,
     tokenizer_image_token,
 )
+from transformers import AutoConfig, AutoTokenizer, BitsAndBytesConfig
 
 from DPF.types import ModalityToDataMapping
 
@@ -29,6 +33,98 @@
     from torch.utils.data import default_collate
 
 
+def load_pretrained_model(model_path: str,
+                          model_base: str,
+                          model_name: str,
+                          load_8bit: bool = False,
+                          load_4bit: bool = False,
+                          device_map: str = "auto",
+                          device: str = "cuda"):
+    kwargs = {"device_map": device_map}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}  # type: ignore
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True  # type: ignore
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True  # type: ignore
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16  # type: ignore
+
+    if 'lita' not in model_name.lower():
+        warnings.warn("this function is for loading LITA models", stacklevel=2)
+    if 'lora' in model_name.lower():
+        warnings.warn("lora is currently not supported for LITA", stacklevel=2)
+    if 'mpt' in model_name.lower():
+        warnings.warn("mpt is currently not supported for LITA", stacklevel=2)
+
+    if model_base is not None:
+        print('Loading LITA from base model...')
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        cfg_pretrained = AutoConfig.from_pretrained(model_path)
+        model = LitaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+
+        mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+        mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items() if 'mm_projector' in k}
+        model.load_state_dict(mm_projector_weights, strict=False)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = LitaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", False)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    vision_tower.to(device=device, dtype=torch.float16)
+    image_processor = vision_tower.image_processor
+
+    # time tokens and embeddings
+    num_time_tokens = getattr(model.config, "num_time_tokens", 0)
+    if num_time_tokens > 0:
+        time_tokens = [TIME_TOKEN_TEMPLATE.format(t=x) for x in range(num_time_tokens)]
+        num_new_tokens = tokenizer.add_tokens(time_tokens)
+
+        if model_base is None:
+            assert num_new_tokens == 0, "time tokens should already be in the tokenizer for full finetune model"
+
+        if num_new_tokens > 0:
+            warnings.warn("looking for weights in mm_projector.bin", stacklevel=2)
+            assert num_new_tokens == num_time_tokens
+            model.resize_token_embeddings(len(tokenizer))
+            input_embeddings = model.get_input_embeddings().weight.data
+            output_embeddings = model.get_output_embeddings().weight.data
+            weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            assert 'model.embed_tokens.weight' in weights and 'lm_head.weight' in weights
+
+            dtype = input_embeddings.dtype
+            device = input_embeddings.device
+
+            tokenizer_time_token_ids = tokenizer.convert_tokens_to_ids(time_tokens)
+            time_token_ids = getattr(model.config, 'time_token_ids', tokenizer_time_token_ids)
+            input_embeddings[tokenizer_time_token_ids] = weights['model.embed_tokens.weight'][time_token_ids].to(dtype).to(device)
+            output_embeddings[tokenizer_time_token_ids] = weights['lm_head.weight'][time_token_ids].to(dtype).to(device)
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len
+
+
 def disable_torch_init() -> None:
     """
     Disable the redundant torch default initialization to accelerate model creation.
@@ -79,7 +175,7 @@ def __init__(
 
         disable_torch_init()
 
-        pretrainers = load_pretrained_model(weights_path, model_base, self.model_name, load_8bit, load_4bit)
+        pretrainers = load_pretrained_model(weights_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)  # type: ignore
         self.tokenizer, self.model, self.processor, self.context_len = pretrainers
 
         self.model_num_frames = self.model.config.num_frames
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,7 @@ disable_error_code = ["import-not-found"]
 
 [[tool.mypy.overrides]]
 module = "DPF.filters.videos.lita_filter"
-disable_error_code = ["import-not-found", "call-overload"]
+disable_error_code = ["import-not-found", "call-overload", "no-untyped-def"]
 
 [[tool.mypy.overrides]]
 module = "DPF.filters.images.llava_captioning_filter"