modelscope · Jintao-Huang · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -126,7 +126,7 @@
 - 🔥freeze_llm: 冻结LLM. 默认为False. 可用于全参和LoRA
 - 🔥freeze_vit: 冻结ViT. 默认为True. 可用于全参和LoRA
 - 🔥freeze_aligner: 冻结aligner. 默认为True, 可用于全参和LoRA
-- 🔥target_modules: 指定lora模块, 默认为`all-linear`, 自动寻找除lm_head外的linear并附加tuner. 该参数不限于LoRA
+- 🔥target_modules: 指定lora模块, 默认为`all-linear`. 在LLM和多模态LLM中，其行为有所不同. 若是LLM则自动寻找除lm_head外的linear并附加tuner，若是多模态LLM，则默认只在LLM上附加tuner，该行为可以被`freeze_llm`, `freeze_vit`, `freeze_aligner`控制. 该参数不限于LoRA
 - 🔥target_regex: 指定lora模块的regex表达式. 默认为`None`, 如果该值传入, 则target_modules不生效. 该参数不限于LoRA
 - 🔥init_weights: 初始化weights的方法, LoRA可以指定为`true`, `false`, `guassian`, `pissa`, `pissa_niter_[number of iters]`, Bone可以指定为`true`, `false`, `bat`, 默认值`true`
 - modules_to_save: 在已附加tuner后，原模型参与训练和存储的模块，默认为`[]`. 该参数不限于LoRA

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -127,7 +127,7 @@ Other important parameters:
 - 🔥freeze_llm: Freeze LLM. Default is False. Applicable for full parameters and LoRA.
 - 🔥freeze_vit: Freeze ViT. Default is True. Applicable for full parameters and LoRA.
 - 🔥freeze_aligner: Freeze aligner. Default is True, applicable for full parameters and LoRA.
-- 🔥target_modules: Specify the LoRA module, default is `all-linear`, automatically finds linear layers except for lm_head and attaches the tuner. This parameter is not limited to LoRA.
+- 🔥target_modules: The specified LoRA module defaults to `all-linear`. This behavior differs in LLM and multimodal LLM. If it is LLM, it will automatically search for linear except lm_head and attach tuner. If it is multimodal LLM, it defaults to attach tuner only on LLM, and this behavior can be controlled by `freeze_llm`, `freeze_vit`, `freeze_aligner`. This parameter is not limited to LoRA.
 - 🔥target_regex: Specify a regex expression for the LoRA module. Default is `None`, if this value is provided, target_modules does not take effect. This parameter is not limited to LoRA.
 - 🔥init_weights: The method of init tuner weights, For lora the accepted values are `true`, `false`, `guassian`, `pissa`, `pissa_niter_[number of iters]`, for bone are `true`, `false`, `bat`, default is `true`
 - modules_to_save: After the tuner is attached, the original model's modules used during training and storage, default is `[]`. This parameter is not limited to LoRA.

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -147,7 +147,8 @@ def __post_init__(self) -> None:
         if self.lazy_tokenize is None:
             self.lazy_tokenize = self.model_meta.is_multimodal and not self.streaming
             logger.info(f'Setting args.lazy_tokenize: {self.lazy_tokenize}')
-        self.accelerator_config = {'dispatch_batches': False}
+        if getattr(self, 'accelerator_config', None) is None:
+            self.accelerator_config = {'dispatch_batches': False}
         self.training_args = TrainerFactory.get_training_args(self)
 
         self._add_version()

diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
@@ -125,6 +125,8 @@ def rows_to_batched(rows: List[Dict[str, Any]]):
                 if k not in batched:
                     batched[k] = [None] * i
                 batched[k].append(v)
+        # Make all the lengths of v the same.
+        batched = {k: v + [None] * (len(rows) - len(v)) for k, v in batched.items()}
         return batched
 
     @staticmethod

diff --git a/swift/llm/model/model/internlm.py b/swift/llm/model/model/internlm.py
@@ -232,9 +232,9 @@ def get_model_tokenizer_internvl(model_dir: str,
             ], ),
         ],
         TemplateType.xcomposer2_5,
-        get_model_tokenizer_internlm_xcomposer2,
+        partial(get_model_tokenizer_internlm_xcomposer2, version='v2.5'),
         architectures=['InternLMXComposer2ForCausalLM'],
-        model_arch=ModelArch.internlm_xcomposer,
+        model_arch=ModelArch.xcomposer,
         tags=['vision'],
         requires=['decord'],
     ))
@@ -250,7 +250,7 @@ def get_model_tokenizer_internvl(model_dir: str,
         TemplateType.xcomposer2,
         get_model_tokenizer_internlm_xcomposer2,
         architectures=['InternLMXComposer2ForCausalLM'],
-        model_arch=ModelArch.internlm_xcomposer,
+        model_arch=ModelArch.xcomposer,
         tags=['vision'],
     ))
 
@@ -265,6 +265,6 @@ def get_model_tokenizer_internvl(model_dir: str,
         TemplateType.xcomposer2,
         partial(get_model_tokenizer_internlm_xcomposer2, version='v2-4khd'),
         architectures=['InternLM2ForCausalLM', 'InternLMXComposer2ForCausalLM'],
-        model_arch=ModelArch.internlm_xcomposer,
+        model_arch=ModelArch.xcomposer,
         tags=['vision'],
     ))
diff --git a/swift/llm/model/model/microsoft.py b/swift/llm/model/model/microsoft.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
 from types import MethodType
 from typing import Any, Dict
 
@@ -18,8 +19,6 @@ def get_model_tokenizer_phi3_vision(model_dir: str,
                                     load_model: bool = True,
                                     **kwargs):
     processor_kwargs = {}
-    if 'Phi-3.5-vision-instruct' in model_dir:
-        kwargs['num_crops'] = kwargs.get('num_crops') or 4
     if 'num_crops' in kwargs:
         processor_kwargs['num_crops'] = get_env_args('num_crops', int, kwargs['num_crops'])
     from transformers import AutoProcessor
@@ -43,7 +42,7 @@ def get_model_tokenizer_phi3_vision(model_dir: str,
             ])
         ],
         TemplateType.phi3_vision,
-        get_model_tokenizer_phi3_vision,
+        partial(get_model_tokenizer_phi3_vision, num_crops=4),
         architectures=['Phi3VForCausalLM'],
         model_arch=ModelArch.phi3v,
         requires=['transformers>=4.36'],

diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -39,7 +39,7 @@ class MLLMModelArch:
     llava_llama = 'llava_llama'
     llava_mistral = 'llava_mistral'
 
-    internlm_xcomposer = 'internlm_xcomposer'
+    xcomposer = 'xcomposer'
     internvl = 'internvl'
     minicpmv = 'minicpmv'
     deepseek_vl = 'deepseek_vl'
@@ -330,7 +330,7 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
 
 register_model_arch(
     MultiModelKeys(
-        MLLMModelArch.internlm_xcomposer,
+        MLLMModelArch.xcomposer,
         language_model='model',
         aligner='vision_proj',
         vision_tower='vit',

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -678,7 +678,7 @@ def pre_forward_hook(self, model: nn.Module, args, kwargs):
             k: v
             for k, v in kwargs.items() if k in {'input_ids', 'labels', 'attention_mask', 'position_ids'}
         }
-        keep_kwargs.update(self._post_encode(model, to_device(kwargs, model.device)))
+        keep_kwargs.update(to_device(self._post_encode(model, to_device(kwargs, model.device)), model.device))
         kwargs = keep_kwargs
         if 'inputs_embeds' in kwargs:
             kwargs.pop('input_ids', None)
@@ -755,10 +755,9 @@ def _fetch_inputs_startswith(batch: List[Dict[str, Any]], prefix: str) -> List[D
     @staticmethod
     def fetch_inputs(batch: List[Dict[str, Any]], keys: Optional[List[str]] = None) -> Dict[str, Any]:
         from swift.llm import RowPreprocessor
+        keys = keys or []
         rows = RowPreprocessor.rows_to_batched(batch)
-        if keys is not None:
-            rows = {k: rows[k] for k in keys}
-        return rows
+        return {k: rows[k] for k in keys if rows.get(k) is not None}
 
     @staticmethod
     def gather_list(batch: List[Dict[str, Any]], attr_name: str) -> Optional[List[Any]]:

diff --git a/swift/llm/template/template/internlm.py b/swift/llm/template/template/internlm.py
@@ -63,57 +63,61 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         return encoded
 
     def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        input_ids = inputs['input_ids'][0].tolist()
-        labels = inputs.get('labels')
-        images = inputs['images']
-        if len(images) > 0:  # ignore <s>
-            input_ids = input_ids[1:]
+        batch_size = len(inputs['input_ids'])
+        res = []
+        im_mask = []
+        length = inputs['length']
+        for i in range(batch_size):
+            input_ids = inputs['input_ids'][i].tolist()[:length[i]]
+            input_ids.append(2)  # add dummy </s>
+            labels = inputs.get('labels')
             if labels is not None:
-                labels = labels[1:]
-        input_ids.append(2)  # add dummy </s>
-        if labels is not None:
-            labels = labels[0].tolist()
-            labels.append(2)
-        else:
-            labels = []
-        res_inputs_embeds = []
-        res_labels = []
-        wrap_im_mask = []
-        pre_i, i, idx = 0, 0, 0
-        device = model.device
-        internlm2_model = model.model
-        if not hasattr(internlm2_model, 'tok_embeddings'):
-            internlm2_model = internlm2_model.model
-        tok_embeddings = internlm2_model.tok_embeddings
-        if len(images) > 0:
-            images = torch.concat([model.img2emb(image[None])[0] for image in images], dim=0)
-        while i < len(input_ids):
-            if input_ids[i] == 2:  # replace_token
-                res_input_ids = torch.tensor([1] + input_ids[pre_i:i], device=device)
-                res_inputs_embeds.append(tok_embeddings(res_input_ids[None])[0])
-                wrap_im_mask += [0] * len(res_input_ids)
-                res_labels += [-100] + labels[pre_i:i]
-                if len(images) > 0 and idx < images.shape[0]:
-                    res_inputs_embeds.append(images[idx].to(device))
-                    wrap_im_mask += [1] * images.shape[1]
-                    res_labels += [-100] * images.shape[1]
-                idx += 1
+                labels = labels[i].tolist()[:length[i]]
+                labels.append(2)
+            else:
+                labels = []
+            images = inputs['images'][i]
+            res_inputs_embeds = []
+            res_labels = []
+            wrap_im_mask = []
+            pre_i, i, idx = 0, 0, 0
+            device = model.device
+            internlm2_model = model.model
+            if not hasattr(internlm2_model, 'tok_embeddings'):
+                internlm2_model = internlm2_model.model
+            tok_embeddings = internlm2_model.tok_embeddings
+            if len(images) > 0:
+                images = torch.concat([model.img2emb(image[None])[0] for image in images], dim=0)
+            add_bos = False
+            while i < len(input_ids):
+                if input_ids[i] == 2:  # replace_token
+                    res_input_ids = torch.tensor(([1] if add_bos else []) + input_ids[pre_i:i], device=device)
+                    if not add_bos and self.version != 'v2.5':
+                        add_bos = True
+                    res_inputs_embeds.append(tok_embeddings(res_input_ids[None])[0])
+                    wrap_im_mask += [0] * len(res_input_ids)
+                    res_labels += ([-100] if add_bos else []) + labels[pre_i:i]
+                    if len(images) > 0 and idx < images.shape[0]:
+                        res_inputs_embeds.append(images[idx].to(device))
+                        wrap_im_mask += [1] * images.shape[1]
+                        res_labels += [-100] * images.shape[1]
+                    idx += 1
+                    i += 1
+                    pre_i = i
+                    continue
                 i += 1
-                pre_i = i
-                continue
-            i += 1
-        if len(labels) == 0:
-            res_labels = None
-        res_inputs_embeds = torch.concat(res_inputs_embeds, dim=0)
-        wrap_im_mask = torch.tensor(wrap_im_mask, dtype=torch.bool, device=device)[None]
-        return {'inputs_embeds': res_inputs_embeds, 'im_mask': wrap_im_mask, 'labels': res_labels}
+            if len(labels) == 0:
+                res_labels = None
+            im_mask.append(torch.tensor(wrap_im_mask, dtype=torch.bool, device=device))
+            res.append({'inputs_embeds': torch.concat(res_inputs_embeds, dim=0), 'labels': res_labels})
+        res = Template._data_collator(self, res)
+        res['im_mask'] = self._pad_sequence(im_mask, 0)
+        return res
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super()._data_collator(batch, padding_to=padding_to)
-        if 'im_mask' in batch[0]:
-            im_mask = [b['im_mask'][0] for b in batch]
-            im_mask = self._pad_sequence(im_mask, 0)
-            res['im_mask'] = im_mask
+        res['length'] = [len(b['input_ids']) for b in batch]
+        res.update(self.fetch_inputs(batch, ['images']))
         return res