modelscope · Jintao-Huang · Jan 1, 2025 · Jan 1, 2025 · Jan 1, 2025 · Jan 1, 2025
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/README_CN.md b/README_CN.md
@@ -121,7 +121,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/docs/source/GetStarted/快速开始.md b/docs/source/GetStarted/快速开始.md
@@ -45,7 +45,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -49,6 +49,7 @@
 - truncation_strategy: 如果超长如何处理，支持`delete`, `left`和`right`，代表删除、左侧裁剪和右侧裁剪，默认为'delete'
 - 🔥max_pixels: 多模态模型图片前处理的最大像素数（H\*W），默认不缩放。
 - tools_prompt: 智能体训练时的工具列表转为system的格式，请参考[智能体训练](./智能体的支持.md)，默认为'react_en'
+- padding_side: 当训练`batch_size>=2`时的padding_side，可选值为'left', 'right'，默认为'right'。（`generate`的batch_size>=2时，只进行左padding）
 - loss_scale: 如何针对训练添加token的loss权重。默认为`'default'`，代表所有response（含history）以1计算交叉熵损失。具体可以查看[插件化](../Customization/插件化.md)和[智能体训练](./智能体的支持.md)
 - sequence_parallel_size: 序列并行数量。参考[example](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel/train.sh)
 - use_chat_template: 使用chat模板或generation模板，默认为`True`。`swift pt`会自动设置为generation模板
@@ -99,7 +100,6 @@
 - remove_unused_columns: 默认值False
 - logging_first_step: 是否记录第一个step的打印，默认值True
 - logging_steps: 日志打印间隔，默认值5
-- average_tokens_across_devices: 是否在设备之间对token数进行平均。如果设置为True，将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为None，如果为分布式训练则设置为True，否则为False
 - metric_for_best_model: 默认为None. 即当`predict_with_generate`设置为False, 则为'loss', 否则设置为'rouge-l'
 - greater_is_better: 默认为None. 即当`metric_for_best_model`含'loss'时, 设置为False, 否则设置为True.
 
@@ -121,6 +121,7 @@
 - 🔥ddp_find_unused_parameters: 默认为None
 - 🔥dataloader_num_workers: 默认为0
 - 🔥neftune_noise_alpha: neftune添加的噪声系数, 默认为0，通常可以设置为5, 10, 15
+- average_tokens_across_devices: 是否在设备之间的token数进行平均。如果设置为True，将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False
 - max_grad_norm: 梯度裁剪. 默认为1.
 - push_to_hub: 推送训练权重到hub. 默认为False
 - hub_model_id: 默认为None

diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
@@ -45,7 +45,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -49,6 +49,7 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - truncation_strategy: How to handle overly long tokens, supports `delete`, `left`, `right`, representing deletion, left trimming, and right trimming, default is 'delete'.
 - 🔥max_pixels: Maximum pixel count for pre-processing images in multimodal models (H*W), default is no scaling.
 - tools_prompt: The list of tools for agent training converted to system format, refer to [Agent Training](./Agent-support.md), default is 'react_en'.
+- padding_side: The padding_side used when training with `batch_size >= 2`, with optional values of 'left' and 'right', defaulting to 'right'. (When the batch_size in `generate` is >= 2, only left padding is applied.)
 - loss_scale: How to add token loss weight during training. Default is `'default'`, meaning all responses (including history) are treated as 1 for cross-entropy loss. For specifics, see [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
 - sequence_parallel_size: Number of sequence parallelism. Refer to [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel/train.sh).
 - use_chat_template: Use chat template or generation template, default is `True`. `swift pt` is automatically set to the generation template.
@@ -100,7 +101,6 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - remove_unused_columns: Default is False.
 - logging_first_step: Whether to log the first step print, default is True.
 - logging_steps: Interval for logging prints, default is 5.
-- average_tokens_across_devices: Whether to average the token count across devices. If set to True, it will use all_reduce to synchronize `num_tokens_in_batch` for accurate loss computation. The default is None; set to True for distributed training, otherwise set to False.
 - metric_for_best_model: Default is None. When `predict_with_generate` is set to False, it is 'loss'; otherwise, it is 'rouge-l'.
 - greater_is_better: Default is None. When `metric_for_best_model` contains 'loss', set to False; otherwise, set to True.
 
@@ -121,6 +121,7 @@ Other important parameters:
 - 🔥ddp_find_unused_parameters: Default is None.
 - 🔥dataloader_num_workers: Default is 0.
 - 🔥neftune_noise_alpha: Noise coefficient added by neftune, default is 0. Generally can be set to 5, 10, 15.
+- average_tokens_across_devices: Whether to average the token count across devices. If set to True, it will use all_reduce to synchronize `num_tokens_in_batch` for accurate loss computation. The default is False.
 - max_grad_norm: Gradient clipping. The default value is 1.
 - push_to_hub: Push training weights to hub, default is False.
 - hub_model_id: Default is None.

diff --git a/examples/custom/sft.sh b/examples/custom/sft.sh
@@ -16,7 +16,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 100 \
     --save_steps 100 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \

diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -93,7 +93,7 @@
     "    gradient_accumulation_steps=16,\n",
     "    num_train_epochs=1,\n",
     "    metric_for_best_model='loss',\n",
-    "    save_total_limit=2,\n",
+    "    save_total_limit=5,\n",
     "    logging_steps=5,\n",
     "    dataloader_num_workers=1,\n",
     "    data_seed=data_seed,\n",

diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -18,7 +18,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \

diff --git a/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb b/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
@@ -95,7 +95,7 @@
     "    # Under normal circumstances, a larger number should be used.\n",
     "    num_train_epochs=1,\n",
     "    metric_for_best_model='loss',\n",
-    "    save_total_limit=2,\n",
+    "    save_total_limit=5,\n",
     "    logging_steps=5,\n",
     "    dataloader_num_workers=4,\n",
     "    data_seed=data_seed,\n",

diff --git a/examples/train/full/infer.sh b/examples/train/full/infer.sh
@@ -0,0 +1,7 @@
+# If you are using the validation set for inference, add the parameter `--load_data_args true`.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --model output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/train/infer.sh b/examples/train/infer.sh
@@ -0,0 +1,8 @@
+# If it's full parameter training, use `--model xxx` instead of `--adapters xxx`.
+# If you are using the validation set for inference, add the parameter `--load_data_args true`.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/train/demo.sh → examples/train/lora_sft.sh b/examples/train/demo.sh → examples/train/lora_sft.sh
@@ -17,7 +17,7 @@ swift sft \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
-    --save_total_limit 2 \
+    --save_total_limit 5 \
     --logging_steps 5 \
     --max_length 2048 \
     --output_dir output \
@@ -26,13 +26,3 @@ swift sft \
     --dataloader_num_workers 4 \
     --model_author swift \
     --model_name swift-robot
-
-
-# After training, use the following command for inference.
-
-# CUDA_VISIBLE_DEVICES=0 \
-# swift infer \
-#     --adapters output/vx-xxx/checkpoint-xxx \
-#     --stream true \
-#     --temperature 0 \
-#     --max_new_tokens 2048
diff --git a/examples/train/pt/train.sh → examples/train/pretrain/train.sh b/examples/train/pt/train.sh → examples/train/pretrain/train.sh
diff --git a/swift/llm/argument/base_args/template_args.py b/swift/llm/argument/base_args/template_args.py
@@ -20,6 +20,7 @@ class TemplateArguments:
         truncation_strategy (Literal): Strategy for truncating the template. Default is 'delete'.
         max_pixels (Optional[int]): Maximum number of pixels for the template. Default is None.
         tools_prompt (str): Override the default tools prompt in the template. Default is 'react_en'.
+        padding_side: The padding_side when the training batch_size >= 2
         loss_scale (str): Loss scale for training. Default is 'default',
             meaning only calculate the loss of the assistant.
         sequence_parallel_size (int): Size of sequence parallelism. Default is 1.
@@ -35,6 +36,7 @@ class TemplateArguments:
     max_pixels: Optional[int] = None
     tools_prompt: str = 'react_en'  # Override the default_tools_prompt in the template.
     # train
+    padding_side: Literal['left', 'right'] = 'right'
     loss_scale: str = 'default'
     sequence_parallel_size: int = 1
     # infer/deploy
@@ -59,6 +61,7 @@ def get_template_kwargs(self):
             'max_pixels': self.max_pixels,
             'tools_prompt': self.tools_prompt,
             'loss_scale': self.loss_scale,
+            'padding_side': self.padding_side,
             'sequence_parallel_size': self.sequence_parallel_size,
             'template_backend': self.template_backend,
             'use_chat_template': self.use_chat_template

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -36,7 +36,6 @@ class Seq2SeqTrainingOverrideArguments(Seq2SeqTrainingArguments):
 
     remove_unused_columns: bool = False
     logging_first_step: bool = True
-    average_tokens_across_devices: Optional[bool] = None
 
     def _init_output_dir(self):
         if self.output_dir is not None:
@@ -56,8 +55,6 @@ def _init_eval_strategy(self):
 
     def __post_init__(self):
         self._init_output_dir()
-        if self.average_tokens_across_devices is None:
-            self.average_tokens_across_devices = self.global_world_size > 1
         if self.metric_for_best_model is None:
             self.metric_for_best_model = 'rouge-l' if self.predict_with_generate else 'loss'
         if self.greater_is_better is None:
@@ -112,18 +109,20 @@ class TrainArguments(TorchAccArguments, TunerArguments, Seq2SeqTrainingOverrideA
     add_version: bool = True
     resume_only_model: bool = False
     check_model: bool = True
-    loss_type: Optional[str] = field(default=None, metadata={'help': f'loss_func choices: {list(LOSS_MAPPING.keys())}'})
 
     # dataset
     packing: bool = False
     lazy_tokenize: Optional[bool] = None
 
+    # plugin
+    loss_type: Optional[str] = field(default=None, metadata={'help': f'loss_func choices: {list(LOSS_MAPPING.keys())}'})
+    optimizer: Optional[str] = None
+    metric: Optional[str] = None
+
     # extra
     acc_strategy: Literal['token', 'seq'] = 'token'
     max_new_tokens: int = 64
     temperature: float = 0.
-    optimizer: Optional[str] = None
-    metric: Optional[str] = None
 
     def __post_init__(self) -> None:
         if self.resume_from_checkpoint:

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -45,31 +45,34 @@ class Template(ProcessorMixin):
     use_model = False
 
     is_encoder_decoder = False
-    padding_side: Literal['left', 'right'] = 'right'  # The padding_side when the training batch_size >= 2.
 
     def __init__(
-            self,
-            processor: Processor,
-            template_meta: 'TemplateMeta',
-            default_system: Optional[str] = None,
-            max_length: Optional[int] = None,
-            *,
-            use_chat_template: bool = True,
-            template_backend: Literal['swift', 'jinja'] = 'swift',
-            truncation_strategy: Literal['raise', 'left', 'right'] = 'raise',
-            max_pixels: Optional[int] = None,
-            tools_prompt: Optional[str] = None,
-            # only for train
-            loss_scale: str = 'default',
-            sequence_parallel_size: int = 1) -> None:
+        self,
+        processor: Processor,
+        template_meta: 'TemplateMeta',
+        default_system: Optional[str] = None,
+        max_length: Optional[int] = None,
+        *,
+        use_chat_template: bool = True,
+        truncation_strategy: Literal['raise', 'left', 'right'] = 'raise',
+        max_pixels: Optional[int] = None,
+        tools_prompt: Optional[str] = None,
+        # only for train
+        padding_side: Literal['left', 'right'] = 'right',
+        loss_scale: str = 'default',
+        sequence_parallel_size: int = 1,
+        # infer/deploy
+        template_backend: Literal['swift', 'jinja'] = 'swift',
+    ) -> None:
         """
         default_system: Override the default_system in the template.
         max_length: Max length of the sequence
         truncation_strategy: The truncation strategy
-        loss_scale: The loss scale function to use
         max_pixels: Rescale image to reduce memory usage, default `None` means no limitation.
             e.g. 512 * 512 (H*W)
         tools_prompt: The type of tools_prompt added in the system.
+        padding_side: The padding_side when the training batch_size >= 2
+        loss_scale: The loss scale function to use
         """
         from .template_meta import TemplateMeta
         self.processor = processor
@@ -96,6 +99,7 @@ def __init__(
         self.truncation_strategy = truncation_strategy
         self.loss_scale = loss_scale
         self.max_pixels = max_pixels
+        self.padding_side = padding_side
         self.sequence_parallel_size = sequence_parallel_size
         self.tools_prompt = tools_prompt or template_meta.default_tools_prompt
         if self.is_encoder_decoder:
@@ -853,7 +857,7 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         keys = [
             'input_ids', 'inputs_embeds', 'attention_mask', 'labels', 'loss_scale', 'position_ids', 'token_type_ids'
         ]
-        pad_value = [self.tokenizer.pad_token_id, 0., 0, -100, 0., -1, 0]
+        pad_value = [self.tokenizer.pad_token_id, 0., 0, -100, 0., 1, 0]
         # Convert to tensor and remove unnecessary dimensions.
         seq_lens = None
         for key in keys:
@@ -869,6 +873,8 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
                 seq_lens = [seq.shape[0] for seq in res[key]]
         if seq_lens and ('input_ids' in res or 'inputs_embeds' in res):
             res['attention_mask'] = [torch.ones(seq_len, dtype=torch.int64) for seq_len in seq_lens]
+            if self.is_training and self.padding_side == 'left':
+                res['position_ids'] = [torch.arange(seq_len, dtype=torch.int64) for seq_len in seq_lens]
 
         for key, pad_value in zip(keys, pad_value):
             if key not in res:

diff --git a/swift/llm/template/register.py b/swift/llm/template/register.py
@@ -28,6 +28,7 @@ def get_template(
         max_pixels: Optional[int] = None,  # h * w
         tools_prompt: str = 'react_en',
         # train
+        padding_side: Literal['left', 'right'] = 'right',
         loss_scale: str = 'default',
         sequence_parallel_size: int = 1) -> 'Template':
     template_meta = TEMPLATE_MAPPING[template_type]
@@ -40,10 +41,12 @@ def get_template(
         use_chat_template=use_chat_template,
         template_backend=template_backend,
         truncation_strategy=truncation_strategy,
-        loss_scale=loss_scale,
         max_pixels=max_pixels,
+        tools_prompt=tools_prompt,
+        padding_side=padding_side,
+        loss_scale=loss_scale,
         sequence_parallel_size=sequence_parallel_size,
-        tools_prompt=tools_prompt)
+    )
 
 
 def get_template_meta(template_type: str) -> TemplateMeta:

diff --git a/swift/llm/template/template/internvl.py b/swift/llm/template/template/internvl.py
@@ -163,8 +163,6 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         return encoded
 
 
-# TODO: self.padding_side = 'left'
-
 _internvl2_system = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'
 register_template(
     ChatmlTemplateMeta(

diff --git a/swift/llm/template/template/llava.py b/swift/llm/template/template/llava.py
@@ -19,11 +19,6 @@
 
 class LlavaHfTemplate(Template):
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        if version.parse(transformers.__version__) < version.parse('4.43.0'):
-            self.padding_side = 'left'
-
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
         assert media_type == 'image'