InternLM · zigzagcai · Jan 16, 2025 · Feb 18, 2025 · Feb 14, 2025 · Feb 24, 2025
diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml
@@ -63,6 +63,7 @@ jobs:
         export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
         export SLURM_PARTITION=$SLURM_PARTITION
         source activate ${evo_env_torch21_flash2}
+        export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         EOF
 
@@ -97,6 +98,7 @@ jobs:
         export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
         export SLURM_PARTITION=$SLURM_PARTITION
         source activate ${evo_env_torch21_flash2}
+        export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rm -rf $GITHUB_WORKSPACE/llm_ckpts
         EOF

diff --git a/.github/workflows/lint_check.yaml b/.github/workflows/lint_check.yaml
@@ -18,25 +18,21 @@ jobs:
       run: |
         pip install flake8==v3.8.4
         FLAKE_DISABLE_LIST="F403,F405,W504,W503,E203"
-        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST ./train.py
+        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
 
     - name: lint-isort
       run: |
         pip install isort==5.12.0
         isort --check --profile=black ./internlm/*
-        isort --check --profile=black ./train.py
 
     - name: lint-black
       run: |
         pip install black==22.8.0
         BLACK_EXCLUDE_SETTINGS='\.venv/|\.local/|\.cache/|\.git/'
         black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./internlm/*
-        black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./train.py
 
     - name: lint-pylint
       run: |
         pip install pylint==v2.17.2
         PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203"
-        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py
+        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
diff --git a/README-ja-JP.md b/README-ja-JP.md
@@ -99,7 +99,7 @@ data = dict(
 
 Slurm環境で2ノード16カードを使用する場合、コマンドは以下の通りです：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 torchを使用し、1ノード8カードで実行する場合、コマンドは以下の通りです：
@@ -166,8 +166,8 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
       </td>
       <td>
         <ul>
-          <li><a href="tools/transformers/README.md">Convert ckpt to HF</a></li>
-          <li><a href="tools/transformers/README.md">Revert ckpt from HF</a></li>
+          <li><a href="huggingface_models/README.md">Convert ckpt to HF</a></li>
+          <li><a href="huggingface_models/README.md">Revert ckpt from HF</a></li>
           <li><a href="tools/tokenizer.py">Raw Data Tokenizer</a></li>
           <li><a href="tools/alpaca_tokenizer.py">Alpaca data Tokenizer</a></li>
         </ul>

diff --git a/README-zh-Hans.md b/README-zh-Hans.md
@@ -99,7 +99,7 @@ data = dict(
 
 slurm环境，双机16卡，启动训练命令如下：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 torch环境，单机8卡，启动训练命令如下：
@@ -166,8 +166,8 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
       </td>
       <td>
         <ul>
-          <li><a href="tools/transformers/README-zh-Hans.md">将ckpt转为huggingface格式</a></li>
-          <li><a href="tools/transformers/README-zh-Hans.md">将ckpt从huggingface格式转为InternEvo格式</a></li>
+          <li><a href="huggingface_models/README-zh-Hans.md">将ckpt转为huggingface格式</a></li>
+          <li><a href="huggingface_models/README-zh-Hans.md">将ckpt从huggingface格式转为InternEvo格式</a></li>
           <li><a href="tools/tokenizer.py">原始数据分词器</a></li>
           <li><a href="tools/alpaca_tokenizer.py">Alpaca数据分词器</a></li>
         </ul>

diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ Training can be started on slurm or torch distributed environment.
 
 On slurm, using 2 nodes and 16 cards, the command is as follows:
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 On torch, using 1 node and 8 cards, the command is as follows:
@@ -166,8 +166,8 @@ Please refer to the [System Architecture document](./doc/en/structure.md) for ar
       </td>
       <td>
         <ul>
-          <li><a href="tools/transformers/README.md">Convert ckpt to HF</a></li>
-          <li><a href="tools/transformers/README.md">Revert ckpt from HF</a></li>
+          <li><a href="huggingface_models/README.md">Convert ckpt to HF</a></li>
+          <li><a href="huggingface_models/README.md">Revert ckpt from HF</a></li>
           <li><a href="tools/tokenizer.py">Raw Data Tokenizer</a></li>
           <li><a href="tools/alpaca_tokenizer.py">Alpaca data Tokenizer</a></li>
         </ul>

diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh
@@ -25,7 +25,7 @@ if [[ -d ${CKPTS_OUTPUT} ]]; then
     fi
 fi
 
-python ./transformers/convert2hf_internlm.py --src ${CKPTS_INPUT} --tgt ${CKPTS_OUTPUT} --tokenizer ./tools/tokenizer_internlm.model
+python ./huggingface_models/convert2hf_internlm.py --src ${CKPTS_INPUT} --tgt ${CKPTS_OUTPUT} --tokenizer ./tools/tokenizer_internlm.model
 [[ $? -ne 0 ]] && { echo "test convert2hf_internlm.py failed.";  exit_code=$(($exit_code + 1)); }
 
 #assert exists model

diff --git a/ci_scripts/train/ci_7B_sft.py b/ci_scripts/train/ci_7B_sft.py
@@ -101,14 +101,12 @@
 model = dict(
     checkpoint=False,
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,

diff --git a/ci_scripts/train/generate_config.py b/ci_scripts/train/generate_config.py
@@ -5,7 +5,7 @@
 import os
 
 from ci_scripts.common import com_func
-from internlm.core.context import Config
+from internlm.utils.config import Config
 
 
 def generate_new_config(config_py_file, test_config_json, case_name):

diff --git a/ci_scripts/train/load_ckpt.sh b/ci_scripts/train/load_ckpt.sh
@@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then
         exit_code=$(($exit_code + 1))
     fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ${file}
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 

diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS20_OUTPUT}")

diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
 [[ $? -ne 0 ]] && { echo "test torch training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS_OUTPUT}")

diff --git a/configs/1.8B_MoE16_sft.py b/configs/1.8B_MoE16_sft.py
@@ -136,14 +136,12 @@
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=False,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,