Merge branch 'main' into data_mix

pjlab-sys4nlp · Dec 13, 2023 · f5810a6 · f5810a6
2 parents 53217dd + 0c37546
commit f5810a6
Show file tree

Hide file tree

Showing 14 changed files with 208 additions and 250 deletions.
diff --git a/docs/moefication/README.md b/docs/moefication/README.md
diff --git a/docs/moefication/readme-image.png b/docs/moefication/readme-image.png
diff --git a/scripts/cpt/fpt_13b.sh b/scripts/cpt/fpt_13b.sh
@@ -71,7 +71,7 @@ source ~/anaconda3/bin/activate smoe
     # tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"
 
     # dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
-    dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed/
+    dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
     # dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
     validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized/
 

diff --git a/scripts/cpt/fpt_7b_residual.sh b/scripts/cpt/fpt_7b_residual.sh
@@ -52,7 +52,7 @@ source ~/anaconda3/bin/activate llama-moe
 
   model_type="llama_moe_residual"
   tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
-  dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
+  dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
   validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized
 
   lr=3e-4
@@ -61,7 +61,7 @@ source ~/anaconda3/bin/activate llama-moe
   per_device_eval_batch_size=8
   gradient_accumulation_steps=4
   block_size=4096
-  num_tokens="1*10^11"
+  num_tokens="2*10^11"
   seed=1227
   deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json
 
@@ -77,7 +77,7 @@ source ~/anaconda3/bin/activate llama-moe
   echo "#tokens/batch: $tokens_per_batch"
 
   data_cache=resources/cache
-  output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
+  output_dir=/mnt/petrelfs/share_data/quxiaoye/runs/residual_2_2_14_scale2_112gpus/
   mkdir -p $output_dir
   echo "output_dir: $output_dir"
   scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
@@ -127,7 +127,7 @@ source ~/anaconda3/bin/activate llama-moe
     --max_steps ${max_steps} \
     --max_train_samples ${max_train_samples} \
     --save_strategy steps \
-    --save_total_limit 1 \
+    --save_total_limit 2 \
     --save_steps 1000 \
     --dataloader_num_workers 0 \
     --dataloader_pin_memory True \

diff --git a/scripts/moefication/convert/run_convert.sh b/scripts/moefication/convert/run_convert.sh
@@ -2,15 +2,16 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama_7B"
+#  open_llama_7b
+llama_size="llama2_7B"
 
-num_experts=16                   #  8  16
-num_selects=4                   #  2  4
+num_experts=16                   #  4  8  16  32
+num_selects=4                    #  1  2  4  8
 convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
-split_type=Random                #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
+split_type=Clustering-l2         #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
 proj_type=up_proj                #  gate_proj  up_proj
 
-score_scale_factor=16.0 #  1.0  2.0  4.0  8.0  16.0
+score_scale_factor=4.0 #  1.0  2.0  4.0  8.0  16.0
 score_scale_factor_file_path=""
 #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense
 
@@ -23,7 +24,7 @@ split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_exper
 
 if [ ${use_default_gate} = "True" ]; then
   select_file_path=""
-  save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
+  save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor}
 else
   select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
   save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}

diff --git a/scripts/moefication/convert/run_convert_gradient_residual.sh b/scripts/moefication/convert/run_convert_gradient_residual.sh
@@ -13,7 +13,7 @@ expert_size=1376
 # 688 1376 2752 5504 11008
 # 864 1728 3456 6912 13824
 
-score_scale_factor_residual=4.0 #  4.0  8.0  12.0  16.0
+score_scale_factor_residual=1.0 #  4.0  8.0  12.0  16.0
 score_scale_factor=4.0          #  4.0  8.0  12.0  16.0
 
 convert_type=LlamaMoEResidualForCausalLM #  LlamaMoEResidualModel  LlamaMoEResidualForCausalLM  LlamaMoEResidualForSequenceClassification

diff --git a/scripts/moefication/split/run_split_graph.py b/scripts/moefication/split/run_split_graph.py
diff --git a/scripts/moefication/split/run_split_graph.sh b/scripts/moefication/split/run_split_graph.sh
@@ -4,9 +4,9 @@
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 llama_size=llama_13B
 
-num_experts=8                         #  8  16
-metric=l2_norm                        #  l1_norm l2_norm plain
-template=layers.{}.mlp.up_proj.weight #  gate_proj  up_proj
+num_experts=16                        #  8  16
+metric=l1_norm                        #  l1_norm l2_norm plain
+proj_type=up_proj #  gate_proj  up_proj
 threshold=1
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
@@ -25,7 +25,7 @@ for specify_layer in {0..39}; do
     --model_path ${model_path} \
     --save_path ${save_path} \
     --specify_layer ${specify_layer} \
-    --template ${template} \
+    --template layers.{}.mlp.${proj_type}.weight \
     --num_experts ${num_experts} \
     --threshold ${threshold} \
     --metric ${metric} \
@@ -38,14 +38,15 @@ wait
 
 gpmetis_run=/mnt/petrelfs/share_data/quxiaoye/metis_for_graph_split/bin/gpmetis
 template1=layers.
-template2=.mlp.up_proj.weight
+template2=.mlp.${proj_type}.weight
 
 for layer in {0..39}; do
   OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
     ${gpmetis_run} ${save_path}/${template1}${layer}${template2} ${num_experts} &
   sleep 0.7
 done
 wait
+
 # STEP3
 
 template3=.part.${num_experts}
@@ -57,4 +58,5 @@ for layer in {0..39}; do
   sleep 0.7
 done
 wait
+
 chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/split/run_split_random.sh b/scripts/moefication/split/run_split_random.sh
@@ -2,10 +2,10 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama2_7B"
+#  open_llama_7b
+llama_size="open_llama_7b"
 
-num_experts=8       #  8  16
-proj_type=gate_proj #  gate_proj  up_proj
+num_experts=8 #  8  16
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
@@ -17,7 +17,7 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
   python -m smoe.entrypoint.moefication.llama_split_random \
   --model_path ${model_path} \
   --save_path ${save_path} \
-  --template layers.{}.mlp.${proj_type}.weight \
+  --template layers.{}.mlp.up_proj.weight \
   --num_experts ${num_experts}
 
 chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/split/run_split_random_one4all.sh b/scripts/moefication/split/run_split_random_one4all.sh
@@ -2,13 +2,13 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama_13B"
+#  open_llama_7b
+llama_size="open_llama_7b"
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
 save_path=${data_path}/moefication_results/split
 
-# 所有可能的结果组合
 gpus=0
 cpus=8
 for num_experts in 4 8 16 32; do
@@ -18,8 +18,8 @@ for num_experts in 4 8 16 32; do
       --model_path ${model_path} \
       --save_path ${save_path} \
       --template layers.{}.mlp.${proj_type}.weight \
-      --num_experts ${num_experts} & # 并行运行下一命令
-    sleep 0.7                        # 等待0.5s
+      --num_experts ${num_experts} &
+    sleep 0.7
   done
 done
 

diff --git a/smoe/entrypoint/moefication/llama_split_random.py b/smoe/entrypoint/moefication/llama_split_random.py
@@ -11,7 +11,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B")
     parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/")
-    parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
+    parser.add_argument('--template', type=str, default='layers.{}.mlp.up_proj.weight')
     parser.add_argument('--num_experts', type=int, default=8, help='number of experts')
 
     args = parser.parse_args()

diff --git a/smoe/modules/moe/moe_gates.py b/smoe/modules/moe/moe_gates.py
@@ -232,7 +232,6 @@ def __init__(
 
         self.gate_network_type = gate_network
         self.gate_network = get_gate_network(gate_network, input_size, num_experts)
-        # self.gate_network = get_gate_network("linear", input_size, num_experts)
 
         self.use_softmax = use_softmax
         self.softmax = nn.Softmax(1)
@@ -286,7 +285,6 @@ def forward(self, x):
         logits_gate = self.gate_network(x)  # gate计算出的权重
         if self.training and self.add_noise:
             noise_mm = self.weight_noise(x)  # 噪声矩阵计算结果
-            # noise_mm = torch.mm(x, self.weight_noise)  # 噪声矩阵计算结果
             noise_control = self.softplus(noise_mm) + self.noise_epsilon  # 控制器得到的噪声增加量
             logits_noise = torch.randn_like(logits_gate) * noise_control  # noise附加的权重
             logits = logits_gate + logits_noise  # 最终权重
@@ -323,7 +321,7 @@ def forward(self, x):
                 load = prob.sum(0)
             else:
                 load = (scores_filtered > 0).sum(0)
-                if not self.warned:
+                if not self.add_noise and not self.warned:
                     warnings.warn('Gradient-trackable implementation for load calculation is only available when "add_noise=True". '
                                   'Training without noise will block the gradient from "load" path and lead to inconsistency in optimization objectives.')
                     self.warned = True
@@ -436,7 +434,7 @@ def __init__(
         add_noise=True,
     ):
         super(SwitchBalancedGate, self).__init__()
-        assert num_selects in [1, 2]
+        assert num_selects in (1, 2)
         self.input_size = input_size
         self.num_experts = num_experts
         self.num_selects = num_selects

diff --git a/smoe/utils/io.py b/smoe/utils/io.py
@@ -25,7 +25,7 @@ def torch_load_template_file(path, template, layer):
 
 def torch_load_template_score_file(path, template, layer):
     score_list = []
-    for expert_folder_name in os.listdir(path):
+    for expert_folder_name in sorted(os.listdir(path)):
         score_file = os.path.join(path, expert_folder_name, template.format(layer))
         score = torch.load(score_file, map_location="cpu")
         score_list.append(score)

diff --git a/smoe/utils/moefication/convert_llama_moe.py b/smoe/utils/moefication/convert_llama_moe.py
@@ -26,6 +26,7 @@ def convert_llama_model(
     num_selects,
     score_scale_factor=None,
     use_default_gate=False,
+    gate_type="mlp",  # "linear"
 ):
     """
     LlamaMoEModel
@@ -63,7 +64,7 @@ def convert_llama_model(
     config_llama_moe.num_experts = num_experts
     config_llama_moe.num_selects = num_selects
     config_llama_moe.size_experts = size_experts
-    config_llama_moe.gates = "mlp"
+    config_llama_moe.gates = gate_type
     config_llama_moe.score_scale_factor = (
         1.0 if score_scale_factor is None else score_scale_factor
     )
@@ -91,7 +92,7 @@ def convert_llama_model(
                     model_llama_moe_state_dict["layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()
 
     for layer_index in range(num_layers):
-        if not use_default_gate:
+        if not use_default_gate and gate_type == "mlp":
             model_llama_moe_state_dict["layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
             model_llama_moe_state_dict["layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
         model_llama_moe_state_dict["layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)
@@ -123,6 +124,7 @@ def convert_llama_model_for_causal_lm(
     num_selects,
     score_scale_factor=None,
     use_default_gate=False,
+    gate_type="mlp",  # "linear"
 ):
     """
     LlamaMoEForCausalLM
@@ -160,7 +162,7 @@ def convert_llama_model_for_causal_lm(
     config_llama_moe.num_experts = num_experts
     config_llama_moe.num_selects = num_selects
     config_llama_moe.size_experts = size_experts
-    config_llama_moe.gates = "mlp"
+    config_llama_moe.gates = gate_type
     config_llama_moe.score_scale_factor = (
         1.0 if score_scale_factor is not None else score_scale_factor
     )
@@ -188,7 +190,7 @@ def convert_llama_model_for_causal_lm(
                     model_llama_moe_state_dict["model.layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()
 
     for layer_index in range(num_layers):
-        if not use_default_gate:
+        if not use_default_gate and gate_type == "mlp":
             model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
             model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
         model_llama_moe_state_dict["model.layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)
@@ -220,6 +222,7 @@ def convert_llama_model_for_sequence_classification(
     num_selects,
     score_scale_factor=None,
     use_default_gate=False,
+    gate_type="mlp",  # "linear"
 ):
     """
     LlamaMoEForSequenceClassification
@@ -257,7 +260,7 @@ def convert_llama_model_for_sequence_classification(
     config_llama_moe.num_experts = num_experts
     config_llama_moe.num_selects = num_selects
     config_llama_moe.size_experts = size_experts
-    config_llama_moe.gates = "mlp"
+    config_llama_moe.gates = gate_type
     config_llama_moe.score_scale_factor = (
         1.0 if score_scale_factor is not None else score_scale_factor
     )
@@ -285,7 +288,7 @@ def convert_llama_model_for_sequence_classification(
                     model_llama_moe_state_dict["model.layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()
 
     for layer_index in range(num_layers):
-        if not use_default_gate:
+        if not use_default_gate and gate_type == "mlp":
             model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
             model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
         model_llama_moe_state_dict["model.layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)