Moefication: Format Standardization (v8)

pjlab-sys4nlp · Dec 24, 2023 · 6c57790 · 6c57790
1 parent ee301c6
commit 6c57790
Show file tree

Hide file tree

Showing 47 changed files with 875 additions and 490 deletions.
diff --git a/docs/moefication/README.md b/docs/moefication/README.md
@@ -28,7 +28,7 @@ Remember to change the following variables:
 ```shell
 num_experts="" # number of experts in each MoE layer
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 save_path="" # path to save the indices sets
 ```
 
@@ -47,7 +47,7 @@ Remember to change the following variables:
 ```shell
 num_experts="" # number of experts in each MoE layer
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 save_path="" # path to save the indices sets
 
 metric="" # metric for clustering, choices: `l2` `cos`
@@ -73,7 +73,7 @@ Remember to change the following variables:
 ```shell
 num_experts="" # number of experts in each MoE layer
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 save_path="" # path to save the indices sets
 
 metric="" # metric to measure the sparsity, choices: `l1_norm` `l2_norm` `plain`
@@ -82,7 +82,7 @@ proj_type="" # weights to perform clustering, choices: `up_proj` `gate_proj`
 
 
 
-#### Gradient Split 
+#### Gradient Split
 
 Before performing gradient-based splitting (Eq. 8 in the technical report), you need to prepare a bunch of pretraining data and group them into different clusters by running:
 
@@ -101,7 +101,7 @@ Remember to change the following variables:
 ```shell
 dataset_dir="" # path to clustered data
 pretrained_model="" # path to the LLaMA checkpoint
-tokenizer_path="" # path to the LLaMA tokenizer 
+tokenizer_path="" # path to the LLaMA tokenizer
 save_path="" # path to save the indices sets
 
 accumulate_level="" # should be set to `sample`
@@ -111,7 +111,7 @@ importance_type="" # should be set to `feature_change`
 
 
 
-##### Neuron Independent 
+##### Neuron Independent
 
 > This part is not included in our technical report.
 
@@ -128,7 +128,7 @@ expert_num="" # number of experts in each MoE layer
 expert_size="" # intermediate neurons in each expert
 share_neurons="False" ######### SET AS FLASE TO BE NEURON-INDEPENDENT #########
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 score_file_path="" # path to the score files generated above
 save_path="" # path to save the indices sets
 visualization_path="" # path to save the visualization results
@@ -154,7 +154,7 @@ expert_num="" # number of experts in each MoE layer
 expert_size="" # intermediate neurons in each expert
 share_neurons="True" ######### SET AS TRUE TO BE INNER-SHARING #########
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 score_file_path="" # path to the score files generated above
 save_path="" # path to save the indices sets
 visualization_path="" # path to save the visualization results
@@ -181,7 +181,7 @@ expert_num_residual="" # number of residual experts
 expert_size="" # intermediate neurons in each expert
 share_neurons="" # Whether to share neurons in non-residual experts
 
-model_path="" # path to the LLaMA checkpoint 
+model_path="" # path to the LLaMA checkpoint
 score_file_path="" # path to the score files generated above
 save_path="" # path to save the indices sets
 visualization_path="" # path to save the visualization results
@@ -239,8 +239,3 @@ bash ./scripts/moefication/convert/run_convert_gradient_residual.sh
         -- entrypoint
             -- moefication
 ```
-
-
-
-
-
diff --git a/scripts/examples/load_relu_llama.sh b/scripts/examples/load_relu_llama.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/bash
+
+base_model=ReluLLaMA-7B
+model_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/
+
+gpus=1
+cpus=8
+quotatype=spot # spot reserved auto
+OMP_NUM_THREADS=2 srun --partition=MoE --job-name=example --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --job-name=example --kill-on-bad-exit=1 --quotatype=${quotatype} \
+  python -m smoe.entrypoint.examples.load_relu_llama \
+  --tokenizer_path ${model_path} \
+  --model_path ${m
diff --git a/scripts/moefication/convert/run_convert.sh b/scripts/moefication/convert/run_convert.sh
@@ -3,31 +3,38 @@
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 #  open_llama_7b
-llama_size="llama2_7B"
+#  ReluLLaMA-7B
+llama_size="ReluLLaMA-7B"
 
-num_experts=8                    #  4  8  16  32
-num_selects=2                    #  1  2  4  8
-convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
-split_type=Random                #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
-proj_type=up_proj                #  gate_proj  up_proj
+num_experts=16           #  4  8  16  32
+num_selects=4            #  1  2  4  8
+split_type=Clustering-l2 #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
+proj_type=gate_proj      #  gate_proj  up_proj
+select_type=positive     #  plain  positive  l1_norm  l2_norm
+
+use_random_gate="False" #  True  False
+gate_type="mlp"         #  mlp  linear
+use_softmax="False"
+multiply_gate_scores="False"
 
-score_scale_factor=4.0 #  1.0  2.0  4.0  8.0  16.0
+score_scale_factor=1.0 #  1.0  2.0  4.0  8.0  16.0
 score_scale_factor_file_path=""
 #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense
 
-use_default_gate=True #  True  False
-select_type=l2_norm   #  plain  positive  l1_norm  l2_norm
+convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
 split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
 
-if [ ${use_default_gate} = "True" ]; then
+if [ ${use_random_gate} = "True" ]; then
   select_file_path=""
   save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor}
 else
-  select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
-  save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
+  select_file_path="/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/Clustering-l2/ReluLLaMA-7B-16Expert-Select-MLP-positive-random"
+  save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-HardBCE
+  #  select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
+  #  save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
 fi
 
 gpus=0
@@ -41,9 +48,10 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:
   --template layers.{}.mlp.${proj_type}.weight \
   --num_experts ${num_experts} \
   --num_selects ${num_selects} \
+  --use_random_gate ${use_random_gate} \
+  --gate_type ${gate_type} \
+  --use_softmax ${use_softmax} \
+  --multiply_gate_scores ${multiply_gate_scores} \
   --score_scale_factor ${score_scale_factor} \
   --score_scale_factor_file_path "${score_scale_factor_file_path}" \
-  --convert_type ${convert_type} \
-  --use_default_gate ${use_default_gate}
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
+  --convert_type ${convert_type}
diff --git a/scripts/moefication/convert/run_convert_gradient.sh b/scripts/moefication/convert/run_convert_gradient.sh
@@ -47,7 +47,7 @@ if [ ${share_neurons} = "True" ]; then
     --score_scale_factor ${score_scale_factor} \
     --score_scale_factor_file_path "${score_scale_factor_file_path}" \
     --convert_type ${convert_type} \
-    --use_default_gate True
+    --use_random_gate True
 else
   OMP_NUM_THREADS=8 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
     python -m smoe.entrypoint.moefication.llama_convert \
@@ -61,7 +61,5 @@ else
     --score_scale_factor ${score_scale_factor} \
     --score_scale_factor_file_path "${score_scale_factor_file_path}" \
     --convert_type ${convert_type} \
-    --use_default_gate True
+    --use_random_gate True
 fi
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/convert/run_convert_gradient_residual.sh b/scripts/moefication/convert/run_convert_gradient_residual.sh
@@ -48,6 +48,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:
   --score_scale_factor ${score_scale_factor} \
   --score_scale_factor_residual ${score_scale_factor_residual} \
   --convert_type ${convert_type} \
-  --use_default_gate True
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
+  --use_random_gate True
diff --git a/scripts/moefication/get_hidden_features/run_get_hidden_features.sh b/scripts/moefication/get_hidden_features/run_get_hidden_features.sh
@@ -2,23 +2,25 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama_13B"
+#  ReluLLaMA-7B
+llama_size="ReluLLaMA-7B"
 save_interval=1
 batch_size=4
 block_size=2048
-data_use_percent=0.01
+data_use_percent=0.002
 
-proj_type=up_proj #  gate_proj  up_proj
+proj_type=gate_proj #  gate_proj  up_proj
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
 train_data_path=${data_path}/data/moefication_LLAMA_data
 train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache
 save_path=${data_path}/moefication_results/features
 
-gpus=8
+gpus=4
 cpus=$((gpus * 16))
-OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
+quotatype=auto # auto spot reserved
+OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=${quotatype} \
   torchrun --nproc_per_node=${gpus} -m smoe.entrypoint.moefication.llama_get_hidden_features \
   --model_path ${model_path} \
   --train_data_path ${train_data_path} \
@@ -31,4 +33,3 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres
   --block_size ${block_size}
 
 wait
-chmod -R 755 ${save_path}/${llama_size} >/dev/null 2>&1
diff --git a/scripts/moefication/prune/run_prune_gradient_convert.sh b/scripts/moefication/prune/run_prune_gradient_convert.sh
@@ -41,6 +41,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gre
   --num_experts 1 \
   --num_selects 1 \
   --convert_type ${convert_type} \
-  --use_default_gate True
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
+  --use_random_gate True
diff --git a/scripts/moefication/prune/run_prune_gradient_convert_one4all.sh b/scripts/moefication/prune/run_prune_gradient_convert_one4all.sh
@@ -42,9 +42,8 @@ for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
     --num_experts 1 \
     --num_selects 1 \
     --convert_type ${convert_type} \
-    --use_default_gate True &
+    --use_random_gate True &
   sleep 1
 done
 
 wait
-chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/prune/run_prune_random_convert.sh b/scripts/moefication/prune/run_prune_random_convert.sh
@@ -29,6 +29,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gre
   --num_experts 1 \
   --num_selects 1 \
   --convert_type ${convert_type} \
-  --use_default_gate True
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
+  --use_random_gate True
diff --git a/scripts/moefication/prune/run_prune_random_convert_one4all.sh b/scripts/moefication/prune/run_prune_random_convert_one4all.sh
@@ -30,9 +30,8 @@ for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
     --num_experts 1 \
     --num_selects 1 \
     --convert_type ${convert_type} \
-    --use_default_gate True &
+    --use_random_gate True &
   sleep 1
 done
 
 wait
-chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/select/run_select.sh b/scripts/moefication/select/run_select.sh
@@ -2,29 +2,35 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama2_7B"
+#  ReluLLaMA-7B
+llama_size="ReluLLaMA-7B"
 
-num_experts=8       #  8  16
-num_selects=2       #  2  4
-split_type=Random   #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
-select_type=l2_norm #  plain  positive  l1_norm  l2_norm
-proj_type=gate_proj #  gate_proj  up_proj
+num_experts=16            #  8  16
+num_selects=4             #  2  4
+split_type=Clustering-l2  #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
+select_type=positive      #  plain  positive  l1_norm  l2_norm
+mlp_init_criterion=random #  weight  random
+proj_type=gate_proj       #  gate_proj  up_proj
+
+use_balance="False"
+balance_loss_lambda=0.0 # 0.0001
+add_noise="False"
+use_softmax="False"
 
 data_use_percent=1.0 #  1.0  0.71  0.43
-train_percent=0.95
+train_percent=0.97
 batch_size=1024
-epochs=200
-lr=0.01
+epochs=800
+lr=0.5
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
 split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
 hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
 save_path=${data_path}/moefication_results/select/${split_type}
 
-save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
+save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}-${mlp_init_criterion}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
 
-#node=108
 gpus=1
 cpus=16
 for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"; do # 并行启用任务
@@ -40,7 +46,11 @@ for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 1
     --num_experts ${num_experts} \
     --num_selects ${num_selects} \
     --select_criterion ${select_type} \
-    --use_softmax \
+    --mlp_init_criterion ${mlp_init_criterion} \
+    --use_balance ${use_balance} \
+    --balance_loss_lambda ${balance_loss_lambda} \
+    --add_noise ${add_noise} \
+    --use_softmax ${use_softmax} \
     --data_use_percent ${data_use_percent} \
     --train_percent ${train_percent} \
     --batch_size ${batch_size} \
@@ -54,4 +64,3 @@ done
 # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
 # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
 wait
-chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/split/run_split_clustering.sh b/scripts/moefication/split/run_split_clustering.sh
@@ -2,9 +2,10 @@
 
 #  llama_7B  llama_13B  llama_30B  llama_base
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
-llama_size="llama_13B"
+#  ReluLLaMA-7B
+llama_size="ReluLLaMA-7B"
 
-num_experts=8    #  8  16
+num_experts=16    #  8  16
 metric=l2         #  l2  cos
 proj_type=up_proj #  gate_proj  up_proj
 
@@ -13,7 +14,7 @@ model_path=${data_path}/models/${llama_size}
 save_path=${data_path}/moefication_results/split
 
 gpus=0
-cpus=32
+cpus=16
 OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
   python -m smoe.entrypoint.moefication.llama_split_clustering \
   --model_path ${model_path} \
@@ -22,5 +23,3 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
   --num_experts ${num_experts} \
   --metric ${metric} \
   --cpu_threads ${cpus}
-
-chmod -R 755 ${save_path} >/dev/null 2>&1
diff --git a/scripts/moefication/split/run_split_random.sh b/scripts/moefication/split/run_split_random.sh
@@ -4,9 +4,10 @@
 #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 #  open_llama_7b
 #  Mistral-7B-v0.1
-llama_size="Mistral-7B-v0.1"
+#  ReluLLaMA-7B
+llama_size="ReluLLaMA-7B"
 
-num_experts=8 #  8  16
+num_experts=16 #  8  16
 
 data_path=/mnt/petrelfs/share_data/quxiaoye
 model_path=${data_path}/models/${llama_size}
@@ -18,7 +19,5 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
   python -m smoe.entrypoint.moefication.llama_split_random \
   --model_path ${model_path} \
   --save_path ${save_path} \
-  --template layers.{}.mlp.up_proj.weight \
+  --template layers.{}.mlp.gate_proj.weight \
   --num_experts ${num_experts}
-
-chmod -R 755 ${save_path} >/dev/null 2>&1