Skip to content

Commit

Permalink
Moefication: Format Standardization (v8)
Browse files Browse the repository at this point in the history
  • Loading branch information
DaizeDong committed Dec 24, 2023
1 parent ee301c6 commit 6c57790
Show file tree
Hide file tree
Showing 47 changed files with 875 additions and 490 deletions.
23 changes: 9 additions & 14 deletions docs/moefication/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Remember to change the following variables:
```shell
num_experts="" # number of experts in each MoE layer

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
save_path="" # path to save the indices sets
```

Expand All @@ -47,7 +47,7 @@ Remember to change the following variables:
```shell
num_experts="" # number of experts in each MoE layer

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
save_path="" # path to save the indices sets

metric="" # metric for clustering, choices: `l2` `cos`
Expand All @@ -73,7 +73,7 @@ Remember to change the following variables:
```shell
num_experts="" # number of experts in each MoE layer

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
save_path="" # path to save the indices sets

metric="" # metric to measure the sparsity, choices: `l1_norm` `l2_norm` `plain`
Expand All @@ -82,7 +82,7 @@ proj_type="" # weights to perform clustering, choices: `up_proj` `gate_proj`



#### Gradient Split
#### Gradient Split

Before performing gradient-based splitting (Eq. 8 in the technical report), you need to prepare a bunch of pretraining data and group them into different clusters by running:

Expand All @@ -101,7 +101,7 @@ Remember to change the following variables:
```shell
dataset_dir="" # path to clustered data
pretrained_model="" # path to the LLaMA checkpoint
tokenizer_path="" # path to the LLaMA tokenizer
tokenizer_path="" # path to the LLaMA tokenizer
save_path="" # path to save the indices sets

accumulate_level="" # should be set to `sample`
Expand All @@ -111,7 +111,7 @@ importance_type="" # should be set to `feature_change`



##### Neuron Independent
##### Neuron Independent

> This part is not included in our technical report.
Expand All @@ -128,7 +128,7 @@ expert_num="" # number of experts in each MoE layer
expert_size="" # intermediate neurons in each expert
share_neurons="False" ######### SET AS FLASE TO BE NEURON-INDEPENDENT #########

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
score_file_path="" # path to the score files generated above
save_path="" # path to save the indices sets
visualization_path="" # path to save the visualization results
Expand All @@ -154,7 +154,7 @@ expert_num="" # number of experts in each MoE layer
expert_size="" # intermediate neurons in each expert
share_neurons="True" ######### SET AS TRUE TO BE INNER-SHARING #########

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
score_file_path="" # path to the score files generated above
save_path="" # path to save the indices sets
visualization_path="" # path to save the visualization results
Expand All @@ -181,7 +181,7 @@ expert_num_residual="" # number of residual experts
expert_size="" # intermediate neurons in each expert
share_neurons="" # Whether to share neurons in non-residual experts

model_path="" # path to the LLaMA checkpoint
model_path="" # path to the LLaMA checkpoint
score_file_path="" # path to the score files generated above
save_path="" # path to save the indices sets
visualization_path="" # path to save the visualization results
Expand Down Expand Up @@ -239,8 +239,3 @@ bash ./scripts/moefication/convert/run_convert_gradient_residual.sh
-- entrypoint
-- moefication
```





12 changes: 12 additions & 0 deletions scripts/examples/load_relu_llama.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/bash

base_model=ReluLLaMA-7B
model_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/

gpus=1
cpus=8
quotatype=spot # spot reserved auto
OMP_NUM_THREADS=2 srun --partition=MoE --job-name=example --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --job-name=example --kill-on-bad-exit=1 --quotatype=${quotatype} \
python -m smoe.entrypoint.examples.load_relu_llama \
--tokenizer_path ${model_path} \
--model_path ${m
40 changes: 24 additions & 16 deletions scripts/moefication/convert/run_convert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,38 @@
# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
# open_llama_7b
llama_size="llama2_7B"
# ReluLLaMA-7B
llama_size="ReluLLaMA-7B"

num_experts=8 # 4 8 16 32
num_selects=2 # 1 2 4 8
convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification
split_type=Random # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
proj_type=up_proj # gate_proj up_proj
num_experts=16 # 4 8 16 32
num_selects=4 # 1 2 4 8
split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
proj_type=gate_proj # gate_proj up_proj
select_type=positive # plain positive l1_norm l2_norm

use_random_gate="False" # True False
gate_type="mlp" # mlp linear
use_softmax="False"
multiply_gate_scores="False"

score_scale_factor=4.0 # 1.0 2.0 4.0 8.0 16.0
score_scale_factor=1.0 # 1.0 2.0 4.0 8.0 16.0
score_scale_factor_file_path=""
#score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense

use_default_gate=True # True False
select_type=l2_norm # plain positive l1_norm l2_norm
convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}

if [ ${use_default_gate} = "True" ]; then
if [ ${use_random_gate} = "True" ]; then
select_file_path=""
save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor}
else
select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
select_file_path="/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/Clustering-l2/ReluLLaMA-7B-16Expert-Select-MLP-positive-random"
save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-HardBCE
# select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
# save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
fi

gpus=0
Expand All @@ -41,9 +48,10 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:
--template layers.{}.mlp.${proj_type}.weight \
--num_experts ${num_experts} \
--num_selects ${num_selects} \
--use_random_gate ${use_random_gate} \
--gate_type ${gate_type} \
--use_softmax ${use_softmax} \
--multiply_gate_scores ${multiply_gate_scores} \
--score_scale_factor ${score_scale_factor} \
--score_scale_factor_file_path "${score_scale_factor_file_path}" \
--convert_type ${convert_type} \
--use_default_gate ${use_default_gate}

chmod -R 755 ${save_path} >/dev/null 2>&1
--convert_type ${convert_type}
6 changes: 2 additions & 4 deletions scripts/moefication/convert/run_convert_gradient.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ if [ ${share_neurons} = "True" ]; then
--score_scale_factor ${score_scale_factor} \
--score_scale_factor_file_path "${score_scale_factor_file_path}" \
--convert_type ${convert_type} \
--use_default_gate True
--use_random_gate True
else
OMP_NUM_THREADS=8 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
python -m smoe.entrypoint.moefication.llama_convert \
Expand All @@ -61,7 +61,5 @@ else
--score_scale_factor ${score_scale_factor} \
--score_scale_factor_file_path "${score_scale_factor_file_path}" \
--convert_type ${convert_type} \
--use_default_gate True
--use_random_gate True
fi

chmod -R 755 ${save_path} >/dev/null 2>&1
4 changes: 1 addition & 3 deletions scripts/moefication/convert/run_convert_gradient_residual.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:
--score_scale_factor ${score_scale_factor} \
--score_scale_factor_residual ${score_scale_factor_residual} \
--convert_type ${convert_type} \
--use_default_gate True

chmod -R 755 ${save_path} >/dev/null 2>&1
--use_random_gate True
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama_13B"
# ReluLLaMA-7B
llama_size="ReluLLaMA-7B"
save_interval=1
batch_size=4
block_size=2048
data_use_percent=0.01
data_use_percent=0.002

proj_type=up_proj # gate_proj up_proj
proj_type=gate_proj # gate_proj up_proj

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
train_data_path=${data_path}/data/moefication_LLAMA_data
train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache
save_path=${data_path}/moefication_results/features

gpus=8
gpus=4
cpus=$((gpus * 16))
OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
quotatype=auto # auto spot reserved
OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=${quotatype} \
torchrun --nproc_per_node=${gpus} -m smoe.entrypoint.moefication.llama_get_hidden_features \
--model_path ${model_path} \
--train_data_path ${train_data_path} \
Expand All @@ -31,4 +33,3 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres
--block_size ${block_size}

wait
chmod -R 755 ${save_path}/${llama_size} >/dev/null 2>&1
4 changes: 1 addition & 3 deletions scripts/moefication/prune/run_prune_gradient_convert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gre
--num_experts 1 \
--num_selects 1 \
--convert_type ${convert_type} \
--use_default_gate True

chmod -R 755 ${save_path} >/dev/null 2>&1
--use_random_gate True
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
--num_experts 1 \
--num_selects 1 \
--convert_type ${convert_type} \
--use_default_gate True &
--use_random_gate True &
sleep 1
done

wait
chmod -R 755 ${save_path} >/dev/null 2>&1
4 changes: 1 addition & 3 deletions scripts/moefication/prune/run_prune_random_convert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,4 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gre
--num_experts 1 \
--num_selects 1 \
--convert_type ${convert_type} \
--use_default_gate True

chmod -R 755 ${save_path} >/dev/null 2>&1
--use_random_gate True
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
--num_experts 1 \
--num_selects 1 \
--convert_type ${convert_type} \
--use_default_gate True &
--use_random_gate True &
sleep 1
done

wait
chmod -R 755 ${save_path} >/dev/null 2>&1
35 changes: 22 additions & 13 deletions scripts/moefication/select/run_select.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,35 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama2_7B"
# ReluLLaMA-7B
llama_size="ReluLLaMA-7B"

num_experts=8 # 8 16
num_selects=2 # 2 4
split_type=Random # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
select_type=l2_norm # plain positive l1_norm l2_norm
proj_type=gate_proj # gate_proj up_proj
num_experts=16 # 8 16
num_selects=4 # 2 4
split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
select_type=positive # plain positive l1_norm l2_norm
mlp_init_criterion=random # weight random
proj_type=gate_proj # gate_proj up_proj

use_balance="False"
balance_loss_lambda=0.0 # 0.0001
add_noise="False"
use_softmax="False"

data_use_percent=1.0 # 1.0 0.71 0.43
train_percent=0.95
train_percent=0.97
batch_size=1024
epochs=200
lr=0.01
epochs=800
lr=0.5

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
save_path=${data_path}/moefication_results/select/${split_type}

save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}-${mlp_init_criterion}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}

#node=108
gpus=1
cpus=16
for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"; do # 并行启用任务
Expand All @@ -40,7 +46,11 @@ for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 1
--num_experts ${num_experts} \
--num_selects ${num_selects} \
--select_criterion ${select_type} \
--use_softmax \
--mlp_init_criterion ${mlp_init_criterion} \
--use_balance ${use_balance} \
--balance_loss_lambda ${balance_loss_lambda} \
--add_noise ${add_noise} \
--use_softmax ${use_softmax} \
--data_use_percent ${data_use_percent} \
--train_percent ${train_percent} \
--batch_size ${batch_size} \
Expand All @@ -54,4 +64,3 @@ done
# "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
# "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
wait
chmod -R 755 ${save_path} >/dev/null 2>&1
9 changes: 4 additions & 5 deletions scripts/moefication/split/run_split_clustering.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama_13B"
# ReluLLaMA-7B
llama_size="ReluLLaMA-7B"

num_experts=8 # 8 16
num_experts=16 # 8 16
metric=l2 # l2 cos
proj_type=up_proj # gate_proj up_proj

Expand All @@ -13,7 +14,7 @@ model_path=${data_path}/models/${llama_size}
save_path=${data_path}/moefication_results/split

gpus=0
cpus=32
cpus=16
OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
python -m smoe.entrypoint.moefication.llama_split_clustering \
--model_path ${model_path} \
Expand All @@ -22,5 +23,3 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
--num_experts ${num_experts} \
--metric ${metric} \
--cpu_threads ${cpus}

chmod -R 755 ${save_path} >/dev/null 2>&1
9 changes: 4 additions & 5 deletions scripts/moefication/split/run_split_random.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
# llama2_7B llama2_13B llama2_30B llama2_base
# open_llama_7b
# Mistral-7B-v0.1
llama_size="Mistral-7B-v0.1"
# ReluLLaMA-7B
llama_size="ReluLLaMA-7B"

num_experts=8 # 8 16
num_experts=16 # 8 16

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
Expand All @@ -18,7 +19,5 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
python -m smoe.entrypoint.moefication.llama_split_random \
--model_path ${model_path} \
--save_path ${save_path} \
--template layers.{}.mlp.up_proj.weight \
--template layers.{}.mlp.gate_proj.weight \
--num_experts ${num_experts}

chmod -R 755 ${save_path} >/dev/null 2>&1
Loading

0 comments on commit 6c57790

Please sign in to comment.