Skip to content

Commit

Permalink
Merge branch 'main' into data_mix
Browse files Browse the repository at this point in the history
  • Loading branch information
Spico197 committed Dec 13, 2023
2 parents 53217dd + 0c37546 commit f5810a6
Show file tree
Hide file tree
Showing 14 changed files with 208 additions and 250 deletions.
312 changes: 167 additions & 145 deletions docs/moefication/README.md

Large diffs are not rendered by default.

Binary file removed docs/moefication/readme-image.png
Binary file not shown.
2 changes: 1 addition & 1 deletion scripts/cpt/fpt_13b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ source ~/anaconda3/bin/activate smoe
# tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"

# dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed/
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
# dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized/

Expand Down
8 changes: 4 additions & 4 deletions scripts/cpt/fpt_7b_residual.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ source ~/anaconda3/bin/activate llama-moe

model_type="llama_moe_residual"
tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized

lr=3e-4
Expand All @@ -61,7 +61,7 @@ source ~/anaconda3/bin/activate llama-moe
per_device_eval_batch_size=8
gradient_accumulation_steps=4
block_size=4096
num_tokens="1*10^11"
num_tokens="2*10^11"
seed=1227
deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json

Expand All @@ -77,7 +77,7 @@ source ~/anaconda3/bin/activate llama-moe
echo "#tokens/batch: $tokens_per_batch"

data_cache=resources/cache
output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
output_dir=/mnt/petrelfs/share_data/quxiaoye/runs/residual_2_2_14_scale2_112gpus/
mkdir -p $output_dir
echo "output_dir: $output_dir"
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
Expand Down Expand Up @@ -127,7 +127,7 @@ source ~/anaconda3/bin/activate llama-moe
--max_steps ${max_steps} \
--max_train_samples ${max_train_samples} \
--save_strategy steps \
--save_total_limit 1 \
--save_total_limit 2 \
--save_steps 1000 \
--dataloader_num_workers 0 \
--dataloader_pin_memory True \
Expand Down
13 changes: 7 additions & 6 deletions scripts/moefication/convert/run_convert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama_7B"
# open_llama_7b
llama_size="llama2_7B"

num_experts=16 # 8 16
num_selects=4 # 2 4
num_experts=16 # 4 8 16 32
num_selects=4 # 1 2 4 8
convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification
split_type=Random # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random
proj_type=up_proj # gate_proj up_proj

score_scale_factor=16.0 # 1.0 2.0 4.0 8.0 16.0
score_scale_factor=4.0 # 1.0 2.0 4.0 8.0 16.0
score_scale_factor_file_path=""
#score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense

Expand All @@ -23,7 +24,7 @@ split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_exper

if [ ${use_default_gate} = "True" ]; then
select_file_path=""
save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor}
else
select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ expert_size=1376
# 688 1376 2752 5504 11008
# 864 1728 3456 6912 13824

score_scale_factor_residual=4.0 # 4.0 8.0 12.0 16.0
score_scale_factor_residual=1.0 # 4.0 8.0 12.0 16.0
score_scale_factor=4.0 # 4.0 8.0 12.0 16.0

convert_type=LlamaMoEResidualForCausalLM # LlamaMoEResidualModel LlamaMoEResidualForCausalLM LlamaMoEResidualForSequenceClassification
Expand Down
68 changes: 0 additions & 68 deletions scripts/moefication/split/run_split_graph.py

This file was deleted.

12 changes: 7 additions & 5 deletions scripts/moefication/split/run_split_graph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size=llama_13B

num_experts=8 # 8 16
metric=l2_norm # l1_norm l2_norm plain
template=layers.{}.mlp.up_proj.weight # gate_proj up_proj
num_experts=16 # 8 16
metric=l1_norm # l1_norm l2_norm plain
proj_type=up_proj # gate_proj up_proj
threshold=1

data_path=/mnt/petrelfs/share_data/quxiaoye
Expand All @@ -25,7 +25,7 @@ for specify_layer in {0..39}; do
--model_path ${model_path} \
--save_path ${save_path} \
--specify_layer ${specify_layer} \
--template ${template} \
--template layers.{}.mlp.${proj_type}.weight \
--num_experts ${num_experts} \
--threshold ${threshold} \
--metric ${metric} \
Expand All @@ -38,14 +38,15 @@ wait

gpmetis_run=/mnt/petrelfs/share_data/quxiaoye/metis_for_graph_split/bin/gpmetis
template1=layers.
template2=.mlp.up_proj.weight
template2=.mlp.${proj_type}.weight

for layer in {0..39}; do
OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
${gpmetis_run} ${save_path}/${template1}${layer}${template2} ${num_experts} &
sleep 0.7
done
wait

# STEP3

template3=.part.${num_experts}
Expand All @@ -57,4 +58,5 @@ for layer in {0..39}; do
sleep 0.7
done
wait

chmod -R 755 ${save_path} >/dev/null 2>&1
8 changes: 4 additions & 4 deletions scripts/moefication/split/run_split_random.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama2_7B"
# open_llama_7b
llama_size="open_llama_7b"

num_experts=8 # 8 16
proj_type=gate_proj # gate_proj up_proj
num_experts=8 # 8 16

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
Expand All @@ -17,7 +17,7 @@ OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${
python -m smoe.entrypoint.moefication.llama_split_random \
--model_path ${model_path} \
--save_path ${save_path} \
--template layers.{}.mlp.${proj_type}.weight \
--template layers.{}.mlp.up_proj.weight \
--num_experts ${num_experts}

chmod -R 755 ${save_path} >/dev/null 2>&1
8 changes: 4 additions & 4 deletions scripts/moefication/split/run_split_random_one4all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

# llama_7B llama_13B llama_30B llama_base
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama_13B"
# open_llama_7b
llama_size="open_llama_7b"

data_path=/mnt/petrelfs/share_data/quxiaoye
model_path=${data_path}/models/${llama_size}
save_path=${data_path}/moefication_results/split

# 所有可能的结果组合
gpus=0
cpus=8
for num_experts in 4 8 16 32; do
Expand All @@ -18,8 +18,8 @@ for num_experts in 4 8 16 32; do
--model_path ${model_path} \
--save_path ${save_path} \
--template layers.{}.mlp.${proj_type}.weight \
--num_experts ${num_experts} & # 并行运行下一命令
sleep 0.7 # 等待0.5s
--num_experts ${num_experts} &
sleep 0.7
done
done

Expand Down
2 changes: 1 addition & 1 deletion smoe/entrypoint/moefication/llama_split_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B")
parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/")
parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
parser.add_argument('--template', type=str, default='layers.{}.mlp.up_proj.weight')
parser.add_argument('--num_experts', type=int, default=8, help='number of experts')

args = parser.parse_args()
Expand Down
6 changes: 2 additions & 4 deletions smoe/modules/moe/moe_gates.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ def __init__(

self.gate_network_type = gate_network
self.gate_network = get_gate_network(gate_network, input_size, num_experts)
# self.gate_network = get_gate_network("linear", input_size, num_experts)

self.use_softmax = use_softmax
self.softmax = nn.Softmax(1)
Expand Down Expand Up @@ -286,7 +285,6 @@ def forward(self, x):
logits_gate = self.gate_network(x) # gate计算出的权重
if self.training and self.add_noise:
noise_mm = self.weight_noise(x) # 噪声矩阵计算结果
# noise_mm = torch.mm(x, self.weight_noise) # 噪声矩阵计算结果
noise_control = self.softplus(noise_mm) + self.noise_epsilon # 控制器得到的噪声增加量
logits_noise = torch.randn_like(logits_gate) * noise_control # noise附加的权重
logits = logits_gate + logits_noise # 最终权重
Expand Down Expand Up @@ -323,7 +321,7 @@ def forward(self, x):
load = prob.sum(0)
else:
load = (scores_filtered > 0).sum(0)
if not self.warned:
if not self.add_noise and not self.warned:
warnings.warn('Gradient-trackable implementation for load calculation is only available when "add_noise=True". '
'Training without noise will block the gradient from "load" path and lead to inconsistency in optimization objectives.')
self.warned = True
Expand Down Expand Up @@ -436,7 +434,7 @@ def __init__(
add_noise=True,
):
super(SwitchBalancedGate, self).__init__()
assert num_selects in [1, 2]
assert num_selects in (1, 2)
self.input_size = input_size
self.num_experts = num_experts
self.num_selects = num_selects
Expand Down
2 changes: 1 addition & 1 deletion smoe/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def torch_load_template_file(path, template, layer):

def torch_load_template_score_file(path, template, layer):
score_list = []
for expert_folder_name in os.listdir(path):
for expert_folder_name in sorted(os.listdir(path)):
score_file = os.path.join(path, expert_folder_name, template.format(layer))
score = torch.load(score_file, map_location="cpu")
score_list.append(score)
Expand Down
15 changes: 9 additions & 6 deletions smoe/utils/moefication/convert_llama_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def convert_llama_model(
num_selects,
score_scale_factor=None,
use_default_gate=False,
gate_type="mlp", # "linear"
):
"""
LlamaMoEModel
Expand Down Expand Up @@ -63,7 +64,7 @@ def convert_llama_model(
config_llama_moe.num_experts = num_experts
config_llama_moe.num_selects = num_selects
config_llama_moe.size_experts = size_experts
config_llama_moe.gates = "mlp"
config_llama_moe.gates = gate_type
config_llama_moe.score_scale_factor = (
1.0 if score_scale_factor is None else score_scale_factor
)
Expand Down Expand Up @@ -91,7 +92,7 @@ def convert_llama_model(
model_llama_moe_state_dict["layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()

for layer_index in range(num_layers):
if not use_default_gate:
if not use_default_gate and gate_type == "mlp":
model_llama_moe_state_dict["layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
model_llama_moe_state_dict["layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
model_llama_moe_state_dict["layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)
Expand Down Expand Up @@ -123,6 +124,7 @@ def convert_llama_model_for_causal_lm(
num_selects,
score_scale_factor=None,
use_default_gate=False,
gate_type="mlp", # "linear"
):
"""
LlamaMoEForCausalLM
Expand Down Expand Up @@ -160,7 +162,7 @@ def convert_llama_model_for_causal_lm(
config_llama_moe.num_experts = num_experts
config_llama_moe.num_selects = num_selects
config_llama_moe.size_experts = size_experts
config_llama_moe.gates = "mlp"
config_llama_moe.gates = gate_type
config_llama_moe.score_scale_factor = (
1.0 if score_scale_factor is not None else score_scale_factor
)
Expand Down Expand Up @@ -188,7 +190,7 @@ def convert_llama_model_for_causal_lm(
model_llama_moe_state_dict["model.layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()

for layer_index in range(num_layers):
if not use_default_gate:
if not use_default_gate and gate_type == "mlp":
model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
model_llama_moe_state_dict["model.layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)
Expand Down Expand Up @@ -220,6 +222,7 @@ def convert_llama_model_for_sequence_classification(
num_selects,
score_scale_factor=None,
use_default_gate=False,
gate_type="mlp", # "linear"
):
"""
LlamaMoEForSequenceClassification
Expand Down Expand Up @@ -257,7 +260,7 @@ def convert_llama_model_for_sequence_classification(
config_llama_moe.num_experts = num_experts
config_llama_moe.num_selects = num_selects
config_llama_moe.size_experts = size_experts
config_llama_moe.gates = "mlp"
config_llama_moe.gates = gate_type
config_llama_moe.score_scale_factor = (
1.0 if score_scale_factor is not None else score_scale_factor
)
Expand Down Expand Up @@ -285,7 +288,7 @@ def convert_llama_model_for_sequence_classification(
model_llama_moe_state_dict["model.layers.{}.mlp.calculator.experts.weight_down.{}".format(layer_index, expert_index)] = model_llama_state_dict[key].transpose(0, 1)[moe_indices[layer_index] == expert_index].transpose(0, 1).cpu().half()

for layer_index in range(num_layers):
if not use_default_gate:
if not use_default_gate and gate_type == "mlp":
model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.0.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.0.weight"].cpu().half()
model_llama_moe_state_dict["model.layers.{}.mlp.gate.gate_network.2.weight".format(layer_index)] = moe_gates[layer_index]["gate_network.2.weight"].cpu().half()
model_llama_moe_state_dict["model.layers.{}.mlp.gate.weight_noise.weight".format(layer_index)] = torch.zeros((num_experts, hidden_size), requires_grad=True)
Expand Down

0 comments on commit f5810a6

Please sign in to comment.