Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trying to debug aggregate step of moe mlp #1552

Merged
Merged
Changes from 1 commit
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
17f162f
with .
hugolatendresse Dec 2, 2024
f5c3f7f
all _ instead of .
hugolatendresse Dec 3, 2024
22e2a58
_block
hugolatendresse Dec 3, 2024
1cd424a
try nullptr
hugolatendresse Dec 3, 2024
df0c59b
no comma
hugolatendresse Dec 3, 2024
3598ee9
debug
hugolatendresse Dec 3, 2024
06259f7
debug
hugolatendresse Dec 3, 2024
c89a327
revert aggregate
hugolatendresse Dec 6, 2024
71c5d69
one expert
hugolatendresse Dec 7, 2024
cb1eaa8
sync
hugolatendresse Dec 7, 2024
286a1fe
dont redefien mlpout
hugolatendresse Dec 7, 2024
8709035
sync
hugolatendresse Dec 7, 2024
51f9701
sync
hugolatendresse Dec 7, 2024
a65e733
register tokenizer for mixtral
hugolatendresse Dec 7, 2024
62bf012
sync
hugolatendresse Dec 7, 2024
61adc0f
rename weights
hugolatendresse Dec 7, 2024
8c69b8b
sync
hugolatendresse Dec 7, 2024
0b89169
sync
hugolatendresse Dec 7, 2024
76cac36
permission
Dec 7, 2024
baa30a8
Merge branch 'nomlp2' of https://github.com/hugolatendresse/FlexFlow …
Dec 7, 2024
53a4cc4
sync
hugolatendresse Dec 7, 2024
bd1ffa0
Merge branch 'nomlp2' of github.com:hugolatendresse/FlexFlow into nomlp2
hugolatendresse Dec 7, 2024
aeb29e9
sync
hugolatendresse Dec 7, 2024
d27804f
sync
hugolatendresse Dec 7, 2024
ecb9675
sync
hugolatendresse Dec 7, 2024
af665bd
which loading
hugolatendresse Dec 7, 2024
16ab912
sync
hugolatendresse Dec 7, 2024
c3945e3
sync
hugolatendresse Dec 7, 2024
9385e82
.o
hugolatendresse Dec 7, 2024
ce91966
sync
hugolatendresse Dec 7, 2024
7e558bc
able to output with mixtral (!!!) but it's all etc etc etc
hugolatendresse Dec 7, 2024
c8007fd
try expert 1
hugolatendresse Dec 7, 2024
aa01156
revert experts
hugolatendresse Dec 7, 2024
539b491
tmp fix
hugolatendresse Dec 7, 2024
28b2df0
dummy gate
hugolatendresse Dec 7, 2024
8ad9478
bad softmax fix
hugolatendresse Dec 7, 2024
9dcb5c2
printf
hugolatendresse Dec 8, 2024
7ed7d65
dims
hugolatendresse Dec 8, 2024
a906c6a
sync
hugolatendresse Dec 8, 2024
0af8064
sync
hugolatendresse Dec 8, 2024
4d26fb5
comments on dims
hugolatendresse Dec 8, 2024
c0e4524
sync
hugolatendresse Dec 8, 2024
7462fb4
sync
hugolatendresse Dec 8, 2024
21ecf77
sync
hugolatendresse Dec 8, 2024
742ec59
sync
hugolatendresse Dec 8, 2024
b04af7a
sync
hugolatendresse Dec 8, 2024
99954e5
sync
hugolatendresse Dec 8, 2024
511fc25
sync
hugolatendresse Dec 8, 2024
e590ce5
sync
hugolatendresse Dec 8, 2024
1ed4bff
sync
hugolatendresse Dec 8, 2024
5700378
CHECKPOINT
hugolatendresse Dec 8, 2024
381d3cd
2222:22 port
hugolatendresse Dec 8, 2024
6309e70
tmp_volume
hugolatendresse Dec 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
revert experts
  • Loading branch information
hugolatendresse committed Dec 7, 2024
commit aa01156669f1e8d65cb81e56a9510e3c7c38bded
147 changes: 110 additions & 37 deletions inference/models/mixtral.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,47 +167,120 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
Tensor ff_norm = token_ff_norm[1];

// MoE
Tensor w1 = ff.dense(
ff_norm,
mixtral_config.intermediate_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_experts_1_w1").c_str());
Tensor gate = ff.dense(
ff_norm,
mixtral_config.num_local_experts,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_gate")
.c_str());
gate = ff.softmax(
gate,
0,
DT_NONE,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_softmax")
.c_str());

Tensor w3 = ff.dense(
ff_norm,
mixtral_config.intermediate_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_experts_1_w3").c_str());
Tensor topk_out[2] = {nullptr, nullptr};
ff.top_k(
gate,
topk_out,
mixtral_config.num_experts_per_tok,
false,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_topk")
.c_str());
Tensor topk_values = topk_out[0];
Tensor topk_indices = topk_out[1];

Tensor multi = ff.sigmoid_silu_multi(w1, w3); //DT_NONE,std::string("layers." + std::to_string(i) +".block_sparse_moe_experts." +std::to_string(expert_idx) + "ssm").c_str());
Tensor grouped_tokens[mixtral_config.num_local_experts] = {nullptr};
ff.group_by(
ff_norm,
topk_indices,
grouped_tokens,
mixtral_config.num_local_experts,
0.0f,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_groupby")
.c_str());

mlp_out = ff.dense(
multi,
mixtral_config.hidden_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) + ".block_sparse_moe_experts_1_w2").c_str());
Tensor aggregate_inputs[4 + mixtral_config.num_local_experts] = {nullptr};
for (int expert_idx = 0; expert_idx < mixtral_config.num_local_experts;
expert_idx++) {
Tensor w1 = ff.dense(grouped_tokens[expert_idx],
mixtral_config.intermediate_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) +
".block_sparse_moe_experts_" +
std::to_string(expert_idx) + "_w1")
.c_str());

Tensor w3 = ff.dense(grouped_tokens[expert_idx],
mixtral_config.intermediate_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) +
".block_sparse_moe_experts_" +
std::to_string(expert_idx) + "_w3")
.c_str());

Tensor multi =
ff.sigmoid_silu_multi(w1,
w3,
DT_NONE,
std::string("layers." + std::to_string(i) +
".block_sparse_moe_experts_" +
std::to_string(expert_idx) + "ssm")
.c_str());

Tensor w2 = ff.dense(multi,
mixtral_config.hidden_size,
AC_MODE_NONE,
false,
DT_NONE,
nullptr,
nullptr,
nullptr,
REG_MODE_NONE,
0.0f,
std::string("layers." + std::to_string(i) +
".block_sparse_moe_experts_" +
std::to_string(expert_idx) + "_w2")
.c_str());
aggregate_inputs[4 + expert_idx] = w2;
}

Tensor topk_values_reduced = ff.reduce_sum(topk_values, {0}, true);
topk_values = ff.divide(topk_values, topk_values_reduced);

aggregate_inputs[0] = topk_values;
aggregate_inputs[1] = topk_indices;
aggregate_inputs[2] = aggregate_inputs[3] = nullptr;
mlp_out = ff.aggregate(aggregate_inputs,
mixtral_config.num_local_experts,
0.0f,
std::string("layers." + std::to_string(i) +
".block_sparse_moe_experts_aggregate")
.c_str());
}

// final normalization and linear
Tensor final_rms_norm_output[2] = {nullptr, nullptr};
ff.residual_rms_norm(token,
Expand Down