Skip to content

Commit 1350b2c

Browse files
tiandeyu-csjahatefQuentin-Anthony
committed
fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed (#1309)
* fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed * config adjustments for llama and gated activations * pre-commit --------- Co-authored-by: jahatef <hatef.4@buckeyemail.osu.edu> Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
1 parent ff7f328 commit 1350b2c

File tree

9 files changed

+14
-5
lines changed

9 files changed

+14
-5
lines changed

configs/llama/13B.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 40,
88
"hidden_size": 5120,
9+
"intermediate_size": 40960,
910
"num_attention_heads": 40,
1011
"seq_length": 2048,
1112
"max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
1617
"output_layer_parallelism": "column",
1718
"norm": "rmsnorm",
1819
"rms_norm_epsilon": 1.0e-6,
20+
"use_bias_in_mlp": False,
1921

2022
"scaled_upper_triang_masked_softmax_fusion": true,
2123
"bias_gelu_fusion": false,

configs/llama/30B.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 60,
88
"hidden_size": 6656,
9+
"intermediate_size": 53248,
910
"num_attention_heads": 52,
1011
"seq_length": 2048,
1112
"max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
1617
"output_layer_parallelism": "column",
1718
"norm": "rmsnorm",
1819
"rms_norm_epsilon": 1.0e-6,
20+
"use_bias_in_mlp": False,
1921

2022
"scaled_upper_triang_masked_softmax_fusion": true,
2123
"bias_gelu_fusion": false,

configs/llama/65B.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 80,
88
"hidden_size": 8192,
9+
"intermediate_size": 65536,
910
"num_attention_heads": 64,
1011
"seq_length": 2048,
1112
"max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
1617
"output_layer_parallelism": "column",
1718
"norm": "rmsnorm",
1819
"rms_norm_epsilon": 1.0e-6,
20+
"use_bias_in_mlp": False,
1921

2022
"scaled_upper_triang_masked_softmax_fusion": true,
2123
"bias_gelu_fusion": false,

configs/llama/7B.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 32,
88
"hidden_size": 4096,
9+
"intermediate_size": 32768,
910
"num_attention_heads": 32,
1011
"seq_length": 2048,
1112
"max_position_embeddings": 2048,
@@ -16,6 +17,7 @@
1617
"output_layer_parallelism": "column",
1718
"norm": "rmsnorm",
1819
"rms_norm_epsilon": 1.0e-6,
20+
"use_bias_in_mlp": False,
1921

2022
"scaled_upper_triang_masked_softmax_fusion": true,
2123
"bias_gelu_fusion": false,

configs/llama/train_config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,5 +70,5 @@
7070
"steps_per_print": 10,
7171
"keep_last_n_checkpoints": 4,
7272
"wall_clock_breakdown": true,
73-
"mlp_multiple_of": 256,
73+
7474
}

configs/llama2/13B.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 40,
88
"hidden_size": 5120,
9+
"intermediate_size": 41472,
910
"num_attention_heads": 40,
1011
"seq_length": 4096,
1112
"max_position_embeddings": 4096,

configs/llama2/70B.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# model settings
77
"num_layers": 80,
88
"hidden_size": 8192,
9-
"intermediate_size": 28672,
9+
"intermediate_size": 86016,
1010
"num_attention_heads": 64,
1111
"num_kv_heads": 8,
1212
"seq_length": 4096,

configs/llama2/7B.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# model settings
77
"num_layers": 32,
88
"hidden_size": 4096,
9+
"intermediate_size": 32768,
910
"num_attention_heads": 32,
1011
"seq_length": 4096,
1112
"max_position_embeddings": 4096,

megatron/model/transformer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,9 +1245,8 @@ def forward(self, x, attention_mask, layer_past=None):
12451245

12461246
with torch.enable_grad() if not self.eval else nullcontext():
12471247
if (
1248-
self.activation == "swiglu"
1249-
or self.num_experts > 1
1250-
and self.moe_type == "deepspeed"
1248+
mlp_bias == None,
1249+
self.num_experts > 1 and self.moe_type == "deepspeed",
12511250
):
12521251
# No dropout either
12531252
assert mlp_bias is None

0 commit comments

Comments
 (0)