@@ -213,6 +213,18 @@ class MODEL_TENSOR(IntEnum):
213
213
MODEL_TENSOR .FFN_DOWN : "blk.{bid}.ffn_down" ,
214
214
MODEL_TENSOR .FFN_UP : "blk.{bid}.ffn_up" ,
215
215
},
216
+ MODEL_ARCH .GPTJ : {
217
+ MODEL_TENSOR .TOKEN_EMBD : "token_embd" ,
218
+ MODEL_TENSOR .OUTPUT_NORM : "output_norm" ,
219
+ MODEL_TENSOR .OUTPUT : "output" ,
220
+ MODEL_TENSOR .ATTN_NORM : "blk.{bid}.attn_norm" ,
221
+ MODEL_TENSOR .ATTN_Q : "blk.{bid}.attn_q" ,
222
+ MODEL_TENSOR .ATTN_K : "blk.{bid}.attn_k" ,
223
+ MODEL_TENSOR .ATTN_V : "blk.{bid}.attn_v" ,
224
+ MODEL_TENSOR .ATTN_OUT : "blk.{bid}.attn_output" ,
225
+ MODEL_TENSOR .FFN_DOWN : "blk.{bid}.ffn_down" ,
226
+ MODEL_TENSOR .FFN_UP : "blk.{bid}.ffn_up" ,
227
+ },
216
228
MODEL_ARCH .GPT2 : {
217
229
# TODO
218
230
},
@@ -237,7 +249,7 @@ class TensorNameMap:
237
249
# Token embeddings
238
250
MODEL_TENSOR .TOKEN_EMBD : (
239
251
"gpt_neox.embed_in" , # gptneox
240
- "transformer.wte" , # gpt2 mpt
252
+ "transformer.wte" , # gpt2 gpt-j mpt
241
253
"transformer.word_embeddings" , # falcon
242
254
"model.embed_tokens" , # llama-hf
243
255
"tok_embeddings" , # llama-pth
@@ -258,14 +270,14 @@ class TensorNameMap:
258
270
# Output
259
271
MODEL_TENSOR .OUTPUT : (
260
272
"embed_out" , # gptneox
261
- "lm_head" , # gpt2 mpt falcon llama-hf baichuan
273
+ "lm_head" , # gpt2 gpt-j mpt falcon llama-hf baichuan
262
274
"output" , # llama-pth
263
275
),
264
276
265
277
# Output norm
266
278
MODEL_TENSOR .OUTPUT_NORM : (
267
279
"gpt_neox.final_layer_norm" , # gptneox
268
- "transformer.ln_f" , # gpt2 falcon
280
+ "transformer.ln_f" , # gpt2 gpt-j falcon
269
281
"model.norm" , # llama-hf baichuan
270
282
"norm" , # llama-pth
271
283
"embeddings.LayerNorm" , # bert
@@ -282,7 +294,7 @@ class TensorNameMap:
282
294
# Attention norm
283
295
MODEL_TENSOR .ATTN_NORM : (
284
296
"gpt_neox.layers.{bid}.input_layernorm" , # gptneox
285
- "transformer.h.{bid}.ln_1" , # gpt2
297
+ "transformer.h.{bid}.ln_1" , # gpt2 gpt-j
286
298
"transformer.blocks.{bid}.norm_1" , # mpt
287
299
"transformer.h.{bid}.input_layernorm" , # falcon7b
288
300
"transformer.h.{bid}.ln_mlp" , # falcon40b
@@ -309,20 +321,23 @@ class TensorNameMap:
309
321
"model.layers.{bid}.self_attn.q_proj" , # llama-hf
310
322
"layers.{bid}.attention.wq" , # llama-pth
311
323
"encoder.layer.{bid}.attention.self.query" , # bert
324
+ "transformer.h.{bid}.attn.q_proj" , # gpt-j
312
325
),
313
326
314
327
# Attention key
315
328
MODEL_TENSOR .ATTN_K : (
316
329
"model.layers.{bid}.self_attn.k_proj" , # llama-hf
317
330
"layers.{bid}.attention.wk" , # llama-pth
318
331
"encoder.layer.{bid}.attention.self.key" , # bert
332
+ "transformer.h.{bid}.attn.k_proj" , # gpt-j
319
333
),
320
334
321
335
# Attention value
322
336
MODEL_TENSOR .ATTN_V : (
323
337
"model.layers.{bid}.self_attn.v_proj" , # llama-hf
324
338
"layers.{bid}.attention.wv" , # llama-pth
325
339
"encoder.layer.{bid}.attention.self.value" , # bert
340
+ "transformer.h.{bid}.attn.v_proj" , # gpt-j
326
341
),
327
342
328
343
# Attention output
@@ -334,6 +349,7 @@ class TensorNameMap:
334
349
"model.layers.{bid}.self_attn.o_proj" , # llama-hf
335
350
"layers.{bid}.attention.wo" , # llama-pth
336
351
"encoder.layer.{bid}.attention.output.dense" , # bert
352
+ "transformer.h.{bid}.attn.out_proj" , # gpt-j
337
353
),
338
354
339
355
# Rotary embeddings
@@ -361,6 +377,7 @@ class TensorNameMap:
361
377
"model.layers.{bid}.mlp.up_proj" , # llama-hf
362
378
"layers.{bid}.feed_forward.w3" , # llama-pth
363
379
"encoder.layer.{bid}.intermediate.dense" , # bert
380
+ "transformer.h.{bid}.mlp.fc_in" , # gpt-j
364
381
),
365
382
366
383
# Feed-forward gate
@@ -378,6 +395,7 @@ class TensorNameMap:
378
395
"model.layers.{bid}.mlp.down_proj" , # llama-hf
379
396
"layers.{bid}.feed_forward.w2" , # llama-pth
380
397
"encoder.layer.{bid}.output.dense" , # bert
398
+ "transformer.h.{bid}.mlp.fc_out" , # gpt-j
381
399
),
382
400
}
383
401
0 commit comments