Skip to content

Commit ab54d2b

Browse files
committed
llama: define architecture for granite models
it currently works only for the small models 3b and 8b. There are enough differences with the base llama arch that it is worth to define a new architecture. To create the .gguf files, it is necessary to specify GraniteForCausalLM in the architectures for the hf model. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
1 parent 829141c commit ab54d2b

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,6 +2427,34 @@ def set_vocab(self, *args, **kwargs):
24272427
self.gguf_writer.add_add_bos_token(True)
24282428
self.gguf_writer.add_add_eos_token(True)
24292429

2430+
@Model.register("GraniteForCausalLM")
2431+
class GraniteModel(Model):
2432+
model_arch = gguf.MODEL_ARCH.GRANITE
2433+
2434+
def __init__(self, *args, **kwargs):
2435+
super().__init__(*args, **kwargs)
2436+
2437+
def set_vocab(self):
2438+
tokens, toktypes, _ = self.get_vocab_base()
2439+
self.gguf_writer.add_tokenizer_model("gpt2")
2440+
self.gguf_writer.add_tokenizer_pre("starcoder")
2441+
self.gguf_writer.add_token_list(tokens)
2442+
self.gguf_writer.add_token_types(toktypes)
2443+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
2444+
special_vocab.add_to_gguf(self.gguf_writer)
2445+
2446+
def set_gguf_parameters(self):
2447+
super().set_gguf_parameters()
2448+
hparams = self.hparams
2449+
self.gguf_writer.add_name("Granite")
2450+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2451+
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
2452+
self.gguf_writer.add_add_bos_token(False)
2453+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
2454+
2455+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2456+
return [(self.map_tensor_name(name), data_torch)]
2457+
24302458

24312459
###### CONVERSION LOGIC ######
24322460

gguf-py/gguf/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
139139
COMMAND_R = auto()
140140
DBRX = auto()
141141
OLMO = auto()
142+
GRANITE = auto()
142143

143144

144145
class MODEL_TENSOR(IntEnum):
@@ -218,6 +219,7 @@ class MODEL_TENSOR(IntEnum):
218219
MODEL_ARCH.COMMAND_R: "command-r",
219220
MODEL_ARCH.DBRX: "dbrx",
220221
MODEL_ARCH.OLMO: "olmo",
222+
MODEL_ARCH.GRANITE: "granite",
221223
}
222224

223225
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -732,6 +734,26 @@ class MODEL_TENSOR(IntEnum):
732734
MODEL_TENSOR.FFN_DOWN,
733735
MODEL_TENSOR.FFN_UP,
734736
],
737+
MODEL_ARCH.GRANITE: [
738+
MODEL_TENSOR.TOKEN_EMBD,
739+
MODEL_TENSOR.OUTPUT_NORM,
740+
MODEL_TENSOR.OUTPUT,
741+
MODEL_TENSOR.ROPE_FREQS,
742+
MODEL_TENSOR.ATTN_NORM,
743+
MODEL_TENSOR.ATTN_Q,
744+
MODEL_TENSOR.ATTN_K,
745+
MODEL_TENSOR.ATTN_V,
746+
MODEL_TENSOR.ATTN_OUT,
747+
MODEL_TENSOR.ATTN_ROT_EMBD,
748+
MODEL_TENSOR.FFN_GATE_INP,
749+
MODEL_TENSOR.FFN_NORM,
750+
MODEL_TENSOR.FFN_GATE,
751+
MODEL_TENSOR.FFN_DOWN,
752+
MODEL_TENSOR.FFN_UP,
753+
MODEL_TENSOR.FFN_GATE_EXP,
754+
MODEL_TENSOR.FFN_DOWN_EXP,
755+
MODEL_TENSOR.FFN_UP_EXP,
756+
],
735757
# TODO
736758
}
737759

@@ -765,6 +787,10 @@ class MODEL_TENSOR(IntEnum):
765787
MODEL_TENSOR.ROPE_FREQS,
766788
MODEL_TENSOR.ATTN_ROT_EMBD,
767789
],
790+
MODEL_ARCH.GRANITE: [
791+
MODEL_TENSOR.ROPE_FREQS,
792+
MODEL_TENSOR.ATTN_ROT_EMBD,
793+
],
768794
}
769795

770796
#

llama.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ enum llm_arch {
225225
LLM_ARCH_COMMAND_R,
226226
LLM_ARCH_DBRX,
227227
LLM_ARCH_OLMO,
228+
LLM_ARCH_GRANITE,
228229
LLM_ARCH_UNKNOWN,
229230
};
230231

@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
261262
{ LLM_ARCH_COMMAND_R, "command-r" },
262263
{ LLM_ARCH_DBRX, "dbrx" },
263264
{ LLM_ARCH_OLMO, "olmo" },
265+
{ LLM_ARCH_GRANITE, "granite" },
264266
{ LLM_ARCH_UNKNOWN, "(unknown)" },
265267
};
266268

@@ -1036,6 +1038,32 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
10361038
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
10371039
},
10381040
},
1041+
{
1042+
LLM_ARCH_GRANITE,
1043+
{
1044+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1045+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1046+
{ LLM_TENSOR_OUTPUT, "output" },
1047+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1048+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1049+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1050+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1051+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1052+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1053+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1054+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1055+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1056+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1057+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1058+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1059+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
1060+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
1061+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
1062+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1063+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1064+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1065+
},
1066+
},
10391067
{
10401068
LLM_ARCH_UNKNOWN,
10411069
{
@@ -4288,6 +4316,18 @@ static void llm_load_hparams(
42884316
default: model.type = e_model::MODEL_UNKNOWN;
42894317
}
42904318
} break;
4319+
case LLM_ARCH_GRANITE:
4320+
{
4321+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4322+
4323+
switch (hparams.n_layer) {
4324+
case 32: model.type = e_model::MODEL_3B; break;
4325+
case 36: model.type = e_model::MODEL_8B; break;
4326+
case 52: model.type = e_model::MODEL_20B; break;
4327+
case 88: model.type = e_model::MODEL_34B; break;
4328+
default: model.type = e_model::MODEL_UNKNOWN;
4329+
}
4330+
} break;
42914331
default: (void)0;
42924332
}
42934333

@@ -4397,6 +4437,9 @@ static void llm_load_vocab(
43974437
} else {
43984438
if (tokenizer_model == "gpt2") {
43994439
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4440+
if (model.arch == LLM_ARCH_LLAMA) {
4441+
vocab.add_space_prefix = false;
4442+
}
44004443
} else {
44014444
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
44024445
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -4967,6 +5010,7 @@ static bool llm_load_tensors(
49675010
case LLM_ARCH_LLAMA:
49685011
case LLM_ARCH_REFACT:
49695012
case LLM_ARCH_MINICPM:
5013+
case LLM_ARCH_GRANITE:
49705014
{
49715015
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
49725016

@@ -10668,6 +10712,7 @@ static struct ggml_cgraph * llama_build_graph(
1066810712

1066910713
switch (model.arch) {
1067010714
case LLM_ARCH_LLAMA:
10715+
case LLM_ARCH_GRANITE:
1067110716
{
1067210717
result = llm.build_llama();
1067310718
} break;
@@ -15811,6 +15856,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1581115856
case LLM_ARCH_PHI3:
1581215857
case LLM_ARCH_GEMMA:
1581315858
case LLM_ARCH_STARCODER2:
15859+
case LLM_ARCH_GRANITE:
1581415860
return LLAMA_ROPE_TYPE_NEOX;
1581515861

1581615862
// all model arches should be listed explicitly here

0 commit comments

Comments
 (0)