Skip to content

Commit 431fde0

Browse files
committed
llama: add support for small granite models
it works only for the small models 3b and 8b. The convert-hf-to-gguf.py script uses the vocabulary size of the granite models to detect granite and set the correct configuration. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
1 parent 0211330 commit 431fde0

File tree

2 files changed

+29
-5
lines changed

2 files changed

+29
-5
lines changed

convert-hf-to-gguf.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,19 @@ def set_gguf_parameters(self):
13151315
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
13161316
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
13171317

1318+
# Apply to granite small models only
1319+
if self.hparams.get("vocab_size", 32000) == 49152:
1320+
self.gguf_writer.add_add_bos_token(False)
1321+
self.gguf_writer.add_rope_type(gguf.RopeType.NEOX)
1322+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
1323+
1324+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1325+
if tokenizer_config_file.is_file():
1326+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1327+
tokenizer_config_json = json.load(f)
1328+
if "add_prefix_space" in tokenizer_config_json:
1329+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1330+
13181331
@staticmethod
13191332
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
13201333
if n_head_kv is not None and n_head != n_head_kv:
@@ -1329,10 +1342,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
13291342
n_head = self.hparams["num_attention_heads"]
13301343
n_kv_head = self.hparams.get("num_key_value_heads")
13311344

1332-
if name.endswith("q_proj.weight"):
1333-
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1334-
if name.endswith("k_proj.weight"):
1335-
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1345+
# Skip for granite models
1346+
if self.hparams.get("vocab_size", 32000) != 49152:
1347+
if name.endswith("q_proj.weight"):
1348+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1349+
if name.endswith("k_proj.weight"):
1350+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
13361351

13371352
# process the experts separately
13381353
if name.find("block_sparse_moe.experts") != -1:

llama.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4001,7 +4001,9 @@ static void llm_load_hparams(
40014001
switch (hparams.n_layer) {
40024002
case 22: model.type = e_model::MODEL_1B; break;
40034003
case 26: model.type = e_model::MODEL_3B; break;
4004-
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4004+
// granite uses a vocab with len 49152
4005+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4006+
case 36: model.type = e_model::MODEL_8B; break; // granite
40054007
case 40: model.type = e_model::MODEL_13B; break;
40064008
case 48: model.type = e_model::MODEL_34B; break;
40074009
case 60: model.type = e_model::MODEL_30B; break;
@@ -4271,6 +4273,8 @@ static void llm_load_hparams(
42714273
case 30: model.type = e_model::MODEL_3B; break;
42724274
case 32: model.type = e_model::MODEL_7B; break;
42734275
case 40: model.type = e_model::MODEL_15B; break;
4276+
case 52: model.type = e_model::MODEL_20B; break; // granite
4277+
case 88: model.type = e_model::MODEL_34B; break; // granite
42744278
default: model.type = e_model::MODEL_UNKNOWN;
42754279
}
42764280
} break;
@@ -4521,6 +4525,11 @@ static void llm_load_vocab(
45214525
} else {
45224526
if (tokenizer_model == "gpt2") {
45234527
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4528+
4529+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4530+
if (add_space_prefix_keyidx != -1) {
4531+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4532+
}
45244533
} else {
45254534
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
45264535
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);

0 commit comments

Comments
 (0)