@@ -255,29 +255,33 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
255255 } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
256256 ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
257257 if (name.find (" attn_v.weight" ) != std::string::npos) {
258- if (qs.model .hparams .n_gqa () >= 4 || qs.model .hparams .n_expert >= 4 ) new_type = GGML_TYPE_Q4_K;
258+ if (qs.params ->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_v_type ;
259+ else if (qs.model .hparams .n_gqa () >= 4 || qs.model .hparams .n_expert >= 4 ) new_type = GGML_TYPE_Q4_K;
259260 else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
260261 ++qs.i_attention_wv ;
261262 }
262263 else if (qs.model .hparams .n_expert == 8 && name.find (" attn_k.weight" ) != std::string::npos) {
263264 new_type = GGML_TYPE_Q4_K;
264265 }
265266 else if (name.find (" ffn_down" ) != std::string::npos) {
266- if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
267+ if (qs.params ->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params ->ffn_down_type ;
268+ else if (qs.i_ffn_down < qs.n_ffn_down /8 ) {
267269 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
268270 }
269271 ++qs.i_ffn_down ;
270272 }
271273 else if (name.find (" attn_output.weight" ) != std::string::npos) {
272- if (qs.model .hparams .n_expert == 8 ) {
274+ if (qs.params ->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_output_type ;
275+ else if (qs.model .hparams .n_expert == 8 ) {
273276 new_type = GGML_TYPE_Q5_K;
274277 } else {
275278 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
276279 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
277280 }
278281 }
279282 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
280- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
283+ if (qs.params ->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_v_type ;
284+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
281285 new_type = qs.model .hparams .n_gqa () >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
282286 }
283287 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model .hparams .n_gqa () >= 4 ) {
@@ -315,7 +319,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
315319 }
316320 ++qs.i_attention_wv ;
317321 } else if (name.find (" attn_k.weight" ) != std::string::npos) {
318- if (qs.model .hparams .n_expert == 8 ) {
322+ if (qs.params ->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_k_type ;
323+ else if (qs.model .hparams .n_expert == 8 ) {
319324 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
320325 // TODO: explore better strategies
321326 new_type = GGML_TYPE_Q8_0;
@@ -327,7 +332,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
327332 new_type = GGML_TYPE_IQ2_S;
328333 }
329334 } else if (name.find (" attn_q.weight" ) != std::string::npos) {
330- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
335+ if (qs.params ->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_q_type ;
336+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
331337 new_type = GGML_TYPE_IQ3_XXS;
332338 }
333339 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -336,7 +342,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
336342 } else if (name.find (" ffn_down" ) != std::string::npos) {
337343 auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
338344 int i_layer = info.first , n_layer = info.second ;
339- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
345+ if (qs.params ->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params ->ffn_down_type ;
346+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
340347 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
341348 if (i_layer < n_layer/8 ) new_type = GGML_TYPE_Q4_K;
342349 }
@@ -379,7 +386,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
379386 }
380387 ++qs.i_ffn_down ;
381388 } else if (name.find (" attn_output.weight" ) != std::string::npos) {
382- if (arch != LLM_ARCH_FALCON) {
389+ if (qs.params ->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_output_type ;
390+ else if (arch != LLM_ARCH_FALCON) {
383391 if (qs.model .hparams .n_expert == 8 ) {
384392 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
385393 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
@@ -399,7 +407,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
399407 }
400408 }
401409 else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
402- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
410+ if (qs.params ->attn_qkv_type < GGML_TYPE_COUNT) new_type = qs.params ->attn_qkv_type ;
411+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
403412 new_type = GGML_TYPE_Q4_K;
404413 }
405414 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
@@ -408,15 +417,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
408417 else if (name.find (" ffn_gate" ) != std::string::npos) {
409418 auto info = layer_info (qs.i_ffn_gate , qs.n_ffn_gate , name.c_str ());
410419 int i_layer = info.first , n_layer = info.second ;
411- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7 *n_layer/8 )) {
420+ if (qs.params ->ffn_gate_type < GGML_TYPE_COUNT) new_type = qs.params ->ffn_up_type ;
421+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7 *n_layer/8 )) {
412422 new_type = GGML_TYPE_IQ3_XXS;
413423 }
414424 ++qs.i_ffn_gate ;
415425 }
416426 else if (name.find (" ffn_up" ) != std::string::npos) {
417427 auto info = layer_info (qs.i_ffn_up , qs.n_ffn_up , name.c_str ());
418428 int i_layer = info.first , n_layer = info.second ;
419- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7 *n_layer/8 )) {
429+ if (qs.params ->ffn_up_type < GGML_TYPE_COUNT) new_type = qs.params ->ffn_up_type ;
430+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7 *n_layer/8 )) {
420431 new_type = GGML_TYPE_IQ3_XXS;
421432 }
422433 ++qs.i_ffn_up ;
@@ -968,6 +979,30 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
968979 if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp (tensor->name , " output.weight" ) == 0 ) {
969980 new_type = params->output_tensor_type ;
970981 }
982+ if (params->attn_q_type < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_q.weight" ) == 0 ) {
983+ new_type = params->attn_q_type ;
984+ }
985+ if (params->attn_k_type < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_k.weight" ) == 0 ) {
986+ new_type = params->attn_k_type ;
987+ }
988+ if (params->attn_v_type < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_v.weight" ) == 0 ) {
989+ new_type = params->attn_v_type ;
990+ }
991+ if (params->attn_qkv_type < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_qkv.weight" ) == 0 ) {
992+ new_type = params->attn_qkv_type ;
993+ }
994+ if (params->attn_output_type < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_output.weight" ) == 0 ) {
995+ new_type = params->attn_output_type ;
996+ }
997+ if (params->ffn_gate_type < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_gate" ) == 0 ) {
998+ new_type = params->ffn_gate_type ;
999+ }
1000+ if (params->ffn_down_type < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_down" ) == 0 ) {
1001+ new_type = params->ffn_down_type ;
1002+ }
1003+ if (params->ffn_up_type < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_up" ) == 0 ) {
1004+ new_type = params->ffn_up_type ;
1005+ }
9711006
9721007 // If we've decided to quantize to the same type the tensor is already
9731008 // in then there's nothing to do.
@@ -1006,9 +1041,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10061041 }
10071042 }
10081043 if ((new_type == GGML_TYPE_IQ2_XXS ||
1009- new_type == GGML_TYPE_IQ2_XS ||
1010- new_type == GGML_TYPE_IQ2_S ||
1011- new_type == GGML_TYPE_IQ1_S ||
1044+ new_type == GGML_TYPE_IQ2_XS ||
1045+ new_type == GGML_TYPE_IQ2_S ||
1046+ new_type == GGML_TYPE_IQ1_S ||
10121047 (new_type == GGML_TYPE_IQ1_M && strcmp (tensor->name , " token_embd.weight" ) && strcmp (tensor->name , " output.weight" )) ||
10131048 (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp (tensor->name , " token_embd.weight" ) != 0 )) && !imatrix) {
10141049 LLAMA_LOG_ERROR (" \n\n ============================================================\n " );
@@ -1114,6 +1149,14 @@ llama_model_quantize_params llama_model_quantize_default_params() {
11141149 /* .ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
11151150 /* .output_tensor_type =*/ GGML_TYPE_COUNT,
11161151 /* .token_embedding_type =*/ GGML_TYPE_COUNT,
1152+ /* .attn_q_type =*/ GGML_TYPE_COUNT,
1153+ /* .attn_k_type =*/ GGML_TYPE_COUNT,
1154+ /* .attn_v_type =*/ GGML_TYPE_COUNT,
1155+ /* .attn_qkv_type =*/ GGML_TYPE_COUNT,
1156+ /* .attn_output_type =*/ GGML_TYPE_COUNT,
1157+ /* .ffn_gate_type =*/ GGML_TYPE_COUNT,
1158+ /* .ffn_down_type =*/ GGML_TYPE_COUNT,
1159+ /* .ffn_up_type =*/ GGML_TYPE_COUNT,
11171160 /* .allow_requantize =*/ false ,
11181161 /* .quantize_output_tensor =*/ true ,
11191162 /* .only_copy =*/ false ,
0 commit comments