@@ -255,29 +255,33 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
255255    } else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
256256               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
257257        if  (name.find (" attn_v.weight" 
258-             if  (qs.model .hparams .n_gqa () >= 4  || qs.model .hparams .n_expert  >= 4 ) new_type = GGML_TYPE_Q4_K;
258+             if  (qs.params ->attn_v_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_v_type ;
259+             else  if  (qs.model .hparams .n_gqa () >= 4  || qs.model .hparams .n_expert  >= 4 ) new_type = GGML_TYPE_Q4_K;
259260            else  new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
260261            ++qs.i_attention_wv ;
261262        }
262263        else  if  (qs.model .hparams .n_expert  == 8  && name.find (" attn_k.weight" 
263264            new_type = GGML_TYPE_Q4_K;
264265        }
265266        else  if  (name.find (" ffn_down" 
266-             if  (qs.i_ffn_down  < qs.n_ffn_down /8 ) {
267+             if  (qs.params ->ffn_down_type  < GGML_TYPE_COUNT) new_type = qs.params ->ffn_down_type ;
268+             else  if  (qs.i_ffn_down  < qs.n_ffn_down /8 ) {
267269                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
268270            }
269271            ++qs.i_ffn_down ;
270272        }
271273        else  if  (name.find (" attn_output.weight" 
272-             if  (qs.model .hparams .n_expert  == 8 ) {
274+             if  (qs.params ->attn_output_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_output_type ;
275+             else  if  (qs.model .hparams .n_expert  == 8 ) {
273276                new_type = GGML_TYPE_Q5_K;
274277            } else  {
275278                if  (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
276279                else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
277280            }
278281        }
279282    } else  if  (name.find (" attn_v.weight" 
280-         if       (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
283+         if  (qs.params ->attn_v_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_v_type ;
284+         else  if       (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
281285            new_type = qs.model .hparams .n_gqa () >= 4  ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
282286        }
283287        else  if  (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model .hparams .n_gqa () >= 4 ) {
@@ -315,7 +319,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
315319        }
316320        ++qs.i_attention_wv ;
317321    } else  if  (name.find (" attn_k.weight" 
318-         if  (qs.model .hparams .n_expert  == 8 ) {
322+         if  (qs.params ->attn_k_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_k_type ;
323+         else  if  (qs.model .hparams .n_expert  == 8 ) {
319324            //  for the 8-expert model, bumping this to Q8_0 trades just ~128MB
320325            //  TODO: explore better strategies
321326            new_type = GGML_TYPE_Q8_0;
@@ -327,7 +332,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
327332            new_type = GGML_TYPE_IQ2_S;
328333        }
329334    } else  if  (name.find (" attn_q.weight" 
330-         if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
335+         if  (qs.params ->attn_q_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_q_type ;
336+         else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
331337            new_type = GGML_TYPE_IQ3_XXS;
332338        }
333339        else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -336,7 +342,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
336342    } else  if  (name.find (" ffn_down" 
337343        auto  info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
338344        int  i_layer = info.first , n_layer = info.second ;
339-         if       (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
345+         if  (qs.params ->ffn_down_type  < GGML_TYPE_COUNT) new_type = qs.params ->ffn_down_type ;
346+         else  if       (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
340347        else  if  (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
341348            if  (i_layer < n_layer/8 ) new_type = GGML_TYPE_Q4_K;
342349        }
@@ -379,7 +386,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
379386        }
380387        ++qs.i_ffn_down ;
381388    } else  if  (name.find (" attn_output.weight" 
382-         if  (arch != LLM_ARCH_FALCON) {
389+         if  (qs.params ->attn_output_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_output_type ;
390+         else  if  (arch != LLM_ARCH_FALCON) {
383391            if  (qs.model .hparams .n_expert  == 8 ) {
384392                if  (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
385393                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
@@ -399,7 +407,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
399407        }
400408    }
401409    else  if  (name.find (" attn_qkv.weight" 
402-         if  (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
410+         if  (qs.params ->attn_qkv_type  < GGML_TYPE_COUNT) new_type = qs.params ->attn_qkv_type ;
411+         else  if  (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
403412            new_type = GGML_TYPE_Q4_K;
404413        }
405414        else  if  (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
@@ -408,15 +417,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
408417    else  if  (name.find (" ffn_gate" 
409418        auto  info = layer_info (qs.i_ffn_gate , qs.n_ffn_gate , name.c_str ());
410419        int  i_layer = info.first , n_layer = info.second ;
411-         if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8  && i_layer < 7 *n_layer/8 )) {
420+         if  (qs.params ->ffn_gate_type  < GGML_TYPE_COUNT) new_type = qs.params ->ffn_up_type ;
421+         else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8  && i_layer < 7 *n_layer/8 )) {
412422            new_type = GGML_TYPE_IQ3_XXS;
413423        }
414424        ++qs.i_ffn_gate ;
415425    }
416426    else  if  (name.find (" ffn_up" 
417427        auto  info = layer_info (qs.i_ffn_up , qs.n_ffn_up , name.c_str ());
418428        int  i_layer = info.first , n_layer = info.second ;
419-         if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8  && i_layer < 7 *n_layer/8 )) {
429+         if  (qs.params ->ffn_up_type  < GGML_TYPE_COUNT) new_type = qs.params ->ffn_up_type ;
430+         else  if  (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8  && i_layer < 7 *n_layer/8 )) {
420431            new_type = GGML_TYPE_IQ3_XXS;
421432        }
422433        ++qs.i_ffn_up ;
@@ -968,6 +979,30 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
968979            if  (params->output_tensor_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " output.weight" 0 ) {
969980                new_type = params->output_tensor_type ;
970981            }
982+             if  (params->attn_q_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_q.weight" 0 ) {
983+                 new_type = params->attn_q_type ;
984+             }
985+             if  (params->attn_k_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_k.weight" 0 ) {
986+                 new_type = params->attn_k_type ;
987+             }
988+             if  (params->attn_v_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_v.weight" 0 ) {
989+                 new_type = params->attn_v_type ;
990+             }
991+             if  (params->attn_qkv_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_qkv.weight" 0 ) {
992+                 new_type = params->attn_qkv_type ;
993+             }
994+             if  (params->attn_output_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " attn_output.weight" 0 ) {
995+                 new_type = params->attn_output_type ;
996+             }
997+             if  (params->ffn_gate_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_gate" 0 ) {
998+                 new_type = params->ffn_gate_type ;
999+             }
1000+             if  (params->ffn_down_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_down" 0 ) {
1001+                 new_type = params->ffn_down_type ;
1002+             }
1003+             if  (params->ffn_up_type  < GGML_TYPE_COUNT && strcmp (tensor->name , " ffn_up" 0 ) {
1004+                 new_type = params->ffn_up_type ;
1005+             }
9711006
9721007            //  If we've decided to quantize to the same type the tensor is already
9731008            //  in then there's nothing to do.
@@ -1006,9 +1041,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10061041                }
10071042            }
10081043            if  ((new_type == GGML_TYPE_IQ2_XXS ||
1009-                   new_type == GGML_TYPE_IQ2_XS  ||
1010-                   new_type == GGML_TYPE_IQ2_S   ||
1011-                   new_type == GGML_TYPE_IQ1_S   ||
1044+                 new_type == GGML_TYPE_IQ2_XS  ||
1045+                 new_type == GGML_TYPE_IQ2_S   ||
1046+                 new_type == GGML_TYPE_IQ1_S   ||
10121047                (new_type == GGML_TYPE_IQ1_M && strcmp (tensor->name , " token_embd.weight" strcmp (tensor->name , " output.weight" 
10131048                (new_type == GGML_TYPE_Q2_K && params->ftype  == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp (tensor->name , " token_embd.weight" 0 )) && !imatrix) {
10141049                LLAMA_LOG_ERROR (" \n\n ============================================================\n " 
@@ -1114,6 +1149,14 @@ llama_model_quantize_params llama_model_quantize_default_params() {
11141149        /* .ftype                       =*/ 
11151150        /* .output_tensor_type          =*/ 
11161151        /* .token_embedding_type        =*/ 
1152+         /* .attn_q_type                 =*/ 
1153+         /* .attn_k_type                 =*/ 
1154+         /* .attn_v_type                 =*/ 
1155+         /* .attn_qkv_type               =*/ 
1156+         /* .attn_output_type            =*/ 
1157+         /* .ffn_gate_type               =*/ 
1158+         /* .ffn_down_type               =*/ 
1159+         /* .ffn_up_type                 =*/ 
11171160        /* .allow_requantize            =*/ false ,
11181161        /* .quantize_output_tensor      =*/ true ,
11191162        /* .only_copy                   =*/ false ,
0 commit comments