@@ -163,6 +163,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
163163 ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
164164 new_type = GGML_TYPE_Q5_K;
165165 }
166+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
167+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
168+ new_type = !qs.has_output ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
169+ }
166170 else if (new_type != GGML_TYPE_Q8_0) {
167171 new_type = GGML_TYPE_Q6_K;
168172 }
@@ -236,7 +240,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
236240 new_type = GGML_TYPE_Q4_K;
237241 }
238242 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
239- new_type = qs.model .hparams .n_gqa () >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS ;
243+ new_type = qs.model .hparams .n_gqa () >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S ;
240244 }
241245 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model .hparams .n_gqa () >= 2 ) {
242246 new_type = GGML_TYPE_Q5_K;
@@ -256,14 +260,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
256260 // else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
257261 // use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
258262 // else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
259- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
260- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ||
263+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
264+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S ||
261265 ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
262266 if (qs.model .type == LLM_TYPE_70B) {
263267 // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
264268 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
265269 // nearly negligible increase in model size by quantizing this tensor with more bits:
266- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
270+ if (new_type == GGML_TYPE_Q3_K) new_type = GGML_TYPE_Q5_K;
271+ if (new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K;
267272 }
268273 if (qs.model .hparams .n_expert >= 2 ) {
269274 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -277,15 +282,22 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
277282 // TODO: explore better strategies
278283 new_type = GGML_TYPE_Q6_K;
279284 }
280- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
281- new_type = GGML_TYPE_IQ3_XXS;
285+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M)
286+ && qs.model .hparams .n_gqa () >= 2 ) {
287+ new_type = GGML_TYPE_IQ4_XS;
282288 }
283- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
284- new_type = GGML_TYPE_IQ2_S ;
289+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs. model . hparams . n_gqa () >= 2 ) {
290+ new_type = GGML_TYPE_IQ3_S ;
285291 }
286- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L )
292+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL )
287293 && qs.model .hparams .n_gqa () >= 2 )
288294 new_type = GGML_TYPE_Q5_K;
295+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
296+ && qs.model .hparams .n_gqa () >= 2 )
297+ new_type = GGML_TYPE_Q3_K;
298+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L)
299+ && qs.model .hparams .n_gqa () >= 2 )
300+ new_type = GGML_TYPE_Q4_K;
289301 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M)
290302 && qs.model .hparams .n_gqa () >= 2 )
291303 new_type = GGML_TYPE_Q5_K;
@@ -296,11 +308,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
296308 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
297309 new_type = GGML_TYPE_IQ3_XXS;
298310 }
311+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS && qs.has_imatrix ) {
312+ new_type = GGML_TYPE_IQ3_S;
313+ }
299314 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
300315 new_type = GGML_TYPE_IQ2_S;
301316 }
302- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S)
317+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && qs.has_imatrix ) {
318+ new_type = GGML_TYPE_Q2_K;
319+ }
320+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
321+ new_type = GGML_TYPE_Q3_K;
322+ }
323+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
303324 new_type = GGML_TYPE_Q4_K;
325+ }
326+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
327+ new_type = GGML_TYPE_Q5_K;
328+ }
329+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
330+ new_type = GGML_TYPE_Q4_0;
331+ }
304332 } else if (name.find (" ffn_down" ) != std::string::npos) {
305333 auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
306334 int i_layer = info.first , n_layer = info.second ;
@@ -332,11 +360,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
332360 // if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
333361 // }
334362 // }
363+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
335364 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
336- else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix ) {
365+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS)
366+ && !qs.has_imatrix && use_more_bits (i_layer, n_layer)) {
337367 new_type = GGML_TYPE_Q5_K;
338368 }
339- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
369+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
340370 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
341371 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8 ) {
342372 new_type = GGML_TYPE_Q5_K;
@@ -363,12 +393,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
363393 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
364394 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
365395 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
366- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K ;
396+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS ;
367397 }
368398 } else {
399+ // if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
369400 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
370- if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
371- if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
401+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
402+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
372403 }
373404 }
374405 else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
0 commit comments