@@ -7985,6 +7985,24 @@ struct no_init {
79857985 no_init () { /* do nothing */ }
79867986};
79877987
7988+ struct quantize_state_internal {
7989+ const llama_model & model;
7990+ const llama_model_quantize_params * params;
7991+ #ifdef GGML_USE_K_QUANTS
7992+ int n_attention_wv = 0 ;
7993+ int n_feed_forward_w2 = 0 ;
7994+ int i_attention_wv = 0 ;
7995+ int i_feed_forward_w2 = 0 ;
7996+
7997+ int n_k_quantized = 0 ;
7998+ int n_fallback = 0 ;
7999+ #endif
8000+ quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
8001+ : model(model)
8002+ , params(params)
8003+ {}
8004+ };
8005+
79888006static void llama_convert_tensor_internal (
79898007 struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
79908008 const size_t nelements, const int nthread
@@ -8045,20 +8063,21 @@ static void llama_convert_tensor_internal(
80458063
80468064#ifdef GGML_USE_K_QUANTS
80478065static ggml_type get_k_quant_type (
8048- ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv ,
8049- int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8066+ quantize_state_internal & qs ,
8067+ ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
80508068) {
80518069 const std::string name = ggml_get_name (tensor);
80528070 // TODO: avoid hardcoded tensor names - use the TN_* constants
8053- const auto tn = LLM_TN (model.arch );
8071+ const llm_arch arch = qs.model .arch ;
8072+ const auto tn = LLM_TN (arch);
80548073
80558074 auto use_more_bits = [](int i_layer, int num_layers) -> bool {
80568075 return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
80578076 };
80588077
80598078 if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
80608079 int nx = tensor->ne [0 ];
8061- if (model. arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8080+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
80628081 new_type = GGML_TYPE_Q8_0;
80638082 }
80648083 else if (new_type != GGML_TYPE_Q8_0) {
@@ -8067,46 +8086,46 @@ static ggml_type get_k_quant_type(
80678086 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
80688087 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
80698088 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8070- new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8089+ new_type = qs. i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
80718090 }
80728091 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
80738092 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8074- use_more_bits (* i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8075- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && * i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8093+ use_more_bits (qs. i_attention_wv , qs. n_attention_wv )) new_type = GGML_TYPE_Q6_K;
8094+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
80768095 else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8077- (* i_attention_wv < n_attention_wv/8 || * i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
8078- if (model.type == MODEL_70B) {
8096+ (qs. i_attention_wv < qs. n_attention_wv /8 || qs. i_attention_wv >= 7 *qs. n_attention_wv /8 )) new_type = GGML_TYPE_Q6_K;
8097+ if (qs. model .type == MODEL_70B) {
80798098 // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
80808099 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
80818100 // nearly negligible increase in model size by quantizing this tensor with more bits:
80828101 if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
80838102 }
8084- ++* i_attention_wv;
8103+ ++qs. i_attention_wv ;
80858104 } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
80868105 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
80878106 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8088- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8089- : model. arch != LLM_ARCH_FALCON || use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8107+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8108+ : arch != LLM_ARCH_FALCON || use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
80908109 : GGML_TYPE_Q3_K;
80918110 }
80928111 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8093- new_type = model. arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8112+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
80948113 }
80958114 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8096- if (model. arch == LLM_ARCH_FALCON) {
8097- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8098- use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8115+ if (arch == LLM_ARCH_FALCON) {
8116+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8117+ use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
80998118 } else {
8100- if (use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8119+ if (use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
81018120 }
81028121 }
8103- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8104- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model. arch != LLM_ARCH_FALCON && * i_feed_forward_w2 < 4 ) {
8122+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8123+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < 4 ) {
81058124 new_type = GGML_TYPE_Q5_K;
81068125 }
8107- ++* i_feed_forward_w2;
8126+ ++qs. i_feed_forward_w2 ;
81088127 } else if (name.find (" attn_output.weight" ) != std::string::npos) {
8109- if (model. arch != LLM_ARCH_FALCON) {
8128+ if (arch != LLM_ARCH_FALCON) {
81108129 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
81118130 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
81128131 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8135,6 +8154,8 @@ static ggml_type get_k_quant_type(
81358154 if (nx % QK_K != 0 ) {
81368155 LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for %s" , __func__, nx, ny, QK_K, ggml_type_name (new_type));
81378156 convert_incompatible_tensor = true ;
8157+ } else {
8158+ ++qs.n_k_quantized ;
81388159 }
81398160 }
81408161 if (convert_incompatible_tensor) {
@@ -8147,6 +8168,7 @@ static ggml_type get_k_quant_type(
81478168 default : throw std::runtime_error (" \n Unsupported tensor size encountered\n " );
81488169 }
81498170 LLAMA_LOG_WARN (" - using fallback quantization %s\n " , ggml_type_name (new_type));
8171+ ++qs.n_fallback ;
81508172 }
81518173
81528174 return new_type;
@@ -8204,6 +8226,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82048226 llm_load_arch (ml, model);
82058227 llm_load_hparams (ml, model);
82068228
8229+ struct quantize_state_internal qs (model, params);
8230+
82078231 if (params->only_copy ) {
82088232 ftype = model.ftype ;
82098233 }
@@ -8217,29 +8241,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82178241 gguf_set_val_u32 (ctx_out, " general.file_type" , ftype);
82188242
82198243#ifdef GGML_USE_K_QUANTS
8220- int n_attention_wv = 0 ;
8221- int n_feed_forward_w2 = 0 ;
8222-
82238244 for (int i = 0 ; i < ml.n_tensors ; ++i) {
82248245 struct ggml_tensor * meta = ml.get_tensor_meta (i);
82258246
82268247 const std::string name = ggml_get_name (meta);
82278248
82288249 // TODO: avoid hardcoded tensor names - use the TN_* constants
82298250 if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
8230- ++n_attention_wv;
8251+ ++qs. n_attention_wv ;
82318252 }
82328253 else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8233- ++n_feed_forward_w2;
8254+ ++qs. n_feed_forward_w2 ;
82348255 }
82358256 }
8236- if (n_attention_wv != n_feed_forward_w2 || (uint32_t )n_attention_wv != model.hparams .n_layer ) {
8257+ if (qs. n_attention_wv != qs. n_feed_forward_w2 || (uint32_t )qs. n_attention_wv != model.hparams .n_layer ) {
82378258 LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
8238- __func__, n_attention_wv, n_feed_forward_w2, model.hparams .n_layer );
8259+ __func__, qs. n_attention_wv , qs. n_feed_forward_w2 , model.hparams .n_layer );
82398260 }
8240-
8241- int i_attention_wv = 0 ;
8242- int i_feed_forward_w2 = 0 ;
82438261#endif
82448262
82458263 size_t total_size_org = 0 ;
@@ -8306,9 +8324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
83068324 if (quantize) {
83078325 new_type = quantized_type;
83088326#ifdef GGML_USE_K_QUANTS
8309- new_type = get_k_quant_type (
8310- new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8311- );
8327+ new_type = get_k_quant_type (qs, new_type, tensor, ftype);
83128328#endif
83138329 // If we've decided to quantize to the same type the tensor is already
83148330 // in then there's nothing to do.
@@ -8434,6 +8450,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
84348450 LLAMA_LOG_INFO (" \n " );
84358451 }
84368452 }
8453+ #ifdef GGML_USE_K_QUANTS
8454+ if (qs.n_fallback > 0 ) {
8455+ LLAMA_LOG_WARN (" %s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n " ,
8456+ __func__, qs.n_fallback , qs.n_k_quantized + qs.n_fallback );
8457+ }
8458+ #endif
84378459}
84388460
84398461static int llama_apply_lora_from_file_internal (
0 commit comments