@@ -8049,6 +8049,24 @@ struct no_init {
80498049 no_init () { /* do nothing */ }
80508050};
80518051
8052+ struct quantize_state_internal {
8053+ const llama_model & model;
8054+ const llama_model_quantize_params * params;
8055+ #ifdef GGML_USE_K_QUANTS
8056+ int n_attention_wv = 0 ;
8057+ int n_feed_forward_w2 = 0 ;
8058+ int i_attention_wv = 0 ;
8059+ int i_feed_forward_w2 = 0 ;
8060+
8061+ int n_k_quantized = 0 ;
8062+ int n_fallback = 0 ;
8063+ #endif
8064+ quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
8065+ : model(model)
8066+ , params(params)
8067+ {}
8068+ };
8069+
80528070static void llama_convert_tensor_internal (
80538071 struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
80548072 const size_t nelements, const int nthread
@@ -8109,20 +8127,21 @@ static void llama_convert_tensor_internal(
81098127
81108128#ifdef GGML_USE_K_QUANTS
81118129static ggml_type get_k_quant_type (
8112- ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv ,
8113- int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8130+ quantize_state_internal & qs ,
8131+ ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
81148132) {
81158133 const std::string name = ggml_get_name (tensor);
81168134 // TODO: avoid hardcoded tensor names - use the TN_* constants
8117- const auto tn = LLM_TN (model.arch );
8135+ const llm_arch arch = qs.model .arch ;
8136+ const auto tn = LLM_TN (arch);
81188137
81198138 auto use_more_bits = [](int i_layer, int num_layers) -> bool {
81208139 return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
81218140 };
81228141
81238142 if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
81248143 int nx = tensor->ne [0 ];
8125- if (model. arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8144+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
81268145 new_type = GGML_TYPE_Q8_0;
81278146 }
81288147 else if (new_type != GGML_TYPE_Q8_0) {
@@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type(
81318150 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
81328151 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
81338152 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8134- new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8153+ new_type = qs. i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
81358154 }
81368155 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
81378156 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8138- use_more_bits (* i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8139- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && * i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8157+ use_more_bits (qs. i_attention_wv , qs. n_attention_wv )) new_type = GGML_TYPE_Q6_K;
8158+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
81408159 else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8141- (* i_attention_wv < n_attention_wv/8 || * i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
8142- if (model.type == MODEL_70B) {
8160+ (qs. i_attention_wv < qs. n_attention_wv /8 || qs. i_attention_wv >= 7 *qs. n_attention_wv /8 )) new_type = GGML_TYPE_Q6_K;
8161+ if (qs. model .type == MODEL_70B) {
81438162 // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
81448163 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
81458164 // nearly negligible increase in model size by quantizing this tensor with more bits:
81468165 if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
81478166 }
8148- ++* i_attention_wv;
8167+ ++qs. i_attention_wv ;
81498168 } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
81508169 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
81518170 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8152- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8153- : model. arch != LLM_ARCH_FALCON || use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8171+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8172+ : arch != LLM_ARCH_FALCON || use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
81548173 : GGML_TYPE_Q3_K;
81558174 }
81568175 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8157- new_type = model. arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8176+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
81588177 }
81598178 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8160- if (model. arch == LLM_ARCH_FALCON) {
8161- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8162- use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8179+ if (arch == LLM_ARCH_FALCON) {
8180+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8181+ use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
81638182 } else {
8164- if (use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8183+ if (use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
81658184 }
81668185 }
8167- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8168- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model. arch != LLM_ARCH_FALCON && * i_feed_forward_w2 < 4 ) {
8186+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8187+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < 4 ) {
81698188 new_type = GGML_TYPE_Q5_K;
81708189 }
8171- ++* i_feed_forward_w2;
8190+ ++qs. i_feed_forward_w2 ;
81728191 } else if (name.find (" attn_output.weight" ) != std::string::npos) {
8173- if (model. arch != LLM_ARCH_FALCON) {
8192+ if (arch != LLM_ARCH_FALCON) {
81748193 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
81758194 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
81768195 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type(
81978216 int nx = tensor->ne [0 ];
81988217 int ny = tensor->ne [1 ];
81998218 if (nx % QK_K != 0 ) {
8200- LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants \n " , __func__, nx, ny, QK_K);
8219+ LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for %s " , __func__, nx, ny, QK_K, ggml_type_name (new_type) );
82018220 convert_incompatible_tensor = true ;
8221+ } else {
8222+ ++qs.n_k_quantized ;
82028223 }
82038224 }
82048225 if (convert_incompatible_tensor) {
8205- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
8206- new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
8207- LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
8208- } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
8209- new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
8210- LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
8211- } else {
8212- throw std::runtime_error (" Unsupported tensor size encountered\n " );
8226+ switch (new_type) {
8227+ case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break ;
8228+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break ;
8229+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break ;
8230+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break ;
8231+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break ;
8232+ default : throw std::runtime_error (" \n Unsupported tensor size encountered\n " );
82138233 }
8234+ LLAMA_LOG_WARN (" - using fallback quantization %s\n " , ggml_type_name (new_type));
8235+ ++qs.n_fallback ;
82148236 }
82158237
82168238 return new_type;
@@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82688290 llm_load_arch (ml, model);
82698291 llm_load_hparams (ml, model);
82708292
8293+ struct quantize_state_internal qs (model, params);
8294+
82718295 if (params->only_copy ) {
82728296 ftype = model.ftype ;
82738297 }
@@ -8281,29 +8305,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82818305 gguf_set_val_u32 (ctx_out, " general.file_type" , ftype);
82828306
82838307#ifdef GGML_USE_K_QUANTS
8284- int n_attention_wv = 0 ;
8285- int n_feed_forward_w2 = 0 ;
8286-
82878308 for (int i = 0 ; i < ml.n_tensors ; ++i) {
82888309 struct ggml_tensor * meta = ml.get_tensor_meta (i);
82898310
82908311 const std::string name = ggml_get_name (meta);
82918312
82928313 // TODO: avoid hardcoded tensor names - use the TN_* constants
82938314 if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
8294- ++n_attention_wv;
8315+ ++qs. n_attention_wv ;
82958316 }
82968317 else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8297- ++n_feed_forward_w2;
8318+ ++qs. n_feed_forward_w2 ;
82988319 }
82998320 }
8300- if (n_attention_wv != n_feed_forward_w2 || (uint32_t )n_attention_wv != model.hparams .n_layer ) {
8321+ if (qs. n_attention_wv != qs. n_feed_forward_w2 || (uint32_t )qs. n_attention_wv != model.hparams .n_layer ) {
83018322 LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
8302- __func__, n_attention_wv, n_feed_forward_w2, model.hparams .n_layer );
8323+ __func__, qs. n_attention_wv , qs. n_feed_forward_w2 , model.hparams .n_layer );
83038324 }
8304-
8305- int i_attention_wv = 0 ;
8306- int i_feed_forward_w2 = 0 ;
83078325#endif
83088326
83098327 size_t total_size_org = 0 ;
@@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
83708388 if (quantize) {
83718389 new_type = quantized_type;
83728390#ifdef GGML_USE_K_QUANTS
8373- new_type = get_k_quant_type (
8374- new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8375- );
8391+ new_type = get_k_quant_type (qs, new_type, tensor, ftype);
83768392#endif
83778393 // If we've decided to quantize to the same type the tensor is already
83788394 // in then there's nothing to do.
@@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
84988514 LLAMA_LOG_INFO (" \n " );
84998515 }
85008516 }
8517+ #ifdef GGML_USE_K_QUANTS
8518+ if (qs.n_fallback > 0 ) {
8519+ LLAMA_LOG_WARN (" %s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n " ,
8520+ __func__, qs.n_fallback , qs.n_k_quantized + qs.n_fallback );
8521+ }
8522+ #endif
85018523}
85028524
85038525static int llama_apply_lora_from_file_internal (
0 commit comments