@@ -8480,28 +8480,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
84808480 new_type = GGML_TYPE_Q8_0;
84818481 }
84828482 } else if (name.find (" ffn_down" ) != std::string::npos) {
8483+ const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
8484+ int i_layer, n_layer;
8485+ if (n_expert == 1 ) {
8486+ i_layer = qs.i_feed_forward_w2 ;
8487+ n_layer = qs.n_feed_forward_w2 ;
8488+ } else {
8489+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8490+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8491+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8492+ // tensor name.
8493+ n_layer = qs.n_feed_forward_w2 / n_expert;
8494+ if (sscanf (name.c_str (), " blk.%d.ffn_down" , &i_layer) != 1 ) {
8495+ throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name.c_str ()));
8496+ }
8497+ if (i_layer < 0 || i_layer >= n_layer) {
8498+ throw std::runtime_error (format (" Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name.c_str (), n_layer));
8499+ }
8500+ }
84838501 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
84848502 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8485- if (qs. i_feed_forward_w2 < qs. n_feed_forward_w2 /8 ) new_type = GGML_TYPE_Q4_K;
8503+ if (i_layer < n_layer /8 ) new_type = GGML_TYPE_Q4_K;
84868504 }
84878505 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8488- new_type = qs. i_feed_forward_w2 < qs. n_feed_forward_w2 /16 ? GGML_TYPE_Q5_K
8489- : arch != LLM_ARCH_FALCON || use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8506+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q5_K
8507+ : arch != LLM_ARCH_FALCON || use_more_bits (i_layer, n_layer ) ? GGML_TYPE_Q4_K
84908508 : GGML_TYPE_Q3_K;
84918509 }
84928510 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
84938511 new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
84948512 }
84958513 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
84968514 if (arch == LLM_ARCH_FALCON) {
8497- new_type = qs. i_feed_forward_w2 < qs. n_feed_forward_w2 /16 ? GGML_TYPE_Q6_K :
8498- use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8515+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q6_K :
8516+ use_more_bits (i_layer, n_layer ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
84998517 } else {
8500- if (use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8518+ if (use_more_bits (i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
85018519 }
85028520 }
8503- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8504- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < qs. n_feed_forward_w2 /8 ) {
8521+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
8522+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer /8 ) {
85058523 new_type = GGML_TYPE_Q5_K;
85068524 }
85078525 ++qs.i_feed_forward_w2 ;
0 commit comments