Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore intended k-quants quantization mixes for MoE models #4872

Merged
merged 3 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Restore intended k-quants quantization mixes for MoE models
  • Loading branch information
Kawrakow committed Jan 11, 2024
commit a38378d2ff72c17bad33e4a788d6ca82aac4b318
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
Expand Down
24 changes: 15 additions & 9 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2585,7 +2585,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";

// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
Expand Down Expand Up @@ -8953,10 +8954,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("ffn_down.weight") != std::string::npos) {
} else if (name.find("ffn_down") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
Expand All @@ -8965,14 +8969,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
new_type = GGML_TYPE_Q5_K;
}
++qs.i_feed_forward_w2;
Expand All @@ -8990,9 +8994,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
Expand Down Expand Up @@ -9041,6 +9046,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
Expand Down Expand Up @@ -9098,7 +9104,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
++qs.n_attention_wv;
}
else if (name.find("ffn_down.weight") != std::string::npos) {
else if (name.find("ffn_down") != std::string::npos) {
++qs.n_feed_forward_w2;
}
}
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K_S = 20, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down