Skip to content

Commit

Permalink
llama : temporary disable Q6_K output quantization (ggerganov#1711)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jun 6, 2023
1 parent 590250f commit 7a74dee
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2198,27 +2198,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
} else {
new_type = quantized_type;
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
// TODO: temporary disabled until Metal / OpenCL support is available
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
//if (tensor.name == "output.weight") {
// new_type = GGML_TYPE_Q6_K;
//}
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_attention_wv;
}
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_feed_forward_w2;
}
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}

float * f32_data;
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
llama_buffer f32_conv_buf;
Expand Down

0 comments on commit 7a74dee

Please sign in to comment.