Skip to content

Commit

Permalink
quantize : use map to assign quantization type from string (ggerg…
Browse files Browse the repository at this point in the history
…anov#1191)

instead of `int` (while `int` option still being supported)

This allows the following usage:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin q4_0`

instead of:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin 2`
  • Loading branch information
prusnak authored Apr 26, 2023
1 parent 4afcc37 commit 859fee6
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .devops/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" 2
./quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
else
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format
python3 convert.py models/7B/
# quantize the model to 4-bits (using method 2 = q4_0)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
# quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
Expand Down
30 changes: 24 additions & 6 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
#include "llama.h"

#include <cstdio>
#include <map>
#include <string>

static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
};

// usage:
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
Expand All @@ -12,11 +21,9 @@ int main(int argc, char ** argv) {

if (argc < 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
return 1;
}

Expand All @@ -30,7 +37,18 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];

const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
enum llama_ftype ftype;
if (argv[3][0] == 'q') {
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
if (it == LLAMA_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
return 1;
}
ftype = it->second;
} else {
ftype = (enum llama_ftype)atoi(argv[3]);
}

int nthread = argc > 4 ? atoi(argv[4]) : 0;

const int64_t t_main_start_us = ggml_time_us();
Expand Down

0 comments on commit 859fee6

Please sign in to comment.