Skip to content

Commit

Permalink
Introduce enum llama_ftype
Browse files Browse the repository at this point in the history
  • Loading branch information
sw committed Apr 2, 2023
1 parent d8d4e86 commit 9c987ee
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 31 deletions.
14 changes: 11 additions & 3 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ int main(int argc, char ** argv) {

if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
return 1;
}

Expand All @@ -27,7 +27,15 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];

const int itype = atoi(argv[3]);
const enum llama_ftype itype = (enum llama_ftype)atoi(argv[3]);
switch (itype) {
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q4_1:
break;
default:
fprintf(stderr, "Invalid model file type %d\n", itype);
return 1;
}

const int64_t t_main_start_us = ggml_time_us();

Expand Down
52 changes: 25 additions & 27 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ enum e_model {
MODEL_65B,
};

static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "gptq" };

static const size_t MB = 1024*1024;

// computed for n_ctx == 2048
Expand Down Expand Up @@ -100,7 +102,7 @@ struct llama_hparams {
int32_t n_head = 32;
int32_t n_layer = 32;
int32_t n_rot = 64;
int32_t f16 = 1;
int32_t f16 = LLAMA_FTYPE_MOSTLY_F16;
};

struct llama_layer {
Expand Down Expand Up @@ -435,7 +437,7 @@ static bool llama_model_load(
}

// temp warning to tell the user to use "--n_parts"
if (hparams.f16 == 4 && n_parts != 1) {
if (hparams.f16 == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) {
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
}
Expand Down Expand Up @@ -508,11 +510,14 @@ static bool llama_model_load(
// wtype is for per-layer weights, while vtype is for other weights
ggml_type wtype, vtype;
switch (model.hparams.f16) {
case 0: wtype = vtype = GGML_TYPE_F32; break;
case 1: wtype = vtype = GGML_TYPE_F16; break;
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break;
case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break;
case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
wtype = GGML_TYPE_Q4_1;
vtype = GGML_TYPE_F16;
break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
Expand Down Expand Up @@ -684,16 +689,15 @@ static bool llama_model_load(
return false;
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
}

switch (ftype) {
case 0: // f32
case 1: // f16
case LLAMA_FTYPE_ALL_F32:
case LLAMA_FTYPE_MOSTLY_F16:
break;
case 2: // q4_0
case 3: // q4_1
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q4_1:
assert(ne[0] % 64 == 0);
break;
default:
Expand Down Expand Up @@ -1273,20 +1277,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
//

// TODO: reuse code from the llama_model_load() somehow
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type type = GGML_TYPE_Q4_1;
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype itype) {
ggml_type type;

switch (itype) {
case 2: type = GGML_TYPE_Q4_0; break;
case 3: type = GGML_TYPE_Q4_1; break;
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
case LLAMA_FTYPE_MOSTLY_Q4_0: type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: type = GGML_TYPE_Q4_1; break;
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return false;
};

if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
return false;
}

llama_vocab vocab;

printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
Expand Down Expand Up @@ -1438,7 +1437,6 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
}

{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
}

Expand All @@ -1459,12 +1457,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= (n_dims == 2);

if (quantize) {
if (ftype != 0 && ftype != 1) {
if (ftype != LLAMA_FTYPE_ALL_F32 && ftype != LLAMA_FTYPE_MOSTLY_F16) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}

if (ftype == 1) {
if (ftype == LLAMA_FTYPE_MOSTLY_F16) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
Expand All @@ -1478,7 +1476,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s

ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
const int bpe = (ftype == LLAMA_FTYPE_ALL_F32) ? sizeof(float) : sizeof(uint16_t);

data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
Expand Down Expand Up @@ -1659,7 +1657,7 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype) {
enum llama_ftype itype) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
fprintf(stderr, "%s: failed to quantize\n", __func__);
return 1;
Expand Down
11 changes: 10 additions & 1 deletion llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ extern "C" {
void * progress_callback_user_data;
};

// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4, // but tok_embeddings.weight and output.weight are F16
};

LLAMA_API struct llama_context_params llama_context_default_params();

// Various functions for loading a ggml llama model.
Expand All @@ -81,7 +90,7 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype);
enum llama_ftype itype);

// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
Expand Down

0 comments on commit 9c987ee

Please sign in to comment.