Skip to content

Commit

Permalink
ggml : add Q5_0 and Q5_1 quantization (ggerganov#1187)
Browse files Browse the repository at this point in the history
* ggml : add Q5_0 quantization (cuBLAS only)

* ggml : fix Q5_0 qh -> uint32_t

* ggml : fix q5_0 histogram stats

* ggml : q5_0 scalar dot product

* ggml : q5_0 ARM NEON dot

* ggml : q5_0 more efficient ARM NEON using uint64_t masks

* ggml : rename Q5_0 -> Q5_1

* ggml : adding Q5_0 mode

* quantize : add Q5_0 and Q5_1 to map

* ggml : AVX2 optimizations for Q5_0, Q5_1 (ggerganov#1195)

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
  • Loading branch information
ggerganov and sw authored Apr 26, 2023
1 parent 87a6f84 commit 574406d
Show file tree
Hide file tree
Showing 8 changed files with 711 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ build-em/
build-debug/
build-release/
build-static/
build-cublas/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
};

Expand Down
85 changes: 85 additions & 0 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,23 @@ typedef struct {
} block_q4_3;
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");

#define QK5_0 32
typedef struct {
__half d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

#define QK5_1 32
typedef struct {
__half d; // delta
__half m; // min
uint32_t qh; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

#define QK8_0 32
typedef struct {
float d; // delta
Expand Down Expand Up @@ -138,6 +155,64 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
}
}

static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
const block_q5_0 * x = (const block_q5_0 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;

const uint8_t * pp = x[i].qs;

uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));

for (int l = 0; l < QK5_0; l += 2) {
const uint8_t vi = pp[l/2];

const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;

const int8_t vi0 = ((vi & 0xf) | vh0);
const int8_t vi1 = ((vi >> 4) | vh1);

const float v0 = (vi0 - 16)*d;
const float v1 = (vi1 - 16)*d;

y[i*QK5_0 + l + 0] = v0;
y[i*QK5_0 + l + 1] = v1;
}
}

static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
const block_q5_1 * x = (const block_q5_1 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;
const float m = x[i].m;

const uint8_t * pp = x[i].qs;

const uint32_t qh = x[i].qh;

for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vi = pp[l/2];

const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;

const int8_t vi0 = (vi & 0xf) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;

const float v0 = vi0*d + m;
const float v1 = vi1*d + m;

y[i*QK5_1 + l + 0] = v0;
y[i*QK5_1 + l + 1] = v1;
}
}

static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
const block_q8_0 * x = (const block_q8_0 *) vx;

Expand Down Expand Up @@ -174,6 +249,16 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
}

void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_0;
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
}

void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_1;
dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
}

void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK8_0;
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
Expand Down
2 changes: 2 additions & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);

#ifdef __cplusplus
Expand Down
Loading

0 comments on commit 574406d

Please sign in to comment.