From d9a239c4104c888eafda672c1e42c9bbc5084cb8 Mon Sep 17 00:00:00 2001 From: Marco Matthies <71844+marcom@users.noreply.github.com> Date: Mon, 10 Apr 2023 19:57:59 +0200 Subject: [PATCH 001/773] Simplify to include lower-case windows.h always, fix compile on mingw32 (#747) --- ggml.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index 9616eb9fd5e393..a817f8321ed22b 100644 --- a/ggml.c +++ b/ggml.c @@ -26,14 +26,9 @@ #define static_assert(cond, msg) struct global_scope_noop_trick #endif -#if defined _MSC_VER || defined(__MINGW32__) +#if defined(_WIN32) -#if !defined(__MINGW32__) -#include -#else -// ref: https://github.com/ggerganov/whisper.cpp/issues/168 #include -#endif typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; From 9d634ef452d0fc24fcd49592952d13d0ab0f41b7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Apr 2023 19:32:45 +0300 Subject: [PATCH 002/773] ggml : remove trailing whitespaces --- ggml.c | 82 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/ggml.c b/ggml.c index a817f8321ed22b..6db6fde8deb5d9 100644 --- a/ggml.c +++ b/ggml.c @@ -1944,7 +1944,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - /* Prepare the constants we will need during execution */ + /* Prepare the constants we will need during execution */ const __m256i lowMask = _mm256_set1_epi8( 0xF ); const __m256i offset_8 = _mm256_set1_epi16( 8 ); @@ -1954,61 +1954,59 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Main loop for (int i = 0; i < nb; i+=UNROLL_COUNT) { - - // This loop will be unrolled by the compiler + // This loop will be unrolled by the compiler for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps( xy_q ); - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( scale, q, acc ); } - - } + } // Return horizontal sum of the acc vector __m128 res = _mm256_extractf128_ps( acc, 1 ); From c3ac702e5ee3533457e0489df4906ee112fe88e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Apr 2023 22:40:28 +0300 Subject: [PATCH 003/773] ggml : add ggml_cont() + optimize ggml_cpy() for contiguous dst --- ggml.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- ggml.h | 6 ++ 2 files changed, 252 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index 6db6fde8deb5d9..4f6420678403d5 100644 --- a/ggml.c +++ b/ggml.c @@ -2609,6 +2609,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "SCALE", "CPY", + "CONT", "RESHAPE", "VIEW", "PERMUTE", @@ -2624,7 +2625,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_FF", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2653,6 +2654,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "x*v", "x-\\>y", + "cont(x)", "reshape(x)", "view(x)", "permute(x)", @@ -2668,7 +2670,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_ff(x)", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4301,6 +4303,41 @@ struct ggml_tensor * ggml_cpy_inplace( return ggml_cpy_impl(ctx, a, b, true); } +// ggml_cont + +struct ggml_tensor * ggml_cont_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, true); +} + // ggml_reshape struct ggml_tensor * ggml_reshape( @@ -4843,6 +4880,85 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + if (ggml_is_contiguous(dst)) { + if (src0->nb[0] == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -4937,6 +5053,105 @@ static void ggml_compute_forward_dup_f32( return; } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + // TODO: simplify + if (src0->nb[0] == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -5057,14 +5272,18 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { - const int j0 = (n/nth)*ith; - const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1); - - for (int j = j0; j < j1; j++) { + for (int j = ith; j < n; j += nth) { +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + j*nb01), 1, + (float *) ((char *) src1->data + j*nb11), 1, + (float *) ((char *) dst->data + j*nb1), 1, nc); +#else ggml_vec_add_f32(nc, (float *) ((char *) dst->data + j*nb1), (float *) ((char *) src0->data + j*nb01), (float *) ((char *) src1->data + j*nb11)); +#endif } } else { // src1 is not contiguous @@ -6812,6 +7031,15 @@ static void ggml_compute_forward_cpy( ggml_compute_forward_dup(params, src0, dst); } +// ggml_compute_forward_cont + +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( @@ -8642,6 +8870,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_cpy(params, tensor->src0, tensor); } break; + case GGML_OP_CONT: + { + ggml_compute_forward_cont(params, tensor->src0, tensor); + } break; case GGML_OP_RESHAPE: { ggml_compute_forward_reshape(params, tensor->src0, tensor); @@ -8886,8 +9118,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_impl(ctx, src1->grad, - // TODO: fix transpose, the node will break the graph connections - ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad), + ggml_mul_mat(ctx, + ggml_cont(ctx, ggml_transpose(ctx, src0)), + tensor->grad), inplace); } } break; @@ -8899,6 +9132,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONT: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_RESHAPE: { GGML_ASSERT(false); // TODO: not implemented @@ -9353,6 +9590,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = n_threads; } break; case GGML_OP_CPY: + case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: diff --git a/ggml.h b/ggml.h index af16c647c97088..a5245a8ae6256c 100644 --- a/ggml.h +++ b/ggml.h @@ -236,6 +236,7 @@ enum ggml_op { GGML_OP_SCALE, GGML_OP_CPY, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_VIEW, GGML_OP_PERMUTE, @@ -525,6 +526,11 @@ struct ggml_tensor * ggml_cpy( struct ggml_tensor * a, struct ggml_tensor * b); +// make contiguous +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view struct ggml_tensor * ggml_reshape( From 461ba9e66ed3885f80680d71495e055580573c74 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Apr 2023 23:20:01 +0300 Subject: [PATCH 004/773] ggml : fix WASM build --- ggml.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index 4f6420678403d5..ada3bbbdcda801 100644 --- a/ggml.c +++ b/ggml.c @@ -2067,18 +2067,18 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest float sum1 = 0.0f; for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &px[i + 0]; - const block_q4_0 * restrict y0 = &py[i + 0]; - const block_q4_0 * restrict x1 = &px[i + 1]; - const block_q4_0 * restrict y1 = &py[i + 1]; + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict y0 = &y[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q4_0 * restrict y1 = &y[i + 1]; const v128_t m4b = wasm_u8x16_splat(0xf); const v128_t s8b = wasm_i8x16_splat(0x8); - const v128_t v0_0 = wasm_v128_load(x0.qs); - const v128_t v0_1 = wasm_v128_load(y0.qs); - const v128_t v1_0 = wasm_v128_load(x1.qs); - const v128_t v1_1 = wasm_v128_load(y1.qs); + const v128_t v0_0 = wasm_v128_load(x0->qs); + const v128_t v0_1 = wasm_v128_load(y0->qs); + const v128_t v1_0 = wasm_v128_load(x1->qs); + const v128_t v1_1 = wasm_v128_load(y1->qs); // 4-bit -> 8-bit const v128_t v0_0l = wasm_v128_and(v0_0, m4b); From a0caa34b162449b5c13b8d604573053300ff54a1 Mon Sep 17 00:00:00 2001 From: qouoq Date: Tue, 11 Apr 2023 04:41:53 +0800 Subject: [PATCH 005/773] Add BAIR's Koala to supported models (#877) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5ef4318eb4873f..ef82855e49e924 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ New features will probably be added mostly through community contributions. - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) +- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) **Bindings:** From 2663d2c6784ad7b77998c6874df25648d597f74b Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 11 Apr 2023 06:19:54 -0700 Subject: [PATCH 006/773] Windows fixes (#890) Mostly for msys2 and mingw64 builds, which are different from each other and different from standard Visual Studio builds. Isn't Windows fun? - Define _GNU_SOURCE in more files (it's already used in ggml.c for Linux's sake). - Don't use PrefetchVirtualMemory if not building for Windows 8 or later (mingw64 doesn't by default). But warn the user about this situation since it's probably not intended. - Check for NOMINMAX already being defined, which it is on mingw64. - Actually use the `increment` variable (bug in my `pizza` PR). - Suppress unused variable warnings in the fake pthread_create and pthread_join implementations for Windows. - (not Windows-related) Remove mention of `asprintf` from comment; `asprintf` is no longer used. Fixes #871. --- examples/main/main.cpp | 5 +++++ ggml.c | 4 +++- llama.cpp | 5 +++++ llama_util.h | 12 +++++++++--- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d333d0dbacd205..bf756c16de19d1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,3 +1,8 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "common.h" #include "llama.h" diff --git a/ggml.c b/ggml.c index ada3bbbdcda801..897b67d9306149 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,4 @@ -// Defines CLOCK_MONOTONIC and asprintf on Linux +// Defines CLOCK_MONOTONIC on Linux #define _GNU_SOURCE #include "ggml.h" @@ -50,6 +50,7 @@ typedef HANDLE pthread_t; typedef DWORD thread_ret_t; static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); if (handle == NULL) { @@ -61,6 +62,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void } static int pthread_join(pthread_t thread, void* unused) { + (void) unused; return (int) WaitForSingleObject(thread, INFINITE); } diff --git a/llama.cpp b/llama.cpp index 203a1adc0928a0..54ba01eefbade0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,3 +1,8 @@ +// Defines fileno on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "llama_util.h" #include "llama.h" #include "llama_internal.h" diff --git a/llama_util.h b/llama_util.h index d68f49bd239a48..653bf71388bff2 100755 --- a/llama_util.h +++ b/llama_util.h @@ -26,7 +26,9 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN - #define NOMINMAX + #ifndef NOMINMAX + #define NOMINMAX + #endif #include #include #include // for _fseeki64 @@ -209,6 +211,7 @@ struct llama_mmap { throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); } + #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 // Advise the kernel to preload the mapped memory WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; @@ -217,6 +220,9 @@ struct llama_mmap { fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } + #else + #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") + #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 } ~llama_mmap() { @@ -338,8 +344,8 @@ struct llama_mlock { // Hopefully a megabyte is enough overhead: size_t increment = size + 1048576; // The minimum must be <= the maximum, so we need to increase both: - min_ws_size += size; - max_ws_size += size; + min_ws_size += increment; + max_ws_size += increment; if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str()); From 3e6e70d8e8917b5bd14c7c9f9b89a585f1ff0b31 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Tue, 11 Apr 2023 15:03:51 +0000 Subject: [PATCH 007/773] Add enum llama_ftype, sync ggml_type to model files (#709) --- examples/quantize/quantize.cpp | 10 ++--- ggml.c | 35 ++++++++---------- ggml.h | 9 +++-- llama.cpp | 67 ++++++++++++++++++++-------------- llama.h | 10 ++++- 5 files changed, 74 insertions(+), 57 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 680757c6bf3561..5c9e2ad9420b3a 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -5,15 +5,15 @@ #include // usage: -// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type +// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // int main(int argc, char ** argv) { ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); + fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); return 1; } @@ -27,7 +27,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int itype = atoi(argv[3]); + const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); const int64_t t_main_start_us = ggml_time_us(); @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index 897b67d9306149..31947c4c10a91f 100644 --- a/ggml.c +++ b/ggml.c @@ -2560,29 +2560,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - QK, - QK, - 1, - 1, - 1, - 1, - 1, + [GGML_TYPE_F32] = 1, + [GGML_TYPE_F16] = 1, + [GGML_TYPE_Q4_0] = QK, + [GGML_TYPE_Q4_1] = QK, + [GGML_TYPE_I8] = 1, + [GGML_TYPE_I16] = 1, + [GGML_TYPE_I32] = 1, }; - -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - sizeof(block_q4_0), - sizeof(block_q4_1), - sizeof(int8_t ), - sizeof(int16_t), - sizeof(int32_t), - sizeof(ggml_fp16_t), - sizeof(float ), + [GGML_TYPE_F32] = sizeof(float), + [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_I8] = sizeof(int8_t), + [GGML_TYPE_I16] = sizeof(int16_t), + [GGML_TYPE_I32] = sizeof(int32_t), }; - -// don't forget to update the array above when adding new types -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", diff --git a/ggml.h b/ggml.h index a5245a8ae6256c..7d8b7a1829dd01 100644 --- a/ggml.h +++ b/ggml.h @@ -198,13 +198,14 @@ struct ggml_object; struct ggml_context; enum ggml_type { - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, + // explicitly numbered values are used in llama.cpp files + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, - GGML_TYPE_F16, - GGML_TYPE_F32, GGML_TYPE_COUNT, }; diff --git a/llama.cpp b/llama.cpp index 54ba01eefbade0..653558be94885e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -82,7 +82,7 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; - uint32_t f16 = 1; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { return memcmp(this, &other, sizeof(llama_hparams)); @@ -432,7 +432,7 @@ struct llama_file_loader { hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); hparams.n_rot = file.read_u32(); - hparams.f16 = file.read_u32(); + hparams.ftype = (enum llama_ftype) file.read_u32(); } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); @@ -458,20 +458,21 @@ struct llama_file_loader { llama_load_tensor_shard shard; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); - uint32_t ftype = file.read_u32(); + shard.type = (enum ggml_type) file.read_u32(); shard.ne.resize(n_dims); file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); } - switch (ftype) { - case 0: shard.type = GGML_TYPE_F32; break; - case 1: shard.type = GGML_TYPE_F16; break; - case 2: shard.type = GGML_TYPE_Q4_0; break; - case 3: shard.type = GGML_TYPE_Q4_1; break; + switch (shard.type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + break; default: { - throw format("unrecognized ftype %u\n", ftype); + throw format("unrecognized tensor type %u\n", shard.type); } } @@ -502,18 +503,18 @@ struct llama_file_loader { struct llama_file_saver { llama_file file; llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) : file(fname, "wb"), any_file_loader(any_file_loader) { fprintf(stderr, "llama.cpp: saving model to %s\n", fname); write_magic(); - write_hparams(new_f16); + write_hparams(new_ftype); write_vocab(); } void write_magic() { file.write_u32('ggjt'); // magic file.write_u32(1); // version } - void write_hparams(uint32_t new_f16) { + void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; file.write_u32(hparams.n_vocab); file.write_u32(hparams.n_embd); @@ -521,7 +522,7 @@ struct llama_file_saver { file.write_u32(hparams.n_head); file.write_u32(hparams.n_layer); file.write_u32(hparams.n_rot); - file.write_u32(new_f16); + file.write_u32(new_ftype); } void write_vocab() { if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { @@ -536,17 +537,17 @@ struct llama_file_saver { } } void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { - uint32_t ftype; switch (new_type) { - case GGML_TYPE_F32: ftype = 0; break; - case GGML_TYPE_F16: ftype = 1; break; - case GGML_TYPE_Q4_0: ftype = 2; break; - case GGML_TYPE_Q4_1: ftype = 3; break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + break; default: LLAMA_ASSERT(false); } file.write_u32((uint32_t) tensor.ne.size()); file.write_u32((uint32_t) tensor.name.size()); - file.write_u32(ftype); + file.write_u32(new_type); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); file.seek(-file.tell() & 31, SEEK_CUR); @@ -820,6 +821,16 @@ static const char *llama_file_version_name(llama_file_version version) { } } +static const char *llama_ftype_name(enum llama_ftype ftype) { + switch (ftype) { + case LLAMA_FTYPE_ALL_F32: return "all F32"; + case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + default: LLAMA_ASSERT(false); + } +} + static const char *llama_model_type_name(e_model type) { switch (type) { case MODEL_7B: return "7B"; @@ -872,7 +883,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); @@ -1544,17 +1555,17 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { ggml_type quantized_type; - switch (itype) { - case 2: quantized_type = GGML_TYPE_Q4_0; break; - case 3: quantized_type = GGML_TYPE_Q4_1; break; - default: throw format("invalid quantization type %d\n", itype); + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid output file type %d\n", ftype); }; std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); size_t total_size_org = 0; size_t total_size_new = 0; @@ -1745,9 +1756,9 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype) { + enum llama_ftype ftype) { try { - llama_model_quantize_internal(fname_inp, fname_out, itype); + llama_model_quantize_internal(fname_inp, fname_out, ftype); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); diff --git a/llama.h b/llama.h index 42c364c6b342e6..8a0d50fb80cec6 100644 --- a/llama.h +++ b/llama.h @@ -65,6 +65,14 @@ extern "C" { void * progress_callback_user_data; }; + // model file types + enum llama_ftype { + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + }; + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API bool llama_mmap_supported(); @@ -85,7 +93,7 @@ extern "C" { LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype); + enum llama_ftype ftype); // Returns the KV cache that will contain the context for the // ongoing prediction with the model. From 8b679987cdce292ff36bd741f6715e4927e26f9b Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Tue, 11 Apr 2023 21:45:44 +0200 Subject: [PATCH 008/773] Fix whitespace, add .editorconfig, add GitHub workflow (#883) --- .devops/main.Dockerfile | 2 +- .dockerignore | 2 +- .ecrc | 5 +++++ .editorconfig | 16 ++++++++++++++++ .github/ISSUE_TEMPLATE/custom.md | 16 ++++++++-------- .github/workflows/docker.yml | 2 +- .github/workflows/editorconfig.yml | 17 +++++++++++++++++ README.md | 10 +++++----- examples/Miku.sh | 12 ++++++------ examples/common.cpp | 14 +++++++------- examples/embedding/README.md | 6 +++--- examples/main/README.md | 6 +++--- examples/main/main.cpp | 2 +- examples/perplexity/README.md | 6 +++--- ggml.c | 14 +++++++------- 15 files changed, 84 insertions(+), 46 deletions(-) create mode 100644 .ecrc create mode 100644 .editorconfig create mode 100644 .github/workflows/editorconfig.yml diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile index cd575efa013d1c..2e629f8ce9a69e 100644 --- a/.devops/main.Dockerfile +++ b/.devops/main.Dockerfile @@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime COPY --from=build /app/main /main -ENTRYPOINT [ "/main" ] \ No newline at end of file +ENTRYPOINT [ "/main" ] diff --git a/.dockerignore b/.dockerignore index 952990f2689009..462fac23a69321 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,4 +21,4 @@ models/* arm_neon.h compile_commands.json -Dockerfile \ No newline at end of file +Dockerfile diff --git a/.ecrc b/.ecrc new file mode 100644 index 00000000000000..b682057dd68916 --- /dev/null +++ b/.ecrc @@ -0,0 +1,5 @@ +{ + "Disable": { + "IndentSize": true + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000000..df8aaf5046720a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +# https://EditorConfig.org + +# Top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file, utf-8 charset +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 +indent_style = space +indent_size = 4 + +[Makefile] +indent_style = tab diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md index 0d508802d0e0ff..8fd95535677803 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and # Current Behavior -Please provide a detailed written description of what `llama.cpp` did, instead. +Please provide a detailed written description of what `llama.cpp` did, instead. -# Environment and Context +# Environment and Context Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. @@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin. llama_model_load: .......................................................................................... done llama_model_load: model size = 4869.09 MB / num tensors = 723 -system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | +system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | main: prompt: 'Please close your issue when it has been answered.' main: number of tokens in prompt = 11 @@ -166,14 +166,14 @@ main: total time = 246406.42 ms Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.': - 3636882.89 msec task-clock # 14.677 CPUs utilized - 13509 context-switches # 3.714 /sec - 2436 cpu-migrations # 0.670 /sec - 10476679 page-faults # 2.881 K/sec + 3636882.89 msec task-clock # 14.677 CPUs utilized + 13509 context-switches # 3.714 /sec + 2436 cpu-migrations # 0.670 /sec + 10476679 page-faults # 2.881 K/sec 13133115082869 cycles # 3.611 GHz (16.77%) 29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%) 10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%) - 23479217109614 instructions # 1.79 insn per cycle + 23479217109614 instructions # 1.79 insn per cycle # 0.44 stalled cycles per insn (16.76%) 2353072268027 branches # 647.002 M/sec (16.77%) 1998682780 branch-misses # 0.08% of all branches (16.76%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f70821de23e36a..28402c9336156a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -60,4 +60,4 @@ jobs: push: ${{ github.event_name == 'push' }} platforms: linux/amd64,linux/arm64 tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" - file: ${{ matrix.config.dockerfile }} \ No newline at end of file + file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml new file mode 100644 index 00000000000000..b4e535acf1f647 --- /dev/null +++ b/.github/workflows/editorconfig.yml @@ -0,0 +1,17 @@ +name: EditorConfig Checker + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + editorconfig: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: editorconfig-checker/action-editorconfig-checker@main + - run: editorconfig-checker diff --git a/README.md b/README.md index ef82855e49e924..da05ef87a05ec9 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ There 26 letters in the English Alphabet The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis > List 5 words that start with "ca". cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. -> +> ``` ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) @@ -254,17 +254,17 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py): ```bash - python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model + python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin ``` - + - You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models - The original model is saved in the same folder with a suffix `.orig` ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** -- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. +- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. - The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory: @@ -284,7 +284,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202 - GPT-3.5 / InstructGPT / ChatGPT: - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) - + ### Perplexity (Measuring model quality) You can use the `perplexity` example to measure perplexity over the given prompt. For more background, diff --git a/examples/Miku.sh b/examples/Miku.sh index 352478a15c829b..c4cbf80f227b56 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024 --top_p 0.5) if [ -n "$N_THREAD" ]; then - GEN_OPTIONS+=(--threads "$N_THREAD") + GEN_OPTIONS+=(--threads "$N_THREAD") fi ./main "${GEN_OPTIONS[@]}" \ - --model "$MODEL" \ - --n_predict "$N_PREDICTS" \ - --color --interactive \ - --reverse-prompt "${USER_NAME}:" \ - --prompt " + --model "$MODEL" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --reverse-prompt "${USER_NAME}:" \ + --prompt " This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. diff --git a/examples/common.cpp b/examples/common.cpp index f909eed24b3252..91d96efae67ff1 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -22,9 +22,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); -extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, - const wchar_t * lpWideCharStr, int cchWideChar, - char * lpMultiByteStr, int cbMultiByte, +extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, + const wchar_t * lpWideCharStr, int cchWideChar, + char * lpMultiByteStr, int cbMultiByte, const char * lpDefaultChar, bool * lpUsedDefaultChar); #define CP_UTF8 65001 #endif @@ -328,9 +328,9 @@ void win32_console_init(bool enable_color) { // Convert a wide Unicode string to an UTF8 string void win32_utf8_encode(const std::wstring & wstr, std::string & str) { - int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); - std::string strTo(size_needed, 0); - WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); - str = strTo; + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string strTo(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); + str = strTo; } #endif diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 21d8be65f18e3c..fe8f5dcc62ed9e 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -1,3 +1,3 @@ -# embedding - -TODO +# embedding + +TODO diff --git a/examples/main/README.md b/examples/main/README.md index 4701aa55892c34..f09e7ba9797648 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,3 +1,3 @@ -# main - -TODO +# main + +TODO diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bf756c16de19d1..ba153cb82dcf67 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -168,7 +168,7 @@ int main(int argc, char ** argv) { } // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.interactive_start) { params.interactive = true; } diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index a932275c2366d5..eacfb17c67fb2e 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,3 +1,3 @@ -# perplexity - -TODO +# perplexity + +TODO diff --git a/ggml.c b/ggml.c index 31947c4c10a91f..a26b4853f7eae5 100644 --- a/ggml.c +++ b/ggml.c @@ -228,12 +228,12 @@ static inline float fp32_from_bits(uint32_t w) { } static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; } static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { @@ -1881,7 +1881,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); #endif #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); From 4dbbd407500cf500ca5f47e4e947635797997c05 Mon Sep 17 00:00:00 2001 From: Nicolai Weitkemper Date: Wed, 12 Apr 2023 08:46:20 +0200 Subject: [PATCH 009/773] readme: link to sha256sums file (#902) This is to emphasize that these do not need to be obtained from elsewhere. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index da05ef87a05ec9..914c07995bb663 100644 --- a/README.md +++ b/README.md @@ -266,7 +266,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202 - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. -- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. +- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. - The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory: `sha256sum --ignore-missing -c SHA256SUMS` on Linux From 782438070f7568380755ffc7bf5e09b20c1e8272 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Apr 2023 14:31:12 +0300 Subject: [PATCH 010/773] readme : update hot topics with link to "GPU support" issue --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 914c07995bb663..ccb0f8774879bd 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/issues/914) - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) ## Description From f76cb3a34d6a6b03afb96650e39495f201eac042 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Apr 2023 14:48:57 +0300 Subject: [PATCH 011/773] readme : change "GPU support" link to discussion --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ccb0f8774879bd..dbc088532b05dd 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** -- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/issues/914) +- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915) - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) ## Description From e7f6997f897a18b6372a6460e25c5f89e1469f1d Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Wed, 12 Apr 2023 15:06:16 +0000 Subject: [PATCH 012/773] Don't crash on ftype (formerly f16) == 4 (#917) --- llama.cpp | 4 +++- llama.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 653558be94885e..6d8b706b982587 100644 --- a/llama.cpp +++ b/llama.cpp @@ -827,7 +827,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; - default: LLAMA_ASSERT(false); + case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + return "mostly Q4_1, some F16"; + default: return "unknown, may not work"; } } diff --git a/llama.h b/llama.h index 8a0d50fb80cec6..7a258a1e16d351 100644 --- a/llama.h +++ b/llama.h @@ -71,6 +71,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 }; LLAMA_API struct llama_context_params llama_context_default_params(); From 82d146df9b43cf677e0dbce20b03cf864958a0cc Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 11:33:16 +0200 Subject: [PATCH 013/773] do not force the prompt file to end with a new line (#908) --- .editorconfig | 3 +++ prompts/chat-with-bob.txt | 2 +- prompts/reason-act.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index df8aaf5046720a..135a7e4bce5a16 100644 --- a/.editorconfig +++ b/.editorconfig @@ -14,3 +14,6 @@ indent_size = 4 [Makefile] indent_style = tab + +[prompts/*.txt] +insert_final_newline = unset diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt index 009da39aee0665..ad494d831f6fb8 100644 --- a/prompts/chat-with-bob.txt +++ b/prompts/chat-with-bob.txt @@ -4,4 +4,4 @@ User: Hello, Bob. Bob: Hello. How may I help you today? User: Please tell me the largest city in Europe. Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. -User: +User: \ No newline at end of file diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt index 8720166312efd7..a4f4f4ee665c47 100644 --- a/prompts/reason-act.txt +++ b/prompts/reason-act.txt @@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333 Question: What is capital of france? Thought: Do I need to use an action? No, I know the answer Answer: Paris is the capital of France -Question: +Question: \ No newline at end of file From 95ea26f6e92d620a5437f576b80868aee7f808d6 Mon Sep 17 00:00:00 2001 From: SebastianApel <13675545+SebastianApel@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:46:23 +0200 Subject: [PATCH 014/773] benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov --- Makefile | 7 +- examples/benchmark/benchmark-q4_0-matmult.c | 270 ++++++++++++++++++++ 2 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 examples/benchmark/benchmark-q4_0-matmult.c diff --git a/Makefile b/Makefile index 3e58a28a751ab3..fe2f26ecb52283 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize quantize-stats perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) + # # Tests # +benchmark: ggml.o + $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) + ./benchmark-q4_0-matmult + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c new file mode 100644 index 00000000000000..9ca9b133a92902 --- /dev/null +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -0,0 +1,270 @@ +/* + License: MIT License + + Changelog: + - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) + +*/ + +#include +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +float tensor_sum_elements(struct ggml_tensor * tensor) { + float sum = 0; + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + } + } + } + return sum; +} + + +/* + These are mapping to unknown + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, +*/ + +#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" + +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ + TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ + TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ + { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } + +struct benchmark_params_struct { + int32_t n_threads = 1; + int32_t n_iterations = 10; +}; + +void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); + fprintf(stderr, "\n"); +} + +int main(int argc, char ** argv) { + + + struct benchmark_params_struct benchmark_params; + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_threads = std::stoi(argv[i]); + } else if (arg == "-i" || arg == "--iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_iterations = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + print_usage(argc, argv, benchmark_params); + exit(0); + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv, benchmark_params); + exit(1); + } + } + + + // create the ggml context + printf("Starting Test\n"); + + + + struct ggml_context * ctx; + //const int sizex = 4096; + //const int sizey = 11008; + +#undef VERBOSE_DEBUGGING +#ifndef VERBOSE_DEBUGGING + const int sizey = 4096; + const int sizex = 11008; + const int sizez = 128; +#else + /* Working - let's increase size */ + const int sizey = 1; + const int sizex = (8*32); + const int sizez = 1; + + /*const int sizey = 1; + const int sizex = 3*(8*32); + const int sizez = 1;*/ +#endif + + //printf("Memsize required = %i\n", sizex*sizex); + ggml_type wtype = GGML_TYPE_F32; + + size_t ctx_size = 0; + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizeof(float); + ctx_size += 1024*1024*100; + + printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + ctx = ggml_init(params); + if (!ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + + + printf("Creating new tensors\n"); + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m11, 1.0f); + + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m12, 1.5f); + + // printf("Creating new tensor m2\n"); + struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); + ggml_set_f32(m2, 2.0f); + + printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + // printf("Creating new tensor m11xm2\n"); + struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf = ggml_build_forward(m11xm2); + + gf.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf.n_threads); + + TENSOR_DUMP(m11); + TENSOR_DUMP(m2); + + ggml_graph_compute(ctx, &gf); + + TENSOR_DUMP(gf.nodes[0]); + + printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + + int32_t nelements = sizex*sizey; + int32_t ne[2] = { sizex, sizey }; + + std::vector hist_cur(1 << 4, 0); + + // Set up a the benchmark matrices + // printf("Creating new tensor q11 & Running quantize\n"); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + + // Set up a the compute graph + // printf("Creating new tensor q31\n"); + struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf31 = ggml_build_forward(q31); + gf31.n_threads=benchmark_params.n_threads; + + // Set up a second graph computation to make sure we override the CPU cache lines + // printf("Creating new tensor q12 & Running quantize\n"); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + + // printf("Creating new tensor q32\n"); + struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); + + //printf("Creating compute graph\n"); + struct ggml_cgraph gf32 = ggml_build_forward(q32); + gf32.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf31.n_threads); + + const int dimx = sizex; + const int dimy = sizey; + const int dimz = sizez; + long long int flops_per_dot_product = dimy + dimy; + long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + + + // Let's use the F32 result from above as a reference for the q4_0 multiplication + float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); + + + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); + printf("==============================================================================================\n"); + + for (int i=0;i allowed_delta) { + printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", + sum_of_F32_reference, + sum_of_Q4_result, + delta, + allowed_delta + ); + exit(0); + } + + // Running a different graph computation to make sure we override the CPU cache lines + ggml_graph_compute(ctx, &gf32); + + } + +} From 585d91a156794d30eec16ebe67c8d7a1d41406c1 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Thu, 13 Apr 2023 15:48:21 +0300 Subject: [PATCH 015/773] cmake : add explicit F16C option (x86) (#576) Fixes building for x86 processors missing F16C featureset MSVC not included, as in MSVC F16C is implied with AVX2/AVX512 --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec1f97befd92..affff3ea17e3ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX" option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_FMA "llama: enable FMA" ON) +# in MSVC F16C is implied with AVX2/AVX512 +if (NOT MSVC) + option(LLAMA_F16C "llama: enable F16C" ON) +endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) @@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") add_compile_options(/arch:AVX) endif() else() - add_compile_options(-mf16c) + if (LLAMA_F16C) + add_compile_options(-mf16c) + endif() if (LLAMA_FMA) add_compile_options(-mfma) endif() From 107980d970808c2ccf9334ad033e2782a560b911 Mon Sep 17 00:00:00 2001 From: niansa/tuxifan Date: Thu, 13 Apr 2023 15:03:39 +0200 Subject: [PATCH 016/773] examples : add -n to alpaca and gpt4all scripts (#706) --- examples/alpaca.sh | 2 +- examples/gpt4all.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/alpaca.sh b/examples/alpaca.sh index 4c9aa5077c9ac9..8d626173030cd2 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -7,4 +7,4 @@ cd `dirname $0` cd .. -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 +./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh index d974f95a920e19..5fd739e55c554a 100755 --- a/examples/gpt4all.sh +++ b/examples/gpt4all.sh @@ -10,6 +10,6 @@ cd .. ./main --color --instruct --threads 4 \ --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ --file ./prompts/alpaca.txt \ - --batch_size 8 --ctx_size 2048 \ + --batch_size 8 --ctx_size 2048 -n -1 \ --repeat_last_n 64 --repeat_penalty 1.3 \ --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95 From 8c3ffc2f048a372639906fb30ec3c2070288d3be Mon Sep 17 00:00:00 2001 From: Vladimir Date: Thu, 13 Apr 2023 15:24:30 +0200 Subject: [PATCH 017/773] ggml : update cblas_sgemm columns var to be more reasonable (#838) --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index a26b4853f7eae5..546da30d1d5391 100644 --- a/ggml.c +++ b/ggml.c @@ -6435,7 +6435,7 @@ static void ggml_compute_forward_mul_mat_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6607,7 +6607,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6820,7 +6820,7 @@ static void ggml_compute_forward_mul_mat_q_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } From 4579af95e8e16910f6dbab0994917a5b3901f0cf Mon Sep 17 00:00:00 2001 From: Judd Date: Thu, 13 Apr 2023 21:43:22 +0800 Subject: [PATCH 018/773] zig : update build.zig (#872) * update * update readme * minimize the changes. --------- Co-authored-by: zjli2019 --- README.md | 40 +++++++++++++++++++++++++++++++--------- build.zig | 22 ++++++++-------------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index dbc088532b05dd..c0958ebd69cb08 100644 --- a/README.md +++ b/README.md @@ -149,21 +149,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the step for the LLaMA-7B model: +Here are the step for the LLaMA-7B model. + +### Get the Code ```bash -# build this repo git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -make +``` + +### Build + +Note: For Windows, CMake or Zig can be used. + +1. Use `make` + + ```bash + make + ``` -#For Windows and CMake, use the following command instead: -cd -mkdir build -cd build -cmake .. -cmake --build . --config Release +1. Use CMake + ```bash + mkdir build + cd build + cmake .. + cmake --build . --config Release + ``` + +1. Use Zig + + ```bash + zig build -Drelease-fast + ``` + +### Prepare Data & Run + +```bash # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model diff --git a/build.zig b/build.zig index defc2c3ad4434d..306127ffe2a73f 100644 --- a/build.zig +++ b/build.zig @@ -1,16 +1,14 @@ const std = @import("std"); -pub fn build(b: *std.Build) void { +pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); + const optimize = b.standardReleaseOptions(); const want_lto = b.option(bool, "lto", "Want -fLTO"); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); + const lib = b.addStaticLibrary("llama", null); lib.want_lto = want_lto; + lib.setTarget(target); + lib.setBuildMode(optimize); lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("examples"); @@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void { fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { const b = args.b; const lib = args.lib; - const target = args.target; - const optimize = args.optimize; const want_lto = args.want_lto; - const exe = b.addExecutable(.{ - .name = name, - .target = target, - .optimize = optimize, - }); + const exe = b.addExecutable(name, null); exe.want_lto = want_lto; + lib.setTarget(args.target); + lib.setBuildMode(args.optimize); exe.addIncludePath("."); exe.addIncludePath("examples"); exe.addCSourceFiles(&.{ From c729ff730a46a135817a3d9988a097e3678a9722 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 15:49:05 +0200 Subject: [PATCH 019/773] flake.nix: add all binaries from bin (#848) --- flake.nix | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flake.nix b/flake.nix index cd1b6d28e78c8e..91d2edd79fb5c5 100644 --- a/flake.nix +++ b/flake.nix @@ -28,10 +28,8 @@ ]; installPhase = '' mkdir -p $out/bin - mv bin/main $out/bin/llama - mv bin/quantize $out/bin/quantize - mv bin/embedding $out/bin/embedding - mv bin/perplexity $out/bin/perplexity + mv bin/* $out/bin/ + mv $out/bin/main $out/bin/llama echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml From 7e941b95eba067cb5b92785e642fd803657376ee Mon Sep 17 00:00:00 2001 From: "Genkagaku.GPT" Date: Thu, 13 Apr 2023 21:54:27 +0800 Subject: [PATCH 020/773] readme : llama node binding (#911) * chore: add nodejs binding * chore: add nodejs binding --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c0958ebd69cb08..a7f220eb2a71ef 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) +- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) **UI:** From ec29272175d7a79681d9919f3e755b1bcefa0478 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 08:59:53 -0500 Subject: [PATCH 021/773] readme : remove python 3.10 warning (#929) --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index a7f220eb2a71ef..c88e0de2874722 100644 --- a/README.md +++ b/README.md @@ -204,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` -Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11. - When running the larger models, make sure you have enough disk space to store all the intermediate files. ### Memory/Disk Requirements From 8cda5c981d0bf4dcb7664194b2cb9a06e2dbdd54 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 09:03:57 -0500 Subject: [PATCH 022/773] fix whitespace (#944) --- Makefile | 6 +- examples/benchmark/benchmark-q4_0-matmult.c | 106 ++++++++++---------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/Makefile b/Makefile index fe2f26ecb52283..c7ccf462d335c4 100644 --- a/Makefile +++ b/Makefile @@ -171,15 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) - + # # Tests # benchmark: ggml.o - $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) + $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult - + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c index 9ca9b133a92902..90f537fd8ae40c 100644 --- a/examples/benchmark/benchmark-q4_0-matmult.c +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -24,12 +24,12 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { float sum = 0; - if (tensor->type==6) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; - } - } + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + } + } } return sum; } @@ -39,7 +39,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { These are mapping to unknown GGML_TYPE_I8, GGML_TYPE_I16, - GGML_TYPE_I32, + GGML_TYPE_I32, GGML_TYPE_COUNT, */ @@ -50,7 +50,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } -struct benchmark_params_struct { +struct benchmark_params_struct { int32_t n_threads = 1; int32_t n_iterations = 10; }; @@ -67,7 +67,7 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para int main(int argc, char ** argv) { - + struct benchmark_params_struct benchmark_params; bool invalid_param = false; @@ -90,7 +90,7 @@ int main(int argc, char ** argv) { } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, benchmark_params); exit(0); - } + } if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); print_usage(argc, argv, benchmark_params); @@ -101,9 +101,9 @@ int main(int argc, char ** argv) { // create the ggml context printf("Starting Test\n"); - - + + struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; @@ -111,31 +111,31 @@ int main(int argc, char ** argv) { #undef VERBOSE_DEBUGGING #ifndef VERBOSE_DEBUGGING const int sizey = 4096; - const int sizex = 11008; + const int sizex = 11008; const int sizez = 128; #else /* Working - let's increase size */ const int sizey = 1; - const int sizex = (8*32); + const int sizex = (8*32); const int sizez = 1; /*const int sizey = 1; - const int sizex = 3*(8*32); + const int sizex = 3*(8*32); const int sizez = 1;*/ #endif //printf("Memsize required = %i\n", sizex*sizex); - ggml_type wtype = GGML_TYPE_F32; - + ggml_type wtype = GGML_TYPE_F32; + size_t ctx_size = 0; ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizeof(float); - ctx_size += 1024*1024*100; - + ctx_size += 1024*1024*100; + printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); - + struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -147,88 +147,88 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } - - + + printf("Creating new tensors\n"); // printf("Creating new tensor m1\n"); struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_f32(m11, 1.0f); - + // printf("Creating new tensor m1\n"); struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_f32(m12, 1.5f); - + // printf("Creating new tensor m2\n"); struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); ggml_set_f32(m2, 2.0f); - + printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); // printf("Creating new tensor m11xm2\n"); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); - + // printf("Creating compute graph\n"); struct ggml_cgraph gf = ggml_build_forward(m11xm2); - + gf.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf.n_threads); - + printf("cgraph->n_threads=%i\n",gf.n_threads); + TENSOR_DUMP(m11); TENSOR_DUMP(m2); - + ggml_graph_compute(ctx, &gf); TENSOR_DUMP(gf.nodes[0]); - + printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); - + int32_t nelements = sizex*sizey; int32_t ne[2] = { sizex, sizey }; - - std::vector hist_cur(1 << 4, 0); + + std::vector hist_cur(1 << 4, 0); // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); - + // Set up a the compute graph // printf("Creating new tensor q31\n"); struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); - + // printf("Creating compute graph\n"); struct ggml_cgraph gf31 = ggml_build_forward(q31); gf31.n_threads=benchmark_params.n_threads; - - // Set up a second graph computation to make sure we override the CPU cache lines + + // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); - + //printf("Creating compute graph\n"); struct ggml_cgraph gf32 = ggml_build_forward(q32); gf32.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf31.n_threads); - + printf("cgraph->n_threads=%i\n",gf31.n_threads); + const int dimx = sizex; const int dimy = sizey; const int dimz = sizez; long long int flops_per_dot_product = dimy + dimy; long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - + // Let's use the F32 result from above as a reference for the q4_0 multiplication float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); - + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); printf("==============================================================================================\n"); - + for (int i=0;i allowed_delta) { printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", - sum_of_F32_reference, + sum_of_F32_reference, sum_of_Q4_result, delta, allowed_delta ); exit(0); } - - // Running a different graph computation to make sure we override the CPU cache lines + + // Running a different graph computation to make sure we override the CPU cache lines ggml_graph_compute(ctx, &gf32); - + } - + } From 6c248707f51c8a50f7792e7f7787ec481881db88 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 16:08:32 +0200 Subject: [PATCH 023/773] ggml : introduce GGML_ALIGNED_MALLOC/GGML_ALIGNED_FREE macros (#884) which allows us to use aligned_alloc or _aligned_malloc functions --- ggml.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 546da30d1d5391..281fd8ec9590d7 100644 --- a/ggml.c +++ b/ggml.c @@ -114,6 +114,14 @@ typedef void* thread_ret_t; #define GGML_MEM_ALIGN 16 #endif +#if defined(_MSC_VER) || defined(__MINGW32__) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#else +#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size) +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif + #define UNUSED(x) (void)(x) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -2966,7 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, @@ -3001,7 +3009,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); if (ctx->mem_buffer_owned) { - free(ctx->mem_buffer); + GGML_ALIGNED_FREE(ctx->mem_buffer); } found = true; From 6232f2d7fd7a22d5eeb62182b2f21fcf01359754 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Thu, 13 Apr 2023 14:59:50 +0000 Subject: [PATCH 024/773] ggml : optimize non-SIMD Q4_0 vector dot product (#703) --- ggml.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 281fd8ec9590d7..eb47d8298cae16 100644 --- a/ggml.c +++ b/ggml.c @@ -2160,18 +2160,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; + int sumi = 0; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; - const float f0 = d0*((int8_t) (v0 & 0xf) - 8); - const float f1 = d0*((int8_t) (v0 >> 4) - 8); + const int8_t i0 = (int8_t) (v0 & 0xf) - 8; + const int8_t i1 = (int8_t) (v0 >> 4) - 8; - const float f2 = d1*((int8_t) (v1 & 0xf) - 8); - const float f3 = d1*((int8_t) (v1 >> 4) - 8); + const int8_t i2 = (int8_t) (v1 & 0xf) - 8; + const int8_t i3 = (int8_t) (v1 >> 4) - 8; - sumf += f0*f2 + f1*f3; + sumi += i0*i2 + i1*i3; } + sumf += d0 * d1 * sumi; } #endif From c85980acd04631a7c43d13676276f76ec72f5dfe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:01:22 +0300 Subject: [PATCH 025/773] gitignore : benchmark --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d8dd34fb973e3b..ba5cbf1ed4374e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ models/* /result /perplexity /embedding +/benchmark-q4_0-matmult /Pipfile arm_neon.h From 9190e8eac8bdc108c40d2d7505e9b45fa773251f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:04:45 +0300 Subject: [PATCH 026/773] llama : merge llama_internal.h into llama.h Hide it behind an #ifdef --- CMakeLists.txt | 1 - Makefile | 2 +- examples/quantize-stats/quantize-stats.cpp | 3 ++- llama.cpp | 1 - llama.h | 11 +++++++++++ llama_internal.h | 12 ------------ 6 files changed, 14 insertions(+), 16 deletions(-) delete mode 100644 llama_internal.h diff --git a/CMakeLists.txt b/CMakeLists.txt index affff3ea17e3ee..d5715d92aa8dfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,7 +253,6 @@ endif() add_library(llama llama.cpp llama.h - llama_internal.h llama_util.h) target_include_directories(llama PUBLIC .) diff --git a/Makefile b/Makefile index c7ccf462d335c4..7db2466508d4eb 100644 --- a/Makefile +++ b/Makefile @@ -142,7 +142,7 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h llama_util.h llama_internal.h +llama.o: llama.cpp llama.h llama_util.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 203bfe8cc10574..c786fe208c78d3 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,6 +1,7 @@ #include "ggml.h" + +#define LLAMA_API_INTERNAL #include "llama.h" -#include "llama_internal.h" #include #include diff --git a/llama.cpp b/llama.cpp index 6d8b706b982587..c7229568473dd1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,7 +5,6 @@ #include "llama_util.h" #include "llama.h" -#include "llama_internal.h" #include "ggml.h" diff --git a/llama.h b/llama.h index 7a258a1e16d351..19221759376858 100644 --- a/llama.h +++ b/llama.h @@ -179,4 +179,15 @@ extern "C" { } #endif +// Internal API to be implemented by llama.cpp and used by tests/benchmarks only +#ifdef LLAMA_API_INTERNAL + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif + #endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h deleted file mode 100644 index 543eed9968e82f..00000000000000 --- a/llama_internal.h +++ /dev/null @@ -1,12 +0,0 @@ -// Internal header to be included by llama.cpp and tests/benchmarks only. - -#ifndef LLAMA_INTERNAL_H -#define LLAMA_INTERNAL_H - -#include -#include -struct ggml_tensor; - -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); - -#endif // LLAMA_INTERNAL_H From d990e3fffc5b0f5448e90a16c79a4f2675100af0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:32:36 +0300 Subject: [PATCH 027/773] ggml : speed-up ggml_vec_dot_q4_1() ARM_NEON + 32-bit ARM support (#900) * ggml : speed-up q4_1 ARM_NEON by ~5% * ggml : implement vaddvq when missing * ggml : implement vminvq and vmaxvq when missing * ggml : implement vzip when missing * ggml : fix comment * ggml : try to use correct ifdef --- ggml.c | 166 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 43 deletions(-) diff --git a/ggml.c b/ggml.c index eb47d8298cae16..b6a24b40c52fc6 100644 --- a/ggml.c +++ b/ggml.c @@ -491,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) } #endif +#if __ARM_NEON + +#if !defined(__aarch64__) + +inline static uint16_t vaddvq_u8(uint8x16_t v) { + return + (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + + (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + + (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + + (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + + (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + + (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + + (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + + (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); +} + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static uint32_t vaddvq_u16(uint16x8_t v) { + return + (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + + (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + + (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + + (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline float vminvq_f32(float32x4_t v) { + return + MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { + return vget_low_s8(vcombine_s8(a, b)); +} + +inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { + return vget_high_s8(vcombine_s8(a, b)); +} + +inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + return vget_low_u8(vcombine_u8(a, b)); +} + +inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + return vget_high_u8(vcombine_u8(a, b)); +} + +#endif +#endif + // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) @@ -1218,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 -#if defined(__ARM_FEATURE_QRDMX) - #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#else - #define GGML_F32x4_REDUCE_ONE(x) \ - (vgetq_lane_f32(x, 0) + \ - vgetq_lane_f32(x, 1) + \ - vgetq_lane_f32(x, 2) + \ - vgetq_lane_f32(x, 3)) -#endif +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #define GGML_F32x4_REDUCE(res, x) \ { \ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ @@ -1849,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // 4-bit -> 8-bit const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); // sub 8 const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); #if defined(__ARM_FEATURE_DOTPROD) - // dot product into int16x8_t + // dot product into int32x4_t int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s32(p_0); - sum1 += x1->d * y1->d * vaddvq_s32(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); - sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); -#endif + sum0 += x0->d*y0->d*vaddvq_s32(p_0); + sum1 += x1->d*y1->d*vaddvq_s32(p_1); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); @@ -1910,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s16(p_0); - sum1 += x1->d * y1->d * vaddvq_s16(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); - sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); -#endif + sum0 += x0->d*y0->d*vaddvq_s16(p_0); + sum1 += x1->d*y1->d*vaddvq_s16(p_1); #endif } @@ -2265,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest float sum10 = 0.0f; float sum11 = 0.0f; - for (int i = 0; i < nb; ++i) { + for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict y0 = &y[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q4_1 * restrict y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v1_0 = vld1q_u8(y0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + const uint8x16_t v1_1 = vld1q_u8(y1->qs); - // and with 0xf + // 4-bit -> 8-bit const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); - const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); - // dot product into uint16x8_t + const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); + const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); + const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); + const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); + + sum00 += x0->m*y0->m; + sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); + sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); + + sum00 += x1->m*y1->m; + sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); + sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l); + int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l); + + p_0 = vdotq_s32(p_0, v0_0h, v1_0h); + p_1 = vdotq_s32(p_1, v0_1h, v1_1h); + + sum11 += x0->d*y0->d*vaddvq_s32(p_0); + sum11 += x1->d*y1->d*vaddvq_s32(p_1); +#else const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); - const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); - const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); - const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); + const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); + const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); + const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); - sum00 += x0->m*y0->m; - sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); - sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); - sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); + const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h); + + const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h); + const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h); + + const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0); + const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1); + + sum11 += x0->d*y0->d*vaddvq_u16(p_0); + sum11 += x1->d*y1->d*vaddvq_u16(p_1); +#endif } sumf = QK*sum00 + sum01 + sum10 + sum11; From a3a2a0eda8828b60436e9f69d9ac2c1060d03e7a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:36:40 +0300 Subject: [PATCH 028/773] ggml : add GGML_DEFAULT_N_THREADS --- ggml.c | 6 +++--- ggml.h | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index b6a24b40c52fc6..42e3ee314424d5 100644 --- a/ggml.c +++ b/ggml.c @@ -9363,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ 0, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -9983,8 +9983,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); + GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); + GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { diff --git a/ggml.h b/ggml.h index 7d8b7a1829dd01..c06c09e060db5e 100644 --- a/ggml.h +++ b/ggml.h @@ -177,11 +177,12 @@ extern "C" { #include #include -#define GGML_MAX_DIMS 4 -#define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 16 -#define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_OPT 4 +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 16 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_OPT 4 +#define GGML_DEFAULT_N_THREADS 4 #ifdef __ARM_NEON // we use the built-in 16-bit float type From 0e07e6a8399fd993739a3ba3c6f95f92bfab6f58 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 10:39:25 -0500 Subject: [PATCH 029/773] common : remove unnecessary includes (#947) --- examples/common.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 91d96efae67ff1..0772dbfe142ffe 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -7,12 +7,6 @@ #include #include -#if defined(_MSC_VER) || defined(__MINGW32__) -#include // using malloc.h with MSC/MINGW -#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -#include -#endif - #if defined (_WIN32) #include #include From be87b6ed20a5f7528bf491a83e759a9fc6a24fea Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Thu, 13 Apr 2023 14:50:42 -0700 Subject: [PATCH 030/773] perplexity : add support for batch size to `--perplexity` (#407) * Add support to batch size for perplexity * Revert "Fix memory allocation issues and seg faults" This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df. * update from merge * Remove perplexity from main * updates * Update batch size for efficiency --- examples/perplexity/perplexity.cpp | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b62f00d0cf110f..38e3643b1ca5eb 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -27,20 +27,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int count = 0; int seq_count = tokens.size() / params.n_ctx; + int n_vocab = llama_n_vocab(ctx); double nll = 0.0; - - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); for (int i = 0; i < seq_count; ++i) { int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512 - // it is better to always be power of 2 for better performance - std::vector embd(tokens.begin() + start, tokens.begin() + end); + int end = start + params.n_ctx; + + std::vector logits; + int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; + for (int j = 0; j < num_batches; ++j) { + int batch_start = start + j * params.n_batch; + int batch_size = std::min(end - batch_start, params.n_batch); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -59,15 +66,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - - auto logits = llama_get_logits(ctx); - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); - const float prob = softmax(tok_logits)[tokens[start + j + 1]]; + logits.begin() + j * n_vocab, + logits.begin() + (j + 1) * n_vocab); + float prob = softmax(tok_logits)[tokens[start + j + 1]]; nll += -std::log(prob); ++count; } @@ -82,11 +86,13 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; + params.n_batch = 512; if (gpt_params_parse(argc, argv, params) == false) { return 1; } params.perplexity = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" From c5d70f5c9ea5a8f0f6b0d6aa741455978a1dabfd Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 14 Apr 2023 14:24:52 +0800 Subject: [PATCH 031/773] ggml : optimize rope function to avoid call powf in the tight loop (#807) --- ggml.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/ggml.c b/ggml.c index 42e3ee314424d5..e2ebc9e4b58e79 100644 --- a/ggml.c +++ b/ggml.c @@ -7507,19 +7507,20 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; + const float theta_scale = powf(10000.0, ((float)-2)/n_dims); + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; - + float theta = (float)p; for (int i0 = 0; i0 < n_dims; i0 += 2) { - const float theta = powf(10000.0, ((float)-i0)/n_dims); - - const float cos_theta = cosf(p*theta); - const float sin_theta = sinf(p*theta); + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -7580,19 +7581,20 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; + const float theta_scale = powf(10000.0, ((float)-2)/n_dims); + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; - + float theta = (float)p; for (int i0 = 0; i0 < n_dims; i0 += 2) { - const float theta = powf(10000.0, ((float)-i0)/n_dims); - - const float cos_theta = cosf(p*theta); - const float sin_theta = sinf(p*theta); + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + theta *= theta_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); From 0f07cacb05f49704d35a39aa27cfd4b419eb6f8d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Apr 2023 09:45:42 +0300 Subject: [PATCH 032/773] ggml : fix q4_1 dot product types --- ggml.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index e2ebc9e4b58e79..d620cd11f95651 100644 --- a/ggml.c +++ b/ggml.c @@ -2344,14 +2344,14 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l); - int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l); + uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l); + uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l); - p_0 = vdotq_s32(p_0, v0_0h, v1_0h); - p_1 = vdotq_s32(p_1, v0_1h, v1_1h); + p_0 = vdotq_u32(p_0, v0_0h, v1_0h); + p_1 = vdotq_u32(p_1, v0_1h, v1_1h); - sum11 += x0->d*y0->d*vaddvq_s32(p_0); - sum11 += x1->d*y1->d*vaddvq_s32(p_1); + sum11 += x0->d*y0->d*vaddvq_u32(p_0); + sum11 += x1->d*y1->d*vaddvq_u32(p_1); #else const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); From 723dac55fa2ba7adc6e3fc8609781d1ad0378906 Mon Sep 17 00:00:00 2001 From: comex Date: Fri, 14 Apr 2023 00:03:03 -0700 Subject: [PATCH 033/773] py : new conversion script (#545) Current status: Working, except for the latest GPTQ-for-LLaMa format that includes `g_idx`. This turns out to require changes to GGML, so for now it only works if you use the `--outtype` option to dequantize it back to f16 (which is pointless except for debugging). I also included some cleanup for the C++ code. This script is meant to replace all the existing conversion scripts (including the ones that convert from older GGML formats), while also adding support for some new formats. Specifically, I've tested with: - [x] `LLaMA` (original) - [x] `llama-65b-4bit` - [x] `alpaca-native` - [x] `alpaca-native-4bit` - [x] LLaMA converted to 'transformers' format using `convert_llama_weights_to_hf.py` - [x] `alpaca-native` quantized with `--true-sequential --act-order --groupsize 128` (dequantized only) - [x] same as above plus `--save_safetensors` - [x] GPT4All - [x] stock unversioned ggml - [x] ggmh There's enough overlap in the logic needed to handle these different cases that it seemed best to move to a single script. I haven't tried this with Alpaca-LoRA because I don't know where to find it. Useful features: - Uses multiple threads for a speedup in some cases (though the Python GIL limits the gain, and sometimes it's disk-bound anyway). - Combines split models into a single file (both the intra-tensor split of the original and the inter-tensor split of 'transformers' format files). Single files are more convenient to work with and more friendly to future changes to use memory mapping on the C++ side. To accomplish this without increasing memory requirements, it has some custom loading code which avoids loading whole input files into memory at once. - Because of the custom loading code, it no longer depends in PyTorch, which might make installing dependencies slightly easier or faster... although it still depends on NumPy and sentencepiece, so I don't know if there's any meaningful difference. In any case, I also added a requirements.txt file to lock the dependency versions in case of any future breaking changes. - Type annotations checked with mypy. - Some attempts to be extra user-friendly: - The script tries to be forgiving with arguments, e.g. you can specify either the model file itself or the directory containing it. - The script doesn't depend on config.json / params.json, just in case the user downloaded files individually and doesn't have those handy. But you still need tokenizer.model and, for Alpaca, added_tokens.json. - The script tries to give a helpful error message if added_tokens.json is missing. --- README.md | 4 +- convert-ggml-to-pth.py | 299 ------- convert-gpt4all-to-ggml.py | 107 --- convert-gptq-to-ggml.py | 172 ---- convert-pth-to-ggml.py | 277 +------ convert-unversioned-ggml-to-ggml.py | 100 --- convert.py | 1143 +++++++++++++++++++++++++++ migrate-ggml-2023-03-30-pr613.py | 311 -------- requirements.txt | 2 + 9 files changed, 1154 insertions(+), 1261 deletions(-) delete mode 100644 convert-ggml-to-pth.py delete mode 100644 convert-gpt4all-to-ggml.py delete mode 100644 convert-gptq-to-ggml.py delete mode 100644 convert-unversioned-ggml-to-ggml.py create mode 100644 convert.py delete mode 100644 migrate-ggml-2023-03-30-pr613.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index c88e0de2874722..78215c9ce40527 100644 --- a/README.md +++ b/README.md @@ -192,10 +192,10 @@ ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model # install Python dependencies -python3 -m pip install torch numpy sentencepiece +python3 -m pip install -r requirements.txt # convert the 7B model to ggml FP16 format -python3 convert-pth-to-ggml.py models/7B/ 1 +python3 convert.py models/7B/ # quantize the model to 4-bits (using method 2 = q4_0) ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py deleted file mode 100644 index 25a44237a092aa..00000000000000 --- a/convert-ggml-to-pth.py +++ /dev/null @@ -1,299 +0,0 @@ -# Author: github.com/ductai199x -import argparse -import os -import struct - -import numpy as np -import torch -from numba import njit -from tqdm.auto import tqdm - - -def read_header(fin): - values = struct.unpack("i" * 9, fin.read(4 * 9)) - _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values - return { - "vocab_size": vocab_size, - "dim": dim, - "multiple_of": multiple_of, - "n_heads": n_heads, - "n_layers": n_layers, - }, ftype - - -def read_tokens(fin, vocab_size): - tokens = [] - for _ in range(vocab_size): - text_len = struct.unpack("i", fin.read(4))[0] - text_bytes = fin.read(text_len) - try: - text = text_bytes.decode() - except UnicodeDecodeError: - text = text_bytes.decode(errors="replace") - score = struct.unpack("f", fin.read(4))[0] - tokens.append((text, score)) - return tokens - - -@njit -def dequantize_weights_numba(fin_data, n_rows, n_cols): - qk = 32 - nb = n_cols // qk - bs = 4 + (qk // 2) - - weights = np.zeros((n_rows, n_cols), dtype=np.float32) - data_pos = 0 - - for row in range(n_rows): - for block in range(nb): - d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0] - data_pos += 4 - packed_values = fin_data[data_pos : data_pos + (qk // 2)] - data_pos += qk // 2 - - for i in range(qk // 2): - packed_value = packed_values[i] - v0 = np.float32((packed_value & 0b00001111) - 8) * d - v1 = np.float32((packed_value >> 4) - 8) * d - - weights[row, block * qk + 2 * i] = v0 - weights[row, block * qk + 2 * i + 1] = v1 - - return weights - - -def dequantize_weights(fin, n_rows, n_cols): - qk = 32 - nb = n_cols // qk - data_size = n_rows * n_cols // 2 + n_rows * nb * 4 - fin_data = fin.read(data_size) - return dequantize_weights_numba(fin_data, n_rows, n_cols) - - -def read_variables(fin): - model = {} - pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables") - while True: - start_pos = fin.tell() - try: - n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3)) - except struct.error: - break - - shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims))) - shape = shape[::-1] - name = fin.read(name_length).decode() - - # ensure tensor data is aligned - tensor_data_offset = fin.tell() - tensor_data_offset = (tensor_data_offset + 31) & -32 - fin.seek(tensor_data_offset) - - if ftype_cur == 2: - # 4-bit quantized weights - dtype = np.uint8 - data = dequantize_weights(fin, shape[0], shape[1]) - data = data.reshape(shape) - elif ftype_cur == 0: - dtype = np.float32 - data_size = np.prod(shape) - data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape) - elif ftype_cur == 1: - dtype = np.float16 - data_size = np.prod(shape) - data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape) - - model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16) - - pbar.update(fin.tell() - start_pos) - - return model - - -def convert_to_hf_format(model, hparams): - # This works for llama 7B, need to test with other models - n_layers = hparams["n_layers"] - n_heads = hparams["n_heads"] - dim = hparams["dim"] - dims_per_head = dim // n_heads - base = 10000.0 - inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - - # permute for sliced rotary - def permute(w): - return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) - - state_dict = {} - for layer_i in range(n_layers): - state_dict.update( - { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - model[f"layers.{layer_i}.attention.wq.weight"] - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - model[f"layers.{layer_i}.attention.wk.weight"] - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": model[ - f"layers.{layer_i}.attention.wv.weight" - ], - f"model.layers.{layer_i}.self_attn.o_proj.weight": model[ - f"layers.{layer_i}.attention.wo.weight" - ], - f"model.layers.{layer_i}.mlp.gate_proj.weight": model[ - f"layers.{layer_i}.feed_forward.w1.weight" - ], - f"model.layers.{layer_i}.mlp.down_proj.weight": model[ - f"layers.{layer_i}.feed_forward.w2.weight" - ], - f"model.layers.{layer_i}.mlp.up_proj.weight": model[ - f"layers.{layer_i}.feed_forward.w3.weight" - ], - f"model.layers.{layer_i}.input_layernorm.weight": model[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": model[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - ) - state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq - state_dict.update( - { - "model.embed_tokens.weight": model["tok_embeddings.weight"], - "model.norm.weight": model["norm.weight"], - "lm_head.weight": model["output.weight"], - } - ) - - return state_dict - - -def chat(model, hparams, llama_dir): - from transformers import (GenerationConfig, LlamaForCausalLM, - LlamaTokenizer, StoppingCriteria, - StoppingCriteriaList) - from transformers.models.llama.configuration_llama import LlamaConfig - - class StoppingCriteriaSub(StoppingCriteria): - def __init__(self): - super().__init__() - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]): - print(tokenizer.decode(input_ids[0]), end="", flush=True) - if input_ids[0][-1] == 13: - return True - - return False - - config = LlamaConfig( - vocab_size=hparams["vocab_size"], - dim=hparams["dim"], - num_hidden_layers=hparams["n_layers"], - num_attention_heads=hparams["n_heads"], - ) - - llama = LlamaForCausalLM(config=config) - llama.load_state_dict(state_dict=model, strict=True) - tokenizer = LlamaTokenizer.from_pretrained(llama_dir) - - device = torch.device("cpu") - llama = llama.to(device) - - ctx = """You are AI. -This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself. -User: Hello, AI. -AI: Hello! How can I assist you today? -""" - print(ctx.rstrip("\n")) - while True: - print("-" * 60) - prompt = input("User: ") - if ctx != "": - ctx = f"{ctx}User: {prompt}\n" - else: - ctx = f"{prompt}\nAI:" - - ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx - - print("-" * 60) - if len(ctx.strip()) > 0: - input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device) - generation_config = GenerationConfig( - temperature=0.8, - top_p=0.95, - top_k=50, - repetition_penalty=1.1764, - ) - with torch.no_grad(): - generation_output = llama.generate( - input_ids=input_ids, - generation_config=generation_config, - return_dict_in_generate=True, - output_scores=True, - max_length=2048, - do_sample=True, - stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]), - ) - s = generation_output.sequences[0] - decoded = tokenizer.decode(s) - ctx = f"{decoded}\n" - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files." - ) - parser.add_argument( - "--prefix", - "-p", - type=str, - required=True, - help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).", - ) - parser.add_argument( - "--hf", - action="store_true", - help="Whether to save the model in the Hugging Face format. (default: False)", - ) - parser.add_argument( - "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)" - ) - args = parser.parse_args() - - llama_dir = os.path.abspath(f"{args.input_dir}/../") - - ggml_files = sorted( - [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)] - ) - - fin = open(ggml_files[0], "rb") - hparams, ftype = read_header(fin) - tokens = read_tokens(fin, hparams["vocab_size"]) - model = read_variables(fin) - - for f in tqdm(ggml_files[1:]): - fin = open(f, "rb") - read_header(fin) - read_tokens(fin, hparams["vocab_size"]) - model.update(read_variables(fin)) - - if args.hf: - model = convert_to_hf_format(model, hparams) - - pth_ckpt = { - "state_dict": model, - "hparams": hparams, - "tokens": tokens, - } - - torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth") - - if args.chat: - if not args.hf: - model = convert_to_hf_format(model, hparams) - chat(model, hparams, llama_dir) - - -if __name__ == "__main__": - main() diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py deleted file mode 100644 index b1a5e0560e0838..00000000000000 --- a/convert-gpt4all-to-ggml.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 - -# -# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py -# - -# Original by https://github.com/eiz -# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 -import argparse -import glob -import os -import struct -import sys -from sentencepiece import SentencePieceProcessor - -HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] - -def parse_args(): - parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') - parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') - parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') - return parser.parse_args() - -def read_header(f_in): - struct_fmt = "i" * (3 + len(HPARAMS)) - struct_size = struct.calcsize(struct_fmt) - buf = f_in.read(struct_size) - return struct.unpack(struct_fmt, buf) - -def write_header(f_out, header): - (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header - - if magic != 0x67676d6c: - raise Exception('Invalid file magic. Must be an old style ggml file.') - - values = [ - 0x67676d66, # magic: ggml in hex - 1, # file version - vocab_size, - dim, - multiple_of, - n_heads, - n_layers, - rot, - ftype - ] - f_out.write(struct.pack("i" * len(values), *values)) - -def write_tokens(fout, tokenizer): - for i in range(tokenizer.vocab_size()): - if tokenizer.is_unknown(i): - text = " \u2047 ".encode() - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - print(f"Invalid token: {piece}") - sys.exit(1) - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode() - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", tokenizer.get_score(i))) - - # TODO: GPT4All - add extra token - text = "".encode() - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", 0.0)) - -def read_tokens(f_in, tokenizer): - for i in range(tokenizer.vocab_size()): - len_b = f_in.read(4) - (length,) = struct.unpack("i", len_b) - f_in.read(length) - -def copy_all_data(f_out, f_in): - while True: - buf = f_in.read(1024 * 1024) - if not buf: - break - f_out.write(buf) - -def convert_one_file(path_in, tokenizer): - path_tmp = f"{path_in}.tmp" - path_orig= f"{path_in}.orig" - print(f"converting {path_in}") - with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: - write_header(f_out, read_header(f_in)) - read_tokens(f_in, tokenizer) - write_tokens(f_out, tokenizer) - copy_all_data(f_out, f_in) - os.rename(path_in, path_orig) - os.rename(path_tmp, path_in) - -def main(): - args = parse_args() - - tokenizer = SentencePieceProcessor(args.tokenizer_model) - - convert_one_file(args.gpt4all_model, tokenizer) - -if __name__ == "__main__": - main() diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py deleted file mode 100644 index 42e99c2ff8d3b8..00000000000000 --- a/convert-gptq-to-ggml.py +++ /dev/null @@ -1,172 +0,0 @@ -# Convert a GPTQ quantized LLaMA model to a ggml compatible file -# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa -# -import os -import re -import sys -import json -import struct -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor - -if len(sys.argv) != 4: - print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n") - sys.exit(1) - -fname_model = sys.argv[1] -fname_tokenizer = sys.argv[2] -dir_out = sys.argv[3] - -model = torch.load(fname_model, map_location="cpu") - -n_vocab, n_embd = model['model.embed_tokens.weight'].shape -n_layer = 1 + max(int(m.group(1)) for name in model - if (m := re.match(r'model\.layers\.([0-9]+)', name))) - -# hardcoded: -n_mult = 256 -n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer] - -tokenizer = SentencePieceProcessor(fname_tokenizer) - -assert tokenizer.vocab_size() == n_vocab - -fname_out = sys.argv[3] - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex -fout.write(struct.pack("i", 1)) # file version -fout.write(struct.pack("i", n_vocab)) -fout.write(struct.pack("i", n_embd)) -fout.write(struct.pack("i", n_mult)) -fout.write(struct.pack("i", n_head)) -fout.write(struct.pack("i", n_layer)) -fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete) -fout.write(struct.pack("i", 4)) - - -# This loop unchanged from convert-pth-to-ggml.py: -for i in range(tokenizer.vocab_size()): - if tokenizer.is_unknown(i): - text = " \u2047 ".encode() - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - print(f"Invalid token: {piece}") - sys.exit(1) - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode() - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", tokenizer.get_score(i))) - -def write_header(shape, dst_name, ftype_cur): - sname = dst_name.encode() - fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur)) - fout.write(struct.pack("i" * len(shape), *shape[::-1])) - fout.write(sname) - - # ensure tensor data is aligned - tensor_data_offset = fout.tell() - tensor_data_offset = (tensor_data_offset + 31) & -32 - fout.seek(tensor_data_offset) - -def convert_non_q4(src_name, dst_name): - v = model[src_name] - shape = v.shape - print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}") - if len(shape) == 1: - print(" Converting to float32") - v = v.to(torch.float32) - - ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype] - - # header - write_header(shape, dst_name, ftype_cur) - - # data - v.numpy().tofile(fout) - -def convert_q4(src_name, dst_name, permute=False): - zeros = model[f"{src_name}.zeros"].numpy() - scales = model[f"{src_name}.scales"].numpy() - bias = model[f"{src_name}.bias"].numpy() - qweight = model[f"{src_name}.qweight"].numpy().T # transpose - - # Q4_1 does not support bias; good thing the bias is always all zeros. - assert not np.any(bias) - - # Each int32 item is actually 8 int4 items packed together, and it's transposed. - shape = (qweight.shape[0], qweight.shape[1] * 8) - - print(f"Processing Q4 variable: {src_name} with shape: {shape}") - - # The output format has the int4 weights in groups of 32 rather than 8. - # It looks like this: - # For each row: - # For each group of 32 columns: - # - addend (float32, 4 bytes) - # - scale (float32, 4 bytes) - # - weights (int4 * 32, 16 bytes) - # Note that in the input, the scales and addends are shared between all - # the columns in a row, so we end up wasting quite a bit of memory with - # repeated scales and addends. - - addends = -zeros # flip sign - - # Since the output format is mixed between integers and floats, we have - # to hackily view the floats as int32s just so numpy will let us - # concatenate them. - addends_view = addends.view(dtype=np.int32) - scales_view = scales.view(dtype=np.int32) - - # Split into groups of 4 columns (i.e. 32 columns of quantized data): - grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4]) - - # Repeat addends and scales: - addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1) - scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1) - - blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no') - - if permute: - # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py. - # This can be done after the above conversion because it doesn't affect column order/layout. - blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:]) - .swapaxes(1, 2) - .reshape(blob.shape)) - - # header - write_header(shape, dst_name, 3) # ftype = Q4_1 - - # data - blob.tofile(fout) - -convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight") -convert_non_q4("model.norm.weight", "norm.weight") -convert_non_q4("lm_head.weight", "output.weight") - -for i in range(n_layer): - convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True) - convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True) - convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight") - convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight") - - convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight") - convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight") - convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight") - - convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight") - convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight") - - -fout.close() - -print(f"Done. Output file: {fname_out}") -print() diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index dcef2f6a322139..f87ac270cd91d6 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -1,274 +1,11 @@ -# Convert a LLaMA model checkpoint to a ggjt compatible file -# -# Load the model using Torch -# Iterate over all variables and write them to a binary file. -# -# For each variable, write the following: -# - Number of dimensions (int) -# - Name length (int) -# - Dimensions (int[n_dims]) -# - Name (char[name_length]) -# - Data (float[n_dims]) -# -# At the start of the ggml file we write the model parameters -# and vocabulary. -# +# Compatibility stub import argparse -import os -import sys -import json -import struct -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor +import convert -QK = 32 - -GGML_TYPE_Q4_0 = 0 -GGML_TYPE_Q4_1 = 1 -GGML_TYPE_I8 = 2 -GGML_TYPE_I16 = 3 -GGML_TYPE_I32 = 4 -GGML_TYPE_F16 = 5 -GGML_TYPE_F32 = 6 - -WTYPES = { - 0: GGML_TYPE_F32, - 1: GGML_TYPE_F16, - 2: GGML_TYPE_Q4_0, - 3: GGML_TYPE_Q4_1, -} - -GGML_BLCK_SIZE = { - GGML_TYPE_Q4_0: QK, - GGML_TYPE_Q4_1: QK, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 1, - GGML_TYPE_I32: 1, - GGML_TYPE_F16: 1, - GGML_TYPE_F32: 1, -} - -GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 2, - GGML_TYPE_I32: 4, - GGML_TYPE_F16: 2, - GGML_TYPE_F32: 4, -} - -def ggml_nelements(shape): - r = 1 - for i in shape: - r *= i - return r - -def ggml_nbytes(shape, ftype): - x = ggml_nelements(shape) - t = WTYPES[ftype] - x *= GGML_TYPE_SIZE[t] - x //= GGML_BLCK_SIZE[t] - return x - -def parse_args(): - parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') - parser.add_argument('dir_model', help='directory containing the model checkpoint') - parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) - parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?') - return parser.parse_args() - -def get_n_parts(dim): - mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} - n_parts = mappings.get(dim) - if n_parts is None: - print(f"Invalid dim: {dim}") - sys.exit(1) - - print(f"n_parts = {n_parts}\n") - return n_parts - -def load_hparams_and_tokenizer(dir_model): - # `dir_model` is something like `models/7B` or `models/7B/`. - # "tokenizer.model" is expected under model's parent dir. - # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found. - # Let's use the model's parent dir directly. - model_parent_dir = os.path.dirname(os.path.normpath(dir_model)) - fname_hparams = f"{dir_model}/params.json" - fname_tokenizer = f"{model_parent_dir}/tokenizer.model" - with open(fname_hparams, "r") as f: - hparams = json.load(f) - print(hparams) - tokenizer = SentencePieceProcessor(fname_tokenizer) - hparams.update({"vocab_size": tokenizer.vocab_size()}) - return hparams, tokenizer - -def write_header(fout, hparams, ftype): - keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] - values = [ - 0x67676a74, # magic: ggjt in hex - 1, # file version - *[hparams[key] for key in keys], - hparams["dim"] // hparams["n_heads"], # rot (obsolete) - ftype - ] - fout.write(struct.pack("i" * len(values), *values)) - -def write_tokens(fout, tokenizer): - for i in range(tokenizer.vocab_size()): - if tokenizer.is_unknown(i): - text = " \u2047 ".encode() - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - print(f"Invalid token: {piece}") - sys.exit(1) - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode() - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", tokenizer.get_score(i))) - -def process_and_write_variables(fout, model, ftype, part_id, n_parts): - for name, datao in model.items(): - if name.endswith("freqs"): - continue - - # remove dimensions with a single element - data = datao.numpy().squeeze() - partshape = data.shape - n_dims = len(data.shape) - assert n_dims in (1, 2) - - print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}") - - # coerce single-dimensional tensors from float16 to float32 - ftype_cur = 1 - if ftype == 0 or n_dims == 1: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]] - type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]] - - # determine dimension along which multipart tensor is sharded - # - # split_dim 0 regex: - # - output.* - # - layers.*.attention.wq.weight - # - layers.*.attention.wk.weight - # - layers.*.attention.wv.weight - # - layers.*.feed_forward.w1.weight - # - layers.*.feed_forward.w3.weight - # - # split_dim 1 regex: - # - tok_embeddings.* - # - layers.*.attention.wo.weight - # - layers.*.feed_forward.w2.weight - # - if n_dims > 1: - split_dim = 1 - if "tok_embeddings" in name: - split_dim = 1 - elif "layers" in name: - if "attention.wo.weight" in name: - split_dim = 1 - elif "feed_forward.w2.weight" in name: - split_dim = 1 - else: - split_dim = 0 - elif "output" in name: - split_dim = 0 - - # output tensor header - fullshape = list(partshape) - if n_dims > 1: - fullshape[split_dim] *= n_parts - sname = name.encode() - fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) - for dim in reversed(fullshape): - fout.write(struct.pack("i", dim)) - fout.write(sname) - - # ensure tensor data is aligned - tensor_data_offset = fout.tell() - while tensor_data_offset % QK != 0: - fout.write(struct.pack("B", 0)) - tensor_data_offset += 1 - - # output unified mappable tensor data - if n_dims == 1 or n_parts == 1: - # copy tensor which we thankfully received in one piece - if part_id == 0: - data.tofile(fout) - elif split_dim == 0: - # reassemble multifile tensor containing some of the rows - rows_per_chunk = partshape[0] - current_row = part_id * rows_per_chunk - bytes_per_row = fullshape[1] // blck_size * type_size - offset = current_row * bytes_per_row - fout.seek(tensor_data_offset + offset) - data.tofile(fout) - elif split_dim == 1: - # reassemble multifile tensor containing some of the cols - cols_per_chunk = partshape[1] - current_col = part_id * cols_per_chunk - bytes_per_row = fullshape[1] // blck_size * type_size - offset_current_col = current_col // blck_size * type_size - for row in range(partshape[0]): - offset_row = row * bytes_per_row - offset = offset_row + offset_current_col - fout.seek(tensor_data_offset + offset) - data[row].tofile(fout) - - # advance file position to next tensor - fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur)) - -def main(): - args = parse_args() - dir_model = args.dir_model - ftype = args.ftype - ftype_str = ["f32", "f16"] - hparams, tokenizer = load_hparams_and_tokenizer(dir_model) - - print(args) - - # if only writing vocab to file - if args.vocab_only: - fname_model = f"{dir_model}/consolidated.00.pth" - fname_out = f"{dir_model}/ggml-vocab.bin" - print(f"Extracting only the vocab from '{fname_model}'\n") - with open(fname_out, "wb") as fout: - write_header(fout, hparams, ftype) - write_tokens(fout, tokenizer) - print(f"Done. Output file: {fname_out}\n") - return - - n_parts = get_n_parts(hparams["dim"]) - fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin" - - # we output a single file for ggml - with open(fname_out, "wb") as fout: - write_header(fout, hparams, ftype) - write_tokens(fout, tokenizer) - offset_of_tensors = fout.tell() - # the tensors we load could be split across multiple files - for part_id in range(n_parts): - fout.seek(offset_of_tensors) - print(f"Processing part {part_id+1} of {n_parts}\n") - fname_model = f"{dir_model}/consolidated.0{part_id}.pth" - model = torch.load(fname_model, map_location="cpu") - process_and_write_variables(fout, model, ftype, part_id, n_parts) - del model - - print(f"Done. Output file: {fname_out}\n") - -if __name__ == "__main__": - main() +parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') +parser.add_argument('dir_model', help='directory containing the model checkpoint') +parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) +args = parser.parse_args() +convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model]) diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py deleted file mode 100644 index 5151d9081a6d39..00000000000000 --- a/convert-unversioned-ggml-to-ggml.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -# Original by https://github.com/eiz -# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 -import argparse -import glob -import os -import struct -import sys -from sentencepiece import SentencePieceProcessor - -HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] - -def parse_args(): - parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format') - parser.add_argument('dir_model', help='directory containing ggml .bin files') - parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') - return parser.parse_args() - -def read_header(f_in): - struct_fmt = "i" * (3 + len(HPARAMS)) - struct_size = struct.calcsize(struct_fmt) - buf = f_in.read(struct_size) - return struct.unpack(struct_fmt, buf) - -def write_header(f_out, header): - (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header - - if magic != 0x67676d6c: - raise Exception('Invalid file magic. Must be an old style ggml file.') - - values = [ - 0x67676d66, # magic: ggml in hex - 1, # file version - vocab_size, - dim, - multiple_of, - n_heads, - n_layers, - rot, - ftype - ] - f_out.write(struct.pack("i" * len(values), *values)) - -def write_tokens(fout, tokenizer): - for i in range(tokenizer.vocab_size()): - if tokenizer.is_unknown(i): - text = " \u2047 ".encode() - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - print(f"Invalid token: {piece}") - sys.exit(1) - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode() - fout.write(struct.pack("i", len(text))) - fout.write(text) - fout.write(struct.pack("f", tokenizer.get_score(i))) - -def read_tokens(f_in, tokenizer): - for i in range(tokenizer.vocab_size()): - len_b = f_in.read(4) - (length,) = struct.unpack("i", len_b) - f_in.read(length) - -def copy_all_data(f_out, f_in): - while True: - buf = f_in.read(1024 * 1024) - if not buf: - break - f_out.write(buf) - -def convert_one_file(path_in, tokenizer): - path_tmp = f"{path_in}.tmp" - path_orig= f"{path_in}.orig" - print(f"converting {path_in}") - with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: - write_header(f_out, read_header(f_in)) - read_tokens(f_in, tokenizer) - write_tokens(f_out, tokenizer) - copy_all_data(f_out, f_in) - os.rename(path_in, path_orig) - os.rename(path_tmp, path_in) - -def main(): - args = parse_args() - files = [] - files.extend(glob.glob(f"{args.dir_model}/*.bin")) - files.extend(glob.glob(f"{args.dir_model}/*.bin.*")) - - tokenizer = SentencePieceProcessor(args.tokenizer_model) - - for file in files: - convert_one_file(file, tokenizer) - -if __name__ == "__main__": - main() diff --git a/convert.py b/convert.py new file mode 100644 index 00000000000000..f35163f67d43eb --- /dev/null +++ b/convert.py @@ -0,0 +1,1143 @@ +import argparse +import concurrent.futures +import copy +import enum +import faulthandler +import functools +import io +import itertools +import json +import math +import mmap +import pickle +import re +import signal +import struct +import sys +import zipfile +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass +from pathlib import Path +import numpy as np +from sentencepiece import SentencePieceProcessor # type: ignore +from typing import (IO, Any, Callable, Iterable, Literal, Optional, Sequence, + TypeVar, Union, List, Dict, Tuple, TYPE_CHECKING) +if TYPE_CHECKING: + from typing_extensions import TypeAlias + +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): + faulthandler.register(signal.SIGUSR1) + +NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' + + +@dataclass(frozen=True) +class UnquantizedDataType: + name: str + + +DT_F16 = UnquantizedDataType('F16') +DT_F32 = UnquantizedDataType('F32') +DT_I32 = UnquantizedDataType('I32') +DT_BF16 = UnquantizedDataType('BF16') + + +@dataclass(frozen=True) +class QuantizedDataType: + groupsize: int + have_addends: bool + have_g_idx: bool + + +DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False) +DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False) + +DataType = Union[UnquantizedDataType, QuantizedDataType] + +DATA_TYPE_TO_FTYPE: Dict[DataType, int] = { + DT_F32: 0, + DT_F16: 1, + DT_Q4_0: 2, + DT_Q4_1: 3, +} + +FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \ + {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} + +DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { + DT_F16: np.dtype(np.float16), + DT_F32: np.dtype(np.float32), + DT_I32: np.dtype(np.int32), +} + +NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \ + {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()} + + +class GGMLFileType(enum.Enum): + AllF32 = 0 + MostlyF16 = 1 # except 1d tensors + MostlyQ4_0 = 2 # except 1d tensors + MostlyQ4_1 = 3 # except 1d tensors + PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16 + + def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType: + if len(tensor.shape) == 1: + # 1D tensors are always F32. + return DT_F32 + elif self == GGMLFileType.AllF32: + return DT_F32 + elif self == GGMLFileType.MostlyF16: + return DT_F16 + elif self == GGMLFileType.MostlyQ4_0: + return DT_Q4_0 + elif self == GGMLFileType.MostlyQ4_1: + return DT_Q4_1 + elif self == GGMLFileType.PerLayerIsQ4_1: + if name in ('output.weight', 'tok_embeddings.weight'): + return DT_F16 + else: + return DT_Q4_1 + else: + raise ValueError(self) + + +def make_tensors_list() -> List[str]: + ret = [ + 'tok_embeddings.weight', + 'norm.weight', + 'output.weight', + ] + for i in range(80): # maximum number of layer + ret += [ + f'layers.{i}.attention.wq.weight', + f'layers.{i}.attention.wk.weight', + f'layers.{i}.attention.wv.weight', + f'layers.{i}.attention.wo.weight', + f'layers.{i}.attention_norm.weight', + f'layers.{i}.feed_forward.w1.weight', + f'layers.{i}.feed_forward.w2.weight', + f'layers.{i}.feed_forward.w3.weight', + f'layers.{i}.atttention_norm.weight', + f'layers.{i}.ffn_norm.weight', + ] + return ret + + +TENSORS_LIST = make_tensors_list() +TENSORS_SET = set(TENSORS_LIST) + + +@dataclass +class Params: + n_vocab: int + n_embd: int + n_mult: int + n_head: int + n_layer: int + file_type: GGMLFileType + + @staticmethod + def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params': + n_vocab, n_embd = model["tok_embeddings.weight"].shape + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=256, + n_head=n_embd // 128, + n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model), + file_type=file_type, + ) + + +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + added_tokens: Dict[str, int] + if fname_added_tokens is not None: + added_tokens = json.load(open(fname_added_tokens)) + else: + added_tokens = {} + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: + tokenizer = self.sentencepiece_tokenizer + for i in range(tokenizer.vocab_size()): + text: bytes + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + elif tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + raise Exception(f"Invalid token: {piece}") + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + score: float = tokenizer.get_score(i) + yield text, score + + def added_tokens(self) -> Iterable[Tuple[bytes, float]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score + + def all_tokens(self) -> Iterable[Tuple[bytes, float]]: + yield from self.sentencepiece_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class GGMLVocab: + def __init__(self, tokens: List[Tuple[bytes, float]]): + self.tokens = tokens + self.vocab_size = len(tokens) + + def all_tokens(self) -> Iterable[Tuple[bytes, float]]: + return self.tokens + + def __repr__(self) -> str: + return f"" + + +Vocab = Union[SentencePieceVocab, GGMLVocab] + + +def permute(weights: NDArray, n_head: int) -> NDArray: + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + +def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray: + # First reinterpret each row from a list of int32s containing 8 values each + # to a list of uint8s containing 2 values each. + qvalues_pack8 = qvalues_pack32.view(np.uint8) + + # Then split out the two values per int8 (which requires an actual + # conversion because numpy doesn't natively support int4s). + qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8) + qvalues[:, 0::2] = qvalues_pack8 & 0xf + qvalues[:, 1::2] = qvalues_pack8 >> 4 + + assert addends is None or addends.shape == scales.shape + assert qvalues.shape[0] == scales.shape[0] + assert qvalues.shape[1] % scales.shape[1] == 0 + if g_idx is None: + repeat_count = qvalues.shape[1] // scales.shape[1] + scales = scales[:, :, np.newaxis] + if addends is not None: + addends = addends[:, :, np.newaxis] + # Reshape so that the below computation broadcasts over scales and addends: + qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count)) + else: + # In this case the scale and addend is selected for each column by g_idx: + assert addends is not None + scales = scales[:, g_idx] + addends = addends[:, g_idx] + if addends is None: + # Q4_0 + qvalues = qvalues.view(np.int8) + qvalues -= 8 + # And do the actual 'value = scale * qvalue + addend' computation. + values = scales * qvalues + if addends is not None: + values += addends + if g_idx is None: + values.shape = (values.shape[0], values.shape[1] * values.shape[2]) + return values + + +class Tensor(metaclass=ABCMeta): + data_type: DataType + + @abstractmethod + def astype(self, data_type: DataType) -> 'Tensor': ... + @abstractmethod + def permute(self, n_head: int) -> 'Tensor': ... + @abstractmethod + def to_ggml(self) -> 'GGMLCompatibleTensor': ... + + +class UnquantizedTensor(Tensor): + def __init__(self, ndarray: NDArray) -> None: + assert isinstance(ndarray, np.ndarray) + self.ndarray = ndarray + self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] + + def astype(self, data_type: DataType) -> Tensor: + dtype = DATA_TYPE_TO_NUMPY[data_type] + return UnquantizedTensor(self.ndarray.astype(dtype)) + + def to_ggml(self) -> 'UnquantizedTensor': + return self + + def permute(self, n_head: int) -> 'UnquantizedTensor': + return UnquantizedTensor(permute(self.ndarray, n_head)) + + +def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: + tensor = lazy_tensor.load() + assert isinstance(tensor, UnquantizedTensor) + + # double-check: + actual_shape = list(tensor.ndarray.shape) + assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape) + if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype: + if convert: + tensor.ndarray = tensor.ndarray.astype(expected_dtype) + else: + raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}') + + return tensor.ndarray + + +class GGMLQuantizedTensor(Tensor): + data_type: QuantizedDataType + + def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None: + rows, columns = shape + assert data_type in (DT_Q4_1, DT_Q4_0) # for now + assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this + assert columns % data_type.groupsize == 0 + words_in_block = 6 if data_type == DT_Q4_1 else 5 + self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block)) + self.shape = shape[:] + self.data_type = data_type + + def astype(self, data_type: DataType) -> Tensor: + if data_type == self.data_type: + return self + scales = self.ndarray[:, :, 0].view(np.float32) + if self.data_type.have_addends: + addends = self.ndarray[:, :, 1].view(np.float32) + else: + addends = None + qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8]) + + dq = dequantize_q4(qweights, scales, addends, g_idx=None) + return UnquantizedTensor(dq).astype(data_type) + + def to_ggml(self) -> 'GGMLQuantizedTensor': + return self + + def permute(self, n_head: int) -> 'GGMLQuantizedTensor': + return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type) + + +GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] + + +class DeferredPermutedTensor(Tensor): + def __init__(self, base: Tensor, n_head: int) -> None: + self.base = base + self.n_head = n_head + self.data_type = self.base.data_type + + def astype(self, data_type: DataType) -> Tensor: + return self.base.astype(data_type).permute(self.n_head) + + def to_ggml(self) -> GGMLCompatibleTensor: + return self.base.to_ggml().permute(self.n_head) + + def permute(self, n_head: int) -> Tensor: + raise Exception("shouldn't permute twice") + + +class GPTQForLLaMaQuantizedTensor(Tensor): + def __init__(self, model: 'LazyModel', namebase: str) -> None: + qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32) + scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True) + + bias = model.get(f"{namebase}.bias") + if bias is not None: + # Q4_1 does not support bias; good thing the bias is always all zeros. + assert not np.any(load_unquantized(bias)) + + if f"{namebase}.zeros" in model: + zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32) + else: + qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32) + assert qzeros.dtype == np.int32 + zeros = dequantize_q4(qzeros, scales, scales, g_idx=None) + assert zeros.dtype == np.float32 + + assert zeros.shape == scales.shape + + # Output is transposed compared to the input, and addends have their sign flipped. + # Scales and zeros similarly must be transposed but only for newer + # versions of GPTQ-for-LLaMa; the older versions can be identified by + # having shape (n_embd, 1). + qweight = qweight.T + if scales.shape[1] != 1: + scales = scales.T + zeros = zeros.T + + # Output also has signs flipped for the addends. + self.qweight = qweight + self.scales = scales + self.addends = -zeros + + self.g_idx: Optional[NDArray] + if f"{namebase}.g_idx" in model: + self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32) + assert self.g_idx.shape == (qweight.shape[1] * 8,) + else: + self.g_idx = None + + self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8] + self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True, + have_g_idx=(self.g_idx is not None)) + + def inspect(self, row: int, col: int) -> None: + '''For debugging.''' + qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf + if self.g_idx is not None: + group = self.g_idx[col] + else: + group = int(col // self.groupsize()) + scale = self.scales[row, group] + addend = self.addends[row, group] + with np.printoptions(precision=None, suppress=True): + print(f'scale:{scale} addend:{addend} qweight:{qweight}') + print('possible values:', np.arange(16) * scale + addend) + print('actual value:', qweight * scale + addend) + + def astype(self, data_type: DataType) -> Tensor: + if isinstance(data_type, QuantizedDataType): + assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False + return self.regroup(data_type.groupsize) + + dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx) + return UnquantizedTensor(dequantized).astype(data_type) + + def groupsize(self) -> int: + assert self.addends.shape == self.scales.shape + assert self.shape[1] % self.scales.shape[1] == 0 + return self.shape[1] // self.scales.shape[1] + + def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': + # Old versions of GPTQ-for-LLaMa shared scales and addends between all the + # columns in a row. Newer versions share them between every set of N + # columns in a row, where N is the `groupsize` parameter, usually 128. The + # output format shares them between every set of 32 columns. To handle + # this, duplicate scales and addends for every smaller group. + # (In the above, 'row' and 'column' are in the sense of the output.) + assert self.g_idx is None + old_groupsize = self.groupsize() + assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize + ret = copy.copy(self) + ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1) + ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1) + ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) + return ret + + def permute(self, n_head: int) -> Tensor: + return DeferredPermutedTensor(self, n_head) + + def to_ggml(self) -> GGMLQuantizedTensor: + # The output format looks like this: + # For each row: + # For each group of 32 columns: + # - addend (float32, 4 bytes) + # - scale (float32, 4 bytes) + # - weights (int4 * 32, 16 bytes) + + if self.groupsize() != 32: + raise Exception("should have been regrouped before converting to ggml") + + # Since the output format is mixed between integers and floats, we have + # to hackily view the floats as int32s just so numpy will let us + # concatenate them. + addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis] + scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis] + + # Split into groups of 4 columns (i.e. 32 columns of quantized data): + grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) + + # And concatenate: + grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no') + + return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1) + + +@dataclass +class LazyTensor: + _load: Callable[[], Tensor] + shape: List[int] + data_type: DataType + description: str + + def load(self) -> Tensor: + ret = self._load() + assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) + return ret + + def astype(self, data_type: DataType) -> 'LazyTensor': + self.validate_conversion_to(data_type) + + def load() -> Tensor: + return self.load().astype(data_type) + return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') + + def validate_conversion_to(self, data_type: DataType) -> None: + if data_type == self.data_type: + return + if isinstance(data_type, QuantizedDataType): + if not isinstance(self.data_type, QuantizedDataType): + raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") + if self.data_type.have_g_idx: + sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n") + sys.exit(1) + assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends + + +LazyModel = Dict[str, LazyTensor] + + +@dataclass +class ModelPlus: + model: LazyModel + paths: List[Path] # Where this was read from. + format: Literal['ggml', 'torch', 'safetensors'] + vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. + + +def merge_sharded(models: List[LazyModel]) -> LazyModel: + # Original LLaMA models have each file contain one part of each tensor. + # Use a dict instead of a set to preserve order. + names = {name: None for model in models for name in model} + + def convert(name: str) -> LazyTensor: + lazy_tensors: List[LazyTensor] = [model[name] for model in models] + if len(lazy_tensors) == 1: + # only one file; don't go through this procedure since there might + # be quantized tensors + return lazy_tensors[0] + if len(lazy_tensors[0].shape) == 1: + # the tensor is just duplicated in every file + return lazy_tensors[0] + if name.startswith('tok_embeddings.') or \ + name.endswith('.attention.wo.weight') or \ + name.endswith('.feed_forward.w2.weight'): + # split by columns + axis = 1 + else: + # split by rows + axis = 0 + concatenated_shape = list(lazy_tensors[0].shape) + concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) + + def load() -> UnquantizedTensor: + ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] + concatenated: NDArray = np.concatenate(ndarrays, axis=axis) + return UnquantizedTensor(concatenated) + description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' + return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) + return {name: convert(name) for name in names} + + +def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: + formats = set(mp.format for mp in models_plus) + assert len(formats) == 1, "different formats?" + format = formats.pop() + paths = [path for mp in models_plus for path in mp.paths] + # Use the first non-None vocab, if any. + try: + vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None) + except StopIteration: + vocab = None + + if any("model.embed_tokens.weight" in mp.model for mp in models_plus): + # Transformers models put different tensors in different files, but + # don't split indivdual tensors between files. + model: LazyModel = {} + for mp in models_plus: + model.update(mp.model) + else: + model = merge_sharded([mp.model for mp in models_plus]) + + return ModelPlus(model, paths, format, vocab) + + +def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor: + def load() -> Tensor: + return lazy_tensor.load().permute(n_head) + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) + + +def convert_transformers_to_orig(model: LazyModel) -> LazyModel: + out: LazyModel = {} + out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] + out["norm.weight"] = model["model.norm.weight"] + out["output.weight"] = model["lm_head.weight"] + + n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128 + for i in itertools.count(): + if f"model.layers.{i}.self_attn.q_proj.weight" not in model: + break + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head) + out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] + + out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"] + out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"] + out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"] + + out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"] + out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"] + return out + + +def handle_quantization(model: LazyModel) -> LazyModel: + '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc. + (which resolve to UnquantizedTensors with the raw data) to one with entries + for 'foo.weight' (which resolve to QuantizedTensors). + ''' + def convert(name: str) -> Tuple[str, LazyTensor]: + if name.endswith(".qweight"): + namebase = name.rsplit('.', 1)[0] + orig_name = namebase + ".weight" + + lazy_tensor = model[name] + assert len(lazy_tensor.shape) == 2 + real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8] + + # Calculate type. This replicates the logic in + # GPTQForLLaMaQuantizedTensor (which is executed when the modelis + # actually loaded). + lazy_scales = model[f"{namebase}.scales"] + scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0] + assert real_shape[1] % scales_width == 0 + groupsize = real_shape[1] // scales_width + have_g_idx = f"{namebase}.g_idx" in model + data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx) + + def load() -> Tensor: + return GPTQForLLaMaQuantizedTensor(model, namebase) + + return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]')) + else: + return (name, model[name]) + return dict(convert(name) for name in model) + +# Functionality that simulates `torch.load` but where individual tensors are +# only loaded into memory on demand, not all at once. +# PyTorch can't do this natively as of time of writing: +# - https://github.com/pytorch/pytorch/issues/64327 +# This allows us to de-shard without multiplying RAM usage, and also +# conveniently drops the PyTorch dependency (though we still need numpy). + + +@dataclass +class LazyStorageKind: + data_type: DataType + + +@dataclass +class LazyStorage: + load: Callable[[int, int], NDArray] + kind: LazyStorageKind + description: str + + +class LazyUnpickler(pickle.Unpickler): + def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): + super().__init__(fp) + self.data_base_path = data_base_path + self.zip_file = zip_file + + def persistent_load(self, pid: Any) -> Any: + assert pid[0] == 'storage' + assert isinstance(pid[1], LazyStorageKind) + data_type = pid[1].data_type + filename_stem = pid[2] + filename = self.data_base_path + '/' + filename_stem + info = self.zip_file.getinfo(filename) + + def load(offset: int, elm_count: int) -> NDArray: + dtype = DATA_TYPE_TO_NUMPY.get(data_type) + if dtype is None: + raise Exception("tensor stored in unsupported format") + fp = self.zip_file.open(info) + fp.seek(offset * dtype.itemsize) + size = elm_count * dtype.itemsize + data = fp.read(size) + assert len(data) == size + return np.frombuffer(data, dtype) + description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' + return LazyStorage(load=load, kind=pid[1], description=description) + + def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] + requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: + assert isinstance(storage, LazyStorage) + + def load() -> UnquantizedTensor: + elm_count = stride[0] * size[0] + return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size)) + description = f'pickled storage_offset={storage_offset} in {storage.description}' + return LazyTensor(load, list(size), storage.kind.data_type, description) + + CLASSES: Dict[Any, Any] = { + ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + } + + def find_class(self, module: str, name: str) -> Any: + if not module.startswith('torch'): + return super().find_class(module, name) + return self.CLASSES[(module, name)] + + +def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: + zf = zipfile.ZipFile(outer_fp) + pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] + assert len(pickle_paths) == 1, pickle_paths + pickle_fp = zf.open(pickle_paths[0], 'r') + unpickler = LazyUnpickler(pickle_fp, + data_base_path=pickle_paths[0][:-4], + zip_file=zf) + model = unpickler.load() + as_dict = dict(model.items()) + return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) + + +SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { + 'F16': DT_F16, + 'F32': DT_F32, + 'I32': DT_I32, +} + + +def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: + header_size, = struct.unpack(' LazyTensor: + data_type = SAFETENSORS_DATA_TYPES[info['dtype']] + numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + shape: List[int] = info['shape'] + begin, end = info['data_offsets'] + assert 0 <= begin <= end <= len(byte_buf) + assert end - begin == math.prod(shape) * numpy_dtype.itemsize + buf = byte_buf[begin:end] + + def load() -> UnquantizedTensor: + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + description = f'safetensors begin={begin} end={end} type={data_type} path={path}' + return LazyTensor(load, shape, data_type, description) + model = {name: convert(info) for (name, info) in header.items()} + return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None) + + +def must_read(fp: IO[bytes], length: int) -> bytes: + ret = fp.read(length) + if len(ret) < length: + raise Exception("unexpectedly reached end of file") + return ret + + +def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus: + magic = must_read(fp, 4)[::-1] + if magic in (b'ggmf', b'ggjt'): + version, = struct.unpack("i", must_read(fp, 4)) + assert version == 1 + else: + assert magic == b'ggml' + version = None + n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28)) + + tokens: List[Tuple[bytes, float]] = [] + for i in range(n_vocab): + if i == 32000: + # HACK: GPT4All messed with the format without changing the magic + # number. Specifically, they changed the vocab section to contain + # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the + # extra pad token). Try to detect if we're reading a file like + # this. + orig_pos = fp.tell() + fp.seek(20, io.SEEK_CUR) + is_gpt4all = fp.read(21) == b'tok_embeddings.weight' + fp.seek(orig_pos) + if is_gpt4all: + break + + length, = struct.unpack("i", must_read(fp, 4)) + text = must_read(fp, length) + if magic != b'ggml': + score, = struct.unpack("f", must_read(fp, 4)) + tokens.append((text, score)) + vocab = GGMLVocab(tokens) if magic != b'ggml' else None + + model: LazyModel = {} + # Use mmap for the actual data to avoid race conditions with the file offset. + mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) + + def read_tensor() -> None: # this is a function so that variables captured in `load` don't change + shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) + assert 0 <= shape_len <= 3 + shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) + shape = shape[::-1] + name = must_read(fp, name_len).decode('utf-8') + data_type = FTYPE_TO_DATA_TYPE[ftype] + + if magic == b'ggjt': + fp.seek((fp.tell() + 31) & -32) + + if data_type == DT_Q4_1: + # See GPTQForLLaMaQuantizedTensor.ggml_ndarray() + size = 24 * (shape[1] // 32) * shape[0] + elif data_type == DT_Q4_0: + size = 20 * (shape[1] // 32) * shape[0] + else: + numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + elm_count = math.prod(shape) + size = elm_count * numpy_dtype.itemsize + offset = fp.tell() + buf = mapped[offset:offset+size] + fp.seek(size, io.SEEK_CUR) + + def load() -> Tensor: + if isinstance(data_type, QuantizedDataType): + ndarray = np.frombuffer(buf, dtype=np.uint32) + return GGMLQuantizedTensor(ndarray, shape, data_type) + else: + return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) + description = f'ggml offset={offset} type={data_type} path={path}' + model[name] = LazyTensor(load, shape, data_type, description) + + while fp.read(1) != b'': + fp.seek(-1, io.SEEK_CUR) + read_tensor() + + return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab) + + +@functools.lru_cache(maxsize=None) +def lazy_load_file(path: Path) -> ModelPlus: + fp = open(path, 'rb') + first8 = fp.read(8) + fp.seek(0) + if first8[:2] == b'PK': + # A zip file, i.e. PyTorch format + return lazy_load_torch_file(fp, path) + elif first8[2:4] == b'gg': + # GGML format + return lazy_load_ggml_file(fp, path) + elif struct.unpack(' Iterable[Out]: + '''Parallel map, but with backpressure. If the caller doesn't call `next` + fast enough, this will stop calling `func` at some point rather than + letting results pile up in memory. Specifically, there is a max of one + output value buffered per thread.''' + with concurrent.futures.ThreadPoolExecutor() as executor: + futures: List[concurrent.futures.Future[Out]] = [] + items_rev = list(iterable)[::-1] + for i in range(min(concurrency, len(items_rev))): + futures.append(executor.submit(func, items_rev.pop())) + while futures: + result = futures.pop(0).result() + if items_rev: + futures.append(executor.submit(func, items_rev.pop())) + yield result + + +def check_vocab_size(params: Params, vocab: Vocab) -> None: + if params.n_vocab != vocab.vocab_size: + # GGMLVocab comes from the same file as the model so shouldn't mismatch: + assert isinstance(vocab, SentencePieceVocab) + if params.n_vocab == vocab.vocab_size_base: + print("Ignoring added_tokens.json since model matches vocab size without it.") + vocab.added_tokens_list = [] + vocab.vocab_size = vocab.vocab_size_base + return + msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" + if vocab.fname_added_tokens is not None: + msg += f" combined with {vocab.fname_added_tokens}" + msg += f" has {vocab.vocab_size})." + if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: + msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + raise Exception(msg) + + +class OutputFile: + def __init__(self, fname_out: Path) -> None: + self.fout = open(fname_out, "wb") + + def write_file_header(self, params: Params) -> None: + self.fout.write(b"ggjt"[::-1]) # magic + values = [ + 1, # file version + params.n_vocab, + params.n_embd, + params.n_mult, + params.n_head, + params.n_layer, + params.n_embd // params.n_head, # rot (obsolete) + params.file_type.value, + ] + self.fout.write(struct.pack("i" * len(values), *values)) + + def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: + sname = name.encode('utf-8') + self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) + self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) + self.fout.write(sname) + self.fout.seek((self.fout.tell() + 31) & -32) + + def write_vocab(self, vocab: Vocab) -> None: + for text, score in vocab.all_tokens(): + self.fout.write(struct.pack("i", len(text))) + self.fout.write(text) + self.fout.write(struct.pack("f", score)) + + @staticmethod + def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: + of = OutputFile(fname_out) + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, + n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) + of = OutputFile(fname_out) + of.write_file_header(params) + of.write_vocab(vocab) + of.fout.close() + + @staticmethod + def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + check_vocab_size(params, vocab) + of = OutputFile(fname_out) + of.write_file_header(params) + print("Writing vocab...") + of.write_vocab(vocab) + + def do_item(item: Tuple[str, LazyTensor]) -> NDArray: + name, lazy_tensor = item + return lazy_tensor.load().to_ggml().ndarray + + ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + size = ' x '.join(map(str, lazy_tensor.shape)) + print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...") + of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) + ndarray.tofile(of.fout) + of.fout.close() + + +def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: + wq_type = model["layers.0.attention.wq.weight"].data_type + if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + return GGMLFileType.AllF32 + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): + return GGMLFileType.MostlyF16 + if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and + wq_type.have_addends): + if isinstance(model["output.weight"].data_type, QuantizedDataType): + return GGMLFileType.MostlyQ4_1 + else: + return GGMLFileType.PerLayerIsQ4_1 + if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)): + return GGMLFileType.MostlyQ4_0 + name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} + raise Exception(f"Unexpected combination of types: {name_to_type}") + + +def do_necessary_conversions(model: LazyModel) -> LazyModel: + model = handle_quantization(model) + + if "lm_head.weight" in model: + model = convert_transformers_to_orig(model) + model = filter_and_sort_tensors(model) + + return model + + +def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) + for (name, tensor) in model.items()} + + +def nth_multifile_path(path: Path, n: int) -> Optional[Path]: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the nth path in the model. + ''' + # Support the following patterns: + patterns: List[Tuple[str, str]] = [ + # - x.00.pth, x.01.pth, etc. + (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), + # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. + (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), + # x.bin, x.bin.1, etc. + (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') + ] + for regex, replacement in patterns: + if re.search(regex, path.name): + new_path = path.with_name(re.sub(regex, replacement, path.name)) + if new_path.exists(): + return new_path + return None + + +def find_multifile_paths(path: Path) -> List[Path]: + '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return + the whole list of paths in the model. + ''' + ret: List[Path] = [] + for i in itertools.count(): + nth_path = nth_multifile_path(path, i) + if nth_path is None: + break + ret.append(nth_path) + if not ret: + # No matches. This should only happen if the file was named, e.g., + # foo.0, and there was no file named foo. Oh well, try to process it + # as a single file. + return [path] + return ret + + +def load_some_model(path: Path) -> ModelPlus: + '''Load a model of any supported format.''' + # Be extra-friendly and accept either a file or a directory: + if path.is_dir(): + globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"] + files = [file for glob in globs for file in path.glob(glob)] + if not files: + # Try GGML too, but with lower priority, since if both a non-GGML + # model and a GGML model exist in the same directory, we assume the + # latter was converted from the former. + files = list(path.glob("ggml-model*.bin*")) + if not files: + raise Exception(f"Can't find model in directory {path}") + if len(files) > 1: + raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") + path = files[0] + + paths = find_multifile_paths(path) + models_plus: List[ModelPlus] = [] + for path in paths: + print(f"Loading model file {path}") + models_plus.append(lazy_load_file(path)) + + model_plus = merge_multifile_models(models_plus) + return model_plus + + +def filter_and_sort_tensors(model: LazyModel) -> LazyModel: + return {name: model[name] for name in TENSORS_LIST if name in model} + + +def load_vocab(path: Path) -> SentencePieceVocab: + # Be extra-friendly and accept either a file or a directory. Also, if it's + # a directory, it might be the model directory, and tokenizer.model might + # be in the parent of that. + if path.is_dir(): + path2 = path / "tokenizer.model" + # Use `.parent` instead of /.. to handle the symlink case better. + path3 = path.parent / "tokenizer.model" + if path2.exists(): + path = path2 + elif path3.exists(): + path = path3 + else: + raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir") + added_tokens_path = path.parent / "added_tokens.json" + print(f"Loading vocab file {path}") + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) + + +def default_outfile(model_paths: List[Path], params: Params) -> Path: + namestr = { + GGMLFileType.AllF32: "f32", + GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ4_1: "q4_1", + GGMLFileType.PerLayerIsQ4_1: "q4_1", + }[params.file_type] + ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" + if ret in model_paths: + sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n") + sys.exit(1) + return ret + + +def do_dump_model(model_plus: ModelPlus) -> None: + print(f"model_plus.paths = {model_plus.paths!r}") + print(f"model_plus.format = {model_plus.format!r}") + print(f"model_plus.vocab = {model_plus.vocab!r}") + for name, lazy_tensor in model_plus.model.items(): + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + + +def main(args_in: Optional[List[str]] = None) -> None: + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + args = parser.parse_args(args_in) + + vocab: Vocab + if args.dump_single: + model_plus = lazy_load_file(args.model) + do_dump_model(model_plus) + elif args.vocab_only: + vocab = load_vocab(args.vocab_dir or args.model) + assert args.outfile, "need --outfile if using --vocab-only" + outfile = args.outfile + OutputFile.write_vocab_only(outfile, vocab) + print(f"Wrote {outfile}") + else: + model_plus = load_some_model(args.model) + if args.dump: + do_dump_model(model_plus) + return + if model_plus.vocab is not None and args.vocab_dir is None: + vocab = model_plus.vocab + else: + vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent + vocab = load_vocab(vocab_dir) + model = model_plus.model + model = do_necessary_conversions(model) + output_type = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, output_type) + params = Params.guessed(model, output_type) + outfile = args.outfile or default_outfile(model_plus.paths, params) + OutputFile.write_all(outfile, params, model, vocab) + print(f"Wrote {outfile}") + + +if __name__ == '__main__': + main() diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py deleted file mode 100644 index b6ef2476e052df..00000000000000 --- a/migrate-ggml-2023-03-30-pr613.py +++ /dev/null @@ -1,311 +0,0 @@ -# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic -# -# We caused a breaking change to the file format on 2023-03-30 in: -# https://github.com/ggerganov/llama.cpp/pull/613 -# -# (1) If you still have the Meta LLaMA .pth files, then close this -# file now; you can just run `convert-pth-to-ggml.py` again to -# migrate to the new format. The tool is easier to use too. It -# isn't necessary anymore to manage split output files because -# the new format always combines things into a single file. -# -# (2) If you deleted the Meta LLaMA .pth files due to save on disk -# space, then this tool is intended to help you. Please check -# out the instructions below. -# -# USAGE -# -# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT -# -# PREREQUISITES -# -# pip install numpy -# cd llama.cpp -# make -j4 -# -# EXAMPLE (7B MODEL) -# -# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights -# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin -# -# # check that it works -# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' -# -# # you can delete the old files -# rm -f models/7B/ggml-model-f16.bin -# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin -# -# EXAMPLE (13B MODEL) -# -# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights -# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin -# -# # check that it works -# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' -# -# # you can delete the old files -# rm -f models/13B/ggml-model-f16.bin* -# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin -# - -import argparse -import os -import sys -import json -import struct -import numpy as np - -QK = 32 - -GGML_TYPE_Q4_0 = 0 -GGML_TYPE_Q4_1 = 1 -GGML_TYPE_I8 = 2 -GGML_TYPE_I16 = 3 -GGML_TYPE_I32 = 4 -GGML_TYPE_F16 = 5 -GGML_TYPE_F32 = 6 - -WTYPE_NAMES = { - 0: "F32", - 1: "F16", - 2: "Q4_0", - 3: "Q4_1", -} - -WTYPES = { - 0: GGML_TYPE_F32, - 1: GGML_TYPE_F16, - 2: GGML_TYPE_Q4_0, - 3: GGML_TYPE_Q4_1, -} - -GGML_BLCK_SIZE = { - GGML_TYPE_Q4_0: QK, - GGML_TYPE_Q4_1: QK, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 1, - GGML_TYPE_I32: 1, - GGML_TYPE_F16: 1, - GGML_TYPE_F32: 1, -} - -GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 2, - GGML_TYPE_I32: 4, - GGML_TYPE_F16: 2, - GGML_TYPE_F32: 4, -} - -HPARAMS = [ - 'magic', # int32 - 'version', # int32 - 'n_vocab', # int32 - 'n_embd', # int32 - 'n_mult', # int32 - 'n_head', # int32 - 'n_layer', # int32 - 'n_rot', # int32 - 'f16', # int32 -] - -def read_hparams(fin): - struct_fmt = "i" * len(HPARAMS) - struct_size = struct.calcsize(struct_fmt) - buf = fin.read(struct_size) - ints = struct.unpack(struct_fmt, buf) - hparams = dict(zip(HPARAMS, ints)) - return hparams - -def write_hparams(fout, hparams): - struct_fmt = "i" * len(HPARAMS) - struct_size = struct.calcsize(struct_fmt) - ints = [hparams[h] for h in HPARAMS] - fout.write(struct.pack(struct_fmt, *ints)) - -def read_tokens(fin, hparams): - tokens = [] - for i in range(hparams['n_vocab']): - len_b = fin.read(4) - (length,) = struct.unpack("i", len_b) - word = fin.read(length) - score_b = fin.read(4) - (score,) = struct.unpack("f", score_b) - tokens.append((word, score)) - return tokens - -def write_tokens(fout, tokens): - for word, score in tokens: - fout.write(struct.pack("i", len(word))) - fout.write(word) - fout.write(struct.pack("f", score)) - -def ggml_nelements(shape): - r = 1 - for i in shape: - r *= i - return r - -def ggml_nbytes(shape, ftype): - x = ggml_nelements(shape) - t = WTYPES[ftype] - x *= GGML_TYPE_SIZE[t] - x //= GGML_BLCK_SIZE[t] - return x - -def copy_tensors(fin, fout, part_id, n_parts): - while True: - - b = fin.read(4) - if not b: break - (n_dims,) = struct.unpack("i", b) - b = fin.read(4) - (length,) = struct.unpack("i", b) - b = fin.read(4) - (ftype,) = struct.unpack("i", b) - - assert n_dims in (1, 2) - - partshape = list(range(n_dims)) - for i in range(n_dims): - b = fin.read(4) - partshape[i] = struct.unpack("i", b)[0] - partshape = list(reversed(partshape)) - - name = fin.read(length) - data = fin.read(ggml_nbytes(partshape, ftype)) - - blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] - type_size = GGML_TYPE_SIZE[WTYPES[ftype]] - - print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") - - # determine dimension along which multipart tensor is sharded - # - # split_dim 0 regex: - # - output.* - # - layers.*.attention.wq.weight - # - layers.*.attention.wk.weight - # - layers.*.attention.wv.weight - # - layers.*.feed_forward.w1.weight - # - layers.*.feed_forward.w3.weight - # - # split_dim 1 regex: - # - tok_embeddings.* - # - layers.*.attention.wo.weight - # - layers.*.feed_forward.w2.weight - # - if n_dims > 1: - split_dim = 1 - if b"tok_embeddings" in name: - split_dim = 1 - elif b"layers" in name: - if b"attention.wo.weight" in name: - split_dim = 1 - elif b"feed_forward.w2.weight" in name: - split_dim = 1 - else: - split_dim = 0 - elif b"output" in name: - split_dim = 0 - - # output tensor header - fullshape = list(partshape) - if n_dims > 1: - fullshape[split_dim] *= n_parts - fout.write(struct.pack("iii", n_dims, len(name), ftype)) - for dim in reversed(fullshape): - fout.write(struct.pack("i", dim)) - fout.write(name) - - # ensure tensor data is aligned - tensor_data_offset = fout.tell() - while tensor_data_offset % QK != 0: - fout.write(struct.pack("B", 0)) - tensor_data_offset += 1 - - # output unified mappable tensor data - if n_dims == 1 or n_parts == 1: - # copy tensor which we thankfully received in one piece - if part_id == 0: - fout.write(data) - elif split_dim == 0: - # reassemble multifile tensor containing some of the rows - rows_per_chunk = partshape[0] - current_row = part_id * rows_per_chunk - bytes_per_row = fullshape[1] // blck_size * type_size - offset = current_row * bytes_per_row - fout.seek(tensor_data_offset + offset) - fout.write(data) - elif split_dim == 1: - # reassemble multifile tensor containing some of the cols - cols_per_chunk = partshape[1] - current_col = part_id * cols_per_chunk - bpr = partshape[1] // blck_size * type_size - bytes_per_row = fullshape[1] // blck_size * type_size - offset_current_col = current_col // blck_size * type_size - for row in range(partshape[0]): - offset_row = row * bytes_per_row - offset = offset_row + offset_current_col - fout.seek(tensor_data_offset + offset) - fout.write(data[row * bpr:row * bpr + bpr]) - - # advance file position to next tensor - fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) - -def parse_args(): - parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') - parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') - parser.add_argument('fout_path', help='your new ggjt file name') - return parser.parse_args() - -def main(): - args = parse_args() - assert args.fin_path - assert args.fout_path - assert args.fin_path != args.fout_path - - with open(args.fin_path, "rb") as fin: - hparams = read_hparams(fin) - tokens = read_tokens(fin, hparams) - - if hparams['magic'] == 0x67676a74: # ggjt - print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n") - sys.exit(1) - - if hparams['magic'] != 0x67676d66: # ggmf - print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n") - sys.exit(1) - - hparams['magic'] = 0x67676a74 # ggjt - - # count number of multipart files by convention - n_parts = 1 - while True: - if os.path.exists(f"{args.fin_path}.{n_parts}"): - n_parts += 1 - else: - break - - # we output a single file for ggml - with open(args.fout_path, "wb") as fout: - write_hparams(fout, hparams) - write_tokens(fout, tokens) - offset_of_tensors = fout.tell() - # the tensors we load could be split across multiple files - for part_id in range(n_parts): - fout.seek(offset_of_tensors) - print(f"Processing part {part_id+1} of {n_parts}\n") - fin_path = args.fin_path - if part_id > 0: - fin_path += f".{part_id}" - with open(fin_path, "rb") as fin: - read_tokens(fin, read_hparams(fin)) - copy_tensors(fin, fout, part_id, n_parts) - - print(f"Done. Output file: {args.fout_path}\n") - -if __name__ == "__main__": - main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000000000..f3944951a486bf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.24 +sentencepiece==0.1.97 From c14e0d2f23e6d1e785255f4da8c253c1b4723659 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Apr 2023 13:31:15 +0300 Subject: [PATCH 034/773] ggml : always allocate buffers with size multiple of GGML_MEM_ALIGN --- ggml.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index d620cd11f95651..76694a61729525 100644 --- a/ggml.c +++ b/ggml.c @@ -3054,9 +3054,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } + const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1); + *ctx = (struct ggml_context) { - /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size), + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, @@ -3066,7 +3068,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.scratch_save =*/ { 0, 0, NULL, }, }; - GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure + GGML_ASSERT(ctx->mem_buffer != NULL); ggml_assert_aligned(ctx->mem_buffer); From 1623a6e9b46453bff30afd7d0f6c3fd188499c2f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Apr 2023 13:31:29 +0300 Subject: [PATCH 035/773] ggml : minor --- ggml.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 76694a61729525..d99aca21a864da 100644 --- a/ggml.c +++ b/ggml.c @@ -7509,7 +7509,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, ((float)-2)/n_dims); + const float theta_scale = powf(10000.0, -2.0f/n_dims); for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { @@ -7517,12 +7517,15 @@ static void ggml_compute_forward_rope_f32( for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; + float theta = (float)p; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); theta *= theta_scale; + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -7583,7 +7586,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, ((float)-2)/n_dims); + const float theta_scale = powf(10000.0, -2.0f/n_dims); for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { @@ -7591,12 +7594,15 @@ static void ggml_compute_forward_rope_f16( for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; + float theta = (float)p; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); theta *= theta_scale; + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); From 43ffdefb7424f79a3d510c199e2ea86684b4f824 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 14 Apr 2023 14:23:21 +0200 Subject: [PATCH 036/773] py : fix flake8 and isort nitpicks (#960) --- convert.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/convert.py b/convert.py index f35163f67d43eb..056dc618daa481 100644 --- a/convert.py +++ b/convert.py @@ -18,10 +18,12 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, + Literal, Optional, Sequence, Tuple, TypeVar, Union) + import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore -from typing import (IO, Any, Callable, Iterable, Literal, Optional, Sequence, - TypeVar, Union, List, Dict, Tuple, TYPE_CHECKING) + if TYPE_CHECKING: from typing_extensions import TypeAlias @@ -684,7 +686,7 @@ def load(offset: int, elm_count: int) -> NDArray: description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) - def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] + def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) From a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 14 Apr 2023 15:37:11 +0200 Subject: [PATCH 037/773] py : cleanup dependencies (#962) after #545 we do not need torch, tqdm and requests in the dependencies --- .devops/full.Dockerfile | 5 +++-- flake.nix | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index a75bc976f54c0f..491d67676f0cd8 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip +COPY requirements.txt requirements.txt + RUN pip install --upgrade pip setuptools wheel \ - && pip install numpy requests sentencepiece tqdm \ - && pip install torch --index-url https://download.pytorch.org/whl/cpu + && pip install -r requirements.txt WORKDIR /app diff --git a/flake.nix b/flake.nix index 91d2edd79fb5c5..5363052b1058aa 100644 --- a/flake.nix +++ b/flake.nix @@ -10,7 +10,6 @@ inherit system; }; llama-python = pkgs.python310.withPackages (ps: with ps; [ - torch numpy sentencepiece ]); From c9a59b70a54e0bc05777df287feaea3dbe0310c4 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Fri, 14 Apr 2023 08:43:55 -0600 Subject: [PATCH 038/773] ggml : add unary and binary map operations (#874) * GGML map ops proof of concept. * Various cleanups. Add handling for task setting. Add handling for ggml_compute_backward. Rename functions to ggml_map_unary_f32 and ggml_map_binary_f32 Fix compiler warnings related to casting function pointers and `void *` Reorder functions and definitions based on the GGML op number. Use typedefs for map op function pointer types. * Fix position of map ops cases in ggml_compute_forward --- ggml.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- ggml.h | 18 +++++ 2 files changed, 237 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index d99aca21a864da..ce48b78adeecf2 100644 --- a/ggml.c +++ b/ggml.c @@ -2712,9 +2712,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_ATTN", "FLASH_FF", + + "MAP_UNARY", + "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); +static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2757,9 +2760,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_attn(x)", "flash_ff(x)", + + "f(x)", + "f(x,y)", }; -static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); +static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4907,6 +4913,90 @@ struct ggml_tensor * ggml_flash_ff( return result; } +// ggml_map_unary + +struct ggml_tensor * ggml_map_unary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MAP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, true); +} + +// ggml_map_binary + +struct ggml_tensor * ggml_map_binary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MAP_BINARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, false); +} + +struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, true); +} + //////////////////////////////////////////////////////////////////////////////// void ggml_set_param( @@ -8875,6 +8965,111 @@ static void ggml_compute_forward_flash_ff( } } +// ggml_compute_forward_map_unary + +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + + +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_binary + +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + + +static void ggml_compute_forward_map_binary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -9024,6 +9219,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); } break; + case GGML_OP_MAP_UNARY: + { + const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + } + break; + case GGML_OP_MAP_BINARY: + { + const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + } + break; case GGML_OP_NONE: { // nop @@ -9283,6 +9490,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // not supported } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_NONE: { // nop @@ -9775,6 +9987,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) work_size = MAX(work_size, cur); } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + { + node->n_tasks = 1; + } break; case GGML_OP_NONE: { node->n_tasks = 1; diff --git a/ggml.h b/ggml.h index c06c09e060db5e..bdff0b4de454e1 100644 --- a/ggml.h +++ b/ggml.h @@ -253,6 +253,9 @@ enum ggml_op { GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + GGML_OP_COUNT, }; @@ -652,6 +655,21 @@ struct ggml_tensor * ggml_flash_ff( struct ggml_tensor * c0, struct ggml_tensor * c1); +// Mapping operations +typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); +typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun); + +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun); + // // automatic differentiation // From f4d277ae17247ee51129ef1a9ff74d377cc90b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Pazdiora?= Date: Fri, 14 Apr 2023 17:19:17 +0200 Subject: [PATCH 039/773] main : alternative instruct mode (Vicuna support, etc.) (#863) * Add support for configs, add configurable prefixes / suffixes, deprecate instruct mode, add stop prompt * Add multiline mode, update text input. * bugfix * update implementation * typos * Change --multiline implementation to be toggled by EOF. * bugfix * default multiline mode * add more configs * update formating * update formatting * apply suggestions --- configs/alpaca-native-enhanced.txt | 21 +++ configs/alpaca.txt | 9 + configs/chat-with-bob.txt | 15 ++ configs/llama.txt | 3 + configs/vicuna-simple.txt | 7 + configs/vicuna-stop.txt | 8 + configs/vicuna.txt | 9 + examples/common.cpp | 262 ++++++++++++++++++++++++----- examples/common.h | 30 +++- examples/main/main.cpp | 172 +++++++++++++------ prompts/alpaca.txt | 1 - prompts/chat-with-bob.txt | 7 - 12 files changed, 434 insertions(+), 110 deletions(-) create mode 100644 configs/alpaca-native-enhanced.txt create mode 100644 configs/alpaca.txt create mode 100644 configs/chat-with-bob.txt create mode 100644 configs/llama.txt create mode 100644 configs/vicuna-simple.txt create mode 100644 configs/vicuna-stop.txt create mode 100644 configs/vicuna.txt delete mode 100644 prompts/alpaca.txt delete mode 100644 prompts/chat-with-bob.txt diff --git a/configs/alpaca-native-enhanced.txt b/configs/alpaca-native-enhanced.txt new file mode 100644 index 00000000000000..109d315924d851 --- /dev/null +++ b/configs/alpaca-native-enhanced.txt @@ -0,0 +1,21 @@ +--ctx_size 2048 +--batch_size 16 +--repeat_penalty 1.15 +--temp 0.4 +--top_k 30 +--top_p 0.18 + +--interactive-first +--keep -1 + +--ins-prefix-bos +--ins-prefix "\n\nUser: " +--ins-suffix "\n\nAssistant: " +--reverse-prompt "User: " + +-p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history. + +User: Hey, how's it going? + +Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!" + diff --git a/configs/alpaca.txt b/configs/alpaca.txt new file mode 100644 index 00000000000000..99a3ab47efc458 --- /dev/null +++ b/configs/alpaca.txt @@ -0,0 +1,9 @@ +--clean-interface +--interactive-first +--keep -1 +--ins-prefix-bos +--ins-prefix "\n\n### Instruction:\n\n" +--ins-suffix "\n\n### Response:\n\n" +--reverse-prompt "### Instruction:\n\n" + +-p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" diff --git a/configs/chat-with-bob.txt b/configs/chat-with-bob.txt new file mode 100644 index 00000000000000..0caa749a3dee26 --- /dev/null +++ b/configs/chat-with-bob.txt @@ -0,0 +1,15 @@ +--interactive-first +--keep -1 +--ins-prefix-bos +--ins-prefix "\nUser: " +--ins-suffix "\nBob: " +--reverse-prompt "User: " +--rm-trailing-space-workaround + +-p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. + +User: Hello, Bob. +Bob: Hello. How may I help you today? +User: Please tell me the largest city in Europe. +Bob: Sure. The largest city in Europe is Moscow, the capital of Russia." + diff --git a/configs/llama.txt b/configs/llama.txt new file mode 100644 index 00000000000000..9d23e75aceb50d --- /dev/null +++ b/configs/llama.txt @@ -0,0 +1,3 @@ +--interactive-first +--keep -1 +--temp 0.1 diff --git a/configs/vicuna-simple.txt b/configs/vicuna-simple.txt new file mode 100644 index 00000000000000..efa60d96a1fbf5 --- /dev/null +++ b/configs/vicuna-simple.txt @@ -0,0 +1,7 @@ +--interactive-first +--keep -1 +--ins-prefix-bos +--ins-prefix "\n### Human: " +--ins-suffix "\n### Assistant: " +--reverse-prompt "### Human: " +--rm-trailing-space-workaround diff --git a/configs/vicuna-stop.txt b/configs/vicuna-stop.txt new file mode 100644 index 00000000000000..911d067efd5b8c --- /dev/null +++ b/configs/vicuna-stop.txt @@ -0,0 +1,8 @@ +--interactive-first +--keep -1 +--ins-prefix-bos +--ins-prefix "\n### Human: " +--ins-suffix "\n### Assistant: " +--reverse-prompt "### Human: " +--stop-prompt "### Assistant: " +--rm-trailing-space-workaround diff --git a/configs/vicuna.txt b/configs/vicuna.txt new file mode 100644 index 00000000000000..6d811410ab3b61 --- /dev/null +++ b/configs/vicuna.txt @@ -0,0 +1,9 @@ +--interactive-first +--keep -1 +--ins-prefix-bos +--ins-prefix "\n### Human: " +--ins-suffix "\n### Assistant: " +--reverse-prompt "### Human: " +--rm-trailing-space-workaround + +-p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." diff --git a/examples/common.cpp b/examples/common.cpp index 0772dbfe142ffe..eaa5aceea8eb0e 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -2,10 +2,13 @@ #include #include +#include #include +#include #include #include #include +#include #if defined (_WIN32) #include @@ -23,6 +26,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int #define CP_UTF8 65001 #endif +void split_args(const std::string & args_string, std::vector & output_args) +{ + std::string current_arg = ""; + bool in_quotes = false; + char quote_type; + + for (char c : args_string) { + if (c == '"' || c == '\'') { + if (!in_quotes) { + in_quotes = true; + quote_type = c; + } else if (quote_type == c) { + in_quotes = false; + } else { + current_arg += c; + } + } else if (in_quotes) { + current_arg += c; + } else if (std::isspace(c)) { + if (current_arg != "") { + output_args.push_back(current_arg); + current_arg = ""; + } + } else { + current_arg += c; + } + } + + if (current_arg != "") { + output_args.push_back(current_arg); + } +} + +std::string unescape(const std::string & str) { + return std::regex_replace(str, std::regex("\\\\n"), "\n"); +} + bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // determine sensible default number of threads. // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. @@ -40,35 +80,66 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string arg; gpt_params default_params; + // get additional arguments from config files + std::vector args; for (int i = 1; i < argc; i++) { arg = argv[i]; + if (arg == "--config") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string args_string; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(args_string)); + if (args_string.back() == '\n') { + args_string.pop_back(); + } + split_args(args_string, args); + for (int j = 0; j < args.size(); j++) { + args[j] = unescape(args[j]); + } + } else { + args.emplace_back(argv[i]); + } + } + + // parse args + int args_c = static_cast(args.size()); + for (int i = 0; i < args_c && !invalid_param; i++) { + arg = args[i]; if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.seed = std::stoi(argv[i]); + params.seed = std::stoi(args[i]); } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_threads = std::stoi(argv[i]); + params.n_threads = std::stoi(args[i]); } else if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.prompt = argv[i]; + params.prompt = args[i]; } else if (arg == "-f" || arg == "--file") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - std::ifstream file(argv[i]); + std::ifstream file(args[i]); if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str()); invalid_param = true; break; } @@ -77,80 +148,100 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n_predict") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_predict = std::stoi(argv[i]); + params.n_predict = std::stoi(args[i]); } else if (arg == "--top_k") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.top_k = std::stoi(argv[i]); + params.top_k = std::stoi(args[i]); } else if (arg == "-c" || arg == "--ctx_size") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_ctx = std::stoi(argv[i]); + params.n_ctx = std::stoi(args[i]); } else if (arg == "--memory_f32") { params.memory_f16 = false; } else if (arg == "--top_p") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.top_p = std::stof(argv[i]); + params.top_p = std::stof(args[i]); } else if (arg == "--temp") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.temp = std::stof(argv[i]); + params.temp = std::stof(args[i]); } else if (arg == "--repeat_last_n") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.repeat_last_n = std::stoi(argv[i]); + params.repeat_last_n = std::stoi(args[i]); } else if (arg == "--repeat_penalty") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.repeat_penalty = std::stof(argv[i]); + params.repeat_penalty = std::stof(args[i]); } else if (arg == "-b" || arg == "--batch_size") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_batch = std::stoi(argv[i]); + params.n_batch = std::stoi(args[i]); params.n_batch = std::min(512, params.n_batch); } else if (arg == "--keep") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_keep = std::stoi(argv[i]); + params.n_keep = std::stoi(args[i]); } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.model = argv[i]; + params.model = args[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { params.embedding = true; + } else if (arg == "--clean-interface") { + params.clean_interface = true; } else if (arg == "--interactive-start") { params.interactive = true; } else if (arg == "--interactive-first") { params.interactive_start = true; } else if (arg == "-ins" || arg == "--instruct") { - params.instruct = true; + fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n" + "--clean-interface " + "--interactive-first " + "--keep -1 " + "--ins-prefix-bos " + "--ins-prefix \"\\n\\n### Instruction:\\n\\n\" " + "--ins-suffix \"\\n\\n### Response:\\n\\n\" " + "-r \"### Instruction:\\n\\n\" " + "\n\n"); + // params.instruct = true; + params.clean_interface = true; + params.interactive_start = true; + params.n_keep = -1; + params.instruct_prefix_bos = true; + params.instruct_prefix = "\n\n### Instruction:\n\n"; + params.instruct_suffix = "\n\n### Response:\n\n"; + params.antiprompt.push_back("### Instruction:\n\n"); } else if (arg == "--color") { params.use_color = true; + } else if (arg == "--disable-multiline") { + params.multiline_mode = false; } else if (arg == "--mlock") { params.use_mlock = true; } else if (arg == "--no-mmap") { @@ -160,65 +251,94 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.antiprompt.push_back(argv[i]); + params.antiprompt.push_back(args[i]); + } else if (arg == "--stop-prompt") { + if (++i >= args_c) { + invalid_param = true; + break; + } + params.stopprompt.push_back(args[i]); + } else if (arg == "--rm-trailing-space-workaround") { + params.rm_trailing_space_workaround = true; } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--n_parts") { - if (++i >= argc) { + if (++i >= args_c) { invalid_param = true; break; } - params.n_parts = std::stoi(argv[i]); + params.n_parts = std::stoi(args[i]); } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, default_params); + gpt_print_usage(argv[0], default_params); exit(0); } else if (arg == "--random-prompt") { params.random_prompt = true; } else if (arg == "--in-prefix") { - if (++i >= argc) { + if (++i >= args_c) { + invalid_param = true; + break; + } + params.input_prefix = args[i]; + } else if (arg == "--ins-prefix-bos") { + params.instruct_prefix_bos = true; + } else if (arg == "--ins-prefix") { + if (++i >= args_c) { invalid_param = true; break; } - params.input_prefix = argv[i]; + params.instruct_prefix = args[i]; + } else if (arg == "--ins-suffix-bos") { + params.instruct_suffix_bos = true; + } else if (arg == "--ins-suffix") { + if (++i >= args_c) { + invalid_param = true; + break; + } + params.instruct_suffix = args[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); + gpt_print_usage(argv[0], default_params); exit(1); } } if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argc, argv, default_params); + gpt_print_usage(argv[0], default_params); exit(1); } return true; } -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); +void gpt_print_usage(char * argv_0, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv_0); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -i, --interactive run in interactive mode\n"); fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stderr, " --clean-interface hides input prefix & suffix and displays '>' instead\n"); fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); fprintf(stderr, " specified more than once for multiple prompts).\n"); fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stderr, " --disable-multiline disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n"); fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stderr, " --ins-prefix STRING (instruct) prefix user inputs with tokenized string (default: empty)\n"); + fprintf(stderr, " --ins-prefix-bos (instruct) prepend bos token to instruct prefix.\n"); + fprintf(stderr, " --ins-suffix STRING (instruct) suffix user inputs with tokenized string (default: empty)\n"); + fprintf(stderr, " --ins-suffix-bos (instruct) prepend bos token to instruct suffix.\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); @@ -328,3 +448,61 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) { str = strTo; } #endif + +bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) { + bool another_line = true; + bool is_eof_multiline_toggled = false; + do { + std::string line; +#if defined(_WIN32) + auto & stdcin = std::wcin; + std::wstring wline; + if (!std::getline(stdcin, wline)) { + // input stream is bad or EOF received + if (stdcin.bad()) { + fprintf(stderr, "%s: error: input stream bad\n", __func__); + return 1; + } + } + win32_utf8_encode(wline, line); +#else + auto & stdcin = std::cin; + if (!std::getline(stdcin, line)) { + // input stream is bad or EOF received + if (stdcin.bad()) { + fprintf(stderr, "%s: error: input stream bad\n", __func__); + return 1; + } + } +#endif + if (stdcin.eof()) { + stdcin.clear(); + stdcin.seekg(0, std::ios::beg); + if (!eof_toggled_multiline_mode) { + another_line = false; + } else { + is_eof_multiline_toggled = !is_eof_multiline_toggled; + if (is_eof_multiline_toggled) { + input_text += line; + continue; + } + } + } + if (!eof_toggled_multiline_mode) { + if (line.empty() || line.back() != '\\') { + another_line = false; + } else { + line.pop_back(); // Remove the continue character + } + } else { + if (!is_eof_multiline_toggled) { + another_line = false; + } + } + input_text += line; + if (another_line) { + input_text += '\n'; // Append the line to the result + } + } while (another_line); + return true; +} diff --git a/examples/common.h b/examples/common.h index 1ea6f744518111..df8e4c6ccb9901 100644 --- a/examples/common.h +++ b/examples/common.h @@ -14,14 +14,14 @@ // struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; // context size - int32_t n_batch = 8; // batch size for prompt processing - int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default) + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing + int32_t n_keep = 0; // number of tokens to keep from initial prompt (-1 for all) // sampling parameters int32_t top_k = 40; @@ -33,8 +33,15 @@ struct gpt_params { std::string prompt = ""; std::string input_prefix = ""; // string to prefix user inputs with + std::string instruct_prefix = ""; // prefix user inputs with tokenized string + bool instruct_prefix_bos = false; // prepend bos token to instruct prefix + std::string instruct_suffix = ""; // suffix user inputs with tokenized string + bool instruct_suffix_bos = false; // prepend bos token to instruct suffix std::vector antiprompt; // string upon seeing which more user input is prompted + std::vector stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes) + + bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided @@ -51,11 +58,14 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation + + bool clean_interface = false; // hides input prefix & suffix and displays '>' + bool multiline_mode = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +void gpt_print_usage(char * argv_0, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); @@ -95,3 +105,5 @@ void set_console_color(console_state & con_st, console_color_t color); void win32_console_init(bool enable_color); void win32_utf8_encode(const std::wstring & wstr, std::string & str); #endif + +bool get_input_text(std::string & input_text, bool escape_newline_mode); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ba153cb82dcf67..68b4b2840858e7 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -30,7 +30,8 @@ static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) void sigint_handler(int signo) { set_console_color(con_st, CONSOLE_COLOR_DEFAULT); - printf("\n"); // this also force flush stdout. + fflush(stdout); + fflush(stderr); if (signo == SIGINT) { if (!is_interacting) { is_interacting=true; @@ -89,6 +90,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } + bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty(); + // params.prompt = R"(// this function checks if the number n is prime //bool is_prime(int n) {)"; @@ -153,22 +156,20 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) { params.n_keep = (int)embd_inp.size(); } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); - - // in instruct mode, we inject a prefix and a suffix to each input by the user - if (params.instruct) { - params.interactive_start = true; - params.antiprompt.push_back("### Instruction:\n\n"); + const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos); + std::string instruct_suffix = params.instruct_suffix; + if (params.rm_trailing_space_workaround) { + if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); } } + const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos); // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) { params.interactive = true; } @@ -210,10 +211,21 @@ int main(int argc, char ** argv) { fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); } } + if (params.stopprompt.size()) { + for (auto stopprompt : params.stopprompt) { + fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str()); + } + } if (!params.input_prefix.empty()) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } + if (!params.instruct_prefix.empty()) { + fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str()); + } + if (!params.instruct_suffix.empty()) { + fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str()); + } } fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); @@ -229,12 +241,29 @@ int main(int argc, char ** argv) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) " - Press Ctrl+C to interject at any time.\n" #endif - " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n\n"); + ); + if (params.multiline_mode) { + fprintf(stderr, " - Press Return to return control to LLaMa.\n" +#if defined (_WIN32) + " - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n"); +#else + " - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n"); +#endif + } + else { + fprintf(stderr, " - Press Return to return control to LLaMa.\n" + " - If you want to submit another line, end your input in '\\'.\n\n"); + } is_interacting = params.interactive_start; } - bool is_antiprompt = false; + struct Antiprompt { + bool any = false; + bool trailing_space = false; + size_t len; + bool is_stop_prompt = false; + } antiprompt; + bool input_noecho = false; int n_past = 0; @@ -304,7 +333,7 @@ int main(int argc, char ** argv) { } // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive && !params.instruct) { + if (id == llama_token_eos() && params.interactive && !instruct_mode) { id = llama_token_newline.front(); if (params.antiprompt.size() != 0) { // tokenize and inject first reverse prompt @@ -350,27 +379,72 @@ int main(int argc, char ** argv) { // check if we should prompt the user for more if (params.interactive && (int) embd_inp.size() <= n_consumed) { - // check for reverse prompt - if (params.antiprompt.size()) { + // check for reverse prompt or stop prompt + if (params.antiprompt.size() || params.stopprompt.size()) { std::string last_output; for (auto id : last_n_tokens) { last_output += llama_token_to_str(ctx, id); } - is_antiprompt = false; + antiprompt.any = false; + antiprompt.is_stop_prompt = false; // Check if each of the reverse prompts appears at the end of the output. - for (std::string & antiprompt : params.antiprompt) { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { + for (std::string & prompt : params.antiprompt) { + if (params.rm_trailing_space_workaround) { + antiprompt.trailing_space = prompt.back() == ' '; + antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0); + } + if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) { is_interacting = true; - is_antiprompt = true; + antiprompt.any = true; set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); fflush(stdout); break; } } + if (!antiprompt.any) { + for (std::string & prompt : params.stopprompt) { + if (params.rm_trailing_space_workaround) { + antiprompt.trailing_space = prompt.back() == ' '; + antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0); + } + if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) { + is_interacting = true; + antiprompt.any = true; + antiprompt.is_stop_prompt = true; + set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); + fflush(stdout); + break; + } + } + } } - if (n_past > 0 && is_interacting) { + if (n_past > 0 && is_interacting) + { + std::string buffer; + if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) { + // avoid printing again user's new line (TODO: try to revert enter press and print newline) + int i = params.instruct_prefix.front() == '\n' ? 1 : 0; + for (; i < inp_pfx.size(); i++) { + printf("%s", llama_token_to_str(ctx, inp_pfx[i])); + } + fflush(stdout); + } + if (params.rm_trailing_space_workaround) { + // add only if not stopprompt (as stopprompt could be used to pause + // assistant and then continue without input - adding back trailing + // space may mess it up.) + if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) { + // add back removed trailing space to buffer(workaround) + buffer += ' '; + if (!params.clean_interface) { + printf("%s", buffer.c_str()); + } + fflush(stdout); + } + } + // potentially set color to indicate we are taking user input set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); @@ -379,49 +453,45 @@ int main(int argc, char ** argv) { signal(SIGINT, sigint_handler); #endif - if (params.instruct) { + if (params.clean_interface) { printf("\n> "); } - std::string buffer; if (!params.input_prefix.empty()) { buffer += params.input_prefix; printf("%s", buffer.c_str()); } - std::string line; - bool another_line = true; - do { -#if defined(_WIN32) - std::wstring wline; - if (!std::getline(std::wcin, wline)) { - // input stream is bad or EOF received - return 0; - } - win32_utf8_encode(wline, line); -#else - if (!std::getline(std::cin, line)) { - // input stream is bad or EOF received - return 0; - } -#endif - if (line.empty() || line.back() != '\\') { - another_line = false; - } else { - line.pop_back(); // Remove the continue character - } - buffer += line + '\n'; // Append the line to the result - } while (another_line); + if (!get_input_text(buffer, params.multiline_mode)) { + // input stream is bad + return 1; + } + if (!antiprompt.is_stop_prompt) { + buffer += "\n"; + } // done taking input, reset color set_console_color(con_st, CONSOLE_COLOR_DEFAULT); + if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) { + // avoid printing again user's new line (TODO: try to revert enter press and print newline) + int i = params.instruct_suffix.front() == '\n' ? 1 : 0; + for (; i < inp_sfx.size(); i++) { + printf("%s", llama_token_to_str(ctx, inp_sfx[i])); + } + // if (remove trailing space workaround) { + // We won't add back removed trailing space here, because assistant continues here, + // and it may mess up it's output (remove trailing space workaround). + // } + fflush(stdout); + } + // Add tokens to embd only if the input buffer is non-empty // Entering a empty line lets the user pass control back if (buffer.length() > 1) { - // instruct mode: insert instruction prefix - if (params.instruct && !is_antiprompt) { + // insert input prefix + if (!params.instruct_prefix.empty() && !antiprompt.any) { n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } @@ -429,8 +499,8 @@ int main(int argc, char ** argv) { auto line_inp = ::llama_tokenize(ctx, buffer, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - // instruct mode: insert response suffix - if (params.instruct) { + // insert response suffix + if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) { embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } @@ -447,7 +517,7 @@ int main(int argc, char ** argv) { // end of text token if (!embd.empty() && embd.back() == llama_token_eos()) { - if (params.instruct) { + if (instruct_mode) { is_interacting = true; } else { fprintf(stderr, " [end of text]\n"); diff --git a/prompts/alpaca.txt b/prompts/alpaca.txt deleted file mode 100644 index 2224bdeb0bcd44..00000000000000 --- a/prompts/alpaca.txt +++ /dev/null @@ -1 +0,0 @@ -Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt deleted file mode 100644 index ad494d831f6fb8..00000000000000 --- a/prompts/chat-with-bob.txt +++ /dev/null @@ -1,7 +0,0 @@ -Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. - -User: Hello, Bob. -Bob: Hello. How may I help you today? -User: Please tell me the largest city in Europe. -Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. -User: \ No newline at end of file From c56b7152690ca25cfd66b20210b3629e6c1e739b Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 14 Apr 2023 20:05:37 +0200 Subject: [PATCH 040/773] Expose type name from ggml (#970) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid duplication of type names in utils Co-authored-by: Håkon H. Hitland --- examples/quantize-stats/quantize-stats.cpp | 14 ++++++-------- ggml.c | 17 +++++++++++++++++ ggml.h | 2 ++ llama.cpp | 14 ++------------ 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index c786fe208c78d3..0503009319e4e4 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -16,9 +16,6 @@ #include #include -static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; -static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); - struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; @@ -224,7 +221,7 @@ int main(int argc, char ** argv) { break; } int j; - for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) { // find match } if (j < GGML_TYPE_COUNT) { @@ -279,7 +276,7 @@ int main(int argc, char ** argv) { continue; } if (params.verbose) { - printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second)); } if (kv_tensor.second->type == GGML_TYPE_F16) { is_f16 = true; @@ -304,13 +301,14 @@ int main(int argc, char ** argv) { // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { + const ggml_type type = (ggml_type) i; if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { - printf("testing %s ...\n", type_strs[i]); + printf("testing %s ...\n", ggml_type_name(type)); } error_stats global_stats {}; @@ -322,7 +320,7 @@ int main(int argc, char ** argv) { if (params.verbose) { printf(" %s ...\n", kv_tensor.first.c_str()); } - std::string layer_name { type_strs[i] }; + std::string layer_name { ggml_type_name(type) }; layer_name += "::" + kv_tensor.first; test_roundtrip_on_layer( layer_name, @@ -337,7 +335,7 @@ int main(int argc, char ** argv) { ); } - print_error_stats(type_strs[i], global_stats, params.print_histogram); + print_error_stats(ggml_type_name(type), global_stats, params.print_histogram); } } diff --git a/ggml.c b/ggml.c index ce48b78adeecf2..1574d649846dd9 100644 --- a/ggml.c +++ b/ggml.c @@ -2671,6 +2671,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { }; static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); + +static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = "f32", + [GGML_TYPE_F16] = "f16", + [GGML_TYPE_Q4_0] = "q4_0", + [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_I8] = "i8", + [GGML_TYPE_I16] = "i16", + [GGML_TYPE_I32] = "i32", +}; +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated"); + static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -2895,6 +2907,11 @@ float ggml_type_sizef(enum ggml_type type) { return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; } +const char * ggml_type_name(enum ggml_type type) { + return GGML_TYPE_NAME[type]; +} + + size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; } diff --git a/ggml.h b/ggml.h index bdff0b4de454e1..617298a95536dd 100644 --- a/ggml.h +++ b/ggml.h @@ -354,6 +354,8 @@ int ggml_blck_size (enum ggml_type type); size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float +const char * ggml_type_name(enum ggml_type type); + size_t ggml_element_size(const struct ggml_tensor * tensor); struct ggml_context * ggml_init(struct ggml_init_params params); diff --git a/llama.cpp b/llama.cpp index c7229568473dd1..be8c4cdc124569 100644 --- a/llama.cpp +++ b/llama.cpp @@ -269,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector & ne) { return ret; } -static const char * llama_format_type(enum ggml_type type) { - switch (type) { - case GGML_TYPE_F32: return "f32"; - case GGML_TYPE_F16: return "f16"; - case GGML_TYPE_Q4_0: return "q4_0"; - case GGML_TYPE_Q4_1: return "q4_1"; - default: LLAMA_ASSERT(false); - } -} - static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { size_t size = ggml_type_size(type); for (uint32_t dim : ne) { @@ -1582,7 +1572,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s printf("[%zu/%zu] %36s - %s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), - llama_format_type(tensor.type)); + ggml_type_name(tensor.type)); // This used to be a regex, but has an extreme cost to compile times. bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? @@ -1615,7 +1605,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } } else { - throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); + throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); } printf("quantizing .. "); From 93265e988af32b8be314bfed334f795a3037555d Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Fri, 14 Apr 2023 19:39:48 +0000 Subject: [PATCH 041/773] make : fix dependencies, use auto variables (#983) --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 7db2466508d4eb..a1b99c6f9dfe29 100644 --- a/Makefile +++ b/Makefile @@ -140,44 +140,44 @@ default: main quantize perplexity embedding # ggml.o: ggml.c ggml.h - $(CC) $(CFLAGS) -c ggml.c -o ggml.o + $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp llama.h llama_util.h - $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o +llama.o: llama.cpp ggml.h llama.h llama_util.h + $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h - $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o + $(CXX) $(CXXFLAGS) -c $< -o $@ clean: rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo quantize: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) libllama.so: llama.o ggml.o - $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # -benchmark: ggml.o - $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) +benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o + $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult .PHONY: tests From 489093548c89c67520109ab25c4df4a4614a32a0 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 14 Apr 2023 21:46:49 +0200 Subject: [PATCH 042/773] py : bump sentencepiece to 0.1.98 to support Python 3.11 (#976) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f3944951a486bf..6c32cbd047b84a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ numpy==1.24 -sentencepiece==0.1.97 +sentencepiece==0.1.98 From c85e03d12e4b8af22cb13aa9c618dcd5935862fd Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 14 Apr 2023 21:58:43 +0200 Subject: [PATCH 043/773] Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982) This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b. --- configs/alpaca-native-enhanced.txt | 21 --- configs/alpaca.txt | 9 - configs/chat-with-bob.txt | 15 -- configs/llama.txt | 3 - configs/vicuna-simple.txt | 7 - configs/vicuna-stop.txt | 8 - configs/vicuna.txt | 9 - examples/common.cpp | 262 +++++------------------------ examples/common.h | 30 +--- examples/main/main.cpp | 172 ++++++------------- prompts/alpaca.txt | 1 + prompts/chat-with-bob.txt | 7 + 12 files changed, 110 insertions(+), 434 deletions(-) delete mode 100644 configs/alpaca-native-enhanced.txt delete mode 100644 configs/alpaca.txt delete mode 100644 configs/chat-with-bob.txt delete mode 100644 configs/llama.txt delete mode 100644 configs/vicuna-simple.txt delete mode 100644 configs/vicuna-stop.txt delete mode 100644 configs/vicuna.txt create mode 100644 prompts/alpaca.txt create mode 100644 prompts/chat-with-bob.txt diff --git a/configs/alpaca-native-enhanced.txt b/configs/alpaca-native-enhanced.txt deleted file mode 100644 index 109d315924d851..00000000000000 --- a/configs/alpaca-native-enhanced.txt +++ /dev/null @@ -1,21 +0,0 @@ ---ctx_size 2048 ---batch_size 16 ---repeat_penalty 1.15 ---temp 0.4 ---top_k 30 ---top_p 0.18 - ---interactive-first ---keep -1 - ---ins-prefix-bos ---ins-prefix "\n\nUser: " ---ins-suffix "\n\nAssistant: " ---reverse-prompt "User: " - --p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history. - -User: Hey, how's it going? - -Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!" - diff --git a/configs/alpaca.txt b/configs/alpaca.txt deleted file mode 100644 index 99a3ab47efc458..00000000000000 --- a/configs/alpaca.txt +++ /dev/null @@ -1,9 +0,0 @@ ---clean-interface ---interactive-first ---keep -1 ---ins-prefix-bos ---ins-prefix "\n\n### Instruction:\n\n" ---ins-suffix "\n\n### Response:\n\n" ---reverse-prompt "### Instruction:\n\n" - --p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" diff --git a/configs/chat-with-bob.txt b/configs/chat-with-bob.txt deleted file mode 100644 index 0caa749a3dee26..00000000000000 --- a/configs/chat-with-bob.txt +++ /dev/null @@ -1,15 +0,0 @@ ---interactive-first ---keep -1 ---ins-prefix-bos ---ins-prefix "\nUser: " ---ins-suffix "\nBob: " ---reverse-prompt "User: " ---rm-trailing-space-workaround - --p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. - -User: Hello, Bob. -Bob: Hello. How may I help you today? -User: Please tell me the largest city in Europe. -Bob: Sure. The largest city in Europe is Moscow, the capital of Russia." - diff --git a/configs/llama.txt b/configs/llama.txt deleted file mode 100644 index 9d23e75aceb50d..00000000000000 --- a/configs/llama.txt +++ /dev/null @@ -1,3 +0,0 @@ ---interactive-first ---keep -1 ---temp 0.1 diff --git a/configs/vicuna-simple.txt b/configs/vicuna-simple.txt deleted file mode 100644 index efa60d96a1fbf5..00000000000000 --- a/configs/vicuna-simple.txt +++ /dev/null @@ -1,7 +0,0 @@ ---interactive-first ---keep -1 ---ins-prefix-bos ---ins-prefix "\n### Human: " ---ins-suffix "\n### Assistant: " ---reverse-prompt "### Human: " ---rm-trailing-space-workaround diff --git a/configs/vicuna-stop.txt b/configs/vicuna-stop.txt deleted file mode 100644 index 911d067efd5b8c..00000000000000 --- a/configs/vicuna-stop.txt +++ /dev/null @@ -1,8 +0,0 @@ ---interactive-first ---keep -1 ---ins-prefix-bos ---ins-prefix "\n### Human: " ---ins-suffix "\n### Assistant: " ---reverse-prompt "### Human: " ---stop-prompt "### Assistant: " ---rm-trailing-space-workaround diff --git a/configs/vicuna.txt b/configs/vicuna.txt deleted file mode 100644 index 6d811410ab3b61..00000000000000 --- a/configs/vicuna.txt +++ /dev/null @@ -1,9 +0,0 @@ ---interactive-first ---keep -1 ---ins-prefix-bos ---ins-prefix "\n### Human: " ---ins-suffix "\n### Assistant: " ---reverse-prompt "### Human: " ---rm-trailing-space-workaround - --p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." diff --git a/examples/common.cpp b/examples/common.cpp index eaa5aceea8eb0e..0772dbfe142ffe 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -2,13 +2,10 @@ #include #include -#include #include -#include #include #include #include -#include #if defined (_WIN32) #include @@ -26,43 +23,6 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int #define CP_UTF8 65001 #endif -void split_args(const std::string & args_string, std::vector & output_args) -{ - std::string current_arg = ""; - bool in_quotes = false; - char quote_type; - - for (char c : args_string) { - if (c == '"' || c == '\'') { - if (!in_quotes) { - in_quotes = true; - quote_type = c; - } else if (quote_type == c) { - in_quotes = false; - } else { - current_arg += c; - } - } else if (in_quotes) { - current_arg += c; - } else if (std::isspace(c)) { - if (current_arg != "") { - output_args.push_back(current_arg); - current_arg = ""; - } - } else { - current_arg += c; - } - } - - if (current_arg != "") { - output_args.push_back(current_arg); - } -} - -std::string unescape(const std::string & str) { - return std::regex_replace(str, std::regex("\\\\n"), "\n"); -} - bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // determine sensible default number of threads. // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. @@ -80,66 +40,35 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string arg; gpt_params default_params; - // get additional arguments from config files - std::vector args; for (int i = 1; i < argc; i++) { arg = argv[i]; - if (arg == "--config") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::string args_string; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(args_string)); - if (args_string.back() == '\n') { - args_string.pop_back(); - } - split_args(args_string, args); - for (int j = 0; j < args.size(); j++) { - args[j] = unescape(args[j]); - } - } else { - args.emplace_back(argv[i]); - } - } - - // parse args - int args_c = static_cast(args.size()); - for (int i = 0; i < args_c && !invalid_param; i++) { - arg = args[i]; if (arg == "-s" || arg == "--seed") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.seed = std::stoi(args[i]); + params.seed = std::stoi(argv[i]); } else if (arg == "-t" || arg == "--threads") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_threads = std::stoi(args[i]); + params.n_threads = std::stoi(argv[i]); } else if (arg == "-p" || arg == "--prompt") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.prompt = args[i]; + params.prompt = argv[i]; } else if (arg == "-f" || arg == "--file") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - std::ifstream file(args[i]); + std::ifstream file(argv[i]); if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str()); + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); invalid_param = true; break; } @@ -148,100 +77,80 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n_predict") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_predict = std::stoi(args[i]); + params.n_predict = std::stoi(argv[i]); } else if (arg == "--top_k") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.top_k = std::stoi(args[i]); + params.top_k = std::stoi(argv[i]); } else if (arg == "-c" || arg == "--ctx_size") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_ctx = std::stoi(args[i]); + params.n_ctx = std::stoi(argv[i]); } else if (arg == "--memory_f32") { params.memory_f16 = false; } else if (arg == "--top_p") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.top_p = std::stof(args[i]); + params.top_p = std::stof(argv[i]); } else if (arg == "--temp") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.temp = std::stof(args[i]); + params.temp = std::stof(argv[i]); } else if (arg == "--repeat_last_n") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.repeat_last_n = std::stoi(args[i]); + params.repeat_last_n = std::stoi(argv[i]); } else if (arg == "--repeat_penalty") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.repeat_penalty = std::stof(args[i]); + params.repeat_penalty = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch_size") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_batch = std::stoi(args[i]); + params.n_batch = std::stoi(argv[i]); params.n_batch = std::min(512, params.n_batch); } else if (arg == "--keep") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_keep = std::stoi(args[i]); + params.n_keep = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.model = args[i]; + params.model = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { params.embedding = true; - } else if (arg == "--clean-interface") { - params.clean_interface = true; } else if (arg == "--interactive-start") { params.interactive = true; } else if (arg == "--interactive-first") { params.interactive_start = true; } else if (arg == "-ins" || arg == "--instruct") { - fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n" - "--clean-interface " - "--interactive-first " - "--keep -1 " - "--ins-prefix-bos " - "--ins-prefix \"\\n\\n### Instruction:\\n\\n\" " - "--ins-suffix \"\\n\\n### Response:\\n\\n\" " - "-r \"### Instruction:\\n\\n\" " - "\n\n"); - // params.instruct = true; - params.clean_interface = true; - params.interactive_start = true; - params.n_keep = -1; - params.instruct_prefix_bos = true; - params.instruct_prefix = "\n\n### Instruction:\n\n"; - params.instruct_suffix = "\n\n### Response:\n\n"; - params.antiprompt.push_back("### Instruction:\n\n"); + params.instruct = true; } else if (arg == "--color") { params.use_color = true; - } else if (arg == "--disable-multiline") { - params.multiline_mode = false; } else if (arg == "--mlock") { params.use_mlock = true; } else if (arg == "--no-mmap") { @@ -251,94 +160,65 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= args_c) { - invalid_param = true; - break; - } - params.antiprompt.push_back(args[i]); - } else if (arg == "--stop-prompt") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.stopprompt.push_back(args[i]); - } else if (arg == "--rm-trailing-space-workaround") { - params.rm_trailing_space_workaround = true; + params.antiprompt.push_back(argv[i]); } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--n_parts") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.n_parts = std::stoi(args[i]); + params.n_parts = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { - gpt_print_usage(argv[0], default_params); + gpt_print_usage(argc, argv, default_params); exit(0); } else if (arg == "--random-prompt") { params.random_prompt = true; } else if (arg == "--in-prefix") { - if (++i >= args_c) { - invalid_param = true; - break; - } - params.input_prefix = args[i]; - } else if (arg == "--ins-prefix-bos") { - params.instruct_prefix_bos = true; - } else if (arg == "--ins-prefix") { - if (++i >= args_c) { - invalid_param = true; - break; - } - params.instruct_prefix = args[i]; - } else if (arg == "--ins-suffix-bos") { - params.instruct_suffix_bos = true; - } else if (arg == "--ins-suffix") { - if (++i >= args_c) { + if (++i >= argc) { invalid_param = true; break; } - params.instruct_suffix = args[i]; + params.input_prefix = argv[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - gpt_print_usage(argv[0], default_params); + gpt_print_usage(argc, argv, default_params); exit(1); } } if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - gpt_print_usage(argv[0], default_params); + gpt_print_usage(argc, argv, default_params); exit(1); } return true; } -void gpt_print_usage(char * argv_0, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv_0); +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -i, --interactive run in interactive mode\n"); fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " --clean-interface hides input prefix & suffix and displays '>' instead\n"); + fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); fprintf(stderr, " specified more than once for multiple prompts).\n"); fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stderr, " --disable-multiline disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n"); fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stderr, " --ins-prefix STRING (instruct) prefix user inputs with tokenized string (default: empty)\n"); - fprintf(stderr, " --ins-prefix-bos (instruct) prepend bos token to instruct prefix.\n"); - fprintf(stderr, " --ins-suffix STRING (instruct) suffix user inputs with tokenized string (default: empty)\n"); - fprintf(stderr, " --ins-suffix-bos (instruct) prepend bos token to instruct suffix.\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); @@ -448,61 +328,3 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) { str = strTo; } #endif - -bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) { - bool another_line = true; - bool is_eof_multiline_toggled = false; - do { - std::string line; -#if defined(_WIN32) - auto & stdcin = std::wcin; - std::wstring wline; - if (!std::getline(stdcin, wline)) { - // input stream is bad or EOF received - if (stdcin.bad()) { - fprintf(stderr, "%s: error: input stream bad\n", __func__); - return 1; - } - } - win32_utf8_encode(wline, line); -#else - auto & stdcin = std::cin; - if (!std::getline(stdcin, line)) { - // input stream is bad or EOF received - if (stdcin.bad()) { - fprintf(stderr, "%s: error: input stream bad\n", __func__); - return 1; - } - } -#endif - if (stdcin.eof()) { - stdcin.clear(); - stdcin.seekg(0, std::ios::beg); - if (!eof_toggled_multiline_mode) { - another_line = false; - } else { - is_eof_multiline_toggled = !is_eof_multiline_toggled; - if (is_eof_multiline_toggled) { - input_text += line; - continue; - } - } - } - if (!eof_toggled_multiline_mode) { - if (line.empty() || line.back() != '\\') { - another_line = false; - } else { - line.pop_back(); // Remove the continue character - } - } else { - if (!is_eof_multiline_toggled) { - another_line = false; - } - } - input_text += line; - if (another_line) { - input_text += '\n'; // Append the line to the result - } - } while (another_line); - return true; -} diff --git a/examples/common.h b/examples/common.h index df8e4c6ccb9901..1ea6f744518111 100644 --- a/examples/common.h +++ b/examples/common.h @@ -14,14 +14,14 @@ // struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default) - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; // context size - int32_t n_batch = 8; // batch size for prompt processing - int32_t n_keep = 0; // number of tokens to keep from initial prompt (-1 for all) + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing + int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters int32_t top_k = 40; @@ -33,15 +33,8 @@ struct gpt_params { std::string prompt = ""; std::string input_prefix = ""; // string to prefix user inputs with - std::string instruct_prefix = ""; // prefix user inputs with tokenized string - bool instruct_prefix_bos = false; // prepend bos token to instruct prefix - std::string instruct_suffix = ""; // suffix user inputs with tokenized string - bool instruct_suffix_bos = false; // prepend bos token to instruct suffix std::vector antiprompt; // string upon seeing which more user input is prompted - std::vector stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes) - - bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided @@ -58,14 +51,11 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation - - bool clean_interface = false; // hides input prefix & suffix and displays '>' - bool multiline_mode = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); -void gpt_print_usage(char * argv_0, const gpt_params & params); +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); @@ -105,5 +95,3 @@ void set_console_color(console_state & con_st, console_color_t color); void win32_console_init(bool enable_color); void win32_utf8_encode(const std::wstring & wstr, std::string & str); #endif - -bool get_input_text(std::string & input_text, bool escape_newline_mode); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 68b4b2840858e7..ba153cb82dcf67 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -30,8 +30,7 @@ static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) void sigint_handler(int signo) { set_console_color(con_st, CONSOLE_COLOR_DEFAULT); - fflush(stdout); - fflush(stderr); + printf("\n"); // this also force flush stdout. if (signo == SIGINT) { if (!is_interacting) { is_interacting=true; @@ -90,8 +89,6 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty(); - // params.prompt = R"(// this function checks if the number n is prime //bool is_prime(int n) {)"; @@ -156,20 +153,22 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) { + if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos); - std::string instruct_suffix = params.instruct_suffix; - if (params.rm_trailing_space_workaround) { - if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); } + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + params.interactive_start = true; + params.antiprompt.push_back("### Instruction:\n\n"); } - const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos); // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.interactive_start) { params.interactive = true; } @@ -211,21 +210,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); } } - if (params.stopprompt.size()) { - for (auto stopprompt : params.stopprompt) { - fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str()); - } - } if (!params.input_prefix.empty()) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } - if (!params.instruct_prefix.empty()) { - fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str()); - } - if (!params.instruct_suffix.empty()) { - fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str()); - } } fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); @@ -241,29 +229,12 @@ int main(int argc, char ** argv) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) " - Press Ctrl+C to interject at any time.\n" #endif - ); - if (params.multiline_mode) { - fprintf(stderr, " - Press Return to return control to LLaMa.\n" -#if defined (_WIN32) - " - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n"); -#else - " - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n"); -#endif - } - else { - fprintf(stderr, " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n\n"); - } + " - Press Return to return control to LLaMa.\n" + " - If you want to submit another line, end your input in '\\'.\n\n"); is_interacting = params.interactive_start; } - struct Antiprompt { - bool any = false; - bool trailing_space = false; - size_t len; - bool is_stop_prompt = false; - } antiprompt; - + bool is_antiprompt = false; bool input_noecho = false; int n_past = 0; @@ -333,7 +304,7 @@ int main(int argc, char ** argv) { } // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive && !instruct_mode) { + if (id == llama_token_eos() && params.interactive && !params.instruct) { id = llama_token_newline.front(); if (params.antiprompt.size() != 0) { // tokenize and inject first reverse prompt @@ -379,72 +350,27 @@ int main(int argc, char ** argv) { // check if we should prompt the user for more if (params.interactive && (int) embd_inp.size() <= n_consumed) { - // check for reverse prompt or stop prompt - if (params.antiprompt.size() || params.stopprompt.size()) { + // check for reverse prompt + if (params.antiprompt.size()) { std::string last_output; for (auto id : last_n_tokens) { last_output += llama_token_to_str(ctx, id); } - antiprompt.any = false; - antiprompt.is_stop_prompt = false; + is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. - for (std::string & prompt : params.antiprompt) { - if (params.rm_trailing_space_workaround) { - antiprompt.trailing_space = prompt.back() == ' '; - antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0); - } - if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) { + for (std::string & antiprompt : params.antiprompt) { + if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { is_interacting = true; - antiprompt.any = true; + is_antiprompt = true; set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); fflush(stdout); break; } } - if (!antiprompt.any) { - for (std::string & prompt : params.stopprompt) { - if (params.rm_trailing_space_workaround) { - antiprompt.trailing_space = prompt.back() == ' '; - antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0); - } - if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) { - is_interacting = true; - antiprompt.any = true; - antiprompt.is_stop_prompt = true; - set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); - fflush(stdout); - break; - } - } - } } - if (n_past > 0 && is_interacting) - { - std::string buffer; - if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) { - // avoid printing again user's new line (TODO: try to revert enter press and print newline) - int i = params.instruct_prefix.front() == '\n' ? 1 : 0; - for (; i < inp_pfx.size(); i++) { - printf("%s", llama_token_to_str(ctx, inp_pfx[i])); - } - fflush(stdout); - } - if (params.rm_trailing_space_workaround) { - // add only if not stopprompt (as stopprompt could be used to pause - // assistant and then continue without input - adding back trailing - // space may mess it up.) - if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) { - // add back removed trailing space to buffer(workaround) - buffer += ' '; - if (!params.clean_interface) { - printf("%s", buffer.c_str()); - } - fflush(stdout); - } - } - + if (n_past > 0 && is_interacting) { // potentially set color to indicate we are taking user input set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); @@ -453,45 +379,49 @@ int main(int argc, char ** argv) { signal(SIGINT, sigint_handler); #endif - if (params.clean_interface) { + if (params.instruct) { printf("\n> "); } + std::string buffer; if (!params.input_prefix.empty()) { buffer += params.input_prefix; printf("%s", buffer.c_str()); } - if (!get_input_text(buffer, params.multiline_mode)) { - // input stream is bad - return 1; - } - if (!antiprompt.is_stop_prompt) { - buffer += "\n"; - } + std::string line; + bool another_line = true; + do { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // input stream is bad or EOF received + return 0; + } + win32_utf8_encode(wline, line); +#else + if (!std::getline(std::cin, line)) { + // input stream is bad or EOF received + return 0; + } +#endif + if (line.empty() || line.back() != '\\') { + another_line = false; + } else { + line.pop_back(); // Remove the continue character + } + buffer += line + '\n'; // Append the line to the result + } while (another_line); // done taking input, reset color set_console_color(con_st, CONSOLE_COLOR_DEFAULT); - if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) { - // avoid printing again user's new line (TODO: try to revert enter press and print newline) - int i = params.instruct_suffix.front() == '\n' ? 1 : 0; - for (; i < inp_sfx.size(); i++) { - printf("%s", llama_token_to_str(ctx, inp_sfx[i])); - } - // if (remove trailing space workaround) { - // We won't add back removed trailing space here, because assistant continues here, - // and it may mess up it's output (remove trailing space workaround). - // } - fflush(stdout); - } - // Add tokens to embd only if the input buffer is non-empty // Entering a empty line lets the user pass control back if (buffer.length() > 1) { - // insert input prefix - if (!params.instruct_prefix.empty() && !antiprompt.any) { + // instruct mode: insert instruction prefix + if (params.instruct && !is_antiprompt) { n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } @@ -499,8 +429,8 @@ int main(int argc, char ** argv) { auto line_inp = ::llama_tokenize(ctx, buffer, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - // insert response suffix - if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) { + // instruct mode: insert response suffix + if (params.instruct) { embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } @@ -517,7 +447,7 @@ int main(int argc, char ** argv) { // end of text token if (!embd.empty() && embd.back() == llama_token_eos()) { - if (instruct_mode) { + if (params.instruct) { is_interacting = true; } else { fprintf(stderr, " [end of text]\n"); diff --git a/prompts/alpaca.txt b/prompts/alpaca.txt new file mode 100644 index 00000000000000..2224bdeb0bcd44 --- /dev/null +++ b/prompts/alpaca.txt @@ -0,0 +1 @@ +Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt new file mode 100644 index 00000000000000..ad494d831f6fb8 --- /dev/null +++ b/prompts/chat-with-bob.txt @@ -0,0 +1,7 @@ +Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. + +User: Hello, Bob. +Bob: Hello. How may I help you today? +User: Please tell me the largest city in Europe. +Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. +User: \ No newline at end of file From 106faaf2971d6c89d6010279a9a95737772470ef Mon Sep 17 00:00:00 2001 From: katsu560 <118887472+katsu560@users.noreply.github.com> Date: Sat, 15 Apr 2023 14:51:11 +0900 Subject: [PATCH 044/773] cmake : add finding the OpenBLAS header file (#992) --- CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d5715d92aa8dfb..5a20de3a216957 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,6 +120,21 @@ if (LLAMA_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS) add_link_options(${BLAS_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas) + + # find header file + set(OPENBLAS_INCLUDE_SEARCH_PATHS + /usr/include + /usr/include/openblas + /usr/include/openblas-base + /usr/local/include + /usr/local/include/openblas + /usr/local/include/openblas-base + /opt/OpenBLAS/include + $ENV{OpenBLAS_HOME} + $ENV{OpenBLAS_HOME}/include + ) + find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + add_compile_options(-I${OPENBLAS_INC}) else() message(WARNING "OpenBLAS not found") endif() From c12b14b77fced0ce9a0e2d81f670c3a746dec251 Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Sat, 15 Apr 2023 07:51:54 +0200 Subject: [PATCH 045/773] benchmark : fix result validation in benchmark-q4_0-matmult (#987) --- examples/benchmark/benchmark-q4_0-matmult.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c index 90f537fd8ae40c..84b06766c15dc7 100644 --- a/examples/benchmark/benchmark-q4_0-matmult.c +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -24,7 +24,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { float sum = 0; - if (tensor->type==6) { + if (tensor->type==GGML_TYPE_F32) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; From aa485cee334e84437e21681c14b6f80b65876d8b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 15 Apr 2023 14:25:45 +0300 Subject: [PATCH 046/773] ggml : use posix_memalign on non-Windows env --- ggml.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index 1574d649846dd9..cf6a81f43cfed5 100644 --- a/ggml.c +++ b/ggml.c @@ -118,7 +118,16 @@ typedef void* thread_ret_t; #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size) +inline static void* ggml_aligned_malloc(size_t size) { + void* aligned_memory = NULL; + int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + if (result != 0) { + // Handle allocation failure + return NULL; + } + return aligned_memory; +} +#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif @@ -531,31 +540,31 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } -inline float vminvq_f32(float32x4_t v) { +float vminvq_f32(float32x4_t v) { return MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } -inline float vmaxvq_f32(float32x4_t v) { +float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } -inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { +int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { return vget_low_s8(vcombine_s8(a, b)); } -inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { +int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { return vget_high_s8(vcombine_s8(a, b)); } -inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { +uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { return vget_low_u8(vcombine_u8(a, b)); } -inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { +uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { return vget_high_u8(vcombine_u8(a, b)); } From e95b6554b493e71a0275764342e09bd5784a7026 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 15 Apr 2023 17:53:22 +0300 Subject: [PATCH 047/773] ggml : add Q8_0 quantization for intermediate results (#951) * ggml : add Q8_0 quantization for intermediate results * quantize-stats : fix test + add it to Makefile default * Q8: use int8_t, AVX/AVX2 optimizations * ggml : fix quantize_row_q8_0() ARM_NEON rounding * minor : updates after rebase to latest master * quantize-stats : delete obsolete strings * ggml : fix q4_1 dot func --------- Co-authored-by: Stephan Walter --- Makefile | 2 +- ggml.c | 456 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- ggml.h | 2 + 3 files changed, 442 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index a1b99c6f9dfe29..e7470d51a22945 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize perplexity embedding +default: main quantize quantize-stats perplexity embedding # # Build library diff --git a/ggml.c b/ggml.c index cf6a81f43cfed5..54b9f764f78393 100644 --- a/ggml.c +++ b/ggml.c @@ -575,7 +575,7 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) typedef struct { - float d; // delta + float d; // delta uint8_t qs[QK / 2]; // nibbles / quants } block_q4_0; static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block size/padding"); @@ -584,12 +584,19 @@ static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block si // blocks of QK elements // represented with 2 floats (delta + min) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) typedef struct { - float d; - float m; + float d; // delta + float m; // min uint8_t qs[QK / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); +typedef struct { + float d; // delta + int8_t qs[QK]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(float) + QK, "wrong q8_0 block size/padding"); + + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK == 0); @@ -1042,6 +1049,157 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int #endif } +// reference implementation for deterministic creation of model files +static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { + assert(k % QK == 0); + const int nb = k / QK; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK; l++) { + const float v = x[i*QK + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int l = 0; l < QK; ++l) { + const float v = x[i*QK + l]*id; + y[i].qs[l] = roundf(v); + } + } +} + +static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(k % QK == 0); + const int nb = k / QK; + + block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); + for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); + for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_0_reference(x, y, k); +#endif +} + static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; @@ -2344,12 +2502,12 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); sum00 += x0->m*y0->m; - sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); - sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); + sum01 += y0->m*x0->d*((uint16_t)vaddvq_u8(v0_0l) + (uint16_t)vaddvq_u8(v0_0h)); + sum10 += x0->m*y0->d*((uint16_t)vaddvq_u8(v1_0l) + (uint16_t)vaddvq_u8(v1_0h)); sum00 += x1->m*y1->m; - sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); - sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); + sum01 += y1->m*x1->d*((uint16_t)vaddvq_u8(v0_1l) + (uint16_t)vaddvq_u8(v0_1h)); + sum10 += x1->m*y1->d*((uint16_t)vaddvq_u8(v1_1l) + (uint16_t)vaddvq_u8(v1_1h)); #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t @@ -2417,6 +2575,209 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest *s = sumf; } +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK; + + assert(n % QK == 0); + assert(nb % 2 == 0); + + const block_q4_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0; + +#if defined(__ARM_NEON) + float sum0 = 0.0f; + float sum1 = 0.0f; + + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + // interleave + const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h); + const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h); + const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h); + const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); + int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); + + p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); + p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); + + sum0 += x0->d*y0->d*vaddvq_s32(p_0); + sum1 += x1->d*y1->d*vaddvq_s32(p_1); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + + const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); + const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); + + const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); + const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); + + const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); + const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); + + sum0 += x0->d*y0->d*vaddvq_s16(p_0); + sum1 += x1->d*y1->d*vaddvq_s16(p_1); +#endif + } + + sumf = sum0 + sum1; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + + __m256i bx = bytesFromNibbles(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(bx, bx); + + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(by, bx); + + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + + const __m256i ones = _mm256_set1_epi16(1); + __m256i xy_q = _mm256_madd_epi16(ones, dot); + + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps( xy_q ); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + + __m128i i32[2]; + for (int j = 0; j < 2; ++j) { + // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes + __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); + __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m128i off = _mm_set1_epi8( 8 ); + bx = _mm_sub_epi8( bx, off ); + + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); + + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); + + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + const __m128i ones = _mm_set1_epi16(1); + i32[j] = _mm_madd_epi16(ones, dot); + } + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ); +#else + // scalar + for (int i = 0; i < nb; i++) { + const float d0 = x[i].d; + const float d1 = y[i].d; + + const uint8_t * restrict p0 = x[i].qs; + const int8_t * restrict p1 = y[i].qs; + + int sumi = 0; + for (int j = 0; j < QK/2; j++) { + const uint8_t v0 = p0[j]; + + const int i0 = (int8_t) (v0 & 0xf) - 8; + const int i1 = (int8_t) (v0 >> 4) - 8; + + const int i2 = p1[2*j + 0]; + const int i3 = p1[2*j + 1]; + + sumi += i0*i2 + i1*i3; + } + sumf += d0*d1*sumi; + } +#endif + + *s = sumf; +} + // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { @@ -2663,22 +3024,24 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK, [GGML_TYPE_Q4_1] = QK, + [GGML_TYPE_Q8_0] = QK, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 8, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { @@ -2686,11 +3049,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_NAME is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -3371,6 +3735,10 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3431,6 +3799,10 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3485,6 +3857,10 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3529,6 +3905,10 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3571,6 +3951,10 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3615,6 +3999,10 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { { GGML_ASSERT(false); } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(false); + } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -5437,6 +5825,7 @@ static void ggml_compute_forward_dup( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5518,6 +5907,7 @@ static void ggml_compute_forward_add( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5570,6 +5960,7 @@ static void ggml_compute_forward_sub( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5622,6 +6013,7 @@ static void ggml_compute_forward_mul( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5674,6 +6066,7 @@ static void ggml_compute_forward_div( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5722,6 +6115,7 @@ static void ggml_compute_forward_sqr( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5770,6 +6164,7 @@ static void ggml_compute_forward_sqrt( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5828,6 +6223,7 @@ static void ggml_compute_forward_sum( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5905,6 +6301,7 @@ static void ggml_compute_forward_mean( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -5969,6 +6366,7 @@ static void ggml_compute_forward_repeat( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6017,6 +6415,7 @@ static void ggml_compute_forward_abs( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6065,6 +6464,7 @@ static void ggml_compute_forward_sgn( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6113,6 +6513,7 @@ static void ggml_compute_forward_neg( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6161,6 +6562,7 @@ static void ggml_compute_forward_step( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6209,6 +6611,7 @@ static void ggml_compute_forward_relu( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6274,6 +6677,7 @@ static void ggml_compute_forward_gelu( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6341,6 +6745,7 @@ static void ggml_compute_forward_silu( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6427,6 +6832,7 @@ static void ggml_compute_forward_norm( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6507,6 +6913,7 @@ static void ggml_compute_forward_rms_norm( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -6908,14 +7315,17 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .dequantize_row_q = dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .vec_dot_q = ggml_vec_dot_q4_0, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_0_q8_0, }, [GGML_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q_dot = quantize_row_q4_1, .vec_dot_q = ggml_vec_dot_q4_1, }, + // TODO: GGML_TYPE_Q8_0 }; // For internal test use @@ -6971,8 +7381,8 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(ne3 == ne13); const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; + vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); @@ -7041,12 +7451,12 @@ static void ggml_compute_forward_mul_mat_q_f32( if (params->type == GGML_TASK_INIT) { char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type]; + const size_t row_size = ne10*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0]; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); wdata += row_size; } } @@ -7072,7 +7482,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const int ir1 = MIN(ir0 + dr, nr); void * wdata = params->wdata; - const size_t row_size = ne00*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type]; + const size_t row_size = ne00*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0]; for (int ir = ir0; ir < ir1; ++ir) { // src0 indices @@ -7120,6 +7530,7 @@ static void ggml_compute_forward_mul_mat( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); } break; @@ -7218,6 +7629,7 @@ static void ggml_compute_forward_scale( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -7383,6 +7795,7 @@ static void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -7472,6 +7885,7 @@ static void ggml_compute_forward_diag_mask_inf( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -7566,6 +7980,7 @@ static void ggml_compute_forward_soft_max( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -7749,6 +8164,7 @@ static void ggml_compute_forward_rope( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -8017,6 +8433,7 @@ static void ggml_compute_forward_conv_1d_1s( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -8285,6 +8702,7 @@ static void ggml_compute_forward_conv_1d_2s( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -8770,6 +9188,7 @@ static void ggml_compute_forward_flash_attn( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -8981,6 +9400,7 @@ static void ggml_compute_forward_flash_ff( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -9030,6 +9450,7 @@ static void ggml_compute_forward_map_unary( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -9085,6 +9506,7 @@ static void ggml_compute_forward_map_binary( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -9914,7 +10336,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else #endif { - cur = GGML_TYPE_SIZE[node->src0->type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[node->src0->type]; + cur = GGML_TYPE_SIZE[GGML_TYPE_Q8_0]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[GGML_TYPE_Q8_0]; } } else { GGML_ASSERT(false); diff --git a/ggml.h b/ggml.h index 617298a95536dd..241e96a1975b1c 100644 --- a/ggml.h +++ b/ggml.h @@ -204,6 +204,7 @@ enum ggml_type { GGML_TYPE_F16 = 1, GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, + GGML_TYPE_Q8_0 = 4, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -836,6 +837,7 @@ typedef struct { dequantize_row_q_t dequantize_row_q; quantize_row_q_t quantize_row_q; quantize_row_q_t quantize_row_q_reference; + quantize_row_q_t quantize_row_q_dot; vec_dot_q_t vec_dot_q; } quantize_fns_t; From 0ad964631f9b3970f1936008fcfb1eadef59c7ed Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sat, 15 Apr 2023 16:25:38 +0000 Subject: [PATCH 048/773] Refactor ggml.c for future tensor types (#1001) --- ggml.c | 537 ++++++++++++++------------------------------------------- 1 file changed, 129 insertions(+), 408 deletions(-) diff --git a/ggml.c b/ggml.c index 54b9f764f78393..ccad76e8c018e3 100644 --- a/ggml.c +++ b/ggml.c @@ -427,8 +427,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // quantization // -#define QK 32 - // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 #if __AVX2__ || __AVX512F__ @@ -571,44 +569,42 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { #endif #endif -// method 5 -// blocks of QK elements -// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) + +#define QK4_0 32 typedef struct { float d; // delta - uint8_t qs[QK / 2]; // nibbles / quants + uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block size/padding"); +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); -// method 4 -// blocks of QK elements -// represented with 2 floats (delta + min) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) +#define QK4_1 32 typedef struct { float d; // delta float m; // min - uint8_t qs[QK / 2]; // nibbles / quants + uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); +#define QK8_0 32 typedef struct { - float d; // delta - int8_t qs[QK]; // quants + float d; // delta + int8_t qs[QK8_0]; // quants } block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(float) + QK, "wrong q8_0 block size/padding"); +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; - uint8_t pp[QK/2]; + uint8_t pp[QK4_0/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK4_0; l++) { + const float v = x[i*QK4_0 + l]; amax = MAX(amax, fabsf(v)); } @@ -617,9 +613,9 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; + for (int l = 0; l < QK4_0; l += 2) { + const float v0 = x[i*QK4_0 + l + 0]*id; + const float v1 = x[i*QK4_0 + l + 1]*id; const uint8_t vi0 = (int8_t)roundf(v0) + 8; const uint8_t vi1 = (int8_t)roundf(v1) + 8; @@ -635,8 +631,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r } static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; block_q4_0 * restrict y = vy; @@ -886,19 +882,19 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; block_q4_1 * restrict y = vy; - uint8_t pp[QK/2]; + uint8_t pp[QK4_1/2]; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK4_1; l++) { + const float v = x[i*QK4_1 + l]; if (v < min) min = v; if (v > max) max = v; } @@ -909,9 +905,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric y[i].d = d; y[i].m = min; - for (int l = 0; l < QK; l += 2) { - const float v0 = (x[i*QK + l + 0] - min)*id; - const float v1 = (x[i*QK + l + 1] - min)*id; + for (int l = 0; l < QK4_1; l += 2) { + const float v0 = (x[i*QK4_1 + l + 0] - min)*id; + const float v1 = (x[i*QK4_1 + l + 1] - min)*id; const uint8_t vi0 = roundf(v0); const uint8_t vi1 = roundf(v1); @@ -927,9 +923,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric } static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); + assert(k % QK4_1 == 0); - const int nb = k / QK; + const int nb = k / QK4_1; block_q4_1 * restrict y = vy; @@ -1013,7 +1009,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int float32x4_t minv[8]; float32x4_t maxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l); + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l); for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); @@ -1051,14 +1047,14 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int // reference implementation for deterministic creation of model files static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK8_0; l++) { + const float v = x[i*QK8_0 + l]; amax = MAX(amax, fabsf(v)); } @@ -1067,16 +1063,16 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; - for (int l = 0; l < QK; ++l) { - const float v = x[i*QK + l]*id; + for (int l = 0; l < QK8_0; ++l) { + const float v = x[i*QK8_0 + l]*id; y[i].qs[l] = roundf(v); } } } static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; block_q8_0 * restrict y = vy; @@ -1201,8 +1197,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int } static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; const block_q4_0 * restrict x = vx; @@ -1213,7 +1209,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK4_0; l += 32) { // Load 32x4-bit integers into 32x8-bit integers __m256i vx8 = bytesFromNibbles(pp+l/2); @@ -1235,7 +1231,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in // Scale and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_mul_ps(vf[j], d_v); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result); } } } @@ -1245,7 +1241,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK4_0; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1284,10 +1280,10 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmulq_f32(vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK4_0 + l + 0, r0); + vst1q_f32(y + i*QK4_0 + l + 4, r1); + vst1q_f32(y + i*QK4_0 + l + 8, r2); + vst1q_f32(y + i*QK4_0 + l + 12, r3); } } #else @@ -1297,7 +1293,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_0; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1308,19 +1304,19 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1); - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK4_0 + l + 0] = v0; + y[i*QK4_0 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK4_0 + l + 0])); + assert(!isnan(y[i*QK4_0 + l + 1])); } } #endif } static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; const block_q4_1 * restrict x = vx; @@ -1331,7 +1327,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK4_1; l += 32) { // Load 32x4-bit integers into 32x8-bit integers __m256i vx8 = bytesFromNibbles(pp+l/2); @@ -1350,7 +1346,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in // Scale, add m and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result); } } } @@ -1361,7 +1357,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK4_1; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1392,10 +1388,10 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK4_1 + l + 0, r0); + vst1q_f32(y + i*QK4_1 + l + 4, r1); + vst1q_f32(y + i*QK4_1 + l + 8, r2); + vst1q_f32(y + i*QK4_1 + l + 12, r3); } } #else @@ -1405,7 +1401,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_1; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1414,11 +1410,11 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float v0 = vi0*d + m; const float v1 = vi1*d + m; - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK4_1 + l + 0] = v0; + y[i*QK4_1 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK4_1 + l + 0])); + assert(!isnan(y[i*QK4_1 + l + 1])); } } #endif @@ -1980,7 +1976,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -#if __AVX512F__ && QK == 32 +#if __AVX512F__ && QK4_0 == 32 static inline __m512 dot_q4_0_oneblock_avx512( __m512 acc, const block_q4_0 * restrict x, @@ -2048,9 +2044,9 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t } static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; + const int nb = n / QK4_0; - assert(n % QK == 0); + assert(n % QK4_0 == 0); assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; @@ -2373,7 +2369,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t * restrict p1 = y[i].qs; int sumi = 0; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK4_0/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; @@ -2393,7 +2389,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest } static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; + const int nb = n / QK4_1; const block_q4_1 * restrict x = vx; const block_q4_1 * restrict y = vy; @@ -2470,7 +2466,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - sumf = _mm_cvtss_f32( res ) + acc_offset * QK; + sumf = _mm_cvtss_f32( res ) + acc_offset * QK4_1; #elif defined(__ARM_NEON) float sum00 = 0.0f; float sum01 = 0.0f; @@ -2544,7 +2540,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest #endif } - sumf = QK*sum00 + sum01 + sum10 + sum11; + sumf = QK4_1*sum00 + sum01 + sum10 + sum11; #else // scalar for (int i = 0; i < nb; i++) { @@ -2557,7 +2553,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK4_1/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; @@ -2576,9 +2572,9 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest } static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; + const int nb = n / QK8_0; - assert(n % QK == 0); + assert(n % QK8_0 == 0); assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; @@ -2760,7 +2756,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int8_t * restrict p1 = y[i].qs; int sumi = 0; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK8_0/2; j++) { const uint8_t v0 = p0[j]; const int i0 = (int8_t) (v0 & 0xf) - 8; @@ -3022,9 +3018,9 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = 1, [GGML_TYPE_F16] = 1, - [GGML_TYPE_Q4_0] = QK, - [GGML_TYPE_Q4_1] = QK, - [GGML_TYPE_Q8_0] = QK, + [GGML_TYPE_Q4_0] = QK4_0, + [GGML_TYPE_Q4_1] = QK4_1, + [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, @@ -3727,18 +3723,6 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3774,7 +3758,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3791,18 +3775,6 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); @@ -3838,7 +3810,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3849,18 +3821,6 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3886,7 +3846,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3897,18 +3857,6 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3934,7 +3882,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3943,18 +3891,6 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -3980,7 +3916,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -3991,18 +3927,6 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { switch (tensor->type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(false); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(false); - } break; case GGML_TYPE_I8: { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); @@ -4028,7 +3952,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5823,13 +5747,7 @@ static void ggml_compute_forward_dup( { ggml_compute_forward_dup_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5905,14 +5823,7 @@ static void ggml_compute_forward_add( { ggml_compute_forward_add_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -5958,14 +5869,7 @@ static void ggml_compute_forward_sub( { ggml_compute_forward_sub_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6011,14 +5915,7 @@ static void ggml_compute_forward_mul( { ggml_compute_forward_mul_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6064,14 +5961,7 @@ static void ggml_compute_forward_div( { ggml_compute_forward_div_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6113,14 +6003,7 @@ static void ggml_compute_forward_sqr( { ggml_compute_forward_sqr_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6162,14 +6045,7 @@ static void ggml_compute_forward_sqrt( { ggml_compute_forward_sqrt_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6221,14 +6097,7 @@ static void ggml_compute_forward_sum( { ggml_compute_forward_sum_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6299,14 +6168,7 @@ static void ggml_compute_forward_mean( { ggml_compute_forward_mean_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6364,14 +6226,7 @@ static void ggml_compute_forward_repeat( { ggml_compute_forward_repeat_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6413,14 +6268,7 @@ static void ggml_compute_forward_abs( { ggml_compute_forward_abs_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6462,14 +6310,7 @@ static void ggml_compute_forward_sgn( { ggml_compute_forward_sgn_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6511,14 +6352,7 @@ static void ggml_compute_forward_neg( { ggml_compute_forward_neg_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6560,14 +6394,7 @@ static void ggml_compute_forward_step( { ggml_compute_forward_step_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6609,14 +6436,7 @@ static void ggml_compute_forward_relu( { ggml_compute_forward_relu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6675,14 +6495,7 @@ static void ggml_compute_forward_gelu( { ggml_compute_forward_gelu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6743,14 +6556,7 @@ static void ggml_compute_forward_silu( { ggml_compute_forward_silu_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6830,14 +6636,7 @@ static void ggml_compute_forward_norm( { ggml_compute_forward_norm_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -6911,14 +6710,7 @@ static void ggml_compute_forward_rms_norm( { ggml_compute_forward_rms_norm_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7542,10 +7334,7 @@ static void ggml_compute_forward_mul_mat( { ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); } break; - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7627,14 +7416,7 @@ static void ggml_compute_forward_scale( { ggml_compute_forward_scale_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7807,10 +7589,7 @@ static void ggml_compute_forward_get_rows( { ggml_compute_forward_get_rows_f32(params, src0, src1, dst); } break; - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7883,14 +7662,7 @@ static void ggml_compute_forward_diag_mask_inf( { ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -7978,14 +7750,7 @@ static void ggml_compute_forward_soft_max( { ggml_compute_forward_soft_max_f32(params, src0, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8162,13 +7927,7 @@ static void ggml_compute_forward_rope( { ggml_compute_forward_rope_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8431,13 +8190,7 @@ static void ggml_compute_forward_conv_1d_1s( { ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -8700,13 +8453,7 @@ static void ggml_compute_forward_conv_1d_2s( { ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9186,13 +8933,7 @@ static void ggml_compute_forward_flash_attn( { ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9398,13 +9139,7 @@ static void ggml_compute_forward_flash_ff( { GGML_ASSERT(false); // TODO } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9448,14 +9183,7 @@ static void ggml_compute_forward_map_unary( { ggml_compute_forward_map_unary_f32(params, src0, dst, fun); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -9504,14 +9232,7 @@ static void ggml_compute_forward_map_binary( { ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_F16: - case GGML_TYPE_COUNT: + default: { GGML_ASSERT(false); } break; @@ -11511,16 +11232,16 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; for (int j = 0; j < n; j += k) { - block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK; + block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0; quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_0; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -11530,20 +11251,20 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_0)); + return (n/QK4_0*sizeof(block_q4_0)); } size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; for (int j = 0; j < n; j += k) { - block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK; + block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1; quantize_row_q4_1_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK4_1; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -11553,7 +11274,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_1)); + return (n/QK4_1*sizeof(block_q4_1)); } //////////////////////////////////////////////////////////////////////////////// From 2f7c8e014e3c0ceaf39688845c2ff6f919fb03b7 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sat, 15 Apr 2023 18:28:56 +0000 Subject: [PATCH 049/773] Fix potential int8 overflow in non-SIMD vec_dot (#986) --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index ccad76e8c018e3..69974989c08f82 100644 --- a/ggml.c +++ b/ggml.c @@ -2373,11 +2373,11 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; - const int8_t i0 = (int8_t) (v0 & 0xf) - 8; - const int8_t i1 = (int8_t) (v0 >> 4) - 8; + const int i0 = (v0 & 0xf) - 8; + const int i1 = (v0 >> 4) - 8; - const int8_t i2 = (int8_t) (v1 & 0xf) - 8; - const int8_t i3 = (int8_t) (v1 >> 4) - 8; + const int i2 = (v1 & 0xf) - 8; + const int i3 = (v1 >> 4) - 8; sumi += i0*i2 + i1*i3; } From 74f5899df4a6083fc467b620baa1cf821e37799d Mon Sep 17 00:00:00 2001 From: comex Date: Sat, 15 Apr 2023 14:53:21 -0700 Subject: [PATCH 050/773] convert.py: Fix loading safetensors and ggml format on Windows (#991) Calling `mmap.mmap` on Windows apparently resets the file offset of the raw file object (and makes the BufferedReader return a *negative* file offset). For safetensors, avoid using the file offset after calling mmap. For GGML format, explicitly save and restore the offset. Fixes #966. --- convert.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 056dc618daa481..4e28a45ebb4e8b 100644 --- a/convert.py +++ b/convert.py @@ -735,7 +735,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size)) # Use mmap for the actual data to avoid race conditions with the file offset. mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) - byte_buf = mapped[fp.tell():] + byte_buf = mapped[8 + header_size:] def convert(info: Dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] @@ -761,7 +761,7 @@ def must_read(fp: IO[bytes], length: int) -> bytes: return ret -def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus: +def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus: magic = must_read(fp, 4)[::-1] if magic in (b'ggmf', b'ggjt'): version, = struct.unpack("i", must_read(fp, 4)) @@ -795,7 +795,9 @@ def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus: model: LazyModel = {} # Use mmap for the actual data to avoid race conditions with the file offset. + off = fp.raw.tell() mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) + fp.raw.seek(off) # needed on Windows def read_tensor() -> None: # this is a function so that variables captured in `load` don't change shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) From 2d3481c72125cd388258864c7ad8d7d36777bad7 Mon Sep 17 00:00:00 2001 From: nanahi <130121847+na-na-hi@users.noreply.github.com> Date: Sun, 16 Apr 2023 17:13:42 +0800 Subject: [PATCH 051/773] Fix msys2 build error and warnings (#1009) --- llama.cpp | 1 + llama_util.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index be8c4cdc124569..a0d7e5137c8689 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9,6 +9,7 @@ #include "ggml.h" #include +#include #include #include #include diff --git a/llama_util.h b/llama_util.h index 653bf71388bff2..d2110ebb4f6422 100755 --- a/llama_util.h +++ b/llama_util.h @@ -43,8 +43,12 @@ } while (0) #ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format(gnu_printf, 1, 2))) +#else __attribute__((format(printf, 1, 2))) #endif +#endif static std::string format(const char * fmt, ...) { va_list ap, ap2; va_start(ap, fmt); @@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) { va_end(ap2); va_end(ap); return std::string(buf.data(), size); -}; +} struct llama_file { // use FILE * so we don't have to re-open the file to mmap From 489537e6cf6c93b74a029a11533dbcaa89791dcc Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sun, 16 Apr 2023 12:13:00 +0200 Subject: [PATCH 052/773] examples: add missing include for time() (#1011) --- examples/embedding/embedding.cpp | 2 ++ examples/main/main.cpp | 1 + examples/perplexity/perplexity.cpp | 1 + 3 files changed, 4 insertions(+) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 2eda3ac01b9c6e..e10de619c9d5fa 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,6 +1,8 @@ #include "common.h" #include "llama.h" +#include + int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ba153cb82dcf67..3e4b0034ee977b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 38e3643b1ca5eb..19449e16e4d547 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2,6 +2,7 @@ #include "llama.h" #include +#include std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); From 3173a62eb9f90b94fb3184131032c1c8b7aa8d86 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Apr 2023 13:58:48 +0300 Subject: [PATCH 053/773] stdout : vertical align outputs for better readibility --- convert.py | 5 +++-- llama.cpp | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/convert.py b/convert.py index 4e28a45ebb4e8b..7b9f043b2c124b 100644 --- a/convert.py +++ b/convert.py @@ -951,8 +951,9 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray: ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - size = ' x '.join(map(str, lazy_tensor.shape)) - print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...") + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) ndarray.tofile(of.fout) of.fout.close() diff --git a/llama.cpp b/llama.cpp index a0d7e5137c8689..a6429a4e79203e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -262,12 +262,12 @@ static size_t checked_div(size_t a, size_t b) { } static std::string llama_format_tensor_shape(const std::vector & ne) { - std::string ret = "[" + std::to_string(ne.at(0)); + char buf[256]; + snprintf(buf, sizeof(buf), "%5u", ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { - ret += " x " + std::to_string(ne.at(i)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); } - ret += "]"; - return ret; + return buf; } static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { @@ -942,8 +942,8 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { @@ -1570,7 +1570,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s tensor.data = read_data.addr; model_loader->load_data_for(tensor); - printf("[%zu/%zu] %36s - %s, type = %6s, ", + printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); From 47f61aaa5f76d04286792e2fbd0c95b659ab2af0 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Sun, 16 Apr 2023 21:27:38 +0200 Subject: [PATCH 054/773] Fix: do not close file on mmap (#1017) --- llama_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_util.h b/llama_util.h index d2110ebb4f6422..c92c0cc7148d86 100755 --- a/llama_util.h +++ b/llama_util.h @@ -176,7 +176,6 @@ struct llama_mmap { flags |= MAP_POPULATE; #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - close(fd); if (addr == MAP_FAILED) { throw format("mmap failed: %s", strerror(errno)); } From f266259ad9a2bce5a34d919592310147af23f3dc Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Mon, 17 Apr 2023 15:10:57 +0200 Subject: [PATCH 055/773] Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933) --- CMakeLists.txt | 22 ++++- ggml.c | 235 ++++++++++++++++++++++++++++++++++++++++++------- ggml.h | 2 + llama.cpp | 26 +++--- 4 files changed, 238 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a20de3a216957..9189b6fc20bad5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" option(LLAMA_AVX "llama: enable AVX" ON) option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) +option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) +option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_FMA "llama: enable FMA" ON) # in MSVC F16C is implied with AVX2/AVX512 if (NOT MSVC) @@ -220,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") if (MSVC) if (LLAMA_AVX512) add_compile_options(/arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (LLAMA_AVX512_VBMI) + add_compile_definitions(__AVX512VBMI__) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_definitions(__AVX512VNNI__) + endif() elseif (LLAMA_AVX2) add_compile_options(/arch:AVX2) elseif (LLAMA_AVX) @@ -240,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") endif() if (LLAMA_AVX512) add_compile_options(-mavx512f) - # add_compile_options(-mavx512cd) - # add_compile_options(-mavx512dq) - # add_compile_options(-mavx512bw) + add_compile_options(-mavx512bw) + endif() + if (LLAMA_AVX512_VBMI) + add_compile_options(-mavx512vbmi) + endif() + if (LLAMA_AVX512_VNNI) + add_compile_options(-mavx512vnni) endif() endif() else() diff --git a/ggml.c b/ggml.c index 69974989c08f82..2be2ce3e9b84ec 100644 --- a/ggml.c +++ b/ggml.c @@ -1977,33 +1977,187 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float } #if __AVX512F__ && QK4_0 == 32 -static inline __m512 dot_q4_0_oneblock_avx512( +static inline __m512i bytes_from_q4_0_twoblocks_avx512( const __m512i blocks ) { + // The 64 bytes of `blocks` contain two consecutive Q4_0 blocks loaded from memory: + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // | :. =_ () [] <> () Zz Yy| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // |Xx Ww Vv Uu Tt Ss Rr Qq Pp Oo Nn Mm Ll Kk Jj Ii Hh Gg Ff Ee Dd Cc Bb Aa | + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // + // Bytes 04..19 (block #0) and 24..39 (block #1) both contain 32 nibbles (4-bit unsigned integers). + // We have exactly 64 nibbles, so we want to place each nibble into a separate byte. + // Bytes 00..03 and 20..23 contain scales, which are irrelevant to this function. + // Bytes 40..63 are masked when loading the data, so they are zeroed out. +#ifdef __AVX512VBMI__ + const __m512i byte_perm = _mm512_set_epi8( + 39, 38, 39, 38, 37, 36, 37, 36, 35, 34, 35, 34, 33, 32, 33, 32, + 31, 30, 31, 30, 29, 28, 29, 28, 27, 26, 27, 26, 25, 24, 25, 24, + 19, 18, 19, 18, 17, 16, 17, 16, 15, 14, 15, 14, 13, 12, 13, 12, + 11, 10, 11, 10, 9, 8, 9, 8, 7, 6, 7, 6, 5, 4, 5, 4 + ); + const __m512i permuted = _mm512_permutexvar_epi8( byte_perm, blocks ); + // After applying VPERMB, `permuted` looks like this: + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |:. =_ :. =_ () [] () [] <> () <> () Zz Yy Zz Yy Xx Ww Xx Ww Vv Uu Vv Uu Tt Ss Tt Ss Rr Qq Rr Qq| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |Pp Oo Pp Oo Nn Mm Nn Mm Ll Kk Ll Kk Jj Ii Jj Ii Hh Gg Hh Gg Ff Ee Ff Ee Dd Cc Dd Cc Bb Aa Bb Aa| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ +#else + const __m512i word_perm = _mm512_set_epi16( + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2 + ); + const __m512i permuted = _mm512_permutexvar_epi16( word_perm, blocks ); + // This is the fallback path for CPUs that don't support VPERMB. Since we permute 16-bit groups only, + // VPERMB can be replaced with VPERMW. We could always use VPERMW, but at least on Tiger Lake and + // Ice Lake VPERMW followed by a right shift is quite noticeably slower than VPERMB. +#endif + + // Shift every odd-numbered 16-bit group to the right by 4 bits. + const __mmask32 shift_mask = 0xaaaaaaaa; + const __m512i shifted = _mm512_mask_srai_epi16( permuted, shift_mask, permuted, 4 ); + // After applying VPSRAW, `shifted` looks like this (the "empty" nibbles are filled with zeroes): + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | : .= :. =_ ( )[ () [] < >( <> () Z zY Zz Yy X xW Xx Ww V vU Vv Uu T tS Tt Ss R rQ Rr Qq + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | P pO Pp Oo N nM Nn Mm L lK Ll Kk J jI Jj Ii H hG Hh Gg F fE Ff Ee D dC Dd Cc B bA Bb Aa| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + + // Now we just need to zero out the higher nibble in each byte, and we're done. + const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf ); + return _mm512_and_si512( low_nibble_mask, shifted ); + // The final result looks like this: + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | : = . _ ( [ ) ] < ( > ) Z Y z y X W x w V U v u T S t s R Q r q| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ + // | P O p o N M n m L K l k J I j i H G h g F E f e D C d c B A b a| + // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ +} + +static inline __m512 dot_q4_0_twoblocks_avx512( __m512 acc, const block_q4_0 * restrict x, const block_q4_0 * restrict y, int i ) { - // Compute combined scale for the block - __m512 d = _mm512_set1_ps( x[i].d * y[i].d ); - - __m256i bx = bytesFromNibbles( x[i].qs ); - __m256i by = bytesFromNibbles( y[i].qs ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - by = _mm256_sub_epi8( by, off ); - - // Sign-extend 16 signed bytes into int16_t - __m512i x32 = _mm512_cvtepi8_epi16( bx ); - __m512i y32 = _mm512_cvtepi8_epi16( by ); - // Compute products of int16_t integers, add pairwise - __m512i i64 = _mm512_madd_epi16( x32, y32 ); + // A pair of Q4_0 blocks spans 40 bytes, while an AVX-512 register has 64. The remaining 24 bytes + // can potentially be unaddressable, so we make sure to mask them out before the load, even though + // we don't use them at all. This might hurt the performance slightly, since the compiler is forced + // to use e.g. `VMOVDQU64 REG, MASK, [ADDR] + VPERMB ..., REG` instead of just `VPERMB ..., [ADDR]`. + const __mmask8 load_mask = 0x1f; + const __m512i blocks_0 = _mm512_maskz_loadu_epi64( load_mask, &x[i] ); + const __m512i blocks_1 = _mm512_maskz_loadu_epi64( load_mask, &y[i] ); + + // We want to multiply the scales, so we interpret both registers as 16 32-bit floats: + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // blocks_0_float + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | | | | | | | xx | xx | xx | xx | B | xx | xx | xx | xx | A | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // blocks_1_float + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | | | | | | | xx | xx | xx | xx | D | xx | xx | xx | xx | C | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + const __m512 blocks_0_float = _mm512_castsi512_ps( blocks_0 ); + const __m512 blocks_1_float = _mm512_castsi512_ps( blocks_1 ); + // We absolutely shouldn't touch the floats marked with `xx`: they contain some + // random data, which might very well underflow. At least on Intel, this leads + // to a huge penalty that can't be ignored (easily 100x or more) unless you + // compile your code with something like `-ffast-math` to enable FTZ/DAZ flags. + // (and ggml can't assume that you do)... + const __mmask16 scale_mul_mask = 0x21; +#ifdef __clang__ + // ...however, clang decides to optimize the multiplication mask away: + // https://godbolt.org/z/P8PqdsfvW + // gcc and MSVC do the sane thing. This horrible workaround forces clang to emit the mask. + __m512i scales; + __asm__( + "vmulps %1, %2, %0%{%3%}" + : "=v" ( scales ) + : "vm" ( blocks_0_float ), "v" ( blocks_1_float ), "Yk" ( scale_mul_mask ) + ); +#else + const __m512 scales = _mm512_maskz_mul_ps( scale_mul_mask, blocks_0_float, blocks_1_float ); +#endif + const __m512i scale_perm = _mm512_set_epi32( + 5, 5, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 0, 0, 0, 0, 0 + ); + const __m512 permuted_scales = _mm512_permutexvar_ps( scale_perm, scales ); + // After VMULPS and VPERMPS, `permuted_scales` looks like this: + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + // | B*D| B*D| B*D| B*D| B*D| B*D| B*D| B*D| A*C| A*C| A*C| A*C| A*C| A*C| A*C| A*C| + // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ + + const __m512i bytes_0 = bytes_from_q4_0_twoblocks_avx512( blocks_0 ); + const __m512i bytes_1 = bytes_from_q4_0_twoblocks_avx512( blocks_1 ); + + // Now we want to compute dot products of 4-element byte vectors and store them in + // 32-bit integers. That is (only one 4-element vector is shown for clarity): + // +----+----+----+----+ + // ... | 03 | 02 | 01 | 00 | + // +----+----+----+----+ + // bytes_0 + // +----+----+----+----+ + // ... | D | C | B | A | + // +----+----+----+----+ + // bytes_1 + // +----+----+----+----+ + // ... | H | G | F | E | + // +----+----+----+----+ + // final_res_int + // +----+----+----+----+ + // ... | A*E+B*F+C*G+D*H | + // +----+----+----+----+ + const __m512i plus_8 = _mm512_set1_epi8( 8 ); + const __m512i bytes_1_minus_8 = _mm512_sub_epi8( bytes_1, plus_8 ); + +#ifdef __AVX512VNNI__ + // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch: + // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8 + // from each nibble, so they can be negative. So, instead of `(bytes_0 - 8) * (bytes_1 - 8)`, + // we compute `bytes_0 * (bytes_1 - 8) + bytes_1 * (-8) + 64`. VPDPBUSDS uses an accumulator, + // which means we only need 2 instructions. + const __m512i dot_init = _mm512_set1_epi32( 4 * 64 ); + const __m512i minus_8 = _mm512_set1_epi8( -8 ); + const __m512i prod_0 = _mm512_dpbusds_epi32( dot_init, bytes_1, minus_8 ); + const __m512i final_res_int = _mm512_dpbusds_epi32( prod_0, bytes_0, bytes_1_minus_8 ); +#else + // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones. + // It has the same catch as VPDPBUSDS: the left operand should be unsigned. + // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me + // ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119 + const __m512i one = _mm512_set1_epi16( 1 ); + const __m512i prod_0 = _mm512_maddubs_epi16( bytes_0, bytes_1_minus_8 ); + const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, bytes_1_minus_8 ); + const __m512i diff = _mm512_sub_epi16( prod_0, prod_1 ); + const __m512i final_res_int = _mm512_madd_epi16( diff, one ); +#endif - // Convert int32_t to float - __m512 p = _mm512_cvtepi32_ps( i64 ); - // Apply the scale, and accumulate - return _mm512_fmadd_ps( d, p, acc ); + // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate. + const __m512 final_res_float = _mm512_cvtepi32_ps( final_res_int ); + return _mm512_fmadd_ps( permuted_scales, final_res_float, acc ); } #endif @@ -2135,25 +2289,26 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest __m512 acc0 = _mm512_setzero_ps(); __m512 acc1 = _mm512_setzero_ps(); - const int superblock_size = 8; + const int superblock_size = 16; + const int superblock_count = nb / superblock_size; for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { int i = superblock_ix * superblock_size; - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 ); + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 ); + acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 ); } // Remainders - for (int i = superblock_count * superblock_size; i < nb; ++i) { - acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i ); + for (int i = superblock_count * superblock_size; i < nb; i += 2) { + acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i ); } // Horizontal sum of all lanes of the accumulator @@ -11303,6 +11458,22 @@ int ggml_cpu_has_avx512(void) { #endif } +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; diff --git a/ggml.h b/ggml.h index 241e96a1975b1c..e693754c917fc3 100644 --- a/ggml.h +++ b/ggml.h @@ -808,6 +808,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * int ggml_cpu_has_avx(void); int ggml_cpu_has_avx2(void); int ggml_cpu_has_avx512(void); +int ggml_cpu_has_avx512_vbmi(void); +int ggml_cpu_has_avx512_vnni(void); int ggml_cpu_has_fma(void); int ggml_cpu_has_neon(void); int ggml_cpu_has_arm_fma(void); diff --git a/llama.cpp b/llama.cpp index a6429a4e79203e..3b916e5ade67d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1915,18 +1915,20 @@ const char * llama_print_system_info(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); } From 69b740289f9b3756ea9dd2a23f241c6f688d88b9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Apr 2023 16:16:23 +0300 Subject: [PATCH 056/773] ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 2be2ce3e9b84ec..995a2faabac848 100644 --- a/ggml.c +++ b/ggml.c @@ -8057,11 +8057,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = ggml_fp16_to_fp32(src[0]); - const float x1 = ggml_fp16_to_fp32(src[1]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); - dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta); - dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } From eb17a026fd23d1c1b612fa4600f7f5c58e501a28 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Apr 2023 17:31:06 +0300 Subject: [PATCH 057/773] quantize-stats : fix bug in --type argument --- examples/quantize-stats/quantize-stats.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 0503009319e4e4..cd973e8aca6a51 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -221,7 +221,7 @@ int main(int argc, char ** argv) { break; } int j; - for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) { + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) { // find match } if (j < GGML_TYPE_COUNT) { From efd05648c88a0923a55f56e7ce1b0f9c33410afb Mon Sep 17 00:00:00 2001 From: Arik Poznanski Date: Mon, 17 Apr 2023 17:41:53 +0300 Subject: [PATCH 058/773] llama : well-defined static initialization of complex objects (#927) * Replaced static initialization of complex objects with a initialization on first use. This prevents an undefined behavior on program run, for example, crash in Release build, works in Debug build * replaced use of auto with exact type to avoid using -std=c++14 * Made the assessors functions for static maps be static const --- llama.cpp | 72 +++++++++++++++++++++++--------------- tests/test-tokenizer-0.cpp | 20 ++++++----- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3b916e5ade67d3..cdd8bd16bc0399 100644 --- a/llama.cpp +++ b/llama.cpp @@ -42,35 +42,51 @@ static const size_t MB = 1024*1024; // TODO: dynamically determine these sizes // needs modifications in ggml -static const std::map MEM_REQ_SCRATCH0 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, -}; +static const std::map & MEM_REQ_SCRATCH0() +{ + static std::map _MEM_REQ_SCRATCH0 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 512ull * MB }, + }; + return _MEM_REQ_SCRATCH0; +} -static const std::map MEM_REQ_SCRATCH1 = { - { MODEL_7B, 512ull*MB }, - { MODEL_13B, 512ull*MB }, - { MODEL_30B, 512ull*MB }, - { MODEL_65B, 512ull*MB }, +static const std::map & MEM_REQ_SCRATCH1() +{ + static std::map _MEM_REQ_SCRATCH1 = { + { MODEL_7B, 512ull * MB }, + { MODEL_13B, 512ull * MB }, + { MODEL_30B, 512ull * MB }, + { MODEL_65B, 512ull * MB }, + }; + return _MEM_REQ_SCRATCH1; }; // 2*n_embd*n_ctx*n_layer*sizeof(float16) -static const std::map MEM_REQ_KV_SELF = { - { MODEL_7B, 1026ull*MB }, - { MODEL_13B, 1608ull*MB }, - { MODEL_30B, 3124ull*MB }, - { MODEL_65B, 5120ull*MB }, +static const std::map & MEM_REQ_KV_SELF() +{ + static std::map _MEM_REQ_KV_SELF = { + { MODEL_7B, 1026ull * MB }, + { MODEL_13B, 1608ull * MB }, + { MODEL_30B, 3124ull * MB }, + { MODEL_65B, 5120ull * MB }, + }; + return _MEM_REQ_KV_SELF; }; // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map MEM_REQ_EVAL = { - { MODEL_7B, 768ull*MB }, - { MODEL_13B, 1024ull*MB }, - { MODEL_30B, 1280ull*MB }, - { MODEL_65B, 1536ull*MB }, +static const std::map & MEM_REQ_EVAL() +{ + static std::map _MEM_REQ_EVAL = { + { MODEL_7B, 768ull * MB }, + { MODEL_13B, 1024ull * MB }, + { MODEL_30B, 1280ull * MB }, + { MODEL_65B, 1536ull * MB }, + }; + return _MEM_REQ_EVAL; }; // default hparams (LLaMA 7B) @@ -899,13 +915,13 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size + - MEM_REQ_SCRATCH0.at(model.type) + - MEM_REQ_SCRATCH1.at(model.type) + - MEM_REQ_EVAL.at (model.type); + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = - scale*MEM_REQ_KV_SELF.at(model.type); + scale*MEM_REQ_KV_SELF().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1732,10 +1748,10 @@ struct llama_context * llama_init_from_file( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } return ctx; diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 55b086daea59c0..b089845714bab4 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,13 +5,17 @@ #include #include -static const std::map> k_tests = { - { "Hello World", { 1, 10994, 2787, }, }, - { " Hello World", { 1, 15043, 2787, }, }, - { " Hello World!", { 1, 15043, 2787, 29991, }, }, - { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, +static const std::map> & k_tests() +{ + static std::map> _k_tests = { + { "Hello World", { 1, 10994, 2787, }, }, + { " Hello World", { 1, 15043, 2787, }, }, + { " Hello World!", { 1, 15043, 2787, 29991, }, }, + { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, + }; + return _k_tests; }; int main(int argc, char **argv) { @@ -47,7 +51,7 @@ int main(int argc, char **argv) { return 2; } - for (const auto & test_kv : k_tests) { + for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); res.resize(n); From 315a95a4d30db726fb7d244dd3b9e90a83fb1616 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Mon, 17 Apr 2023 17:28:55 +0200 Subject: [PATCH 059/773] Add LoRA support (#820) --- convert-lora-to-ggml.py | 124 +++++++++++ examples/common.cpp | 15 ++ examples/common.h | 7 +- examples/main/main.cpp | 11 + examples/perplexity/perplexity.cpp | 11 + ggml.c | 327 ++++++++++++++++++++++++++--- ggml.h | 6 + llama.cpp | 251 ++++++++++++++++++++++ llama.h | 12 ++ llama_util.h | 30 +-- 10 files changed, 753 insertions(+), 41 deletions(-) create mode 100644 convert-lora-to-ggml.py diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py new file mode 100644 index 00000000000000..8a2085c2511a1e --- /dev/null +++ b/convert-lora-to-ggml.py @@ -0,0 +1,124 @@ +import json +import os +import re +import struct +import sys +from typing import Any, Dict, Sequence, TextIO + +import torch + +from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType + +HF_SUBLAYER_TO_GGML = { + "self_attn.q_proj": "attention.wq", + "self_attn.k_proj": "attention.wk", + "self_attn.v_proj": "attention.wv", + "self_attn.o_proj": "attention.wo", + "mlp.gate_proj": "feed_forward.w1", + "mlp.down_proj": "feed_forward.w2", + "mlp.up_proj": "feed_forward.w3", + "input_layernorm": "attention_norm", + "post_attention_layernorm": "ffn_norm", + # "norm": "norm", + # "embed_tokens": "tok_embeddings", + # "lm_head": "output", +} + + +def translate_tensor_name(t: str) -> str: + match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) + if match: + nn = match.group(1) + sub_layer = match.group(2) + lora_type = match.group(3) + + sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) + if sub_layer_renamed is None: + print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") + sys.exit(1) + + output_string = ( + f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" + ) + return output_string + else: + print(f"Error: unrecognized tensor {t}") + sys.exit(1) + + +def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: + fout.write(b"ggla"[::-1]) # magic (ggml lora) + fout.write(struct.pack("i", 1)) # file version + fout.write(struct.pack("ii", params["r"], params["lora_alpha"])) + + +def write_tensor_header( + self, name: str, shape: Sequence[int], data_type: DataType +) -> None: + sname = name.encode("utf-8") + fout.write( + struct.pack( + "iii", + len(shape), + len(sname), + DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]], + ) + ) + fout.write(struct.pack("i" * len(shape), *shape[::-1])) + fout.write(sname) + fout.seek((fout.tell() + 31) & -32) + + +if len(sys.argv) != 2: + print(f"Usage: python {sys.argv[0]} ") + print( + "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" + ) + sys.exit(1) + +input_json = os.path.join(sys.argv[1], "adapter_config.json") +input_model = os.path.join(sys.argv[1], "adapter_model.bin") +output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") + +model = torch.load(input_model, map_location="cpu") + +with open(input_json, "r") as f: + params = json.load(f) + +if params["peft_type"] != "LORA": + print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + sys.exit(1) + +if params["fan_in_fan_out"] == True: + print("Error: param fan_in_fan_out is not supported") + sys.exit(1) + +if params["bias"] is not None and params["bias"] != "none": + print("Error: param bias is not supported") + sys.exit(1) + +# TODO: these seem to be layers that have been trained but without lora. +# doesn't seem widely used but eventually should be supported +if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: + print("Error: param modules_to_save is not supported") + sys.exit(1) + +with open(output_path, "wb") as fout: + fout.truncate() + + write_file_header(fout, params) + for k, v in model.items(): + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: + v = v.float() + + t = v.numpy() + tname = translate_tensor_name(k) + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) + t.tofile(fout) + +print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/examples/common.cpp b/examples/common.cpp index 0772dbfe142ffe..a0b6f10ad8c8bf 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.model = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { @@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/examples/common.h b/examples/common.h index 1ea6f744518111..cbbc2dfab16de1 100644 --- a/examples/common.h +++ b/examples/common.h @@ -31,11 +31,12 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; - std::string input_prefix = ""; // string to prefix user inputs with - - + std::string input_prefix = ""; // string to prefix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted + std::string lora_adapter = ""; // lora adapter path + std::string lora_base = ""; // base model path for the lora adapter + bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3e4b0034ee977b..b7b3c419655f66 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -114,6 +114,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 19449e16e4d547..80792ea0d95d0a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -134,6 +134,17 @@ int main(int argc, char ** argv) { } } + if (!params.lora_adapter.empty()) { + int err = llama_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + // print system information { fprintf(stderr, "\n"); diff --git a/ggml.c b/ggml.c index 995a2faabac848..acdba0333c8ea5 100644 --- a/ggml.c +++ b/ggml.c @@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #endif } +static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q4_0] = { + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_0_q8_0, + }, + [GGML_TYPE_Q4_1] = { + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q_dot = quantize_row_q4_1, + .vec_dot_q = ggml_vec_dot_q4_1, + }, + // TODO: GGML_TYPE_Q8_0 +}; + +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + + // // simd mappings // @@ -5588,6 +5616,26 @@ static void ggml_compute_forward_dup_f16( } } } + } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + size_t id = 0; + uint8_t * dst_ptr = (uint8_t *) dst->data; + size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + float * src0_f32 = (float *) params->wdata; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + // convert to f32 and quantize + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + } + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += dst_row_size; + } + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -5780,6 +5828,21 @@ static void ggml_compute_forward_dup_f32( } } } + } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + size_t id = 0; + uint8_t * dst_ptr = (uint8_t *) dst->data; + size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += dst_row_size; + } + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -5968,6 +6031,212 @@ static void ggml_compute_forward_add_f32( } } +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(float)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + //const int64_t ne10 = src1->ne[0]; + //const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t ne0 = dst->ne[0]; + //const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float*) params->wdata + ne00 * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne00); + } +} + static void ggml_compute_forward_add( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -5978,6 +6247,23 @@ static void ggml_compute_forward_add( { ggml_compute_forward_add_f32(params, src0, src1, dst); } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + { + ggml_compute_forward_add_q_f32(params, src0, src1, dst); + } break; default: { GGML_ASSERT(false); @@ -7257,30 +7543,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { - [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0, - }, - [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, - }, - // TODO: GGML_TYPE_Q8_0 -}; - -// For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); - return quantize_fns[i]; -} - static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10137,13 +10399,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) struct ggml_tensor * node = cgraph->nodes[i]; switch (node->op) { + case GGML_OP_CPY: case GGML_OP_DUP: { node->n_tasks = 1; + + size_t cur = 0; + if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_ADD: { node->n_tasks = n_threads; + + size_t cur = 0; + + if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + } + + work_size = MAX(work_size, cur); } break; case GGML_OP_SUB: case GGML_OP_MUL: @@ -10224,7 +10502,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = n_threads; } break; - case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: diff --git a/ggml.h b/ggml.h index e693754c917fc3..59de0cb12572ab 100644 --- a/ggml.h +++ b/ggml.h @@ -430,6 +430,12 @@ struct ggml_tensor * ggml_add( struct ggml_tensor * a, struct ggml_tensor * b); + +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + struct ggml_tensor * ggml_sub( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index cdd8bd16bc0399..db71c03bdd4a1f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,6 +1,8 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include +#include #endif #include "llama_util.h" @@ -633,6 +635,7 @@ struct llama_model_loader { throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } + return get_tensor_for(lt); } @@ -1774,6 +1777,254 @@ int llama_model_quantize( } } +int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + auto & model = ctx->model; + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 'ggla') { + fprintf(stderr, "%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_buffer base_buf; + if (path_base_model) { + fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + + size_t ctx_size, mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; + } + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + std::string name(length, 0); + fin.read(&name[0], length); + + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } + + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + + if (model_tensors.find(base_name.data()) == model_tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + fprintf(stderr, "%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor* lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); + + lora_tensors[name] = lora_tensor; + + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + + ggml_tensor * dest_t = model_tensors[base_name]; + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) { + if (!warned) { + fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; + } + + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + BA = ggml_scale(lora_ctx, BA, scale_tensor); + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + r = ggml_cpy(lora_ctx, r, dest_t); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + gf.n_threads = n_threads; + ggml_graph_compute(lora_ctx, &gf); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) + fprintf(stderr, "."); + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); + return 1; + } +} + // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { diff --git a/llama.h b/llama.h index 19221759376858..c35193a8a80de4 100644 --- a/llama.h +++ b/llama.h @@ -96,6 +96,18 @@ extern "C" { const char * fname_out, enum llama_ftype ftype); + // Apply a LoRA adapter to a loaded model + // path_base_model is the path to a higher quality model to use as a base for + // the layers modified by the adapter. Can be NULL to use the current loaded model. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter + // will be applied on top of the previous one + // Returns 0 on success + LLAMA_API int llama_apply_lora_from_file( + struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); + // Returns the KV cache that will contain the context for the // ongoing prediction with the model. LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); diff --git a/llama_util.h b/llama_util.h index c92c0cc7148d86..ee9b2a6f366b18 100755 --- a/llama_util.h +++ b/llama_util.h @@ -168,7 +168,7 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -180,10 +180,12 @@ struct llama_mmap { throw format("mmap failed: %s", strerror(errno)); } - // Advise the kernel to preload the mapped memory - if (madvise(addr, file->size, MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); + if (prefetch) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } } } @@ -193,7 +195,7 @@ struct llama_mmap { #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file) { + llama_mmap(struct llama_file * file, bool prefetch = true) { size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); @@ -215,13 +217,15 @@ struct llama_mmap { } #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } #else #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") From 4ad73137a1aadc40a62f7101aab883f69e7172c6 Mon Sep 17 00:00:00 2001 From: Cameron Date: Mon, 17 Apr 2023 11:26:23 -0700 Subject: [PATCH 060/773] add 4_0 to default outfile namestr dict (#1031) this came up when trying to convert the gpt4all-lora-unfiltered-quantized.bin file --- convert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 7b9f043b2c124b..7f7ae05fa66711 100644 --- a/convert.py +++ b/convert.py @@ -1085,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", }[params.file_type] @@ -1108,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)") + parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") From e9298af3896b536d0c6d740447dc764e56b22102 Mon Sep 17 00:00:00 2001 From: Atsushi Tatsuma Date: Tue, 18 Apr 2023 04:34:35 +0900 Subject: [PATCH 061/773] readme : add Ruby bindings (#1029) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 78215c9ce40527..4f451f32d23847 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) +- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) **UI:** From 42747220b4cac548b6e3059b66b3e960b517cfa4 Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Tue, 18 Apr 2023 03:15:50 +0200 Subject: [PATCH 062/773] Do not close file after mmap (Windows version) (#1034) --- llama_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_util.h b/llama_util.h index ee9b2a6f366b18..eba14656a95096 100755 --- a/llama_util.h +++ b/llama_util.h @@ -202,7 +202,6 @@ struct llama_mmap { HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); DWORD error = GetLastError(); - CloseHandle(hFile); if (hMapping == NULL) { throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); From 5af8e32238c7d9c4cdb7fc640472c9a26538b9da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Apr 2023 18:00:10 +0300 Subject: [PATCH 063/773] ci : do not run on drafts --- .github/workflows/build.yml | 14 ++++++++++++++ .github/workflows/docker.yml | 2 ++ 2 files changed, 16 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 88e70e4959914f..2208f42f77929e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,6 +8,8 @@ on: required: true type: boolean push: + branches: + - master paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] pull_request: types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] @@ -18,6 +20,8 @@ env: jobs: ubuntu-latest-make: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest steps: @@ -37,6 +41,8 @@ jobs: make ubuntu-latest-cmake: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest steps: @@ -65,6 +71,8 @@ jobs: ctest --verbose ubuntu-latest-cmake-sanitizer: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest continue-on-error: true @@ -101,6 +109,8 @@ jobs: ctest --verbose macOS-latest-make: + if: github.event.pull_request.draft == false + runs-on: macos-latest steps: @@ -119,6 +129,8 @@ jobs: make macOS-latest-cmake: + if: github.event.pull_request.draft == false + runs-on: macOS-latest steps: @@ -146,6 +158,8 @@ jobs: ctest --verbose windows-latest-cmake: + if: github.event.pull_request.draft == false + runs-on: windows-latest strategy: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 28402c9336156a..379fbd7ad35f11 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -18,6 +18,8 @@ on: jobs: push_to_registry: name: Push Docker image to Docker Hub + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest env: COMMIT_SHA: ${{ github.sha }} From 7faa7460f03bdd88becf1e659cf359f274055404 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Apr 2023 20:10:26 +0300 Subject: [PATCH 064/773] readme : update hot topics about new LoRA functionality --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4f451f32d23847..c6f24d032307d6 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820) - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915) - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) From 5ecff35151156118c2df74899637ad34ee384b9b Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Tue, 18 Apr 2023 21:00:14 +0200 Subject: [PATCH 065/773] Adding a simple program to measure speed of dot products (#1041) On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow --- CMakeLists.txt | 1 + Makefile | 5 +- pocs/CMakeLists.txt | 12 ++ pocs/vdot/CMakeLists.txt | 4 + pocs/vdot/vdot.cpp | 305 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 pocs/CMakeLists.txt create mode 100644 pocs/vdot/CMakeLists.txt create mode 100644 pocs/vdot/vdot.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9189b6fc20bad5..ed9a3aa3822901 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -305,4 +305,5 @@ endif () if (LLAMA_BUILD_EXAMPLES) add_subdirectory(examples) + add_subdirectory(pocs) endif() diff --git a/Makefile b/Makefile index e7470d51a22945..071d9561ac57b2 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize quantize-stats perplexity embedding +default: main quantize quantize-stats perplexity embedding vdot # # Build library @@ -169,6 +169,9 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +vdot: pocs/vdot/vdot.cpp ggml.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt new file mode 100644 index 00000000000000..03e1d2c04be65d --- /dev/null +++ b/pocs/CMakeLists.txt @@ -0,0 +1,12 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(vdot) +endif() diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt new file mode 100644 index 00000000000000..cbc85223650ea5 --- /dev/null +++ b/pocs/vdot/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET vdot) +add_executable(${TARGET} vdot.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp new file mode 100644 index 00000000000000..26bf50c9ac2e46 --- /dev/null +++ b/pocs/vdot/vdot.cpp @@ -0,0 +1,305 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +constexpr int kVecSize = 1 << 18; + +float drawFromGaussianPdf(std::mt19937& rndm) { + constexpr double kScale = 1./(1. + std::mt19937::max()); + constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; + static float lastX; + static bool haveX = false; + if (haveX) { haveX = false; return lastX; } + auto r = sqrt(-2*log(1 - kScale*rndm())); + auto phi = kTwoPiTimesScale * rndm(); + lastX = r*sin(phi); + haveX = true; + return r*cos(phi); +} +void fillRandomGaussianFloats(std::vector& values, std::mt19937& rndm, float mean = 0) { + for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); +} + +// Copy-pasted from ggml.c +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +// Copy-pasted from ggml.c +#define QK8_0 32 +typedef struct { + float d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +// "Scalar" dot product between the quantized vector x and float vector y +inline double dot(int n, const block_q4_0* x, const float* y) { + const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; + constexpr uint32_t kMask1 = 0x0f0f0f0f; + uint32_t u1, u2; + auto q1 = (const uint8_t*)&u1; + auto q2 = (const uint8_t*)&u2; + double sum = 0; + for (int i=0; id; + auto u = (const uint32_t*)x->qs; + float s = 0; + for (int k=0; k<4; ++k) { + u1 = u[k] & kMask1; + u2 = (u[k] >> 4) & kMask1; + s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] + + y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] + + y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] + + y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]]; + y += 8; + } + sum += s*d; + ++x; + } + return sum; +} +// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product), +// but about the same on X86_64 (Ryzen 7950X CPU). +inline double dot3(int n, const block_q4_0* x, const float* y) { + const static std::pair kValues[256] = { + {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f}, + { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f}, + {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f}, + { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f}, + {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f}, + { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f}, + {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f}, + { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f}, + {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f}, + { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f}, + {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f}, + { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f}, + {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f}, + { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f}, + {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f}, + { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f}, + {-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f}, + { 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f}, + {-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f}, + { 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f}, + {-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f}, + { 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f}, + {-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f}, + { 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f}, + {-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f}, + { 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f}, + {-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f}, + { 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f}, + {-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f}, + { 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f}, + {-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f}, + { 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f} + }; + double sum = 0; + for (int i=0; id; + auto q = x->qs; + float s = 0; + for (int k=0; k<4; ++k) { + s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second + + y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second + + y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second + + y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second; + y += 8; q += 4; + } + sum += s*d; + ++x; + } + return sum; +} + +inline double dot41(int n, const block_q4_1* x, const float* y) { + const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f}; + constexpr uint32_t kMask1 = 0x0f0f0f0f; + uint32_t u1, u2; + auto q1 = (const uint8_t*)&u1; + auto q2 = (const uint8_t*)&u2; + double sum = 0; + for (int i=0; iqs; + float s = 0, s1 = 0; + for (int k=0; k<4; ++k) { + u1 = u[k] & kMask1; + u2 = (u[k] >> 4) & kMask1; + s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] + + y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] + + y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] + + y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]]; + s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7]; + y += 8; + } + sum += s*x->d + s1*x->m; + ++x; + } + return sum; +} + +// Copy-pasted from ggml.c +static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK8_0; l++) { + const float v = x[i*QK8_0 + l]; + amax = std::max(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int l = 0; l < QK8_0; ++l) { + const float v = x[i*QK8_0 + l]*id; + y[i].qs[l] = roundf(v); + } + } +} + +// Copy-pasted from ggml.c +static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) { + const int nb = n / QK8_0; + const block_q4_0* x = (const block_q4_0*)vx; + const block_q8_0* y = (const block_q8_0*)vy; + float sumf = 0; + for (int i = 0; i < nb; i++) { + const float d0 = x[i].d; + const float d1 = y[i].d; + + const uint8_t * p0 = x[i].qs; + const int8_t * p1 = y[i].qs; + + int sumi = 0; + for (int j = 0; j < QK8_0/2; j++) { + const uint8_t v0 = p0[j]; + + const int i0 = (int8_t) (v0 & 0xf) - 8; + const int i1 = (int8_t) (v0 >> 4) - 8; + + const int i2 = p1[2*j + 0]; + const int i3 = p1[2*j + 1]; + + sumi += i0*i2 + i1*i3; + } + sumf += d0*d1*sumi; + } + *s = sumf; +} + +int main(int argc, char** argv) { + + int nloop = argc > 1 ? atoi(argv[1]) : 10; + bool scalar = argc > 2 ? atoi(argv[2]) : false; + bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false; + + if (scalar && useQ4_1) { + printf("It is not possible to use Q4_1 quantization and scalar implementations\n"); + return 1; + } + + std::mt19937 rndm(1234); + + std::vector x1(kVecSize), y1(kVecSize); + int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); + int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); + + auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0); + + std::vector q40; + std::vector q41; + if (useQ4_1) q41.resize(n4); + else q40.resize(n4); + std::vector q8(n8); + std::vector H(16, 0); + double sumt = 0, sumt2 = 0, maxt = 0; + double sumqt = 0, sumqt2 = 0, maxqt = 0; + double sum = 0, sumq = 0, exactSum = 0; + for (int iloop=0; iloop(t2-t1).count(); + sumt += t; sumt2 += t*t; maxt = std::max(maxt, t); + + // And now measure the time needed to quantize y and perform the dot product with the quantized y + t1 = std::chrono::high_resolution_clock::now(); + float result; + if (scalar) { + quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize); + dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); + } + else { + funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize); + if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data()); + else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data()); + } + sumq += result; + t2 = std::chrono::high_resolution_clock::now(); + t = 1e-3*std::chrono::duration_cast(t2-t1).count(); + sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t); + + } + + // Report the time (and the average of the dot products so the compiler does not come up with the idea + // of optimizing away the function calls after figuring that the result is not used). + sum /= nloop; sumq /= nloop; + exactSum /= nloop; + printf("Exact result: = %g\n",exactSum); + printf(" = %g, %g\n",sum,sumq); + sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt; + if (sumt2 > 0) sumt2 = sqrt(sumt2); + printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt); + sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt; + if (sumqt2 > 0) sumqt2 = sqrt(sumqt2); + printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt); + return 0; +} From dcdd65e2969bc03c91a1ebd1160162d5054e6923 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Apr 2023 22:59:17 +0300 Subject: [PATCH 066/773] ggml : optimize ggml_vec_dot_q4_0_q8_0() using vectorized accumulators --- ggml.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/ggml.c b/ggml.c index acdba0333c8ea5..66a8c5bb51f2e1 100644 --- a/ggml.c +++ b/ggml.c @@ -2766,8 +2766,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * float sumf = 0.0; #if defined(__ARM_NEON) - float sum0 = 0.0f; - float sum1 = 0.0f; + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; @@ -2807,14 +2807,17 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); - int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); - - p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); - p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs); - sum0 += x0->d*y0->d*vaddvq_s32(p_0); - sum1 += x1->d*y1->d*vaddvq_s32(p_1); +#if 0 + // note: this is faster for 4-6 threads by slower for more threads + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); +#else + sumv0 = vaddq_f32(sumv0, vmulq_f32(vcvtq_f32_s32(p_0), vdupq_n_f32(x0->d*y0->d))); + sumv1 = vaddq_f32(sumv1, vmulq_f32(vcvtq_f32_s32(p_1), vdupq_n_f32(x1->d*y1->d))); +#endif #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); @@ -2826,21 +2829,17 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); - const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); - const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); - - const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); - const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); - const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - - sum0 += x0->d*y0->d*vaddvq_s16(p_0); - sum1 += x1->d*y1->d*vaddvq_s16(p_1); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); #endif } - sumf = sum0 + sum1; + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); From 4caebf6d408b91c2d29d0abc7b1e867b5de64db5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Apr 2023 23:00:08 +0300 Subject: [PATCH 067/773] gitignore : vdot --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ba5cbf1ed4374e..631f2360ba1c31 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ models/* /perplexity /embedding /benchmark-q4_0-matmult +/vdot /Pipfile arm_neon.h From 50a8a2af97cb92e53e7a3195aa201c3d87da5415 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Apr 2023 23:11:23 +0300 Subject: [PATCH 068/773] ggml : scratch that - vmlaq_n_f32 is always better Had a background process that was messing with the timings --- ggml.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml.c b/ggml.c index 66a8c5bb51f2e1..5fcb81cb8513f6 100644 --- a/ggml.c +++ b/ggml.c @@ -2810,14 +2810,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs); -#if 0 - // note: this is faster for 4-6 threads by slower for more threads sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); -#else - sumv0 = vaddq_f32(sumv0, vmulq_f32(vcvtq_f32_s32(p_0), vdupq_n_f32(x0->d*y0->d))); - sumv1 = vaddq_f32(sumv1, vmulq_f32(vcvtq_f32_s32(p_1), vdupq_n_f32(x1->d*y1->d))); -#endif #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); From 77a73403ca8eaced2590559d0f9cebd2b3649d32 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Apr 2023 23:54:57 +0300 Subject: [PATCH 069/773] ggml : add new Q4_2 quantization (ARM only) (#1046) * ggml : Q4_2 ARM * ggml : add ggml_is_quantized() * llama : update llama_type_name() with Q4_2 entry * ggml : speed-up q4_2 - 4 threads: ~100ms -> ~90ms - 8 threads: ~55ms -> ~50ms * ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32 --- examples/quantize/quantize.cpp | 1 + ggml.c | 282 +++++++++++++++++++++++++++++++-- ggml.h | 4 +- llama.cpp | 10 +- llama.h | 1 + 5 files changed, 287 insertions(+), 11 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5c9e2ad9420b3a..59cb6744016cb7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,6 +14,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); + fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); return 1; } diff --git a/ggml.c b/ggml.c index 5fcb81cb8513f6..40a9d0b2f8ecf3 100644 --- a/ggml.c +++ b/ggml.c @@ -585,6 +585,13 @@ typedef struct { } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); +#define QK4_2 16 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + #define QK8_0 32 typedef struct { float d; // delta @@ -1045,6 +1052,49 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int #endif } +// reference implementation for deterministic creation of model files +static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { + assert(k % QK4_2 == 0); + + const int nb = k / QK4_2; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK4_2; l++) { + const float v = x[i*QK4_2 + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 3) - 1); + + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int l = 0; l < QK4_2; l += 2) { + const float v0 = x[i*QK4_2 + l + 0]*id; + const float v1 = x[i*QK4_2 + l + 1]*id; + + const uint8_t vi0 = (uint8_t)(v0 + 8.5f); + const uint8_t vi1 = (uint8_t)(v1 + 8.5f); + + assert(vi0 < 16); + assert(vi1 < 16); + + y[i].qs[l/2] = vi0 | (vi1 << 4); + } + } +} + +static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_2 == 0); + + block_q4_2 * restrict y = vy; + + quantize_row_q4_2_reference(x, y, k); +} + // reference implementation for deterministic creation of model files static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { assert(k % QK8_0 == 0); @@ -1064,7 +1114,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; for (int l = 0; l < QK8_0; ++l) { - const float v = x[i*QK8_0 + l]*id; + const float v = x[i*QK8_0 + l]*id; y[i].qs[l] = roundf(v); } } @@ -1420,8 +1470,39 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #endif } +static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) { + assert(k % QK4_2 == 0); + const int nb = k / QK4_2; + + const block_q4_2 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_2 + l + 0] = v0; + y[i*QK4_2 + l + 1] = v1; + + assert(!isnan(y[i*QK4_2 + l + 0])); + assert(!isnan(y[i*QK4_2 + l + 1])); + } + } +} + static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +//static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { @@ -1438,6 +1519,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .quantize_row_q_dot = quantize_row_q4_1, .vec_dot_q = ggml_vec_dot_q4_1, }, + [GGML_TYPE_Q4_2] = { + .dequantize_row_q = dequantize_row_q4_2, + .quantize_row_q = quantize_row_q4_2, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_2_q8_0, + }, // TODO: GGML_TYPE_Q8_0 }; @@ -2950,6 +3038,136 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * *s = sumf; } +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; + + assert(n % QK8_0 == 0); + assert(nb % 2 == 0); + assert(QK8_0 == 2*QK4_2); + + const block_q4_2 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; + const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; + const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); + const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs)); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); + const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( + vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d); +#endif + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#else + // scalar + for (int i = 0; i < nb; i++) { + const uint8_t * restrict x0 = x[2*i + 0].qs; + const uint8_t * restrict x1 = x[2*i + 1].qs; + const int8_t * restrict y0 = y[i].qs; + + const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); + const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); + + int sumi_0 = 0; + int sumi_1 = 0; + + for (int j = 0; j < QK8_0/4; j++) { + const uint8_t v0 = x0[j]; + const uint8_t v1 = x1[j]; + + const int i0_0 = (int8_t) (v0 & 0xf) - 8; + const int i1_0 = (int8_t) (v0 >> 4) - 8; + + const int i0_1 = (int8_t) (v1 & 0xf) - 8; + const int i1_1 = (int8_t) (v1 >> 4) - 8; + + const int i2_0 = y0[2*j + 0]; + const int i3_0 = y0[2*j + 1]; + + const int i2_1 = y0[2*(j + QK8_0/4) + 0]; + const int i3_1 = y0[2*(j + QK8_0/4) + 1]; + + sumi_0 += i0_0*i2_0 + i1_0*i3_0; + sumi_1 += i0_1*i2_1 + i1_1*i3_1; + } + + sumf += (d0 * y[i].d) * sumi_0; + sumf += (d1 * y[i].d) * sumi_1; + } +#endif + + *s = sumf; +} + // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { @@ -3196,24 +3414,26 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, + [GGML_TYPE_Q4_2] = QK4_2, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 8, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_Q4_2] = sizeof(block_q4_2), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { @@ -3221,12 +3441,26 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q4_2] = "q4_2", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated"); + +static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = false, + [GGML_TYPE_F16] = false, + [GGML_TYPE_Q4_0] = true, + [GGML_TYPE_Q4_1] = true, + [GGML_TYPE_Q4_2] = true, + [GGML_TYPE_Q8_0] = true, + [GGML_TYPE_I8] = false, + [GGML_TYPE_I16] = false, + [GGML_TYPE_I32] = false, +}; +static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -3488,6 +3722,10 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } +static inline bool ggml_is_quantized(enum ggml_type type) { + return GGML_IS_QUANTIZED[type]; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -5609,7 +5847,7 @@ static void ggml_compute_forward_dup_f16( } } } - } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + } else if (ggml_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; size_t id = 0; uint8_t * dst_ptr = (uint8_t *) dst->data; @@ -5821,7 +6059,7 @@ static void ggml_compute_forward_dup_f32( } } } - } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { + } else if (ggml_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; size_t id = 0; uint8_t * dst_ptr = (uint8_t *) dst->data; @@ -6184,7 +6422,7 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); + GGML_ASSERT(ggml_is_quantized(src0->type)); GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -6254,6 +6492,7 @@ static void ggml_compute_forward_add( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7732,6 +7971,7 @@ static void ggml_compute_forward_mul_mat( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: case GGML_TYPE_Q8_0: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); @@ -7987,6 +8227,7 @@ static void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: case GGML_TYPE_Q8_0: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); @@ -10398,7 +10639,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = 1; size_t cur = 0; - if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) { + if (ggml_is_quantized(node->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; } @@ -10410,7 +10651,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; - if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) { + if (ggml_is_quantized(node->src0->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; } @@ -11702,6 +11943,29 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_1*sizeof(block_q4_1)); } +size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_2 == 0); + const int nb = k / QK4_2; + + for (int j = 0; j < n; j += k) { + block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; + + quantize_row_q4_2_reference(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_2*sizeof(block_q4_2)); +} + //////////////////////////////////////////////////////////////////////////////// int ggml_cpu_has_avx(void) { diff --git a/ggml.h b/ggml.h index 59de0cb12572ab..603be84531b145 100644 --- a/ggml.h +++ b/ggml.h @@ -204,7 +204,8 @@ enum ggml_type { GGML_TYPE_F16 = 1, GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, - GGML_TYPE_Q8_0 = 4, + GGML_TYPE_Q4_2 = 4, + GGML_TYPE_Q8_0 = 5, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -806,6 +807,7 @@ enum ggml_opt_result ggml_opt( size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); // // system info diff --git a/llama.cpp b/llama.cpp index db71c03bdd4a1f..f14324fcd1eaba 100644 --- a/llama.cpp +++ b/llama.cpp @@ -478,6 +478,7 @@ struct llama_file_loader { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: break; default: { throw format("unrecognized tensor type %u\n", shard.type); @@ -550,6 +551,7 @@ struct llama_file_saver { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: break; default: LLAMA_ASSERT(false); } @@ -838,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; + case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; default: return "unknown, may not work"; } } @@ -1571,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; default: throw format("invalid output file type %d\n", ftype); }; @@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s { new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; + case GGML_TYPE_Q4_2: + { + new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; default: LLAMA_ASSERT(false); } @@ -1955,7 +1963,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * base_t = dest_t; } - if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) { + if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) { if (!warned) { fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " "use a f16 or f32 base model with --lora-base\n", __func__); diff --git a/llama.h b/llama.h index c35193a8a80de4..208b03d18056cf 100644 --- a/llama.h +++ b/llama.h @@ -72,6 +72,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); From 66674012389f9537140044290f8517bc4a5e0b74 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 19 Apr 2023 00:53:24 +0200 Subject: [PATCH 070/773] Multi-threaded ggml_cpy (#1035) * Multi-threaded ggml_cpy * Update ggml.c Co-authored-by: Georgi Gerganov * Also fix wdata offset in ggml_compute_forward_add_q_f32 --------- Co-authored-by: Georgi Gerganov --- ggml.c | 302 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 238 insertions(+), 64 deletions(-) diff --git a/ggml.c b/ggml.c index 40a9d0b2f8ecf3..f4b8fc2829463d 100644 --- a/ggml.c +++ b/ggml.c @@ -5766,7 +5766,6 @@ static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -5778,6 +5777,11 @@ static void ggml_compute_forward_dup_f16( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; const size_t nb02 = src0->nb[2]; @@ -5788,19 +5792,40 @@ static void ggml_compute_forward_dup_f16( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + return; } + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + if (src0->type == dst->type && - src0->ne[0] == dst->ne[0] && - src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + ne00 == ne0 && + nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), @@ -5814,21 +5839,21 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy if (ggml_is_contiguous(dst)) { - if (src0->nb[0] == sizeof(ggml_fp16_t)) { + if (nb00 == sizeof(ggml_fp16_t)) { if (dst->type == GGML_TYPE_F16) { size_t id = 0; - const size_t rs = ne00*nb00; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; } + id += rs * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F32) { @@ -5837,34 +5862,39 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); id++; } } + id += ne00 * (ne01 - ir1); } } } else if (ggml_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + size_t id = 0; - uint8_t * dst_ptr = (uint8_t *) dst->data; - size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); - float * src0_f32 = (float *) params->wdata; + size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - // convert to f32 and quantize + for (int i00 = 0; i00 < ne00; i00++) { src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); } + quantize_row_q(src0_f32, dst_ptr + id, ne00); - id += dst_row_size; + id += rs; } + id += rs * (ne01 - ir1); } } } else { @@ -5879,7 +5909,8 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5887,6 +5918,7 @@ static void ggml_compute_forward_dup_f16( id++; } } + id += ne00 * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -5895,7 +5927,8 @@ static void ggml_compute_forward_dup_f16( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -5903,6 +5936,7 @@ static void ggml_compute_forward_dup_f16( id++; } } + id += ne00 * (ne01 - ir1); } } } else { @@ -5921,7 +5955,20 @@ static void ggml_compute_forward_dup_f16( if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); @@ -5942,25 +5989,51 @@ static void ggml_compute_forward_dup_f16( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); - if (++i10 == ne00) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == ne01) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == ne02) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == ne03) { + if (++i13 == ne3) { i13 = 0; } } @@ -5968,6 +6041,19 @@ static void ggml_compute_forward_dup_f16( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else { @@ -5979,7 +6065,6 @@ static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -5991,6 +6076,11 @@ static void ggml_compute_forward_dup_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; const size_t nb02 = src0->nb[2]; @@ -6001,19 +6091,40 @@ static void ggml_compute_forward_dup_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + return; } + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + if (src0->type == dst->type && - src0->ne[0] == dst->ne[0] && - src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + ne00 == ne0 && + nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { memcpy( ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), @@ -6026,21 +6137,21 @@ static void ggml_compute_forward_dup_f32( if (ggml_is_contiguous(dst)) { // TODO: simplify - if (src0->nb[0] == sizeof(float)) { + if (nb00 == sizeof(float)) { if (dst->type == GGML_TYPE_F32) { size_t id = 0; - const size_t rs = ne00*nb00; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; } + id += rs * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -6049,7 +6160,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -6057,21 +6169,25 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); } } } else if (ggml_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + size_t id = 0; - uint8_t * dst_ptr = (uint8_t *) dst->data; - size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); quantize_row_q(src0_ptr, dst_ptr + id, ne00); - id += dst_row_size; + id += rs; } + id += rs * (ne01 - ir1); } } } else { @@ -6086,7 +6202,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -6094,6 +6211,7 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); } } } else if (dst->type == GGML_TYPE_F16) { @@ -6102,7 +6220,8 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { - for (int i01 = 0; i01 < ne01; i01++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); @@ -6110,6 +6229,7 @@ static void ggml_compute_forward_dup_f32( id++; } } + id += ne00 * (ne01 - ir1); } } } else { @@ -6121,6 +6241,7 @@ static void ggml_compute_forward_dup_f32( } // dst counters + int64_t i10 = 0; int64_t i11 = 0; int64_t i12 = 0; @@ -6129,20 +6250,34 @@ static void ggml_compute_forward_dup_f32( if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + i11++; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, sizeof(float)); - if (++i10 == dst->ne[0]) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == dst->ne[1]) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == dst->ne[2]) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == dst->ne[3]) { + if (++i13 == ne3) { i13 = 0; } } @@ -6150,25 +6285,51 @@ static void ggml_compute_forward_dup_f32( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); - if (++i10 == dst->ne[0]) { + if (++i10 == ne0) { i10 = 0; - if (++i11 == dst->ne[1]) { + if (++i11 == ne1) { i11 = 0; - if (++i12 == dst->ne[2]) { + if (++i12 == ne2) { i12 = 0; - if (++i13 == dst->ne[3]) { + if (++i13 == ne3) { i13 = 0; } } @@ -6176,6 +6337,19 @@ static void ggml_compute_forward_dup_f32( } } } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } } } } else { @@ -6436,7 +6610,7 @@ static void ggml_compute_forward_add_q_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float * wdata = (float*) params->wdata + ne00 * ith; + float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; for (int ir = ir0; ir < ir1; ++ir) { // src0 indices @@ -10636,11 +10810,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_CPY: case GGML_OP_DUP: { - node->n_tasks = 1; + node->n_tasks = n_threads; size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; } work_size = MAX(work_size, cur); From 8944a1329648c57bb7d66851170938230587a52c Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 19 Apr 2023 11:22:45 +0200 Subject: [PATCH 071/773] Add NVIDIA cuBLAS support (#1044) --- CMakeLists.txt | 21 +++++ Makefile | 4 + ggml.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++--- ggml.h | 1 + llama.cpp | 2 +- 5 files changed, 221 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed9a3aa3822901..8eadea4fd48627 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,7 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) +option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -142,6 +143,26 @@ if (LLAMA_OPENBLAS) endif() endif() +if (LLAMA_CUBLAS) + cmake_minimum_required(VERSION 3.17) + + find_package(CUDAToolkit) + if (CUDAToolkit_FOUND) + message(STATUS "cuBLAS found") + + add_compile_definitions(GGML_USE_CUBLAS) + + if (LLAMA_STATIC) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + else() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + else() + message(WARNING "cuBLAS not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags diff --git a/Makefile b/Makefile index 071d9561ac57b2..deb0d00090f5af 100644 --- a/Makefile +++ b/Makefile @@ -97,6 +97,10 @@ ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas LDFLAGS += -lopenblas endif +ifdef LLAMA_CUBLAS + CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include + LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 +endif ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg diff --git a/ggml.c b/ggml.c index f4b8fc2829463d..13c1548fee895a 100644 --- a/ggml.c +++ b/ggml.c @@ -142,10 +142,46 @@ inline static void* ggml_aligned_malloc(size_t size) { } \ } while (0) -#ifdef GGML_USE_ACCELERATE +#if defined(GGML_USE_ACCELERATE) #include -#elif GGML_USE_OPENBLAS +#elif defined(GGML_USE_OPENBLAS) #include +#elif defined(GGML_USE_CUBLAS) +#include +#include +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +static cublasHandle_t cublasH = NULL; +static cudaStream_t cudaStream = NULL; +static void init_cublas(void) { + if (cublasH == NULL) { + // create cublas handle, bind a stream + CUBLAS_CHECK(cublasCreate(&cublasH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking)); + CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream)); + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + } +} #endif #undef MIN @@ -3836,6 +3872,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } + // initialize cuBLAS + #if defined(GGML_USE_CUBLAS) + init_cublas(); + #endif + is_first_call = false; } @@ -7567,7 +7608,7 @@ static void ggml_compute_forward_rms_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -7607,7 +7648,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -7664,7 +7705,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -7678,6 +7719,21 @@ static void ggml_compute_forward_mul_mat_f32( return; } +#if defined(GGML_USE_CUBLAS) + float *d_X = NULL; + float *d_Y = NULL; + float *d_D = NULL; + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); +#endif + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); @@ -7685,15 +7741,37 @@ static void ggml_compute_forward_mul_mat_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CUBLAS) + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream)); + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); + + // compute + CUBLAS_CHECK( + cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, ne00, + d_Y, ne10, + &beta, d_D, ne01)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); +#else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } - +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaFree(d_X)); + CUDA_CHECK(cudaFree(d_Y)); + CUDA_CHECK(cudaFree(d_D)); +#endif //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -7823,7 +7901,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -7839,10 +7917,37 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } - float * const wdata = params->wdata; +#if defined(GGML_USE_CUBLAS) + ggml_fp16_t * const wdata = params->wdata; + float *d_X = NULL; + float *d_Y = NULL; + float *d_D = NULL; + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(ggml_fp16_t) * x_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); +#else + float * const wdata = params->wdata; +#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { +#if defined(GGML_USE_CUBLAS) + // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 + { + size_t id = 0; + for (int64_t i01 = 0; i01 < ne11; ++i01) { + for (int64_t i00 = 0; i00 < ne10; ++i00) { + wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); + } + } + } +#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -7851,7 +7956,32 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } } +#endif +#if defined(GGML_USE_CUBLAS) + const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03); + const ggml_fp16_t * y = (ggml_fp16_t *) wdata; + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, cudaStream)); + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, cudaStream)); + + // compute + CUBLAS_CHECK( + cublasGemmEx(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, CUDA_R_16F, ne00, + d_Y, CUDA_R_16F, ne10, + &beta, d_D, CUDA_R_32F, ne01, + CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); +#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -7863,9 +7993,15 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaFree(d_X)); + CUDA_CHECK(cudaFree(d_Y)); + CUDA_CHECK(cudaFree(d_D)); +#endif /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; @@ -8017,7 +8153,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -8034,6 +8170,21 @@ static void ggml_compute_forward_mul_mat_q_f32( float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; +#if defined(GGML_USE_CUBLAS) + float *d_X = NULL; + float *d_Y = NULL; + float *d_D = NULL; + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne10; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); +#endif + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { { @@ -8049,15 +8200,38 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CUBLAS) + // copy data to device + CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream)); + CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); + + // compute + CUBLAS_CHECK( + cublasSgemm(cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, d_X, ne00, + d_Y, ne10, + &beta, d_D, ne01)); + + // copy data to host + CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); +#else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } +#if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaFree(d_X)); + CUDA_CHECK(cudaFree(d_Y)); + CUDA_CHECK(cudaFree(d_D)); +#endif //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -10874,7 +11048,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -10891,7 +11065,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); @@ -12231,7 +12405,15 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_cublas(void) { +#if defined(GGML_USE_CUBLAS) return 1; #else return 0; diff --git a/ggml.h b/ggml.h index 603be84531b145..570147fc246587 100644 --- a/ggml.h +++ b/ggml.h @@ -825,6 +825,7 @@ int ggml_cpu_has_f16c(void); int ggml_cpu_has_fp16_va(void); int ggml_cpu_has_wasm_simd(void); int ggml_cpu_has_blas(void); +int ggml_cpu_has_cublas(void); int ggml_cpu_has_sse3(void); int ggml_cpu_has_vsx(void); diff --git a/llama.cpp b/llama.cpp index f14324fcd1eaba..3ff5dc1e148009 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1069,7 +1069,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, tokens, N*ggml_element_size(embd)); From f3d4edf504c19ee9d1381c5727fe38667205d979 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Wed, 19 Apr 2023 16:06:37 +0000 Subject: [PATCH 072/773] ggml : Q4 cleanup - remove 4-bit dot product code (#1061) * Q4 cleanup * Remove unused AVX512 Q4_0 code --- CMakeLists.txt | 1 - Makefile | 2 +- ggml.c | 541 +------------------------------------------------ 3 files changed, 9 insertions(+), 535 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8eadea4fd48627..d7aa051da4ac64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,7 +174,6 @@ if (LLAMA_ALL_WARNINGS) -Wshadow -Wstrict-prototypes -Wpointer-arith - -Wno-unused-function ) set(cxx_flags -Wall diff --git a/Makefile b/Makefile index deb0d00090f5af..d9a2d836babeed 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = # warnings -CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific diff --git a/ggml.c b/ggml.c index 13c1548fee895a..7728794743c711 100644 --- a/ggml.c +++ b/ggml.c @@ -1562,7 +1562,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_2_q8_0, }, - // TODO: GGML_TYPE_Q8_0 + [GGML_TYPE_Q8_0] = { + .dequantize_row_q = NULL, // TODO + .quantize_row_q = quantize_row_q8_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = NULL, // TODO + }, }; // For internal test use @@ -2128,191 +2134,6 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -#if __AVX512F__ && QK4_0 == 32 -static inline __m512i bytes_from_q4_0_twoblocks_avx512( const __m512i blocks ) { - // The 64 bytes of `blocks` contain two consecutive Q4_0 blocks loaded from memory: - // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| - // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - // | :. =_ () [] <> () Zz Yy| - // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| - // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - // |Xx Ww Vv Uu Tt Ss Rr Qq Pp Oo Nn Mm Ll Kk Jj Ii Hh Gg Ff Ee Dd Cc Bb Aa | - // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ - // - // Bytes 04..19 (block #0) and 24..39 (block #1) both contain 32 nibbles (4-bit unsigned integers). - // We have exactly 64 nibbles, so we want to place each nibble into a separate byte. - // Bytes 00..03 and 20..23 contain scales, which are irrelevant to this function. - // Bytes 40..63 are masked when loading the data, so they are zeroed out. -#ifdef __AVX512VBMI__ - const __m512i byte_perm = _mm512_set_epi8( - 39, 38, 39, 38, 37, 36, 37, 36, 35, 34, 35, 34, 33, 32, 33, 32, - 31, 30, 31, 30, 29, 28, 29, 28, 27, 26, 27, 26, 25, 24, 25, 24, - 19, 18, 19, 18, 17, 16, 17, 16, 15, 14, 15, 14, 13, 12, 13, 12, - 11, 10, 11, 10, 9, 8, 9, 8, 7, 6, 7, 6, 5, 4, 5, 4 - ); - const __m512i permuted = _mm512_permutexvar_epi8( byte_perm, blocks ); - // After applying VPERMB, `permuted` looks like this: - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |:. =_ :. =_ () [] () [] <> () <> () Zz Yy Zz Yy Xx Ww Xx Ww Vv Uu Vv Uu Tt Ss Tt Ss Rr Qq Rr Qq| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |Pp Oo Pp Oo Nn Mm Nn Mm Ll Kk Ll Kk Jj Ii Jj Ii Hh Gg Hh Gg Ff Ee Ff Ee Dd Cc Dd Cc Bb Aa Bb Aa| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ -#else - const __m512i word_perm = _mm512_set_epi16( - 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, - 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2 - ); - const __m512i permuted = _mm512_permutexvar_epi16( word_perm, blocks ); - // This is the fallback path for CPUs that don't support VPERMB. Since we permute 16-bit groups only, - // VPERMB can be replaced with VPERMW. We could always use VPERMW, but at least on Tiger Lake and - // Ice Lake VPERMW followed by a right shift is quite noticeably slower than VPERMB. -#endif - - // Shift every odd-numbered 16-bit group to the right by 4 bits. - const __mmask32 shift_mask = 0xaaaaaaaa; - const __m512i shifted = _mm512_mask_srai_epi16( permuted, shift_mask, permuted, 4 ); - // After applying VPSRAW, `shifted` looks like this (the "empty" nibbles are filled with zeroes): - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // | : .= :. =_ ( )[ () [] < >( <> () Z zY Zz Yy X xW Xx Ww V vU Vv Uu T tS Tt Ss R rQ Rr Qq - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // | P pO Pp Oo N nM Nn Mm L lK Ll Kk J jI Jj Ii H hG Hh Gg F fE Ff Ee D dC Dd Cc B bA Bb Aa| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - - // Now we just need to zero out the higher nibble in each byte, and we're done. - const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf ); - return _mm512_and_si512( low_nibble_mask, shifted ); - // The final result looks like this: - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // | : = . _ ( [ ) ] < ( > ) Z Y z y X W x w V U v u T S t s R Q r q| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ - // | P O p o N M n m L K l k J I j i H G h g F E f e D C d c B A b a| - // +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+ -} - -static inline __m512 dot_q4_0_twoblocks_avx512( - __m512 acc, - const block_q4_0 * restrict x, - const block_q4_0 * restrict y, - int i -) { - // A pair of Q4_0 blocks spans 40 bytes, while an AVX-512 register has 64. The remaining 24 bytes - // can potentially be unaddressable, so we make sure to mask them out before the load, even though - // we don't use them at all. This might hurt the performance slightly, since the compiler is forced - // to use e.g. `VMOVDQU64 REG, MASK, [ADDR] + VPERMB ..., REG` instead of just `VPERMB ..., [ADDR]`. - const __mmask8 load_mask = 0x1f; - const __m512i blocks_0 = _mm512_maskz_loadu_epi64( load_mask, &x[i] ); - const __m512i blocks_1 = _mm512_maskz_loadu_epi64( load_mask, &y[i] ); - - // We want to multiply the scales, so we interpret both registers as 16 32-bit floats: - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // blocks_0_float - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // | | | | | | | xx | xx | xx | xx | B | xx | xx | xx | xx | A | - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // blocks_1_float - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // | | | | | | | xx | xx | xx | xx | D | xx | xx | xx | xx | C | - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - const __m512 blocks_0_float = _mm512_castsi512_ps( blocks_0 ); - const __m512 blocks_1_float = _mm512_castsi512_ps( blocks_1 ); - // We absolutely shouldn't touch the floats marked with `xx`: they contain some - // random data, which might very well underflow. At least on Intel, this leads - // to a huge penalty that can't be ignored (easily 100x or more) unless you - // compile your code with something like `-ffast-math` to enable FTZ/DAZ flags. - // (and ggml can't assume that you do)... - const __mmask16 scale_mul_mask = 0x21; -#ifdef __clang__ - // ...however, clang decides to optimize the multiplication mask away: - // https://godbolt.org/z/P8PqdsfvW - // gcc and MSVC do the sane thing. This horrible workaround forces clang to emit the mask. - __m512i scales; - __asm__( - "vmulps %1, %2, %0%{%3%}" - : "=v" ( scales ) - : "vm" ( blocks_0_float ), "v" ( blocks_1_float ), "Yk" ( scale_mul_mask ) - ); -#else - const __m512 scales = _mm512_maskz_mul_ps( scale_mul_mask, blocks_0_float, blocks_1_float ); -#endif - const __m512i scale_perm = _mm512_set_epi32( - 5, 5, 5, 5, 5, 5, 5, 5, - 0, 0, 0, 0, 0, 0, 0, 0 - ); - const __m512 permuted_scales = _mm512_permutexvar_ps( scale_perm, scales ); - // After VMULPS and VPERMPS, `permuted_scales` looks like this: - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 | - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - // | B*D| B*D| B*D| B*D| B*D| B*D| B*D| B*D| A*C| A*C| A*C| A*C| A*C| A*C| A*C| A*C| - // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ - - const __m512i bytes_0 = bytes_from_q4_0_twoblocks_avx512( blocks_0 ); - const __m512i bytes_1 = bytes_from_q4_0_twoblocks_avx512( blocks_1 ); - - // Now we want to compute dot products of 4-element byte vectors and store them in - // 32-bit integers. That is (only one 4-element vector is shown for clarity): - // +----+----+----+----+ - // ... | 03 | 02 | 01 | 00 | - // +----+----+----+----+ - // bytes_0 - // +----+----+----+----+ - // ... | D | C | B | A | - // +----+----+----+----+ - // bytes_1 - // +----+----+----+----+ - // ... | H | G | F | E | - // +----+----+----+----+ - // final_res_int - // +----+----+----+----+ - // ... | A*E+B*F+C*G+D*H | - // +----+----+----+----+ - const __m512i plus_8 = _mm512_set1_epi8( 8 ); - const __m512i bytes_1_minus_8 = _mm512_sub_epi8( bytes_1, plus_8 ); - -#ifdef __AVX512VNNI__ - // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch: - // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8 - // from each nibble, so they can be negative. So, instead of `(bytes_0 - 8) * (bytes_1 - 8)`, - // we compute `bytes_0 * (bytes_1 - 8) + bytes_1 * (-8) + 64`. VPDPBUSDS uses an accumulator, - // which means we only need 2 instructions. - const __m512i dot_init = _mm512_set1_epi32( 4 * 64 ); - const __m512i minus_8 = _mm512_set1_epi8( -8 ); - const __m512i prod_0 = _mm512_dpbusds_epi32( dot_init, bytes_1, minus_8 ); - const __m512i final_res_int = _mm512_dpbusds_epi32( prod_0, bytes_0, bytes_1_minus_8 ); -#else - // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones. - // It has the same catch as VPDPBUSDS: the left operand should be unsigned. - // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me - // ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119 - const __m512i one = _mm512_set1_epi16( 1 ); - const __m512i prod_0 = _mm512_maddubs_epi16( bytes_0, bytes_1_minus_8 ); - const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, bytes_1_minus_8 ); - const __m512i diff = _mm512_sub_epi16( prod_0, prod_1 ); - const __m512i final_res_int = _mm512_madd_epi16( diff, one ); -#endif - - // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate. - const __m512 final_res_float = _mm512_cvtepi32_ps( final_res_int ); - return _mm512_fmadd_ps( permuted_scales, final_res_float, acc ); -} -#endif - inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; @@ -2349,352 +2170,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } -static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK4_0; - - assert(n % QK4_0 == 0); - assert(nb % 2 == 0); - - const block_q4_0 * restrict x = vx; - const block_q4_0 * restrict y = vy; - - float sumf = 0.0; - -#if defined(__ARM_NEON) - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0xf); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v1_0 = vld1q_u8(y0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - const uint8x16_t v1_1 = vld1q_u8(y1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); - const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); - - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); - const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); - - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); - int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); - - p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); - p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - - sum0 += x0->d*y0->d*vaddvq_s32(p_0); - sum1 += x1->d*y1->d*vaddvq_s32(p_1); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); - - const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); - const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); - - const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); - const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); - - const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); - const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - - sum0 += x0->d*y0->d*vaddvq_s16(p_0); - sum1 += x1->d*y1->d*vaddvq_s16(p_1); -#endif - } - - sumf = sum0 + sum1; -#elif defined(__AVX512F__) - // Initialize accumulator with zeros - __m512 acc0 = _mm512_setzero_ps(); - __m512 acc1 = _mm512_setzero_ps(); - - const int superblock_size = 16; - - const int superblock_count = nb / superblock_size; - - for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { - int i = superblock_ix * superblock_size; - - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 ); - } - - // Remainders - for (int i = superblock_count * superblock_size; i < nb; i += 2) { - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i ); - } - - // Horizontal sum of all lanes of the accumulator - sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - /* Prepare the constants we will need during execution */ - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - const __m256i offset_8 = _mm256_set1_epi16( 8 ); - -#define UNROLL_COUNT 8 - // make sure we only unroll multiples of the block count - assert(nb % UNROLL_COUNT == 0); - - // Main loop - for (int i = 0; i < nb; i+=UNROLL_COUNT) { - // This loop will be unrolled by the compiler - for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); - } - } - - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - - __m128i i32[2]; - for (int j = 0; j < 2; ++j) { - // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); - __m128i by = bytesFromNibbles( y[i].qs + 8*j ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m128i off = _mm_set1_epi8( 8 ); - bx = _mm_sub_epi8( bx, off ); - by = _mm_sub_epi8( by, off ); - - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); - - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); - - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - - const __m128i ones = _mm_set1_epi16(1); - i32[j] = _mm_madd_epi16(ones, dot); - } - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); - } - - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); -#elif defined(__wasm_simd128__) - // wasm simd - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; - - const v128_t m4b = wasm_u8x16_splat(0xf); - const v128_t s8b = wasm_i8x16_splat(0x8); - - const v128_t v0_0 = wasm_v128_load(x0->qs); - const v128_t v0_1 = wasm_v128_load(y0->qs); - const v128_t v1_0 = wasm_v128_load(x1->qs); - const v128_t v1_1 = wasm_v128_load(y1->qs); - - // 4-bit -> 8-bit - const v128_t v0_0l = wasm_v128_and(v0_0, m4b); - const v128_t v1_0l = wasm_v128_and(v1_0, m4b); - - const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); - const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4); - - const v128_t v0_1l = wasm_v128_and(v0_1, m4b); - const v128_t v1_1l = wasm_v128_and(v1_1, m4b); - - const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); - const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4); - - // sub 8 - const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); - const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b); - - const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); - const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b); - - const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); - const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b); - - const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); - const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b); - - // dot product into int16x8_t - const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls)); - const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls)); - - const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs)); - const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs)); - - const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls)); - const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls)); - - const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs)); - const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs)); - - const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h); - const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h); - - const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h); - const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h); - - const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0); - const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1); - - sum0 += x0->d * y0->d * ( - wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) + - wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) + - wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) + - wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7)); - sum1 += x1->d * y1->d * ( - wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) + - wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) + - wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) + - wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7)); - } - - sumf = sum0 + sum1; -#else - // scalar - for (int i = 0; i < nb; i++) { - const float d0 = x[i].d; - const float d1 = y[i].d; - - const uint8_t * restrict p0 = x[i].qs; - const uint8_t * restrict p1 = y[i].qs; - - int sumi = 0; - for (int j = 0; j < QK4_0/2; j++) { - const uint8_t v0 = p0[j]; - const uint8_t v1 = p1[j]; - - const int i0 = (v0 & 0xf) - 8; - const int i1 = (v0 >> 4) - 8; - - const int i2 = (v1 & 0xf) - 8; - const int i3 = (v1 >> 4) - 8; - - sumi += i0*i2 + i1*i3; - } - sumf += d0 * d1 * sumi; - } -#endif - - *s = sumf; -} - static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK4_1; @@ -11064,7 +10539,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; - } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { + } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; From 7cd5c4a3e9106151d48f328bb3c94c298a211f18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Apr 2023 19:07:54 +0300 Subject: [PATCH 073/773] readme : add warning about Q4_2 and Q4_3 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c6f24d032307d6..8e1945cff0c32d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +**Warnings** + +- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalize + **Hot topics:** - [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820) From 884e7d7a2bfd7325b107442d6758983f5886ed3d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Apr 2023 20:10:08 +0300 Subject: [PATCH 074/773] ggml : use 8-bit precision for Q4_1 intermediate results (#1047) * ggml : use 8-bit precision for Q4_1 intermediate results (ARM) * ggml : optimize ggml_vec_dot_q4_1_q8_0() via vmalq_n_f32 56 ms/token with Q4_1 ! * ggml : AVX2 implementation of ggml_vec_dot_q4_1_q8_0 (#1051) * gitignore : ignore ppl-*.txt files --------- Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> --- .gitignore | 15 ++- ggml.c | 371 ++++++++++++++++++++++++++--------------------------- 2 files changed, 192 insertions(+), 194 deletions(-) diff --git a/.gitignore b/.gitignore index 631f2360ba1c31..e52d479eeafa8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,15 @@ *.o *.a +.DS_Store +.build/ .cache/ +.direnv/ +.envrc +.swiftpm +.venv .vs/ .vscode/ -.DS_Store -.build/ build/ build-em/ build-debug/ @@ -30,12 +34,9 @@ models/* arm_neon.h compile_commands.json -.envrc -.direnv/ - -.venv __pycache__ -.swiftpm zig-out/ zig-cache/ + +ppl-*.txt diff --git a/ggml.c b/ggml.c index 7728794743c711..3b38eaad367361 100644 --- a/ggml.c +++ b/ggml.c @@ -550,6 +550,18 @@ inline static uint16_t vaddvq_u8(uint8x16_t v) { (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); } +inline static int16_t vaddvq_s8(int8x16_t v) { + return + (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + + (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + + (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + + (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + + (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + + (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + + (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + + (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); +} + inline static int32_t vaddvq_s16(int16x8_t v) { return (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + @@ -1535,9 +1547,8 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in } } -static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -//static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { @@ -1552,8 +1563,8 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .dequantize_row_q = dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_1_q8_0, }, [GGML_TYPE_Q4_2] = { .dequantize_row_q = dequantize_row_q4_2, @@ -2170,189 +2181,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } -static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK4_1; - - const block_q4_1 * restrict x = vx; - const block_q4_1 * restrict y = vy; - - float sumf = 0.0; - -#if defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - // Accumulator for constant offsets - float acc_offset = 0.0f; - - // Main loop - for (int i = 0; i < nb; ++i) { - const float * d0 = &x[i].d; - const float * d1 = &y[i].d; - - const float * m0 = &x[i].m; - const float * m1 = &y[i].m; - - const __m256 d0v = _mm256_broadcast_ss( d0 ); - const __m256 d1v = _mm256_broadcast_ss( d1 ); - const __m256 m0v = _mm256_broadcast_ss( m0 ); - const __m256 m1v = _mm256_broadcast_ss( m1 ); - - // Compute combined scale for the block - const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); - - // Compute cross scales for the block - const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); - const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); - const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - __m256i bx = bytesFromNibbles( x[i].qs ); - __m256i by = bytesFromNibbles( y[i].qs ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. - - // Sign-extend first 16 signed bytes into int16_t - __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); - __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); - // Compute products of int16_t integers, add pairwise - __m256i i32 = _mm256_madd_epi16( x16, y16 ); - - // Sign-extend last 16 signed bytes into int16_t vectors - __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); - __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); - // Accumulate products of int16_t integers - i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) ); - - // compute sums of unsigned bytes in bx, by in blocks of 8. - // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000, - // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400. - // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ] - __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() ); - __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() ); - __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) ); - __m256 sums = _mm256_cvtepi32_ps( sumsi ); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( i32 ); - // Apply the scale, and accumulate - // acc += d0*d1*x*y + d0*m1*x + d1*m0*y - acc = _mm256_fmadd_ps( scale_01, p, acc ); - acc = _mm256_fmadd_ps( cross_scales, sums, acc ); - // acc_offset += m0*m1 (for each entry in the block) - acc_offset += (*m0)*(*m1); - } - - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ) + acc_offset * QK4_1; -#elif defined(__ARM_NEON) - float sum00 = 0.0f; - float sum01 = 0.0f; - float sum10 = 0.0f; - float sum11 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict y0 = &y[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q4_1 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0xf); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v1_0 = vld1q_u8(y0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - const uint8x16_t v1_1 = vld1q_u8(y1->qs); - - // 4-bit -> 8-bit - const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); - const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); - const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); - const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); - - const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); - const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); - const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); - const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); - - sum00 += x0->m*y0->m; - sum01 += y0->m*x0->d*((uint16_t)vaddvq_u8(v0_0l) + (uint16_t)vaddvq_u8(v0_0h)); - sum10 += x0->m*y0->d*((uint16_t)vaddvq_u8(v1_0l) + (uint16_t)vaddvq_u8(v1_0h)); - - sum00 += x1->m*y1->m; - sum01 += y1->m*x1->d*((uint16_t)vaddvq_u8(v0_1l) + (uint16_t)vaddvq_u8(v0_1h)); - sum10 += x1->m*y1->d*((uint16_t)vaddvq_u8(v1_1l) + (uint16_t)vaddvq_u8(v1_1h)); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l); - uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l); - - p_0 = vdotq_u32(p_0, v0_0h, v1_0h); - p_1 = vdotq_u32(p_1, v0_1h, v1_1h); - - sum11 += x0->d*y0->d*vaddvq_u32(p_0); - sum11 += x1->d*y1->d*vaddvq_u32(p_1); -#else - const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); - const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); - const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); - const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); - - const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); - const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); - const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); - const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); - - const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h); - const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h); - - const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h); - const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h); - - const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0); - const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1); - - sum11 += x0->d*y0->d*vaddvq_u16(p_0); - sum11 += x1->d*y1->d*vaddvq_u16(p_1); -#endif - } - - sumf = QK4_1*sum00 + sum01 + sum10 + sum11; -#else - // scalar - for (int i = 0; i < nb; i++) { - const float d0 = x[i].d; - const float d1 = y[i].d; - - const float m0 = x[i].m; - const float m1 = y[i].m; - - const uint8_t * restrict p0 = x[i].qs; - const uint8_t * restrict p1 = y[i].qs; - - for (int j = 0; j < QK4_1/2; j++) { - const uint8_t v0 = p0[j]; - const uint8_t v1 = p1[j]; - - const float f0 = d0*(v0 & 0xf) + m0; - const float f1 = d0*(v0 >> 4) + m0; - - const float f2 = d1*(v1 & 0xf) + m1; - const float f3 = d1*(v1 >> 4) + m1; - - sumf += f0*f2 + f1*f3; - } - } -#endif - - *s = sumf; -} - static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; @@ -2549,6 +2377,175 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * *s = sumf; } +static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; + + assert(n % QK8_0 == 0); + assert(nb % 2 == 0); + + const block_q4_1 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0; + + // TODO: add AVX / WASM SIMD / etc +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + // interleave + const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h); + const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h); + const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h); + const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h); + + const int16x8_t s0i = vaddq_s16( + vaddq_s16(vmovl_s8(vget_low_s8(v1_0ls)), vmovl_s8(vget_high_s8(v1_0ls))), + vaddq_s16(vmovl_s8(vget_low_s8(v1_0hs)), vmovl_s8(vget_high_s8(v1_0hs)))); + + const int16x8_t s1i = vaddq_s16( + vaddq_s16(vmovl_s8(vget_low_s8(v1_1ls)), vmovl_s8(vget_high_s8(v1_1ls))), + vaddq_s16(vmovl_s8(vget_low_s8(v1_1hs)), vmovl_s8(vget_high_s8(v1_1hs)))); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(s0i), vget_high_s16(s0i))), x0->m*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(s1i), vget_high_s16(s1i))), x1->m*y1->d); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1ls), v0_1h, v1_1hs); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); +#endif + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * d0 = &x[i].d; + const float * d1 = &y[i].d; + const float * m0 = &x[i].m; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + const __m256 m0v = _mm256_broadcast_ss( m0 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + const __m256 d1m0 = _mm256_mul_ps( d1v, m0v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytesFromNibbles( x[i].qs ); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8( bx, bx ); + + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8( by, bx ); + + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16( ax, sy ); + const __m256i ones = _mm256_set1_epi16( 1 ); + const __m256i xy_q = _mm256_madd_epi16( ones, dot ); + + // Convert to vector of 8 int32_t to 8 floats + const __m256 xy = _mm256_cvtepi32_ps( xy_q ); + + // Accumulate d0*d1*x*y + acc = _mm256_fmadd_ps( d0d1, xy, acc ); + + // Compute sum of y values + const __m256i y16_l = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); + const __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); + const __m256i ysumi = _mm256_madd_epi16( _mm256_add_epi16(y16_l, y16_h), ones ); + const __m256 ysum = _mm256_cvtepi32_ps( ysumi ); + + // Accumulate d1*m0*y + acc = _mm256_fmadd_ps( d1m0, ysum, acc ); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + + sumf = _mm_cvtss_f32( res ); +#else + // scalar + for (int i = 0; i < nb; i++) { + const float d0 = x[i].d; + const float m0 = x[i].m; + const float d1 = y[i].d; + + const uint8_t * restrict p0 = x[i].qs; + const int8_t * restrict p1 = y[i].qs; + + // TODO: this is very slow .. + for (int j = 0; j < QK8_0/2; j++) { + const uint8_t v0 = p0[j]; + + const float f0 = d0*(v0 & 0xf) + m0; + const float f1 = d0*(v0 >> 4) + m0; + + const float f2 = d1*p1[2*j + 0]; + const float f3 = d1*p1[2*j + 1]; + + sumf += f0*f2 + f1*f3; + } + } +#endif + + *s = sumf; +} + static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; From f7d05095b404b5500b4a702ea16f67fc22446e49 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 19 Apr 2023 20:20:14 +0200 Subject: [PATCH 075/773] Q4_2 quantization with rmse-optimized scale and quants (#1062) * Q4_2 quantization with rmse-optimized scale and quants For quantize-stats we get q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012 For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks. Quantization is slow (~90 seconds on my Mac for 7B) as not multi-threaded as in PR #896. * ggml : satisfy the sanitizer builds Not sure why this makes them fail * Better follow ggml conventions for function names * Fixed type as per reviewer comment --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- ggml.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 3b38eaad367361..431cdb9c907e4e 100644 --- a/ggml.c +++ b/ggml.c @@ -19,6 +19,7 @@ #include #include #include +#include // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 @@ -1135,12 +1136,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } +static inline int nearest_int(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i Date: Wed, 19 Apr 2023 14:52:14 -0500 Subject: [PATCH 076/773] Minor: Readme fixed grammar, spelling, and misc updates (#1071) --- README.md | 63 ++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 8e1945cff0c32d..324d49f072de47 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Warnings** -- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalize +- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized **Hot topics:** @@ -19,7 +19,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ## Description -The main goal is to run the model using 4-bit quantization on a MacBook +The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook. - Plain C/C++ implementation without dependencies - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework @@ -156,7 +156,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the step for the LLaMA-7B model. +Here are the steps for the LLaMA-7B model. ### Get the Code @@ -214,8 +214,7 @@ When running the larger models, make sure you have enough disk space to store al ### Memory/Disk Requirements -As the models are currently fully loaded into memory, you will need adequate disk space to save them -and sufficient RAM to load them. At the moment, memory and disk requirements are the same. +As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. | model | original size | quantized size (4-bit) | |-------|---------------|------------------------| @@ -227,18 +226,18 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are ### Interactive mode If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. -In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. -Here is an example few-shot interaction, invoked with the command +Here is an example of a few-shot interaction, invoked with the command ```bash -# default arguments using 7B model +# default arguments using a 7B model ./examples/chat.sh -# advanced chat with 13B model +# advanced chat with a 13B model ./examples/chat-13B.sh -# custom arguments using 13B model +# custom arguments using a 13B model ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` @@ -277,7 +276,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - Obtain the `gpt4all-lora-quantized.bin` model -- It is distributed in the old `ggml` format which is now obsoleted +- It is distributed in the old `ggml` format, which is now obsoleted - You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py): @@ -291,7 +290,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data -- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** +- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.** - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. @@ -303,29 +302,27 @@ convert the model from the old format to the new format with [./migrate-ggml-202 `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS -- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: - - LLaMA: - - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) - - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) - - GPT-3 - - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) - - GPT-3.5 / InstructGPT / ChatGPT: - - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) +- If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: +- LLaMA: +- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) +- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) +- GPT-3 +- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) +- GPT-3.5 / InstructGPT / ChatGPT: +- [Aligning language models to follow instructions](https://openai.com/research/instruction-following) +- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) -### Perplexity (Measuring model quality) +### Perplexity (measuring model quality) -You can use the `perplexity` example to measure perplexity over the given prompt. For more background, -see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs. +You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs. #### Latest measurements -The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well -compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running +The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running 13B at q4_0 beats the 7B f16 model by a significant amount. -All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context). -Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity). +All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context). +Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity). ``` Perplexity - model options 5.5985 - 13B, q4_0 @@ -367,7 +364,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0 #### Prerequisites * Docker must be installed and running on your system. -* Create a folder to store big models & intermediate files (in ex. im using /llama/models) +* Create a folder to store big models & intermediate files (ex. /llama/models) #### Images We have two Docker images available for this project: @@ -381,17 +378,17 @@ The easiest way to download the models, convert them to ggml and optimize them i Replace `/path/to/models` below with the actual path where you downloaded the models. - ```bash +```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B ``` -On complete, you are ready to play! +On completion, you are ready to play! ```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` -or with light image: +or with a light image: ```bash docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 @@ -412,7 +409,7 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode - Always consider cross-compatibility with other operating systems and architectures - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit -- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a` +- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions ### Docs From 02d6988121510c067e06d498a273a351a888f5b9 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Thu, 20 Apr 2023 03:14:14 +0200 Subject: [PATCH 077/773] Improve cuBLAS performance by dequantizing on the GPU (#1065) --- CMakeLists.txt | 31 ++++++++++--- Makefile | 24 +++++----- ggml-cuda.cu | 116 +++++++++++++++++++++++++++++++++++++++++++++++++ ggml-cuda.h | 11 +++++ ggml.c | 80 +++++++++++++++++++++++----------- 5 files changed, 221 insertions(+), 41 deletions(-) create mode 100644 ggml-cuda.cu create mode 100644 ggml-cuda.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d7aa051da4ac64..1f9fdd30f0830d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE) message(WARNING "Accelerate framework not found") endif() endif() + if (LLAMA_OPENBLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) @@ -150,6 +151,10 @@ if (LLAMA_CUBLAS) if (CUDAToolkit_FOUND) message(STATUS "cuBLAS found") + enable_language(CUDA) + + set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) + add_compile_definitions(GGML_USE_CUBLAS) if (LLAMA_STATIC) @@ -241,21 +246,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") message(STATUS "x86 detected") if (MSVC) if (LLAMA_AVX512) - add_compile_options(/arch:AVX512) + add_compile_options($<$:/arch:AVX512>) + add_compile_options($<$:/arch:AVX512>) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. # Do it manually. if (LLAMA_AVX512_VBMI) - add_compile_definitions(__AVX512VBMI__) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) endif() if (LLAMA_AVX512_VNNI) - add_compile_definitions(__AVX512VNNI__) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_options(/arch:AVX2) + add_compile_options($<$:/arch:AVX2>) + add_compile_options($<$:/arch:AVX2>) elseif (LLAMA_AVX) - add_compile_options(/arch:AVX) + add_compile_options($<$:/arch:AVX>) + add_compile_options($<$:/arch:AVX>) endif() else() if (LLAMA_F16C) @@ -292,7 +302,8 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + ${GGML_CUDA_SOURCES}) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump @@ -314,6 +325,14 @@ if (BUILD_SHARED_LIBS) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() +if (GGML_CUDA_SOURCES) + message(STATUS "GGML CUDA sources found, configuring CUDA architecture") + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) +endif() + + # # programs, examples and tests # diff --git a/Makefile b/Makefile index d9a2d836babeed..4bf481aa28d096 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,6 @@ +# Define the default target now so that it is always the first target +default: main quantize quantize-stats perplexity embedding vdot + ifndef UNAME_S UNAME_S := $(shell uname -s) endif @@ -100,6 +103,9 @@ endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 + OBJS += ggml-cuda.o +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + nvcc -arch=native -c -o $@ $< endif ifdef LLAMA_GPROF CFLAGS += -pg @@ -137,8 +143,6 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize quantize-stats perplexity embedding vdot - # # Build library # @@ -155,35 +159,35 @@ common.o: examples/common.cpp examples/common.h clean: rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult -main: examples/main/main.cpp ggml.o llama.o common.o +main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o +quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -vdot: pocs/vdot/vdot.cpp ggml.o +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o +libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # -benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o +benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult diff --git a/ggml-cuda.cu b/ggml-cuda.cu new file mode 100644 index 00000000000000..7cd116602b9b09 --- /dev/null +++ b/ggml-cuda.cu @@ -0,0 +1,116 @@ +#include +#include +#include "ggml-cuda.h" + +typedef uint16_t ggml_fp16_t; +static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK4_2 16 +typedef struct { + __half d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + + +static __global__ void dequantize_block_q4_0(const void * vx, float * y) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_0; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_0 + l + 0] = v0; + y[i*QK4_0 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q4_1(const void * vx, float * y) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_1; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_1 + l + 0] = v0; + y[i*QK4_1 + l + 1] = v1; + } +} + +static __global__ void dequantize_block_q4_2(const void * vx, float * y) { + const block_q4_2 * x = (const block_q4_2 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_2; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = (vi0 - 8)*d; + const float v1 = (vi1 - 8)*d; + + y[i*QK4_2 + l + 0] = v0; + y[i*QK4_2 + l + 1] = v1; + } +} + +extern "C" { + __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_0; + dequantize_block_q4_0<<>>(vx, y); + } + + __host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_1; + dequantize_block_q4_1<<>>(vx, y); + } + + __host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_2; + dequantize_block_q4_2<<>>(vx, y); + } +} diff --git a/ggml-cuda.h b/ggml-cuda.h new file mode 100644 index 00000000000000..646caafc6f8a04 --- /dev/null +++ b/ggml-cuda.h @@ -0,0 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif + +void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 431cdb9c907e4e..9a3430859f7e1b 100644 --- a/ggml.c +++ b/ggml.c @@ -150,23 +150,25 @@ inline static void* ggml_aligned_malloc(size_t size) { #elif defined(GGML_USE_CUBLAS) #include #include -#define CUDA_CHECK(err) \ - do { \ - cudaError_t err_ = (err); \ - if (err_ != cudaSuccess) { \ - printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ - cudaGetErrorString(err_)); \ - exit(1); \ - } \ +#include "ggml-cuda.h" + +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ } while (0) -#define CUBLAS_CHECK(err) \ - do { \ - cublasStatus_t err_ = (err); \ - if (err_ != CUBLAS_STATUS_SUCCESS) { \ - printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ - exit(1); \ - } \ +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ } while (0) static cublasHandle_t cublasH = NULL; @@ -177,6 +179,7 @@ static void init_cublas(void) { CUBLAS_CHECK(cublasCreate(&cublasH)); CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking)); + CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream)); // configure logging to stdout @@ -7311,7 +7314,6 @@ static void ggml_compute_forward_mul_mat_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -7323,6 +7325,7 @@ static void ggml_compute_forward_mul_mat_f32( } } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D)); @@ -7535,7 +7538,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -7553,6 +7555,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D)); @@ -7722,13 +7725,11 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - #if defined(GGML_USE_CUBLAS) float *d_X = NULL; float *d_Y = NULL; float *d_D = NULL; + float *d_Q = NULL; const float alpha = 1.0f; const float beta = 0.0f; const int x_ne = ne01 * ne10; @@ -7738,10 +7739,41 @@ static void ggml_compute_forward_mul_mat_q_f32( CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne)); CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne)); CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne)); + CUDA_CHECK(cudaMalloc((void **)(&d_Q), GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type])); + + void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL; + if (type == GGML_TYPE_Q4_0) { + dequantize_row_q_cuda = dequantize_row_q4_0_cuda; + } + else if (type == GGML_TYPE_Q4_1) { + dequantize_row_q_cuda = dequantize_row_q4_1_cuda; + } + else if (type == GGML_TYPE_Q4_2) { + dequantize_row_q_cuda = dequantize_row_q4_2_cuda; + } + else { + GGML_ASSERT(false); + } +#else + float * const wdata = params->wdata; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; #endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + +#if defined(GGML_USE_CUBLAS) + // copy and dequantize on device + CUDA_CHECK( + cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02, + GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, cudaStream)); + + dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, cudaStream); + CUDA_CHECK(cudaGetLastError()); +#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -7749,15 +7781,12 @@ static void ggml_compute_forward_mul_mat_q_f32( id += ne00; } } - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); +#endif - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); #if defined(GGML_USE_CUBLAS) // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream)); CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream)); // compute @@ -7770,7 +7799,6 @@ static void ggml_compute_forward_mul_mat_q_f32( // copy data to host CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - CUDA_CHECK(cudaStreamSynchronize(cudaStream)); #else // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -7783,9 +7811,11 @@ static void ggml_compute_forward_mul_mat_q_f32( } #if defined(GGML_USE_CUBLAS) + CUDA_CHECK(cudaStreamSynchronize(cudaStream)); CUDA_CHECK(cudaFree(d_X)); CUDA_CHECK(cudaFree(d_Y)); CUDA_CHECK(cudaFree(d_D)); + CUDA_CHECK(cudaFree(d_Q)); #endif //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); From c8c2c524827be8fd681a63f0e5a697b0bf4c587b Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Thu, 20 Apr 2023 06:45:41 +0000 Subject: [PATCH 078/773] AVX2 optimization for vec_dot_q4_2_q8_0 (#1068) --- ggml.c | 99 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/ggml.c b/ggml.c index 9a3430859f7e1b..35b15cc2ee7805 100644 --- a/ggml.c +++ b/ggml.c @@ -467,12 +467,30 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // quantization // -// AVX routines provided by GH user Const-me -// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 +#if __AVX__ || __AVX2__ || __AVX512F__ +// Unpack 16 4-bit fields into 16 bytes +// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval +static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) +{ + // Load 8 bytes from memory + __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); + + // Expand bytes into uint16_t values + __m128i bytes = _mm_cvtepu8_epi16( tmp ); + + // Unpack values into individual bytes + const __m128i lowMask = _mm_set1_epi8( 0xF ); + __m128i high = _mm_andnot_si128( lowMask, bytes ); + __m128i low = _mm_and_si128( lowMask, bytes ); + high = _mm_slli_epi16( high, 4 ); + bytes = _mm_or_si128( low, high ); + return bytes; +} + #if __AVX2__ || __AVX512F__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytesFromNibbles( const uint8_t* rsi ) +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { // Load 16 bytes from memory __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); @@ -503,24 +521,7 @@ static inline __m128i packNibbles( __m256i bytes ) __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); } -#elif __AVX__ -static inline __m128i bytesFromNibbles( const uint8_t* rsi ) -{ - // Load 8 bytes from memory - __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m128i bytes = _mm_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes - const __m128i lowMask = _mm_set1_epi8( 0xF ); - __m128i high = _mm_andnot_si128( lowMask, bytes ); - __m128i low = _mm_and_si128( lowMask, bytes ); - high = _mm_slli_epi16( high, 4 ); - bytes = _mm_or_si128( low, high ); - return bytes; -} - +#else static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh @@ -537,6 +538,7 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } #endif +#endif // __AVX__ || __AVX2__ || __AVX512F__ #if __ARM_NEON @@ -1395,7 +1397,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in for (int l = 0; l < QK4_0; l += 32) { // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytesFromNibbles(pp+l/2); + __m256i vx8 = bytes_from_nibbles_32(pp+l/2); // Subtract 8 from the integers vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8)); @@ -1513,7 +1515,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in for (int l = 0; l < QK4_1; l += 32) { // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytesFromNibbles(pp+l/2); + __m256i vx8 = bytes_from_nibbles_32(pp+l/2); // Convert to 16-bit int const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0)); @@ -2356,7 +2358,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * /* Compute combined scale for the block */ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m256i bx = bytesFromNibbles(x[i].qs); + __m256i bx = bytes_from_nibbles_32(x[i].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); @@ -2402,7 +2404,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * __m128i i32[2]; for (int j = 0; j < 2; ++j) { // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); + __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j); __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. @@ -2567,7 +2569,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * const __m256 d1m0 = _mm256_mul_ps( d1v, m0v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytesFromNibbles( x[i].qs ); + const __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); // Get absolute values of x vectors @@ -2721,6 +2723,51 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); + const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); + const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d)); + + __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); + __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); + __m256i bx = _mm256_set_m128i(bx1, bx0); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8(8); + bx = _mm256_sub_epi8(bx, off); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(bx, bx); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(by, bx); + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + + const __m256i ones = _mm256_set1_epi16(1); + __m256i xy_q = _mm256_madd_epi16(ones, dot); + + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps(xy_q); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps(acc, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(acc)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + + sumf = _mm_cvtss_f32(res); #else // scalar for (int i = 0; i < nb; i++) { From 5addcb120cf2682c7ede0b1c520592700d74c87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Thu, 20 Apr 2023 21:28:43 +0800 Subject: [PATCH 079/773] fix: LLAMA_CUBLAS=1 undefined reference 'shm_open' (#1080) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4bf481aa28d096..8483d66ce4a7d4 100644 --- a/Makefile +++ b/Makefile @@ -102,7 +102,7 @@ ifdef LLAMA_OPENBLAS endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include - LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 + LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt -L/usr/local/cuda/lib64 OBJS += ggml-cuda.o ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< From 6a9661ea5ad72166b700ae5e87976e4452499dda Mon Sep 17 00:00:00 2001 From: Ivan Komarov Date: Thu, 20 Apr 2023 17:15:18 +0200 Subject: [PATCH 080/773] ci : remove the LLAMA_ACCELERATE matrix dimension from Ubuntu builds in the CI (#1074) [Accelerate](https://developer.apple.com/documentation/accelerate) is an Apple framework which can only be used on macOS, and the CMake build [ignores](https://github.com/ggerganov/llama.cpp/blob/master/CMakeLists.txt#L102) the `LLAMA_ACCELERATE` variable when run on non-Apple platforms. This implies setting `LLAMA_ACCELERATE` is a no-op on Ubuntu and can be removed. This will reduce visual noise in CI check results (in addition to reducing the number of checks we have to run for every PR). Right now every sanitized build is duplicated twice for no good reason (e.g., we have `CI / ubuntu-latest-cmake-sanitizer (ADDRESS, Debug, ON)` and `CI / ubuntu-latest-cmake-sanitizer (ADDRESS, Debug, OFF)`). --- .github/workflows/build.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2208f42f77929e..7e8a29b1e5faef 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -81,7 +81,6 @@ jobs: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] build_type: [Debug, Release] - accelerate: [ON, OFF] steps: - name: Clone @@ -99,7 +98,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }} + cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} - name: Test From e0305ead3a072db9c08b35c9600c49273b38a4b5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Apr 2023 20:35:53 +0300 Subject: [PATCH 081/773] ggml : add Q4_3 quantization (#1082) --- examples/quantize/quantize.cpp | 1 + ggml.c | 312 +++++++++++++++++++++++++++++---- ggml.h | 6 +- llama.cpp | 10 +- llama.h | 1 + 5 files changed, 293 insertions(+), 37 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 59cb6744016cb7..49a33a86f10a8c 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -15,6 +15,7 @@ int main(int argc, char ** argv) { fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); + fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3); return 1; } diff --git a/ggml.c b/ggml.c index 35b15cc2ee7805..733ddc0dea0b21 100644 --- a/ggml.c +++ b/ggml.c @@ -637,7 +637,7 @@ typedef struct { float m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding"); #define QK4_2 16 typedef struct { @@ -646,6 +646,14 @@ typedef struct { } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); +#define QK4_3 16 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + #define QK8_0 32 typedef struct { float d; // delta @@ -1203,7 +1211,6 @@ static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restri const int nb = k / QK4_2; for (int i = 0; i < nb; i++) { - float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L); y[i].d = GGML_FP32_TO_FP16(scale); @@ -1231,6 +1238,49 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int quantize_row_q4_2_rmse(x, y, k); } +static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int l = 0; l < QK4_3; l++) { + const float v = x[i*QK4_3 + l]; + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); + + for (int l = 0; l < QK4_3; l += 2) { + const float v0 = (x[i*QK4_3 + l + 0] - min)*id; + const float v1 = (x[i*QK4_3 + l + 1] - min)*id; + + const uint8_t vi0 = (int) (v0 + 0.5f); + const uint8_t vi1 = (int) (v1 + 0.5f); + + assert(vi0 < 16); + assert(vi1 < 16); + + y[i].qs[l/2] = vi0 | (vi1 << 4); + } + } +} + +static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) { + assert(k % QK4_3 == 0); + + block_q4_3 * restrict y = vy; + + quantize_row_q4_3_reference(x, y, k); +} + // reference implementation for deterministic creation of model files static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { assert(k % QK8_0 == 0); @@ -1635,9 +1685,40 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in } } +static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + const block_q4_3 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_3 + l + 0] = v0; + y[i*QK4_3 + l + 1] = v1; + + assert(!isnan(y[i*QK4_3 + l + 0])); + assert(!isnan(y[i*QK4_3 + l + 1])); + } + } +} + static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { @@ -1661,6 +1742,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_2_q8_0, }, + [GGML_TYPE_Q4_3] = { + .dequantize_row_q = dequantize_row_q4_3, + .quantize_row_q = quantize_row_q4_3, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, // TODO: RMSE optimization + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_3_q8_0, + }, [GGML_TYPE_Q8_0] = { .dequantize_row_q = NULL, // TODO .quantize_row_q = quantize_row_q8_0, @@ -2655,6 +2743,7 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; const block_q8_0 * restrict y1 = &y[i + 1]; @@ -2809,6 +2898,154 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * *s = sumf; } +static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / QK8_0; + + assert(n % QK8_0 == 0); + assert(nb % 2 == 0); + assert(QK8_0 == 2*QK4_2); + + const block_q4_3 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1]; + const block_q4_3 * restrict x1_0 = &x[2*(i + 1) + 0]; + const block_q4_3 * restrict x1_1 = &x[2*(i + 1) + 1]; + + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const float x0_0d = GGML_FP16_TO_FP32(x0_0->d); + const float x0_1d = GGML_FP16_TO_FP32(x0_1->d); + const float x1_0d = GGML_FP16_TO_FP32(x1_0->d); + const float x1_1d = GGML_FP16_TO_FP32(x1_1->d); + + const float x0_0m = GGML_FP16_TO_FP32(x0_0->m); + const float x0_1m = GGML_FP16_TO_FP32(x0_1->m); + const float x1_0m = GGML_FP16_TO_FP32(x1_0->m); + const float x1_1m = GGML_FP16_TO_FP32(x1_1->m); + + const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); + const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs)); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h); + const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h); + const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h); + const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + const int16x8_t sy0_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0l)), vmovl_s8(vget_high_s8(v1_0l))); + const int16x8_t sy0_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0h)), vmovl_s8(vget_high_s8(v1_0h))); + + const int16x8_t sy1_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1l)), vmovl_s8(vget_high_s8(v1_1l))); + const int16x8_t sy1_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1h)), vmovl_s8(vget_high_s8(v1_1h))); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_0), vget_high_s16(sy0_0))), x0_0m*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_1), vget_high_s16(sy0_1))), x0_1m*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_0), vget_high_s16(sy1_0))), x1_0m*y1->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_1), vget_high_s16(sy1_1))), x1_1m*y1->d); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), x1_0d*y1->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), x1_1d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(ph0), x0_1d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(pl1), x1_0d*y1->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph1), x1_1d*y1->d); +#endif + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#else + // scalar + for (int i = 0; i < nb; i++) { + const uint8_t * restrict x0 = x[2*i + 0].qs; + const uint8_t * restrict x1 = x[2*i + 1].qs; + const int8_t * restrict y0 = y[i].qs; + + const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); + const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m); + const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); + const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m); + + int sy_0 = 0; + int sy_1 = 0; + + int sxy_0 = 0; + int sxy_1 = 0; + + for (int j = 0; j < QK8_0/4; j++) { + const uint8_t v0 = x0[j]; + const uint8_t v1 = x1[j]; + + const int x0_0 = v0 & 0xf; + const int x1_0 = v0 >> 4; + + const int x0_1 = v1 & 0xf; + const int x1_1 = v1 >> 4; + + const int y0_0 = y0[2*j + 0]; + const int y1_0 = y0[2*j + 1]; + + const int y0_1 = y0[2*(j + QK8_0/4) + 0]; + const int y1_1 = y0[2*(j + QK8_0/4) + 1]; + + sy_0 += y0_0 + y1_0; + sy_1 += y0_1 + y1_1; + + sxy_0 += x0_0*y0_0 + x1_0*y1_0; + sxy_1 += x0_1*y0_1 + x1_1*y1_1; + } + + sumf += (d0*sxy_0 + m0*sy_0)*y[i].d; + sumf += (d1*sxy_1 + m1*sy_1)*y[i].d; + } +#endif + + *s = sumf; +} + + // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { @@ -3056,12 +3293,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, [GGML_TYPE_Q4_2] = QK4_2, + [GGML_TYPE_Q4_3] = QK4_3, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), @@ -3069,12 +3307,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), [GGML_TYPE_Q4_2] = sizeof(block_q4_2), + [GGML_TYPE_Q4_3] = sizeof(block_q4_3), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { @@ -3083,12 +3322,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", [GGML_TYPE_Q4_2] = "q4_2", + [GGML_TYPE_Q4_3] = "q4_3", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated"); static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = false, @@ -3096,12 +3336,13 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_1] = true, [GGML_TYPE_Q4_2] = true, + [GGML_TYPE_Q4_3] = true, [GGML_TYPE_Q8_0] = true, [GGML_TYPE_I8] = false, [GGML_TYPE_I16] = false, [GGML_TYPE_I32] = false, }; -static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated"); +static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -3363,7 +3604,7 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } -static inline bool ggml_is_quantized(enum ggml_type type) { +bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } @@ -6313,6 +6554,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7798,6 +8040,9 @@ static void ggml_compute_forward_mul_mat_q_f32( else if (type == GGML_TYPE_Q4_2) { dequantize_row_q_cuda = dequantize_row_q4_2_cuda; } + else if (type == GGML_TYPE_Q4_3) { + dequantize_row_q_cuda = dequantize_row_q4_3_cuda; + } else { GGML_ASSERT(false); } @@ -7952,6 +8197,7 @@ static void ggml_compute_forward_mul_mat( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: case GGML_TYPE_Q8_0: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); @@ -7969,34 +8215,6 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(false); } break; } - -#if 0 - if (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_Q4_1) { - static int first = 8; - printf("src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); - printf("src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); - printf("dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - if (first) { - --first; - } else { - for (int k = 0; k < dst->ne[1]; ++k) { - for (int j = 0; j < dst->ne[0]/16; ++j) { - for (int i = 0; i < 16; ++i) { - printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); - } - printf("\n"); - } - printf("\n"); - } - printf("\n"); - exit(0); - } - } else { - printf("aaaa src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); - printf("aaaa src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); - printf("aaaa dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); - } -#endif } // ggml_compute_forward_scale @@ -8208,6 +8426,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: case GGML_TYPE_Q8_0: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); @@ -11947,6 +12166,29 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_2*sizeof(block_q4_2)); } +size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_3 == 0); + const int nb = k / QK4_3; + + for (int j = 0; j < n; j += k) { + block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3; + + quantize_row_q4_3_reference(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_3*sizeof(block_q4_3)); +} + //////////////////////////////////////////////////////////////////////////////// int ggml_cpu_has_avx(void) { diff --git a/ggml.h b/ggml.h index 570147fc246587..6e81d8125ae6f1 100644 --- a/ggml.h +++ b/ggml.h @@ -205,7 +205,8 @@ enum ggml_type { GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_2 = 4, - GGML_TYPE_Q8_0 = 5, + GGML_TYPE_Q4_3 = 5, + GGML_TYPE_Q8_0 = 6, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -360,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type); size_t ggml_element_size(const struct ggml_tensor * tensor); +bool ggml_is_quantized(enum ggml_type type); + struct ggml_context * ggml_init(struct ggml_init_params params); void ggml_free(struct ggml_context * ctx); @@ -808,6 +811,7 @@ enum ggml_opt_result ggml_opt( size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); // // system info diff --git a/llama.cpp b/llama.cpp index 3ff5dc1e148009..99d29a1ef7b092 100644 --- a/llama.cpp +++ b/llama.cpp @@ -479,6 +479,7 @@ struct llama_file_loader { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: break; default: { throw format("unrecognized tensor type %u\n", shard.type); @@ -552,6 +553,7 @@ struct llama_file_saver { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: break; default: LLAMA_ASSERT(false); } @@ -841,6 +843,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; + case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; default: return "unknown, may not work"; } } @@ -1575,6 +1578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; + case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break; default: throw format("invalid output file type %d\n", ftype); }; @@ -1652,6 +1656,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s { new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; + case GGML_TYPE_Q4_3: + { + new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; default: LLAMA_ASSERT(false); } @@ -1963,7 +1971,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * base_t = dest_t; } - if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) { + if (ggml_is_quantized(base_t->type)) { if (!warned) { fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " "use a f16 or f32 base model with --lora-base\n", __func__); diff --git a/llama.h b/llama.h index 208b03d18056cf..011e34c00fa3cf 100644 --- a/llama.h +++ b/llama.h @@ -73,6 +73,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); From 38de86a7114c97ecf3644e3a60159f1ed893e1b0 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 20 Apr 2023 19:42:27 +0200 Subject: [PATCH 082/773] llama : multi-threaded quantization (#1075) * Multi-threading quantization. Not much gain for simple quantizations, bit it will be important for quantizations that require more CPU cycles. * Multi-threading for quantize-stats It now does the job in ~14 seconds on my Mac for Q4_0, Q4_1 and Q4_2. Single-threaded it was taking more than 2 minutes after adding the more elaborate version of Q4_2. * Reviewer comments * Avoiding compiler confusion After changing chunk_size to const int as suggested by @ggerganov, clang and GCC starting to warn me that I don't need to capture it in the lambda. So, I removed it from the capture list. But that makes the MSVC build fail. So, making it a constexpr to make every compiler happy. * Still fighting with lambda captures in MSVC --------- Co-authored-by: Iwan Kawrakow Co-authored-by: Georgi Gerganov --- examples/quantize-stats/quantize-stats.cpp | 135 +++++++++++++++------ examples/quantize/quantize.cpp | 7 +- ggml.c | 27 +++++ ggml.h | 2 + llama.cpp | 67 ++++++---- llama.h | 4 +- 6 files changed, 182 insertions(+), 60 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index cd973e8aca6a51..4e6c2c8314661e 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; @@ -27,7 +29,6 @@ struct quantize_stats_params { std::vector include_types; }; -const int64_t SCRATCH_ELEMENTS = 32*32; const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; @@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } +void combine_error_stats(error_stats & into, const error_stats & from) { + into.num_samples += from.num_samples; + into.total_error += from.total_error; + if (from.max_error > into.max_error) into.max_error = from.max_error; + for (size_t i=0; inb[3] == tensor->nb[2]*tensor->ne[2]; } +void test_roundtrip_on_chunk( + const ggml_tensor * layer, + int64_t offset, + int64_t chunk_size, + const quantize_fns_t & qfns, + bool use_reference, + float * input_scratch, + char * quantized_scratch, + float * output_scratch, + error_stats & stats) { + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, stats); +} + + // Run quantization function for a single layer and update error stats void test_roundtrip_on_layer( std::string & name, @@ -137,40 +175,61 @@ void test_roundtrip_on_layer( const quantize_fns_t & qfns, bool use_reference, const ggml_tensor * layer, - float * input_scratch, - char *quantized_scratch, - float * output_scratch, - error_stats & total_error) { + std::vector & input_scratch, + std::vector & quantized_scratch, + std::vector & output_scratch, + error_stats & total_error, + int max_thread = 0) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; - int64_t nelements = ggml_nelements(layer); - - for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { - int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); + uint64_t nelements = ggml_nelements(layer); - if (layer->type == GGML_TYPE_F16) { - for (int i = 0; i < chunk_size; i++) { - input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + float* input_scratch_ptr = nullptr; + if (layer->type == GGML_TYPE_F16) { + if (input_scratch.size() < nelements) input_scratch.resize(nelements); + input_scratch_ptr = input_scratch.data(); + } + if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); + if (output_scratch.size() < nelements) output_scratch.resize(nelements); + + if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); + int chunk_size = 32*512; + int num_chunks = (nelements + chunk_size - 1)/chunk_size; + + if (num_chunks < 2 || max_thread < 2) { + test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), + output_scratch.data(), print_layer_stats ? layer_error : total_error); + } else { + auto & stats = print_layer_stats ? layer_error : total_error; + std::mutex mutex; + uint64_t counter = 0; + auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, + &quantized_scratch, &output_scratch, chunk_size] () { + error_stats local_stats {}; + while (true) { + std::unique_lock lock(mutex); + uint64_t offset = counter; counter += chunk_size; + if (offset >= nelements) { + combine_error_stats(stats, local_stats); + break; + } + lock.unlock(); + uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; + test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, + quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); } - } else { - input_scratch = ggml_get_data_f32(layer) + offset; - } - - if (use_reference) { - qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); - } else { - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); - } - qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); - - update_error_stats(chunk_size, input_scratch, output_scratch, total_error); - if (print_layer_stats) { - update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); - } + }; + int nthread = std::min(num_chunks, max_thread); + std::vector workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); } + if (print_layer_stats) { print_error_stats(name, layer_error, false); + combine_error_stats(total_error, layer_error); } } @@ -181,6 +240,7 @@ int main(int argc, char ** argv) { // read command line + int max_thread = 0; bool invalid_param = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -230,6 +290,12 @@ int main(int argc, char ** argv) { fprintf(stderr, "error: %s not in list of types\n", argv[i]); invalid_param = true; } + } else if (arg == "-n" || arg == "--num-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + max_thread = atoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); quantize_stats_print_usage(argc, argv); @@ -295,9 +361,9 @@ int main(int argc, char ** argv) { } printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); // allocate scratch space - std::vector input_scratch(SCRATCH_ELEMENTS); - std::vector quantized_scratch(SCRATCH_ELEMENTS*4); - std::vector output_scratch(SCRATCH_ELEMENTS); + std::vector input_scratch; + std::vector quantized_scratch; + std::vector output_scratch; // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { @@ -328,10 +394,11 @@ int main(int argc, char ** argv) { qfns, params.reference, kv_tensor.second, - input_scratch.data(), - quantized_scratch.data(), - output_scratch.data(), - global_stats + input_scratch, + quantized_scratch, + output_scratch, + global_stats, + max_thread ); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 49a33a86f10a8c..5b4812c62ba9c2 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -10,8 +10,8 @@ int main(int argc, char ** argv) { ggml_time_init(); - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + if (argc < 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); @@ -30,6 +30,7 @@ int main(int argc, char ** argv) { const std::string fname_out = argv[2]; const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); @@ -39,7 +40,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index 733ddc0dea0b21..1aa8ee303b8dac 100644 --- a/ggml.c +++ b/ggml.c @@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_3*sizeof(block_q4_3)); } +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { + size_t result = 0; + switch (type) { + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(start % QK4_0 == 0); + block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; + result = ggml_quantize_q4_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(start % QK4_1 == 0); + block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; + result = ggml_quantize_q4_1(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_2: + { + GGML_ASSERT(start % QK4_2 == 0); + block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; + result = ggml_quantize_q4_2(src + start, block, n, n, hist); + } break; + default: + assert(false); + } + return result; +} + //////////////////////////////////////////////////////////////////////////////// int ggml_cpu_has_avx(void) { diff --git a/ggml.h b/ggml.h index 6e81d8125ae6f1..a8a7b6b4ff504c 100644 --- a/ggml.h +++ b/ggml.h @@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // // system info // diff --git a/llama.cpp b/llama.cpp index 99d29a1ef7b092..e4c414c2dde8e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -24,6 +24,9 @@ #include #include #include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; @@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s default: throw format("invalid output file type %d\n", ftype); }; + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); @@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); + std::vector workers; + std::mutex mutex; + size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { llama_buffer read_data; @@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - switch (new_type) { - case GGML_TYPE_Q4_0: - { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_2: - { - new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_3: - { - new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - default: - LLAMA_ASSERT(false); + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); @@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype) { + enum llama_ftype ftype, + int nthread) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype); + llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); diff --git a/llama.h b/llama.h index 011e34c00fa3cf..e95ff73b8df1d0 100644 --- a/llama.h +++ b/llama.h @@ -93,10 +93,12 @@ extern "C" { // TODO: not great API - very likely to change // Returns 0 on success + // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype); + enum llama_ftype ftype, + int nthread); // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for From 66aab46079609972ee1f7bd6f319d826205a2fbd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Apr 2023 20:44:05 +0300 Subject: [PATCH 083/773] ggml : fix Q4_3 quantization Broke it during conflict resolution in last PR --- ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml.c b/ggml.c index 1aa8ee303b8dac..50f114d9b59f42 100644 --- a/ggml.c +++ b/ggml.c @@ -12210,6 +12210,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; result = ggml_quantize_q4_2(src + start, block, n, n, hist); } break; + case GGML_TYPE_Q4_3: + { + GGML_ASSERT(start % QK4_3 == 0); + block_q4_3 * block = (block_q4_3*)dst + start / QK4_3; + result = ggml_quantize_q4_3(src + start, block, n, n, hist); + } break; default: assert(false); } From 8a1756abdf1f48cb4dcb898bc8fbe9102ef49dc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Apr 2023 21:43:50 +0300 Subject: [PATCH 084/773] ggml : do not break cuBLAS build (Q4_3 is not yet implemented) --- ggml.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml.c b/ggml.c index 50f114d9b59f42..da0f5d1d549ab4 100644 --- a/ggml.c +++ b/ggml.c @@ -8040,9 +8040,6 @@ static void ggml_compute_forward_mul_mat_q_f32( else if (type == GGML_TYPE_Q4_2) { dequantize_row_q_cuda = dequantize_row_q4_2_cuda; } - else if (type == GGML_TYPE_Q4_3) { - dequantize_row_q_cuda = dequantize_row_q4_3_cuda; - } else { GGML_ASSERT(false); } From 2005469ea130cf920c50175d4f47a87bfd8aaf4d Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Thu, 20 Apr 2023 20:49:53 +0200 Subject: [PATCH 085/773] Add Q4_3 support to cuBLAS (#1086) --- Makefile | 2 +- ggml-cuda.cu | 40 +++++++++++++++++++++++++++++++++++++++- ggml-cuda.h | 1 + 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8483d66ce4a7d4..f267d086415eec 100644 --- a/Makefile +++ b/Makefile @@ -102,7 +102,7 @@ ifdef LLAMA_OPENBLAS endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include - LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt -L/usr/local/cuda/lib64 + LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 OBJS += ggml-cuda.o ggml-cuda.o: ggml-cuda.cu ggml-cuda.h nvcc -arch=native -c -o $@ $< diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7cd116602b9b09..0baa989a36ca91 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -22,11 +22,20 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b #define QK4_2 16 typedef struct { - __half d; // delta + __half d; // delta uint8_t qs[QK4_2 / 2]; // nibbles / quants } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); +#define QK4_3 16 +typedef struct { + __half d; // delta + __half m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + + static __global__ void dequantize_block_q4_0(const void * vx, float * y) { const block_q4_0 * x = (const block_q4_0 *) vx; @@ -98,6 +107,30 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) { } } +static __global__ void dequantize_block_q4_3(const void * vx, float * y) { + const block_q4_3 * x = (const block_q4_3 *) vx; + + const int i = blockIdx.x; + + const float d = x[i].d; + const float m = x[i].m; + + const uint8_t * pp = x[i].qs; + + for (int l = 0; l < QK4_3; l += 2) { + const uint8_t vi = pp[l/2]; + + const int8_t vi0 = vi & 0xf; + const int8_t vi1 = vi >> 4; + + const float v0 = vi0*d + m; + const float v1 = vi1*d + m; + + y[i*QK4_3 + l + 0] = v0; + y[i*QK4_3 + l + 1] = v1; + } +} + extern "C" { __host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_0; @@ -113,4 +146,9 @@ extern "C" { const int nb = k / QK4_2; dequantize_block_q4_2<<>>(vx, y); } + + __host__ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) { + const int nb = k / QK4_3; + dequantize_block_q4_3<<>>(vx, y); + } } diff --git a/ggml-cuda.h b/ggml-cuda.h index 646caafc6f8a04..be140606aa2d45 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -5,6 +5,7 @@ extern "C" { void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); +void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream); #ifdef __cplusplus } From 9ff334f3c9b960a44c5e149b08c748a2914fb882 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Apr 2023 21:58:05 +0300 Subject: [PATCH 086/773] ggml : fix bug in ggml_compute_forward_dup_f32() --- ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml.c b/ggml.c index da0f5d1d549ab4..8109b36b244326 100644 --- a/ggml.c +++ b/ggml.c @@ -6140,7 +6140,6 @@ static void ggml_compute_forward_dup_f32( i10 += ne00 * ir0; while (i10 >= ne0) { i10 -= ne0; - i11++; if (++i11 == ne1) { i11 = 0; if (++i12 == ne2) { From 12b5900dbc9743dee3ce83513cf5c3a44523a1b6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Apr 2023 23:32:59 +0300 Subject: [PATCH 087/773] ggml : sync ggml (add GPT-NeoX RoPE implementation) --- ggml.c | 58 ++++++++++++++++++++++++++++++++++++++++--------------- ggml.h | 3 ++- llama.cpp | 5 +++++ 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/ggml.c b/ggml.c index 8109b36b244326..998602150fe559 100644 --- a/ggml.c +++ b/ggml.c @@ -8653,9 +8653,11 @@ static void ggml_compute_forward_rope_f32( const float theta_scale = powf(10000.0, -2.0f/n_dims); + const bool is_neox = mode & 2; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int p = (mode == 0 ? n_past + i2 : i2); + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -8668,14 +8670,25 @@ static void ggml_compute_forward_rope_f32( theta *= theta_scale; - const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + if (!is_neox) { + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[1]; - const float x0 = src[0]; - const float x1 = src[1]; + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } else { + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } } } } @@ -8730,9 +8743,11 @@ static void ggml_compute_forward_rope_f16( const float theta_scale = powf(10000.0, -2.0f/n_dims); + const bool is_neox = mode & 2; + for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { - const int p = (mode == 0 ? n_past + i2 : i2); + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = ((mode & 1) == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -8745,14 +8760,25 @@ static void ggml_compute_forward_rope_f16( theta *= theta_scale; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + if (!is_neox) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } } } } diff --git a/ggml.h b/ggml.h index a8a7b6b4ff504c..460d4ffe03d85a 100644 --- a/ggml.h +++ b/ggml.h @@ -630,7 +630,8 @@ struct ggml_tensor * ggml_soft_max( // rotary position embedding // in-place, returns view(a) -// if mode == 1, skip n_past elements +// if mode & 1 == 1, skip n_past elements +// if mode & 2 == 1, GPT-NeoX style // TODO: avoid creating a new tensor every time struct ggml_tensor * ggml_rope( struct ggml_context * ctx, diff --git a/llama.cpp b/llama.cpp index e4c414c2dde8e9..4a646eb911621e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1618,6 +1618,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor.ne.size() == 2); + // GG: uncomment this to keep the output layer in FP16 + //if (tensor.name.rfind("output")) { + // quantize = false; + //} + enum ggml_type new_type; void * new_data; size_t new_size; From 2510c1831fac874f32e272f6079f01b5461f3986 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Thu, 20 Apr 2023 21:56:44 +0000 Subject: [PATCH 088/773] Add ggml-model-*.bin checksums for 7B, 13B, 30B, 65B (#1088) * Add ggml-model-*.bin checksums for 7B, 13B, 30B * Add ggml-model-*.bin checksums for 65B --------- Co-authored-by: Pavol Rusnak --- SHA256SUMS | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/SHA256SUMS b/SHA256SUMS index 63fac21ae1bef3..1d034b371ad704 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,12 +1,27 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth +666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin +fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a models/7B/ggml-model-q4_0.bin +cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin +1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8 models/7B/ggml-model-q4_2.bin +3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth +2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin +4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e models/13B/ggml-model-q4_0.bin +d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin +8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f models/13B/ggml-model-q4_2.bin +4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth +7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin +7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73 models/30B/ggml-model-q4_0.bin +7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin +2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3 models/30B/ggml-model-q4_2.bin +a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -16,5 +31,10 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/con a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth +60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin +c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64 models/65B/ggml-model-q4_0.bin +4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin +4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633 models/65B/ggml-model-q4_2.bin +305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model From d40fded93e1a533e969768e1e335c15c61c296ce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Apr 2023 10:23:36 +0300 Subject: [PATCH 089/773] llama : fix comment for "output.weight" tensor --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4a646eb911621e..33ee4fbb594746 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1618,8 +1618,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor.ne.size() == 2); - // GG: uncomment this to keep the output layer in FP16 - //if (tensor.name.rfind("output")) { + // uncomment this to keep the output layer in FP16 + //if (tensor.name == "output.weight") { // quantize = false; //} From 3d59769c3bb7e72c915646ddb1e239b1face19f5 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Fri, 21 Apr 2023 14:57:57 +0200 Subject: [PATCH 090/773] Show perplexity ETA in hours and minutes (#1096) --- examples/perplexity/perplexity.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 80792ea0d95d0a..615157e7b68ece 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -53,7 +53,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) { auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { const float seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + printf("%.2f seconds per pass - ETA ", seconds); + int total_seconds = (int)(seconds * seq_count); + if (total_seconds >= 60*60) { + printf("%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + printf("%d minutes\n", total_seconds / 60); } // We get the logits for all the tokens in the context window (params.n_ctx) // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, From 1bfc153e2f35ddd9d64b084e8d1a5e6fa57ad1c9 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:18:26 +0200 Subject: [PATCH 091/773] ggml : a faster version for Q4_1 x Q8_0 dot products (#1083) * A faster version for Q4_1 x Q8_0 dot products The idea nehind being that Q8_0 quantized values get used many times in the matrix multiplications where they are involved. In the current implementations, when we are evaluating the dot products, we need to compute the sum of the quants in the Q8_0 vector, so the same operation is repeated many times. Here we pre-compute the sum during Q8_0 quantization, store it in the now modified block_q8_0 struct, and then reuse this result in the subsequent dot products. In a synthetic benchmark (just compute a bunch of dot products), this change speeds up the Q4_1 * Q8_0 dot product by 80%, making the performance identical to Q4_0 * Q8_0. In practical application, I see a ~15% gain in speed for token prediction on M2, and ~5% gain on Ryzen 7950X. The speed gain in the prompt evaluation is much bigger (around 50%). I have only done the change for the scalar version, ARM_NEON, and AVX2, so we still need an AVX implementation. * Cleaning up --------- Co-authored-by: Iwan Kawrakow --- ggml.c | 120 ++++++++++++++++----------- pocs/vdot/CMakeLists.txt | 5 ++ pocs/vdot/q8dot.cpp | 172 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 46 deletions(-) create mode 100644 pocs/vdot/q8dot.cpp diff --git a/ggml.c b/ggml.c index 998602150fe559..6cea937c8d735b 100644 --- a/ggml.c +++ b/ggml.c @@ -657,9 +657,10 @@ static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong #define QK8_0 32 typedef struct { float d; // delta + float s; // d * sum(qs[i]) int8_t qs[QK8_0]; // quants } block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); +static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); // reference implementation for deterministic creation of model files @@ -1299,12 +1300,38 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; + int sum = 0; for (int l = 0; l < QK8_0; ++l) { const float v = x[i*QK8_0 + l]*id; y[i].qs[l] = roundf(v); - } - } + sum += y[i].qs[l]; + } + y[i].s = d * sum; + } +} + +#ifdef __AVX2__ +// There is no better way of doing this? +// I guess not, AVX is not very good at horizontal sums. +// The commented solution for a hotrizontal sum was suggested by @pubby as being slightly +// faster than the solution below. As I don't have an AVX2 system handt right now to test, +// keeping the original. +// TODO: Please try and if it does make a differece, uncomment and remove the implementation below. +//static inline float horizontal_sum(__m256i a) { +// __m256i b = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(a))); +// __m256i sum = _mm256_add_epi32(a, b); +// __m256i hi = _mm256_unpackhi_epi64(sum, sum); +// sum = _mm256_add_epi32(sum, hi); +// return _mm256_cvtsi256_si32(sum) + _mm256_extract_epi32(sum, 4); +//} +static inline float horizontal_sum(__m256i a) { + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); + __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + __m128i sum64 = _mm_add_epi32(hi64, sum128); + __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } +#endif static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { assert(k % QK8_0 == 0); @@ -1332,6 +1359,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].d = d; + int32x4_t accv = vdupq_n_s32(0); + for (int l = 0; l < 8; l++) { const float32x4_t v = vmulq_n_f32(srcv[l], id); const int32x4_t vi = vcvtnq_s32_f32(v); @@ -1340,7 +1369,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); } + int32_t sum = vaddvq_s32(accv); + y[i].s = d * sum; } #elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { @@ -1388,6 +1421,10 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int __m256i i3 = _mm256_cvtps_epi32( v3 ); #if defined(__AVX2__) + + // Compute the sum of the quants and set y[i].s + y[i].s = d * horizontal_sum(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 @@ -1430,6 +1467,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int // scalar quantize_row_q8_0_reference(x, y, k); #endif +#if defined __AVX__ + // TODO: vectorize this + for (int i=0; id * y0->s + x1->d * y1->s; + const uint8x16_t m4b = vdupq_n_u8(0xf); - const int8x16_t s8b = vdupq_n_s8(0x8); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v0_1 = vld1q_u8(x1->qs); @@ -2390,12 +2438,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); @@ -2410,21 +2452,21 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1ls), v0_1h, v1_1hs); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs)); - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); @@ -2436,7 +2478,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #endif } - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) - 8 * sum8; #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -2569,12 +2611,16 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + float summs = 0; + for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x1 = &x[i + 1]; const block_q8_0 * restrict y0 = &y[i + 0]; const block_q8_0 * restrict y1 = &y[i + 1]; + summs += x0->m * y0->s + x1->m * y1->s; + const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t v0_0 = vld1q_u8(x0->qs); @@ -2598,17 +2644,6 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h); const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h); - const int16x8_t s0i = vaddq_s16( - vaddq_s16(vmovl_s8(vget_low_s8(v1_0ls)), vmovl_s8(vget_high_s8(v1_0ls))), - vaddq_s16(vmovl_s8(vget_low_s8(v1_0hs)), vmovl_s8(vget_high_s8(v1_0hs)))); - - const int16x8_t s1i = vaddq_s16( - vaddq_s16(vmovl_s8(vget_low_s8(v1_1ls)), vmovl_s8(vget_high_s8(v1_1ls))), - vaddq_s16(vmovl_s8(vget_low_s8(v1_1hs)), vmovl_s8(vget_high_s8(v1_1hs)))); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(s0i), vget_high_s16(s0i))), x0->m*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(s1i), vget_high_s16(s1i))), x1->m*y1->d); - #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs); @@ -2637,24 +2672,26 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * #endif } - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); + float summs = 0; + // Main loop for (int i = 0; i < nb; ++i) { const float * d0 = &x[i].d; const float * d1 = &y[i].d; - const float * m0 = &x[i].m; + //const float * m0 = &x[i].m; + + summs += x[i].m * y[i].s; const __m256 d0v = _mm256_broadcast_ss( d0 ); const __m256 d1v = _mm256_broadcast_ss( d1 ); - const __m256 m0v = _mm256_broadcast_ss( m0 ); // Compute combined scales const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); - const __m256 d1m0 = _mm256_mul_ps( d1v, m0v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes const __m256i bx = bytes_from_nibbles_32(x[i].qs); @@ -2676,15 +2713,6 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * // Accumulate d0*d1*x*y acc = _mm256_fmadd_ps( d0d1, xy, acc ); - - // Compute sum of y values - const __m256i y16_l = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); - const __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); - const __m256i ysumi = _mm256_madd_epi16( _mm256_add_epi16(y16_l, y16_h), ones ); - const __m256 ysum = _mm256_cvtepi32_ps( ysumi ); - - // Accumulate d1*m0*y - acc = _mm256_fmadd_ps( d1m0, ysum, acc ); } // Return horizontal sum of the acc vector @@ -2693,7 +2721,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - sumf = _mm_cvtss_f32( res ); + sumf = _mm_cvtss_f32( res ) + summs; #else // scalar for (int i = 0; i < nb; i++) { diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt index cbc85223650ea5..fb89a1cd4e8334 100644 --- a/pocs/vdot/CMakeLists.txt +++ b/pocs/vdot/CMakeLists.txt @@ -2,3 +2,8 @@ set(TARGET vdot) add_executable(${TARGET} vdot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET q8dot) +add_executable(${TARGET} q8dot.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp new file mode 100644 index 00000000000000..5748c8ac22193c --- /dev/null +++ b/pocs/vdot/q8dot.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +constexpr int kVecSize = 1 << 16; + +// Copy-pasted from ggml.c +#define QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +// Copy-pasted from ggml.c +#define QK8_0 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); + +static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same"); +static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same"); + +template +void fillQ4blocks(std::vector& blocks, std::mt19937& rndm) { + for (auto& b : blocks) { + b.d = 1; + for (int i=0; i> 28; + uint8_t v2 = rndm() >> 28; + b.qs[i] = v1 | (v2 << 4); + } + } +} + +void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { + for (auto& b : blocks) { + b.d = 1; + int sum = 0; + for (int i=0; i> 24) - 128; + sum += b.qs[i]; + } + b.s = b.d * sum; + } +} + +float simpleDot(const block_q4_0& x, const block_q8_0& y) { + int s1 = 0; //, s2 = 0; + for (int i=0; i> 4; + int v3 = x.qs[i+1] & 0xf; + int v4 = x.qs[i+1] >> 4; + int j = 2*i; + s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; + //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; + } + return y.d * x.d * s1 - 8 * x.d * y.s; + //return y.d * x.d * (s1 - 8 * s2); +} + +float simpleDot(const block_q4_1& x, const block_q8_0& y) { + int s1 = 0; //, s2 = 0; + for (int i=0; i> 4; + int v3 = x.qs[i+1] & 0xf; + int v4 = x.qs[i+1] >> 4; + int j = 2*i; + s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3]; + //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3]; + } + return y.d * x.d * s1 + y.s * x.m; + //return y.d * (x.d * s1 + x.m * s2); +} + +struct Stat { + double sum = 0, sumt = 0, sumt2 = 0, maxt = 0; + int nloop = 0; + void addResult(double s, double t) { + sum += s; + sumt += t; sumt2 += t*t; maxt = std::max(maxt, t); + ++nloop; + } + void reportResult(const char* title) const { + if (nloop < 1) { + printf("%s(%s): no result\n",__func__,title); + return; + } + printf("============ %s\n",title); + printf(" = %g\n",sum/nloop); + auto t = sumt/nloop, dt = sumt2/nloop - t*t; + if (dt > 0) dt = sqrt(dt); + printf("