Nexesenex
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/benchmark/benchmark-matmult.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/benchmark/benchmark-matmult.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/quantize/quantize.cpp‎
Lines changed: 131 additions & 2 deletions b/‎examples/quantize/quantize.cpp‎
Lines changed: 131 additions & 2 deletions
diff --git a/‎ggml-cuda.cu‎
Lines changed: 73 additions & 4 deletions b/‎ggml-cuda.cu‎
Lines changed: 73 additions & 4 deletions
diff --git a/‎ggml-metal.m‎
Lines changed: 1 addition & 1 deletion b/‎ggml-metal.m‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)  # for add_link_options
+cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -76,6 +76,10 @@ if (NOT MSVC)
     option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
 endif()
 
+if (WIN32)
+    option(LLAMA_WIN_VER                     "llama: Windows Version"                           0x602)
+endif()
+
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
@@ -686,7 +690,7 @@ endif()
 
 if (MINGW)
     # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=0x602)
+    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
 endif()
 
 #
 
@@ -194,7 +194,7 @@ int main(int argc, char ** argv)  {
     // Set up a the benchmark matrices
     // printf("Creating new tensor q11 & Running quantize\n");
     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
 
     // Set up a the compute graph
     // printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv)  {
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
 
     // printf("Creating new tensor q32\n");
     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
 
@@ -5,6 +5,10 @@
 #include <cstring>
 #include <vector>
 #include <string>
+#include <unordered_map>
+#include <fstream>
+#include <cmath>
+#include <algorithm>
 
 struct quant_option {
     std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
+    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
+    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
         if (it.name != "COPY") {
             printf("  %2d  or  ", it.ftype);
         } else {
             printf("          ");
         }
-        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
+        printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
     }
     exit(1);
 }
 
+static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
+    if (!in) {
+        printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
+        return;
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        return;
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len; in.read((char *)&len, sizeof(len));
+        std::vector<char> name_as_vec(len+1);
+        in.read((char *)name_as_vec.data(), len);
+        if (in.fail()) {
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
+            return;
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = imatrix_data[std::move(name)];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char *)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return;
+        }
+        e.resize(nval);
+        in.read((char*)e.data(), nval*sizeof(float));
+        if (in.fail()) {
+            printf("%s: failed reading data for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return;
+        }
+        if (ncall > 0) {
+            for (auto& v : e) v /= ncall;
+        }
+    }
+    printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
+}
+
+static void prepare_imatrix(const std::string& imatrix_file,
+        const std::vector<std::string>& included_weights,
+        const std::vector<std::string>& excluded_weights,
+        std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+    if (!imatrix_file.empty()) {
+        load_imatrix(imatrix_file, imatrix_data);
+    }
+    if (imatrix_data.empty()) {
+        return;
+    }
+    if (!excluded_weights.empty()) {
+        for (auto& name : excluded_weights) {
+            for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
+                auto pos = it->first.find(name);
+                if (pos != std::string::npos) it = imatrix_data.erase(it);
+                else ++it;
+            }
+        }
+    }
+    if (!included_weights.empty()) {
+        std::unordered_map<std::string, std::vector<float>> tmp;
+        for (auto& name : included_weights) {
+            for (auto& e : imatrix_data) {
+                auto pos = e.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp.emplace(std::move(e));
+                }
+            }
+        }
+        imatrix_data = std::move(tmp);
+    }
+    if (!imatrix_data.empty()) {
+        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+    }
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
     llama_model_quantize_params params = llama_model_quantize_default_params();
 
     int arg_idx = 1;
+    std::string imatrix_file;
+    std::vector<std::string> included_weights, excluded_weights;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
             params.allow_requantize = true;
         } else if (strcmp(argv[arg_idx], "--pure") == 0) {
             params.pure = true;
+        } else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
+            if (arg_idx < argc-1) {
+                imatrix_file = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
+            if (arg_idx < argc-1) {
+                included_weights.push_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
+            if (arg_idx < argc-1) {
+                excluded_weights.push_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
         } else {
             usage(argv[0]);
         }
     }
 
     if (argc - arg_idx < 2) {
+        printf("%s: bad arguments\n", argv[0]);
+        usage(argv[0]);
+    }
+    if (!included_weights.empty() && !excluded_weights.empty()) {
         usage(argv[0]);
     }
 
+    std::unordered_map<std::string, std::vector<float>> imatrix_data;
+    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    if (!imatrix_data.empty()) {
+        params.imatrix = &imatrix_data;
+    }
+
     llama_backend_init(false);
 
     // parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
+        fprintf(stderr, "\n===============================================================================================\n");
+        fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "===============================================================================================\n\n\n");
+        return 1;
+    }
+
     print_build_info();
 
     fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
 
@@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
 #endif // GGML_CUDA_F16
 }
 
+template<typename dst_t>
+static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int i = blockIdx.x;
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
+    const float d = __half2float(x->d);
+    const float dm = -8*d;
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d * (q[l] & 0xF) + dm;
+        y[l+16] = d * (q[l] >>  4) + dm;
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int i = blockIdx.x;
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
+    const float2 d = __half22float2(x->dm);
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
+        y[l+16] = d.x * (q[l] >>  4) + d.y;
+    }
+}
+
 //================================== k-quants
 
 template<typename dst_t>
@@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
 #endif
 }
 
+template<typename dst_t>
+static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
 template<typename dst_t>
 static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
     const int nb = k / QK_K;
@@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
     int id;
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_row_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
@@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_row_q4_1_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
 
@@ -330,7 +330,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
         }
     }
 
-#if TARGET_OS_OSX
     // print MTL GPU family:
     GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
 
@@ -370,6 +369,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+#if TARGET_OS_OSX
     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
     if (ctx->device.maxTransferRate != 0) {
         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
Original file line number	Diff line number	Diff line change
`@@ -330,7 +330,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){`
`330`	`330`	`}`
`331`	`331`	`}`
`332`	`332`
`333`		`-#if TARGET_OS_OSX`
`334`	`333`	`// print MTL GPU family:`
`335`	`334`	`GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);`
`336`	`335`
`@@ -370,6 +369,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){`
`370`	`369`	`GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");`
`371`	`370`	`GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");`
`372`	`371`	`GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");`
	`372`	`+#if TARGET_OS_OSX`
`373`	`373`	`GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);`
`374`	`374`	`if (ctx->device.maxTransferRate != 0) {`
`375`	`375`	`GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);`