ggml-org · Srihari-mcw · Aug 12, 2024 · May 23, 2024 · May 24, 2024 · May 24, 2024
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -17,6 +17,7 @@ struct quant_option {
 
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_0_B16", LLAMA_FTYPE_MOSTLY_Q4_0_B16, " 3.56G, 5.9624 +/- 0.03348 ppl @ LLaMA-v2-7B", },
     { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
     { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
     { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
@@ -46,6 +47,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q8_0_B16", LLAMA_FTYPE_MOSTLY_Q8_0_B16, " 6.70G, 5.8011 +/- 0.03239 ppl @ LLaMA-v1-7B", },
     { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -391,6 +391,8 @@ extern "C" {
         GGML_TYPE_Q4_0_4_4 = 31,
         GGML_TYPE_Q4_0_4_8 = 32,
         GGML_TYPE_Q4_0_8_8 = 33,
+        GGML_TYPE_Q4_0_B16 = 34,
+        GGML_TYPE_Q8_0_B16 = 35,
         GGML_TYPE_COUNT,
     };
 
@@ -435,6 +437,8 @@ extern "C" {
         GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_B16 = 28, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0_B16 = 29, // except 1d tensors
     };
 
     // available tensor operations:

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -20,11 +20,13 @@
 #if defined(_MSC_VER)
 
 #define m512bh(p) p
+#define m128bh(p) p
 #define m512i(p) p
 
 #else
 
 #define m512bh(p) (__m512bh)(p)
+#define m128bh(p) (__m128bh)(p)
 #define m512i(p) (__m512i)(p)
 
 #endif
@@ -100,6 +102,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
     return h;
 }
 
+static inline ggml_bf16_t ggml_make_bf16(uint16_t h) {
+    union {
+        ggml_bf16_t f;
+        uint16_t i;
+    } u;
+    u.i = h;
+    return u.f;
+}
+
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)