Small matmul optimization

turboderp · turboderp · commit c73d921950f5 · 2023-06-10T20:02:29.000+02:00
diff --git a/exllama_ext/cuda_buffers.cu b/exllama_ext/cuda_buffers.cu
@@ -1,6 +1,10 @@
+#define _cuda_buffers_cu
 #include "cuda_buffers.cuh"
 
 CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
+// __constant__ half2 q4_table[16][256];
+// half2 q4_table_host[16][256];
+// bool q4_table_init = false;
 
 CudaBuffers::CudaBuffers
 (
@@ -64,4 +68,23 @@ void prepare_buffers_cuda
     );
 
     g_buffers[_device] = buffers;
+
+//     if (!q4_table_init)
+//     {
+//         for (uint v_zero = 0; v_zero < 16; v_zero++)
+//         {
+//             for (uint v_read = 0; v_read < 256; v_read++)
+//             {
+//                 half v_0 = __float2half((float)((int)((v_read      ) & 0x0f) - v_zero - 1));
+//                 half v_1 = __float2half((float)((int)((v_read >>  4) & 0x0f) - v_zero - 1));
+//                 half2 v_01 = {v_0, v_1};
+//                 q4_table_host[v_zero][v_read] = v_01;
+//             }
+//         }
+//         q4_table_init = true;
+//     }
+//
+//     cudaSetDevice(_device);
+//     cudaMemcpyToSymbol(q4_table, q4_table_host, 16 * 256 * sizeof(half2));
+//     cudaDeviceSynchronize();
 }
diff --git a/exllama_ext/cuda_buffers.cuh b/exllama_ext/cuda_buffers.cuh
@@ -8,6 +8,10 @@
 
 const int CUDA_MAX_DEVICES = 16;
 
+// #ifndef _cuda_buffers_cu
+// extern __constant__ half2 q4_table[16][256];
+// #endif
+
 class CudaBuffers
 {
 public:
diff --git a/exllama_ext/matrix.cuh b/exllama_ext/matrix.cuh
@@ -4,21 +4,23 @@
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 
+//#include "cuda_buffers.cuh"
+
 class MatrixView_half
 {
 public:
     const half* data;
     const int height;
     const int width;
 
-    __device__ inline MatrixView_half(const half* data, const int height, const int width)
+    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
         : data(data), height(height), width(width)
     { }
 
-    __device__ inline half item(int row, int column) const { return data[row * width + column]; }
-    __device__ inline half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ inline half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ inline const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
+    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
+    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
 };
 
 class MatrixView_half_rw
@@ -28,15 +30,15 @@ public:
     const int height;
     const int width;
 
-    __device__ inline MatrixView_half_rw(half* data, const int height, const int width)
+    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
         : data(data), height(height), width(width)
     { }
 
-    __device__ inline half item(int row, int column) const { return data[row * width + column]; }
-    __device__ inline half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ inline half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ inline void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ inline void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
+    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
+    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
+    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
 };
 
 class MatrixView_q4_row
@@ -46,11 +48,11 @@ public:
     const int height;
     const int width;
 
-    __device__ inline MatrixView_q4_row(const uint32_t* data, const int height, const int width)
+    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
         : data(data), height(height), width(width)
     { }
 
-    __device__ inline int item(int row, int column) const
+    __device__ __forceinline__ int item(int row, int column) const
     {
         int shift = (column & 0x07) * 4;
         return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
@@ -64,25 +66,25 @@ public:
     const int height;
     const int width;
 
-    __device__ inline MatrixView_q4_column(const uint32_t* data, const int height, const int width)
+    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
         : data(data), height(height), width(width)
     { }
 
-    __device__ inline int item(int row, int column) const
+    __device__ __forceinline__ int item(int row, int column) const
     {
         int shift = (row & 0x07) * 4;
         return (data[row / 8 * width + column] >> shift) & 0x0f;
     }
 
-    __device__ inline uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
-    __device__ inline const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
+    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
+    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
 };
 
 // TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
 
 // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
 
-__device__ inline half2 dot_product_8
+__device__ __forceinline__ half2 dot_product_8
 (
     const half2 acc,
     MatrixView_half& h_,
@@ -118,21 +120,22 @@ __device__ inline half2 dot_product_8
         half2 v_45 = __halves2half2(v_4, v_5);
         half2 v_67 = __halves2half2(v_6, v_7);
 
-        v_01 = __hmul2(v_01, v_scale_2);
-        v_23 = __hmul2(v_23, v_scale_2);
-        v_45 = __hmul2(v_45, v_scale_2);
-        v_67 = __hmul2(v_67, v_scale_2);
+//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
+//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
+//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
+//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
 
-        result = __hfma2(*h_ptr++, v_01, result);
-        result = __hfma2(*h_ptr++, v_23, result);
-        result = __hfma2(*h_ptr++, v_45, result);
-        result = __hfma2(*h_ptr++, v_67, result);
+        half2 tmp = __hmul2(*h_ptr++, v_01);
+        tmp = __hfma2(*h_ptr++, v_23, tmp);
+        tmp = __hfma2(*h_ptr++, v_45, tmp);
+        tmp = __hfma2(*h_ptr++, v_67, tmp);
+        result = __hfma2(v_scale_2, tmp, result);
     }
 
     return result;
 }
 
-__device__ inline half dot_product_8_h
+__device__ __forceinline__ half dot_product_8_h
 (
     const half acc,
     MatrixView_half& h_,
@@ -163,31 +166,23 @@ __device__ inline half dot_product_8_h
         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
 
-        v_0 = __hmul(v_0, v_scale);
-        v_1 = __hmul(v_1, v_scale);
-        v_2 = __hmul(v_2, v_scale);
-        v_3 = __hmul(v_3, v_scale);
-        v_4 = __hmul(v_4, v_scale);
-        v_5 = __hmul(v_5, v_scale);
-        v_6 = __hmul(v_6, v_scale);
-        v_7 = __hmul(v_7, v_scale);
-
-        result = __hfma(*h_ptr++, v_0, result);
-        result = __hfma(*h_ptr++, v_1, result);
-        result = __hfma(*h_ptr++, v_2, result);
-        result = __hfma(*h_ptr++, v_3, result);
-        result = __hfma(*h_ptr++, v_4, result);
-        result = __hfma(*h_ptr++, v_5, result);
-        result = __hfma(*h_ptr++, v_6, result);
-        result = __hfma(*h_ptr++, v_7, result);
+        half tmp = __hmul(*h_ptr++, v_0);
+        tmp = __hfma(*h_ptr++, v_1, tmp);
+        tmp = __hfma(*h_ptr++, v_2, tmp);
+        tmp = __hfma(*h_ptr++, v_3, tmp);
+        tmp = __hfma(*h_ptr++, v_4, tmp);
+        tmp = __hfma(*h_ptr++, v_5, tmp);
+        tmp = __hfma(*h_ptr++, v_6, tmp);
+        tmp = __hfma(*h_ptr++, v_7, tmp);
+        result = __hfma(v_scale, tmp, result);
     }
 
     return result;
 }
 
 // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
 
-__device__ inline half2 dot_product_8_x_map
+__device__ __forceinline__ half2 dot_product_8_x_map
 (
     const half2 acc,
     MatrixView_half& h_,
@@ -225,11 +220,6 @@ __device__ inline half2 dot_product_8_x_map
         half2 v_45 = __halves2half2(v_4, v_5);
         half2 v_67 = __halves2half2(v_6, v_7);
 
-        v_01 = __hmul2(v_01, v_scale_2);
-        v_23 = __hmul2(v_23, v_scale_2);
-        v_45 = __hmul2(v_45, v_scale_2);
-        v_67 = __hmul2(v_67, v_scale_2);
-
         half h_0 = h_ptr[*x_map_ptr++];
         half h_1 = h_ptr[*x_map_ptr++];
         half h_2 = h_ptr[*x_map_ptr++];
@@ -244,16 +234,17 @@ __device__ inline half2 dot_product_8_x_map
         half2 h_45 = __halves2half2(h_4, h_5);
         half2 h_67 = __halves2half2(h_6, h_7);
 
-        result = __hfma2(h_01, v_01, result);
-        result = __hfma2(h_23, v_23, result);
-        result = __hfma2(h_45, v_45, result);
-        result = __hfma2(h_67, v_67, result);
+        half2 tmp = __hmul2(h_01, v_01);
+        tmp = __hfma2(h_23, v_23, tmp);
+        tmp = __hfma2(h_45, v_45, tmp);
+        tmp = __hfma2(h_67, v_67, tmp);
+        result = __hfma2(v_scale_2, tmp, result);
     }
 
     return result;
 }
 
-__device__ inline half dot_product_8_x_map_h
+__device__ __forceinline__ half dot_product_8_x_map_h
 (
     const half acc,
     MatrixView_half& h_,
@@ -286,23 +277,15 @@ __device__ inline half dot_product_8_x_map_h
         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
 
-        v_0 = __hmul(v_0, v_scale);
-        v_1 = __hmul(v_1, v_scale);
-        v_2 = __hmul(v_2, v_scale);
-        v_3 = __hmul(v_3, v_scale);
-        v_4 = __hmul(v_4, v_scale);
-        v_5 = __hmul(v_5, v_scale);
-        v_6 = __hmul(v_6, v_scale);
-        v_7 = __hmul(v_7, v_scale);
-
-        result = __hfma(h_ptr[*x_map_ptr++], v_0, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_1, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_2, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_3, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_4, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_5, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_6, result);
-        result = __hfma(h_ptr[*x_map_ptr++], v_7, result);
+        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
+        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
+        result = __hfma(v_scale, tmp, result);
     }
 
     return result;

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,10 @@`
`8`	`8`
`9`	`9`	`const int CUDA_MAX_DEVICES = 16;`
`10`	`10`
	`11`	`+// #ifndef _cuda_buffers_cu`
	`12`	`+// extern __constant__ half2 q4_table[16][256];`
	`13`	`+// #endif`
	`14`	`+`
`11`	`15`	`class CudaBuffers`
`12`	`16`	`{`
`13`	`17`	`public:`