vllm-project · WoosukKwon · May 3, 2023 · May 2, 2023 · May 2, 2023 · May 2, 2023
diff --git a/csrc/attention/attention_dtypes.cuh b/csrc/attention/attention_dtypes.cuh
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "attention_generic.cuh"
+#include "dtype_float16.cuh"
+#include "dtype_float32.cuh"
diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace cacheflow {
+
+// A vector type to store Q, K, V elements.
+template<typename T, int VEC_SIZE>
+struct Vec {};
+
+// A vector type to store FP32 accumulators.
+template<typename T>
+struct FloatVec {};
+
+// Template vector operations.
+template<typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b);
+
+template<typename T>
+inline __device__ float sum(T v);
+
+template<typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+
+template<typename A, typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+
+template<typename T>
+inline __device__ void zero(T& dst) {
+  constexpr int WORDS = sizeof(T) / 4;
+  union {
+    T raw;
+    uint32_t words[WORDS];
+  } tmp;
+
+#pragma unroll
+  for (int ii = 0; ii < WORDS; ++ii) {
+    tmp.words[ii] = 0u;
+  }
+  dst = tmp.raw;
+}
+
+} // namespace cacheflow