Add KVCache for long sequence && tuned comm for faster Addreduce

abenmao · abenmao · commit 4cfb6d988c7a · 2024-03-28T07:34:50.000-04:00
diff --git a/src/common/kvcache_tensor.h b/src/common/kvcache_tensor.h
@@ -21,6 +21,8 @@
 
 #include "allocator.h"
 
+extern bool kvTrans();
+
 /**
  * Tensor specially designed for KV Cache
  * Naturaly, it could be represented in the shape of [seq_length][batch_size][head_num][head_size]
@@ -92,13 +94,26 @@ class KVCacheTensor {
 
     // Get a vector for a specified sequence
     T *getSequence(int seqIdx, int batchIdx, int headIdx) {
-        return data + (seqIdx * batchSize + batchIdx) * (headNum * headSize) + headIdx * headSize;
+        if (kvTrans()) {
+            // [batchSize, headNum, seq, headSize] but also need to modify expand and reorder function
+            return data + (uint64_t)batchIdx * headNum * maxSeqLen * headSize + (uint64_t)headIdx * maxSeqLen * headSize + (uint64_t)seqIdx * headSize;
+        } else {
+            // [seqLen, batchSize, headNum, headSize] but also need to modify expand and reorder function
+            return data + (uint64_t)seqIdx * batchSize * headNum * headSize + (uint64_t)batchIdx * headNum * headSize + (uint64_t)headIdx * headSize;
+        }
     }
 
     // Get a head matrix, return the start address and the stride
     std::pair<T *, int> getHead(int batchIdx, int headIdx) {
-        T *addr = data + batchIdx * headNum * headSize + headIdx * headSize;
-        return std::make_pair(addr, batchSize * headNum * headSize);
+        if (kvTrans()) {
+            // [batchSize, headNum, seq, headSize] but also need to modify expand and reorder function
+            T *addr = data + batchIdx * headNum * maxSeqLen * headSize + headIdx * maxSeqLen * headSize;
+            return std::make_pair(addr, headSize);
+        } else {
+            // [seqLen, batchSize, headNum, headSize] but also need to modify expand and reorder function
+            T *addr = data + (uint64_t)batchIdx * headNum * headSize + (uint64_t)headIdx * headSize;
+            return std::make_pair(addr, batchSize * headNum * headSize);
+        }
     }
 
     /**
@@ -120,37 +135,34 @@ class KVCacheTensor {
             return;
         }
 
+        if (!kvTrans()) {
 #pragma omp parallel for
-        for (int seq = 0; seq < seqLen; ++seq) {
-            for (int b = batchSize - 1; b > 0; --b) {
-                T *dst = getSequence(seq, b, 0);
-                T *src = getSequence(seq, b / beamSize, 0);
-                memcpy(dst, src, headNum * headSize * sizeof(T));
+            for (int seq = 0; seq < seqLen; ++seq) {
+                for (int b = batchSize - 1; b > 0; --b) {
+                    T *dst = getSequence(seq, b, 0);
+                    T *src = getSequence(seq, b / beamSize, 0);
+                    memcpy(dst, src, sizeof(T) * headNum * headSize);
+                }
             }
+        } else {
+            printf("Unsupported kv tensor optimization [ENABLE_KV_TRANS] in beam search for now.\n");
+            exit(-1);
         }
     }
 
     void expandOneSequence(int userSideBS, int beamSize, int seq) {
-        for (int b = batchSize - 1; b > 0; --b) {
-            T *dst = getSequence(seq, b, 0);
-            T *src = getSequence(seq, b / beamSize, 0);
-            memcpy(dst, src, headNum * headSize * sizeof(T));
+        if (!kvTrans()) {
+            for (int b = batchSize - 1; b > 0; --b) {
+                T *dst = getSequence(seq, b, 0);
+                T *src = getSequence(seq, b / beamSize, 0);
+                memcpy(dst, src, sizeof(T) * headNum * headSize);
+            }
+        } else {
+            printf("Unsupported kv tensor optimization [ENABLE_KV_TRANS] in beam search for now.\n");
+            exit(-1);
         }
     }
 
-    // Below implementation could be a little faster (100.6 vs. 100.9), but also need to modify expand and reorder function
-
-    // // Get a vector for a specified sequence
-    // T *getSequence(int seqIdx, int batchIdx, int headIdx) {
-    //     return data + batchIdx * headNum * maxSeqLen * headSize + headIdx * maxSeqLen * headSize + seqIdx * headSize;
-    // }
-
-    // // Get a head matrix, return the start address and the stride
-    // std::pair<T *, int> getHead(int batchIdx, int headIdx) {
-    //     T *addr = data + batchIdx * headNum * maxSeqLen * headSize + headIdx * maxSeqLen * headSize;
-    //     return std::make_pair(addr, headSize);
-    // }
-
 private:
     int maxSeqLen;
     int batchSize;
@@ -159,4 +171,4 @@ class KVCacheTensor {
 
     T *data;
     uint64_t allocSize;
-};
+};
diff --git a/src/layers/attention.h b/src/layers/attention.h
@@ -571,7 +571,7 @@ class Attention {
         int scoreStride = pastSeqLen > 0 ? (pastSeqLen + ctx->inputSeqLen + 15) / 16 * 16 : ctx->inputSeqLen;
         auto bufSizeRequired = ctx->numThreads * mBlockSize * scoreStride;
         if (bufSizeRequired > ctx->getScoreCapacity()) {
-            scoreBuf = (float *)SimpleMemPool::instance().getBuffer("scoreBuf", bufSizeRequired * sizeof(float));
+            scoreBuf = (float *)SimpleMemPool::instance().getBuffer("scoreBuf", sizeof(float) * bufSizeRequired);
         }
 
 #pragma omp parallel for collapse(3)
@@ -680,7 +680,7 @@ class Attention {
         }
 
         float *shardedOut = (float *)SimpleMemPool::instance().getBuffer(
-                "shardedOutput", totalTasks * ctx->attHeadSize * sizeof(float));
+                "shardedOutput", sizeof(float) * totalTasks * ctx->attHeadSize);
 
 #pragma omp parallel for collapse(3)
         for (int b = 0; b < batchSize; ++b) {
@@ -835,6 +835,7 @@ class Attention {
         // TODO: kv dtype conversion for prefixSharing
         AttnT *k, *v;
         int kvStride;
+	// convert to AttnT forcely for accelerating purpose
         if constexpr (!std::is_same_v<AttnT, ImT>) {
             //Timer tmc(true, "convert KV matrix into bf16");
             kvStride = kvCols * 2;
@@ -866,28 +867,10 @@ class Attention {
 
         // [batch, src, head, headsize]
         scaledDpAttention<AttnT>(query.Data(), k, v, attnMask, scale, batchSize, srcLen, tgtLen, respQHeads,
-                respKVHeads, headSize, result.Data(), qkvCols, kvStride, result.Stride());
+                respKVHeads, headSize, result.Data(), query.Stride(), kvStride, result.Stride());
 
-        // For group attention, as #kvHeads != #qHeads, need to copy current key/values to cache seperately
-        // When M dimension is split, also multiple tasks per copy, so do copy seperately
-#pragma omp parallel for collapse(3)
-        for (uint64_t b = 0; b < batchSize; ++b) {
-            for (uint64_t i = 0; i < (this->endKVHead - this->startKVHead); ++i) {
-                // Copy current key/value to cached keys/values
-                // Re-layout is needed: (bs, seq=1, hidden_size) -> (seq=1, bs, hidden_size)
-                // Be noted: for group attention, the key/value is less than query
-                for (uint64_t seq = 0; seq < tgtLen; ++seq) {
-                    auto srcK = key.Data() + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
-                    auto dstK = presentKey.getSequence(pastSeqLen + seq, b, i);
-
-                    auto srcV = value.Data() + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
-                    auto dstV = presentValue.getSequence(pastSeqLen + seq, b, i);
-
-                    xft::copy(dstK, srcK, headSize);
-                    xft::copy(dstV, srcV, headSize);
-                }
-            }
-        }
+        // copy current key/values to cache
+        copyKVCache(ctx, key, value, presentKey, presentValue, pastSeqLen);
     }
 
     // scaled dot-product attention: bmm1 + softmax + bmm2
@@ -908,9 +891,9 @@ class Attention {
 
         int numArr = 7;
         int arrStride = (4 + tgtBlk + 2 * headSize) * srcBlk;
-        float *thrBuf = (float *)SimpleMemPool::instance().getBuffer("threadBuffers", nth * arrStride * sizeof(float));
+        float *thrBuf = (float *)SimpleMemPool::instance().getBuffer("threadBuffers", sizeof(float) * nth * arrStride);
         float **thrPtrBuf
-                = (float **)SimpleMemPool::instance().getBuffer("threadPtrBuffers", nth * numArr * sizeof(float *));
+                = (float **)SimpleMemPool::instance().getBuffer("threadPtrBuffers", sizeof(float *) * nth * numArr);
 
         float **preSum = thrPtrBuf;
         float **sum = thrPtrBuf + nth;
@@ -930,7 +913,7 @@ class Attention {
             qArr[i] = thrBuf + srcBlk * nth * (4 + tgtBlk + headSize) + srcBlk * headSize * i;
         }
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(dynamic)
         for (uint64_t i = 0; i < batchSize; ++i) {
             for (int j = 0; j < numQHead; ++j) {
                 for (int m = 0; m < srcLen; m += srcBlk) {
@@ -968,6 +951,11 @@ class Attention {
                     for (int b = 0; b < tgtLen; b += tgtBlk) {
                         int kvRealBlk = std::min(tgtBlk, tgtLen - b);
                         // TODO: mask out
+                        if (enableSkipMsk() && DecoderUtil::skipMskAttn(attnMsk + b, qRealBlk, kvRealBlk, tgtLen)) {
+                            // printf("Skip bs %d head %d src %d tgt %d\n", i, j, m, b);
+                            break;
+                        }
+
                         const AttnT *kBlk = k + b * kvStride;
                         const AttnT *vBlk = v + b * kvStride;
 
diff --git a/src/models/env_config.cpp b/src/models/env_config.cpp
@@ -23,9 +23,39 @@ bool enableCATMLP() {
     return catMlp == 1;
 }
 
+bool tunedComm() {
+    static int tunedComm = -1;
+    if (tunedComm == -1) {
+        tunedComm = (getenv("ENABLE_TUNED_COMM") ? atoi(getenv("ENABLE_TUNED_COMM")) : 1);
+        if (tunedComm == 1)
+            printf("ENABLE_TUNED_COMM is enabled for faster reduceAdd.\n");
+    }
+    return tunedComm == 1;
+}
+
 int getFlashThresh() {
     static int envFlashThresh = -1;
     if (envFlashThresh == -1)
         envFlashThresh = (getenv("FLASH_ATTN_THRESHOLD") ? atoi(getenv("FLASH_ATTN_THRESHOLD")) : 1024);
     return envFlashThresh;
 }
+
+bool enableSkipMsk() {
+    static int skipMsk = -1;
+    if (skipMsk == -1) {
+        skipMsk = (getenv("ENABLE_SKIP_MASK") ? atoi(getenv("ENABLE_SKIP_MASK")) : 0);
+        if (skipMsk == 1)
+            printf("ENABLE_SKIP_MASK is enabled for ignoring mask Q*K.\n");
+    }
+    return skipMsk == 1;
+}
+
+bool kvTrans() {
+    static int kvTrans = -1;
+    if (kvTrans == -1) {
+        kvTrans = (getenv("ENABLE_KV_TRANS") ? atoi(getenv("ENABLE_KV_TRANS")) : 0);
+        // if (kvTrans == 1)
+            // printf("ENABLE_KV_TRANS is enabled for kv cache optimization.\n");
+    }
+    return kvTrans == 1;
+}
diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp
@@ -141,22 +141,31 @@ void KVCacheManager<KVCacheT>::expandPrefixCache(int layerId, int userSideBS, in
     int headNum = dstTensors[0]->getHeadNum();
     int headSize = dstTensors[0]->getHeadSize();
 
+    if (!kvTrans()) {
 #pragma omp parallel for collapse(2)
-    for (int i = 0; i < 2; ++i) {
-        for (int seq = 0; seq < seqLen; ++seq) {
-            auto *src = srcTensors[i]->getSequence(seq, 0, 0);
-            for (int b = userSideBS - 1; b >= 0; --b) {
-                auto *dst = dstTensors[i]->getSequence(seq, b, 0);
-                memcpy(dst, src, headNum * headSize * sizeof(KVCacheT));
+        for (int i = 0; i < 2; ++i) {
+            for (int seq = 0; seq < seqLen; ++seq) {
+                auto *src = srcTensors[i]->getSequence(seq, 0, 0);
+                for (int b = userSideBS - 1; b >= 0; --b) {
+                    auto *dst = dstTensors[i]->getSequence(seq, b, 0);
+                    memcpy(dst, src, sizeof(KVCacheT) * headNum * headSize);
+                }
             }
         }
+    } else {
+        printf("Unsupported kv tensor optimization [ENABLE_KV_TRANS] in Prefix mode for now.\n");
+        exit(-1);
     }
 }
 
 // Reorder cached keys and values
 // TODO: move to KVCacheTensor is better
 template <typename KVCacheT>
 void KVCacheManager<KVCacheT>::reorderCache(int *idx, int size, int initSeqLen, int accSeqLen) {
+    if (kvTrans()) {
+        printf("Unsupported kv tensor optimization [ENABLE_KV_TRANS] in beam search for now.\n");
+        exit(-1);
+    }
     // Reorder for all the layers
 #pragma omp parallel for
     for (int layer = 0; layer < this->layers; ++layer) {
@@ -251,4 +260,4 @@ void KVCacheManager<KVCacheT>::reorderCache(int *idx, int size, int initSeqLen,
 
 template class KVCacheManager<float16_t>;
 template class KVCacheManager<bfloat16_t>;
-template class KVCacheManager<float>;
+template class KVCacheManager<float>;
diff --git a/src/utils/decoder_util.h b/src/utils/decoder_util.h
@@ -28,8 +28,9 @@
 #include "transformer_ctx.h"
 #include "xdnn.h"
 
-int getFlashThresh();
-bool enableCATMLP();
+extern int getFlashThresh();
+extern bool enableCATMLP();
+extern bool enableSkipMsk();
 
 class DecoderUtil {
 public:
@@ -580,4 +581,14 @@ class DecoderUtil {
         sgemm((T *)AB, C, expABC, m, n, k, k, vStride, n, false, false);
         updateOutTile(output, expABC, preSum, sum, preMax, max, m, n, stride);
     }
+
+    static bool skipMskAttn(const float *attnMask, int m, int n, int stride) {
+        float lowest = std::numeric_limits<float>::lowest();
+        // left bottom is lowest
+        if (attnMask[(m - 1)* stride] == lowest)
+            return true;
+        else
+            return false;
+    }
+
 };
diff --git a/src/utils/matmul_helper.h b/src/utils/matmul_helper.h
@@ -1067,7 +1067,7 @@ class MMHelper {
             if constexpr (std::is_same_v<InT, bfloat16_t>) {
                 TimeLine t("onednn_amx_sgemm_f32bf16f32_compute_residential");
 #pragma omp parallel for collapse(2)
-                for (int i = 0; i < M; ++i) {
+                for (uint64_t i = 0; i < M; ++i) {
                     for (int j = 0; j < N; ++j) {
                         auto remain = N - j;
                         __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
@@ -1082,7 +1082,7 @@ class MMHelper {
                 if (M > AMXThresholdM) {
                     TimeLine t("onednn_amx_sgemm_f32bf16f32_compute_residential");
 #pragma omp parallel for collapse(2)
-                    for (int i = 0; i < M; ++i) {
+                    for (uint64_t i = 0; i < M; ++i) {
                         for (int j = 0; j < N; ++j) {
                             res[i * ldres + j] = res[i * ldres + j] * gamma;
                         }
@@ -1624,7 +1624,7 @@ class MMHelper {
         if (C == res) {
             scale_mem = memory(scale_md, *engine);
 #pragma omp parallel for
-            for (int i = 0; i < M; ++i) {
+            for (uint64_t i = 0; i < M; ++i) {
                 memcpy((Tin *)scale_mem.get_data_handle() + i * N, res + i * ldres, N * sizeof(Tin));
             }
         } else {
diff --git a/src/utils/messenger.h b/src/utils/messenger.h
diff --git a/src/utils/shm_reduction.cpp b/src/utils/shm_reduction.cpp
diff --git a/src/utils/shm_reduction.h b/src/utils/shm_reduction.h