[layers] Add bf16-type input/output support for flash attention (#252)

abenmao · pujiang2018 · commit ac3bd5cf900b · 2024-03-04T21:39:14.000-05:00
diff --git a/src/layers/attention.h b/src/layers/attention.h
@@ -306,17 +306,16 @@ class Attention {
         }
 
         // TODO: refine the logic (and support large inputSeqLen when pastSeqLen > 0)
-        if constexpr (std::is_same_v<InT, bfloat16_t> && std::is_same_v<OutT, bfloat16_t>) {
-            if (pastSeqLen == 0) {
+        if (pastSeqLen == 0) {
+            if (ctx->inputSeqLen >= getFlashThresh()) {
+                flashAttention(ctx, query, key, value, imBuffer, presentKey, presentValue, attnMask, pastSeqLen);
+            } else if constexpr (std::is_same_v<InT, bfloat16_t> && std::is_same_v<OutT, bfloat16_t>) {
                 selfAttentionBF16(ctx, query, key, value, imBuffer, presentKey, presentValue);
             } else {
                 fusedAttention(ctx, query, key, value, imBuffer, presentKey, presentValue, attnMask, pastSeqLen);
             }
         } else {
-            if (ctx->inputSeqLen >= 1024 && pastSeqLen == 0)
-                flashAttention(
-                        ctx, qkvGroupMatMul, outBuffer, imBuffer, presentKey, presentValue, attnMask, pastSeqLen);
-            else { fusedAttention(ctx, query, key, value, imBuffer, presentKey, presentValue, attnMask, pastSeqLen); }
+            fusedAttention(ctx, query, key, value, imBuffer, presentKey, presentValue, attnMask, pastSeqLen);
         }
         t4.release();
 
@@ -809,11 +808,15 @@ class Attention {
         } // end for b
     }
 
-    template <typename KVCacheT, typename AttnT = bfloat16_t>
-    void flashAttention(DecoderContext *ctx, hpj::Matrix<float> &qkvMatMul, hpj::Matrix<float> &tmpBuf,
-            hpj::Matrix<float> &result, KVCacheTensor<KVCacheT> &presentKey, KVCacheTensor<KVCacheT> &presentValue,
-            const float *attnMask, int pastSeqLen) {
-
+    template <typename KVCacheT>
+    void flashAttention(DecoderContext *ctx, hpj::Matrix<ImT> &query, hpj::Matrix<ImT> &key,
+            hpj::Matrix<ImT> &value, hpj::Matrix<ImT> &result, KVCacheTensor<KVCacheT> &presentKey,
+            KVCacheTensor<KVCacheT> &presentValue, const float *attnMask, int pastSeqLen) {
+#if defined(AVX512_BF16_WEIGHT_ONLY_BF16)
+        using AttnT = bfloat16_t;
+#else
+        using AttnT = float;
+#endif
         // How many heads this task should do
         int batchSize = ctx->batchSize;
         int respQHeads = this->endQHead - this->startQHead;
@@ -828,31 +831,41 @@ class Attention {
 
         // TODO: kv dtype conversion for prefixSharing
         AttnT *k, *v;
-        if constexpr (std::is_same_v<AttnT, bfloat16_t>) {
+        int kvStride;
+        if constexpr (!std::is_same_v<AttnT, ImT>) {
+            //Timer tmc(true, "convert KV matrix into bf16");
+            kvStride = kvCols * 2;
+            AttnT *kvBuf = (AttnT *)SimpleMemPool::instance().getBuffer(
+                "flashKVBuf", batchSize * srcLen * kvStride * sizeof(AttnT));
 #pragma omp parallel for collapse(3)
             for (uint64_t b = 0; b < batchSize; ++b)
                 for (uint64_t seq = 0; seq < srcLen; ++seq)
-                    for (uint64_t i = qCols; i < qkvCols; i += headSize) {
-                        const float *srcPtr = qkvMatMul.Data() + b * srcLen * qkvCols + seq * qkvCols + i;
-                        bfloat16_t *dstPtr
-                                = (bfloat16_t *)tmpBuf.Data() + b * srcLen * kvCols * 2 + seq * kvCols * 2 + i - qCols;
-                        bfloat16_t::cvt_float_to_bfloat16(srcPtr, dstPtr, headSize);
+                    for (uint64_t i = 0; i < kvCols * 2; i += headSize) {
+                        const ImT *srcPtr = key.Data() + b * srcLen * qkvCols + seq * qkvCols + i;
+                        AttnT *dstPtr
+                                = kvBuf + b * srcLen * kvStride + seq * kvStride + i;
+                        if constexpr (std::is_same_v<AttnT, bfloat16_t> && std::is_same_v<ImT, float>) {
+                                bfloat16_t::cvt_float_to_bfloat16(srcPtr, dstPtr, headSize);
+                        } else if constexpr (std::is_same_v<AttnT, float> && std::is_same_v<ImT, bfloat16_t>) {
+                                bfloat16_t::cvt_bfloat16_to_float(srcPtr, dstPtr, headSize);
+                        } else {
+                            printf("Not supported Type in Flash Attention yet\n");
+                            exit(-1);
+                        }
                     }
 
-            k = (AttnT *)tmpBuf.Data();
-            v = (AttnT *)tmpBuf.Data() + kvCols;
+            k = kvBuf;
+            v = kvBuf + kvCols;
         } else {
-            k = qkvMatMul.Data() + respQHeads * headSize;
-            v = qkvMatMul.Data() + (respQHeads + respKVHeads) * headSize;
+            kvStride = qkvCols;
+            k = key.Data();
+            v = value.Data();
         }
 
-        float *query = qkvMatMul.Data();
         // [batch, src, head, headsize]
-        scaledDpAttention<AttnT>(query, k, v, attnMask, scale, batchSize, srcLen, tgtLen, respQHeads, respKVHeads,
-                headSize, result.Data(), qkvCols, kvCols * 2, ctx->hiddenSize);
+        scaledDpAttention<AttnT>(query.Data(), k, v, attnMask, scale, batchSize, srcLen, tgtLen, respQHeads, respKVHeads,
+                headSize, result.Data(), qkvCols, kvStride, result.Stride());
 
-        float *key = qkvMatMul.Data() + respQHeads * headSize;
-        float *value = qkvMatMul.Data() + (respQHeads + respKVHeads) * headSize;
         // For group attention, as #kvHeads != #qHeads, need to copy current key/values to cache seperately
         // When M dimension is split, also multiple tasks per copy, so do copy seperately
 #pragma omp parallel for collapse(3)
@@ -862,10 +875,10 @@ class Attention {
                 // Re-layout is needed: (bs, seq=1, hidden_size) -> (seq=1, bs, hidden_size)
                 // Be noted: for group attention, the key/value is less than query
                 for (uint64_t seq = 0; seq < tgtLen; ++seq) {
-                    auto srcK = key + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
+                    auto srcK = key.Data() + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
                     auto dstK = presentKey.getSequence(pastSeqLen + seq, b, i);
 
-                    auto srcV = value + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
+                    auto srcV = value.Data() + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
                     auto dstV = presentValue.getSequence(pastSeqLen + seq, b, i);
 
                     xft::copy(dstK, srcK, headSize);
@@ -877,8 +890,8 @@ class Attention {
 
     // scaled dot-product attention: bmm1 + softmax + bmm2
     template <typename AttnT>
-    void scaledDpAttention(const float *query, const AttnT *key, const AttnT *value, const float *attnMask, float scale,
-            int batchSize, int srcLen, int tgtLen, int numQHead, int numKVHead, int headSize, float *output,
+    void scaledDpAttention(const ImT *query, const AttnT *key, const AttnT *value, const float *attnMask, float scale,
+            int batchSize, int srcLen, int tgtLen, int numQHead, int numKVHead, int headSize, ImT *output,
             int qStride, int kvStride, int stride) {
         // output = trans(softmax(query * trans(key)) * value)
         int nth = omp_get_max_threads();
@@ -916,17 +929,17 @@ class Attention {
         }
 
 #pragma omp parallel for collapse(3)
-        for (int i = 0; i < batchSize; ++i) {
+        for (uint64_t i = 0; i < batchSize; ++i) {
             for (int j = 0; j < numQHead; ++j) {
                 for (int m = 0; m < srcLen; m += srcBlk) {
                     int tid = omp_get_thread_num();
 
                     int qRealBlk = std::min(srcBlk, srcLen - m);
                     uint64_t srcOff = i * srcLen * qStride + j * headSize;
                     uint64_t outOff = i * srcLen * stride + j * headSize;
-                    const float *qbuf = query + srcOff + m * qStride;
+                    const ImT *qbuf = query + srcOff + m * qStride;
                     AttnT *q = (AttnT *)qArr[tid];
-                    float *out = output + outOff + m * stride;
+                    ImT *out = output + outOff + m * stride;
 
                     // reset out
                     for (int ii = 0; ii < qRealBlk; ++ii) {
diff --git a/src/layers/mlp_chatglm2.h b/src/layers/mlp_chatglm2.h
@@ -38,8 +38,7 @@ class ChatGLM2MLP : public LlamaMLP<WeiT> {
         auto range = SplitUtil::getTaskRange(intermediateSize, ctx->numSplit, ctx->splitIdx);
         int colSplit = range.second - range.first;
 
-        setMLPOPTConfig();
-        if (!enableCATMLP) {
+        if (!enableCATMLP()) {
             OriWeiT *gateW = (OriWeiT *)malloc(hiddenSize * colSplit * sizeof(OriWeiT));
             OriWeiT *upW = (OriWeiT *)malloc(hiddenSize * colSplit * sizeof(OriWeiT));
             if (trans) {
@@ -93,14 +92,9 @@ class ChatGLM2MLP : public LlamaMLP<WeiT> {
             }
         }
         // Horizontally split the down weight
-        if (enableCBLASMLP && std::is_same_v<WeiT, bfloat16_t>) {
-            ctx->mmHelper->convertWeight(ctx, trans, intermediateSize, hiddenSize, downW, nullptr, nullptr, false,
-                    this->downWeight, this->downWeightScale, this->downWeightZero, this->gateWeightSum);
-        } else {
-            ctx->mmHelper->convertWeight(ctx, trans, intermediateSize, hiddenSize, downW, nullptr, nullptr, false,
-                    convertedDownWeight, this->downWeightScale, this->downWeightZero, this->downWeightSum);
-            ctx->mmHelper->packWeight(trans, convertedDownWeight, this->downWeight);
-        }
+        ctx->mmHelper->convertWeight(ctx, trans, intermediateSize, hiddenSize, downW, nullptr, nullptr, false,
+                convertedDownWeight, this->downWeightScale, this->downWeightZero, this->downWeightSum);
+        ctx->mmHelper->packWeight(trans, convertedDownWeight, this->downWeight);
 #ifdef DEBUG
         this->dbg.debugPrint("convertedGateWeight [%d, %d](%d):\n", convertedGateWeight.Rows(),
                 convertedGateWeight.Cols(), convertedGateWeight.Stride());
diff --git a/src/layers/mlp_llama.cpp b/src/layers/mlp_llama.cpp
@@ -17,14 +17,6 @@
 
 #include <unordered_map>
 
-bool enableCATMLP;
-bool enableCBLASMLP;
-
-void setMLPOPTConfig() {
-    enableCATMLP = (getenv("ENABLE_CAT_MLP") ? atoi(getenv("ENABLE_CAT_MLP")) : 1);
-    enableCBLASMLP = (getenv("ENABLE_CBLAS_MLP") ? atoi(getenv("ENABLE_CBLAS_MLP")) : 0);
-}
-
 namespace xft {
 
 void invokeMLPLLaMA(DataType dt, int numTokens, int hiddenSize, int intermediateSize, void *output, int outputStride,
diff --git a/src/layers/mlp_llama.h b/src/layers/mlp_llama.h
@@ -23,9 +23,6 @@
 #include "singleton.h"
 #include "timeline.h"
 
-extern bool enableCATMLP;
-extern bool enableCBLASMLP;
-void setMLPOPTConfig();
 // C++ implementation for the python code in modeling_llama.py:
 // residual = hidden_states
 // hidden_states = self.post_attention_layernorm(hidden_states)
@@ -65,8 +62,7 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         ctx->mmHelper->convertWeight(ctx, trans, hiddenSize, imSize, upW, upS, upZ, true, quantizedUpWeight,
                 upWeightScale, upWeightZero, upWeightSum);
 
-        setMLPOPTConfig();
-        if (!enableCATMLP) {
+        if (!enableCATMLP()) {
             gateWeight.Resize(hiddenSize, it.second - it.first);
             upWeight.Resize(hiddenSize, it.second - it.first);
             ctx->mmHelper->packWeight(trans, quantizedGateWeight, gateWeight);
@@ -82,14 +78,9 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
             ctx->mmHelper->packWeight(trans, quantizedCatWeights, catWeights);
         }
         // Horizontally split the down weight
-        if (enableCBLASMLP && std::is_same_v<WeiT, bfloat16_t>) {
-            ctx->mmHelper->convertWeight(ctx, trans, imSize, hiddenSize, downW, downS, downZ, false, downWeight,
-                    downWeightScale, downWeightZero, downWeightSum);
-        } else {
-            ctx->mmHelper->convertWeight(ctx, trans, imSize, hiddenSize, downW, downS, downZ, false,
-                    quantizedDownWeight, downWeightScale, downWeightZero, downWeightSum);
-            ctx->mmHelper->packWeight(trans, quantizedDownWeight, downWeight);
-        }
+        ctx->mmHelper->convertWeight(ctx, trans, imSize, hiddenSize, downW, downS, downZ, false,
+                quantizedDownWeight, downWeightScale, downWeightZero, downWeightSum);
+        ctx->mmHelper->packWeight(trans, quantizedDownWeight, downWeight);
 
 #ifdef DEBUG
         dbg.debugPrint("quantizedGateWeight:\n");
@@ -137,7 +128,7 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         dbg.dumpMatrix(normBuffer);
 #endif
 
-        if (!enableCATMLP) {
+        if (!enableCATMLP()) {
             hpj::Matrix<ImT> imBuffer(
                     (ImT *)ctx->imOut.Data(), ctx->imOut.Rows(), ctx->imOut.Cols(), ctx->imOut.Stride());
             gateProj(ctx, doLnBefore ? normBuffer : inBuffer, imBuffer);
@@ -165,31 +156,19 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
             hpj::Matrix<ImT> imBuffer((ImT *)ctx->imOut.Data(), M, N, N);
 
             // Need to allocate extra buffer as oneDNN does not support the case of stride > cols
-            if constexpr (std::is_same_v<ImT, bfloat16_t>) {
-                const int cols = N / 2;
-                auto bufSize = M * cols * sizeof(ImT);
-                ImT *t = (ImT *)SimpleMemPool::instance().getBuffer("mlp_silu", bufSize);
-                hpj::Matrix<ImT> siluBuf(t, M, cols, cols);
-
-                catGateUpProj(ctx, doLnBefore ? normBuffer : inBuffer, imBuffer, siluBuf);
-#ifdef DEBUG
-                dbg.debugPrint("gateUp output:\n");
-                dbg.dumpMatrix(siluBuf);
-#endif
-                downProj(ctx, siluBuf, outBuffer, inBuffer, ctx->splitIdx == 0);
-            }
+            const int cols = N / 2;
+            auto bufSize = M * cols * sizeof(ImT);
+            ImT *t = (ImT *)SimpleMemPool::instance().getBuffer("mlp_silu", bufSize);
+            hpj::Matrix<ImT> siluBuf(t, M, cols, cols);
 
-            // Use imBuffer as silu buffer
-            else {
-                catGateUpProj(ctx, doLnBefore ? normBuffer : inBuffer, imBuffer, imBuffer);
+            catGateUpProj(ctx, doLnBefore ? normBuffer : inBuffer, imBuffer, siluBuf);
 #ifdef DEBUG
-                dbg.debugPrint("catWeights:\n");
-                dbg.dumpMatrix(catWeights);
-                dbg.debugPrint("gateUp output:\n");
-                dbg.dumpMatrix(imBuffer);
+            dbg.debugPrint("catWeights:\n");
+            dbg.dumpMatrix(catWeights);
+            dbg.debugPrint("gateUp output:\n");
+            dbg.dumpMatrix(siluBuf);
 #endif
-                downProj(ctx, imBuffer, outBuffer, inBuffer, ctx->splitIdx == 0);
-            }
+            downProj(ctx, siluBuf, outBuffer, inBuffer, ctx->splitIdx == 0);
         }
 
 #ifdef DEBUG
@@ -248,7 +227,7 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         TimeLine t("DownProj");
 
         assert(input.Rows() == output.Rows());
-        if (!enableCATMLP)
+        if (!enableCATMLP())
             assert(input.Cols() == downWeight.Rows());
         else
             assert(input.Cols() == 2 * downWeight.Rows());
@@ -266,62 +245,10 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         const InT *R = residential.Data();
 
         if (isMaster) {
-            // TODO: enable below code (currently disabled as hard to get tmpBuf from pre-alloced memory)
-            // if (enableCBLASMLP && std::is_same_v<WeiT, bfloat16_t>) {
-            //     computeProjBF16(A, B, C, M, N, K, lda, ldc, ldc, R, ldr, tmpBuf, ldt);
-            // }
-            {
-                ctx->mmHelper->compute_residential(
-                        false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc, NULL, R, ldr);
-            }
+            ctx->mmHelper->compute_residential(
+                    false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc, NULL, R, ldr);
         } else {
-            // if (enableCBLASMLP && std::is_same_v<WeiT, bfloat16_t>) {
-            //     computeProjBF16(A, B, C, M, N, K, lda, ldc, ldc, nullptr, 0, tmpBuf, ldt);
-            // }
-            {
-                ctx->mmHelper->compute(false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc);
-            }
-        }
-    }
-
-    // C = (R == nullptr ? A * B : A * B + R)
-    // T: temporary buffer if C is not in float
-    void computeProjBF16(const ImT *A, const WeiT *B, OutT *C, int M, int N, int K, int lda, int ldb, int ldc,
-            const InT *R, int ldr, float *T, int ldt) {
-        int alpha = 1.0;
-        int beta = 0.0;
-
-        // MKL needs float as output, use T (temporary buffer) as output if C is not in float
-        float *D = std::is_same_v<OutT, float> ? (float *)C : T;
-        int ldd = std::is_same_v<OutT, float> ? ldc : ldt;
-
-        REQUIRES(D != nullptr, "Incorrect parameter in computeProjBF16.");
-
-        if (R != nullptr) {
-#pragma omp parallel for
-            for (uint64_t i = 0; i < M; ++i) {
-                xft::copy(D + i * ldd, R + i * ldr, N);
-            }
-            beta = 1.0;
-        }
-
-        int ldaH = lda * sizeof(ImT) / sizeof(bfloat16_t); // stride in bf16
-        if constexpr (std::is_same_v<ImT, float>) {
-#pragma omp parallel for
-            for (uint64_t i = 0; i < M; ++i) {
-                bfloat16_t::cvt_float_to_bfloat16(A + i * lda, (bfloat16_t *)A + i * ldaH, K);
-            }
-        }
-
-        cblas_gemm_bf16bf16f32(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, (const MKL_BF16 *)(A), ldaH,
-                (const MKL_BF16 *)(B), ldb, beta, D, ldd);
-
-        // Convert result from float to OutT
-        if constexpr (!std::is_same_v<OutT, float>) {
-#pragma omp parallel for
-            for (uint64_t i = 0; i < M; ++i) {
-                xft::copy(C + i * ldc, D + i * ldd, N);
-            }
+            ctx->mmHelper->compute(false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc);
         }
     }
 
diff --git a/src/models/env_config.cpp b/src/models/env_config.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#include <cstdlib>
+#include <iostream>
+#include <stdlib.h>
+
+bool enableCATMLP() {
+    static int catMlp = -1;
+    if (catMlp == -1)
+        catMlp = (getenv("ENABLE_CAT_MLP") ? atoi(getenv("ENABLE_CAT_MLP")) : 1);
+    return catMlp == 1;
+}
+
+int getFlashThresh() {
+    static int envFlashThresh = -1;
+    if (envFlashThresh == -1)
+        envFlashThresh = (getenv("FLASH_ATTN_THRESHOLD") ? atoi(getenv("FLASH_ATTN_THRESHOLD")) : 1024);
+    return envFlashThresh;
+}
diff --git a/src/utils/decoder_util.h b/src/utils/decoder_util.h