intel
diff --git a/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 39 additions & 0 deletions b/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 29 additions & 0 deletions b/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp‎
Lines changed: 1 addition & 89 deletions b/‎csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp‎
Lines changed: 1 addition & 89 deletions
@@ -7,6 +7,7 @@ namespace cpu {
 
 IPEX_DEFINE_DISPATCH(single_query_cached_kv_attention_kernel_stub);
 IPEX_DEFINE_DISPATCH(reshape_and_cache_kernel_stub);
+IPEX_DEFINE_DISPATCH(flash_attn_var_len_kernel_stub);
 
 /*
  *Caculate the masked multihead attention for decoder layer in decoder only
@@ -48,6 +49,35 @@ void reshape_and_cache_cpu(
       kCPU, key, value, key_cache, value_cache, slot_mapping);
 }
 
+void flash_attn_varlen_cpu(
+    at::Tensor& out,
+    at::Tensor& query,
+    at::Tensor& key,
+    at::Tensor& value,
+    at::Tensor& cu_seqlens_q,
+    at::Tensor& cu_seqlens_kv,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_kv,
+    const double softmax_scale,
+    bool is_causal,
+    at::Tensor& block_table,
+    const c10::optional<at::Tensor>& alibi_slopes) {
+  return flash_attn_var_len_kernel_stub(
+      kCPU,
+      out,
+      query,
+      key,
+      value,
+      cu_seqlens_q,
+      cu_seqlens_kv,
+      max_seqlen_q,
+      max_seqlen_kv,
+      softmax_scale,
+      is_causal,
+      block_table,
+      alibi_slopes);
+}
+
 } // namespace cpu
 } // namespace torch_ipex
 
@@ -68,5 +98,14 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
       "reshape_and_cache",
       c10::DispatchKey::CPU,
       torch_ipex::cpu::reshape_and_cache_cpu);
+  m.def(
+      "flash_attn_varlen_func(Tensor (a!)out, Tensor (a!)query, Tensor (a!)key, Tensor (a!)value, Tensor(a!) cu_seqlens_q,\
+         Tensor(a!) cu_seqlens_kv, int max_seqlen_q, int max_seqlen_kv, float softmax_scale, bool is_causal, Tensor(a!) block_table, \
+         Tensor? alibi_slopes)-> ()");
+
+  m.impl(
+      "flash_attn_varlen_func",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::flash_attn_varlen_cpu);
 }
 } // namespace
@@ -29,6 +29,20 @@ void reshape_and_cache(
     at::Tensor& value_cache,
     at::Tensor& slot_mapping);
 
+void flash_attn_varlen(
+    at::Tensor& out,
+    at::Tensor& query,
+    at::Tensor& key,
+    at::Tensor& value,
+    at::Tensor& cu_seqlens_q,
+    at::Tensor& cu_seqlens_kv,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_kv,
+    const double softmax_scale,
+    bool is_causal,
+    at::Tensor& block_table,
+    const c10::optional<at::Tensor>& alibi_slopes);
+
 using single_query_cached_kv_attention_fn = void (*)(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
     at::Tensor& query, // [num_seqs, num_heads, head_size]
@@ -49,10 +63,25 @@ using reshape_and_cache_fn = void (*)(
     at::Tensor& value_cache,
     at::Tensor& slot_mapping);
 
+using flash_attn_var_len_fn = void (*)(
+    at::Tensor& out,
+    at::Tensor& query,
+    at::Tensor& key,
+    at::Tensor& value,
+    at::Tensor& cu_seqlens_q,
+    at::Tensor& cu_seqlens_kv,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_kv,
+    const double softmax_scale,
+    bool is_causal,
+    at::Tensor& block_table,
+    const c10::optional<at::Tensor>& alibi_slopes);
+
 IPEX_DECLARE_DISPATCH(
     single_query_cached_kv_attention_fn,
     single_query_cached_kv_attention_kernel_stub);
 IPEX_DECLARE_DISPATCH(reshape_and_cache_fn, reshape_and_cache_kernel_stub);
+IPEX_DECLARE_DISPATCH(flash_attn_var_len_fn, flash_attn_var_len_kernel_stub);
 
 } // namespace cpu
 } // namespace torch_ipex
@@ -8,102 +8,14 @@
 
 #include <ATen/Tensor.h>
 #include <aten/FlashAttention.h>
+#include <aten/utils/mkl_gemm.h>
 #include <torch/all.h>
 #include <torch/csrc/autograd/function.h>
 #include <limits>
 #include "../cpu/utils/isa_utils.h"
 #include "csrc/cpu/tpp/woq/tla.h"
-#include "mkl.h"
 #include "vec/vec.h"
 
-inline void _mkl_gemm(
-    const CBLAS_LAYOUT layout,
-    const CBLAS_TRANSPOSE transa,
-    const CBLAS_TRANSPOSE transb,
-    const int& m,
-    const int& n,
-    const int& k,
-    const float& alpha,
-    const float* a,
-    const int& lda,
-    const float* b,
-    const int& ldb,
-    const float& beta,
-    float* c,
-    const int& ldc) {
-  cblas_sgemm(
-      layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-inline void _mkl_gemm(
-    const CBLAS_LAYOUT layout,
-    const CBLAS_TRANSPOSE transa,
-    const CBLAS_TRANSPOSE transb,
-    const int& m,
-    const int& n,
-    const int& k,
-    const double& alpha,
-    const double* a,
-    const int& lda,
-    const double* b,
-    const int& ldb,
-    const double& beta,
-    double* c,
-    const int& ldc) {
-  cblas_dgemm(
-      layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-inline void _mkl_gemm(
-    const CBLAS_LAYOUT layout,
-    const CBLAS_TRANSPOSE transa,
-    const CBLAS_TRANSPOSE transb,
-    const int& m,
-    const int& n,
-    const int& k,
-    const float& alpha,
-    const at::BFloat16* a,
-    const int& lda,
-    const at::BFloat16* b,
-    const int& ldb,
-    const float& beta,
-    float* c,
-    const int& ldc) {
-  cblas_gemm_bf16bf16f32(
-      layout,
-      transa,
-      transb,
-      m,
-      n,
-      k,
-      alpha,
-      (const MKL_BF16*)(a),
-      lda,
-      (const MKL_BF16*)(b),
-      ldb,
-      beta,
-      c,
-      ldc);
-}
-
-inline void _mkl_gemm(
-    const CBLAS_LAYOUT layout,
-    const CBLAS_TRANSPOSE transa,
-    const CBLAS_TRANSPOSE transb,
-    const int& m,
-    const int& n,
-    const int& k,
-    const float& alpha,
-    const at::Half* a,
-    const int& lda,
-    const at::Half* b,
-    const int& ldb,
-    const float& beta,
-    float* c,
-    const int& ldc) {
-  TORCH_CHECK(false, "_mkl_gemm does not support FP16 yet");
-}
-
 namespace torch_ipex {
 using namespace tpp;
 namespace cpu {