wqupdate xdnn and pack to support fp8 gemm in prefill

abenmao · abenmao · commit b4a4193eb1d4 · 2025-07-21T21:05:51.000+08:00
diff --git a/README.md b/README.md
@@ -153,7 +153,9 @@ Please install libnuma package:
   git checkout <latest-tag>
   # Please make sure torch is installed when run python example
   mkdir build && cd build
+  # Notice: use gcc-13 or higher
   cmake ..
+  # If you see the error "numa.h: No such file or directory", install libnuma first, then build with "CPATH=$CONDA_PATH/include/:$CPATH make -j".
   make -j
   ```
 - Using `python setup.py`
diff --git a/README_CN.md b/README_CN.md
@@ -154,7 +154,9 @@ docker run -it \
   git checkout <latest-tag>
   # 如果使用python示例，请确保已经安装torch。
   mkdir build && cd build
+  # 注意使用gcc-13及以上版本
   cmake ..
+  # 若遇到错误 "numa.h: No such file or directory"，需要先安装numa包，然后使用 "CPATH=$CONDA_PATH/include/:$CPATH make -j"完成编译
   make -j
   ```
 - 使用 `python setup.py`
diff --git a/cmake/xdnn.cmake b/cmake/xdnn.cmake
@@ -26,8 +26,8 @@ include(ExternalProject)
 
 # cmake-format: off
 ExternalProject_Add(xdnn_lib
-  URL               https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.5.7.tar.gz
-  URL_HASH          MD5=6cad71df05ef120e058bce28a0a478a8
+  URL               https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.5.9.tar.gz
+  URL_HASH          MD5=3aa9cd15df3eb2a7a1c178f3edcf9d37
   TIMEOUT           120
   SOURCE_DIR        ${CMAKE_SOURCE_DIR}/3rdparty/xdnn
   CONFIGURE_COMMAND ""
diff --git a/src/utils/matmul_helper.h b/src/utils/matmul_helper.h
@@ -524,12 +524,12 @@ class MMHelper {
 
         // E4M3
         else if constexpr (std::is_same_v<WeiT, e4m3_t>) {
-            int amx_rows = (int)((K + 15) / 16) * 16;
-            int amx_cols = (int)((N + 63) / 64) * 64;
-            if (!weight.isShadow()) weight.Resize(amx_rows, amx_cols);
-            memset(weight.Data(), 0, sizeof(e4m3_t) * amx_rows * amx_cols);
+            int blockSize = 32;
+            size_t pack_size = xdnn_small_amx_sgemm_bf16f8bf16_packb_size(K, N, blockSize);
+            if (!weight.isShadow()) weight.Resize((pack_size + N - 1) / N, N);
+            memset(weight.Data(), 0, sizeof(e4m3_t) * pack_size);
             xdnn_small_amx_sgemm_bf16f8bf16_packb(trans, N, K, (const XDNN_E4M3 *)src.Data(), src.Stride(),
-                    (XDNN_E4M3 *)weight.Data(), 64);
+                    (XDNN_E4M3 *)weight.Data(), blockSize);
         }
     }
 
@@ -691,7 +691,7 @@ class MMHelper {
 
         // E4M3
         else if constexpr (std::is_same_v<WeiT, e4m3_t>) {
-            if (M <= 16) {
+            if (true) {
                 assert(blockSize == 128);
                 if (lds == -1) lds = (K + 127) / 128;
                 GEMMVERBOSE("xdnn_gemm_bf16f8bf16_compute",
@@ -1509,7 +1509,7 @@ class MMHelper {
 
         // E4M3
         else if constexpr (std::is_same_v<WeiT, e4m3_t>) {
-            if (M <= 16) {
+            if (true) {
                 assert(blockSize == 128);
                 if (lds == -1) lds = (K + 127) / 128;
                 GEMMVERBOSE("xdnn_gemm_bf16f8bf16_compute_residential",