intel
diff --git a/‎include/dtype.h‎
Lines changed: 1 addition & 0 deletions b/‎include/dtype.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/common/float16.h‎
Lines changed: 53 additions & 0 deletions b/‎src/common/float16.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎src/layers/attention.h‎
Lines changed: 2 additions & 7 deletions b/‎src/layers/attention.h‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎src/layers/dist_linear.h‎
Lines changed: 2 additions & 2 deletions b/‎src/layers/dist_linear.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/layers/layer_norm.cpp‎
Lines changed: 17 additions & 8 deletions b/‎src/layers/layer_norm.cpp‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎src/layers/layer_norm.h‎
Lines changed: 6 additions & 2 deletions b/‎src/layers/layer_norm.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/layers/rms_norm.cpp‎
Lines changed: 5 additions & 0 deletions b/‎src/layers/rms_norm.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/layers/rms_norm.h‎
Lines changed: 3 additions & 1 deletion b/‎src/layers/rms_norm.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/layers/token_embedding.h‎
Lines changed: 5 additions & 1 deletion b/‎src/layers/token_embedding.h‎
Lines changed: 5 additions & 1 deletion
@@ -31,6 +31,7 @@ enum DataType {
     w8a8_int8,
     w8a8_int4,
     w8a8_nf4,
+    unknown,
 };
 
 enum DeviceKind {
 
@@ -24,7 +24,7 @@ add_subdirectory(comm_helper)
 
 add_library(xfastertransformer_static STATIC)
 
-set(SRC_LIB_LIST "utils" "layers" "kernels" "models" "searchers")
+set(SRC_LIB_LIST "utils" "layers" "kernels" "models" "searchers" "stdc++fs")
 
 target_link_libraries(xfastertransformer_static
                       ${SRC_LIB_LIST}
 
@@ -42,7 +42,9 @@ class float16_t {
     operator float() const;
 
     static void cvt_float_to_float16(const float *src, float16_t *dst, int size);
+    static void cvt_float_to_float16_MT(const float *src, float16_t *dst, int size);
     static void cvt_float16_to_float(const float16_t *src, float *dst, int size);
+    static void cvt_float16_to_float_MT(const float16_t *src, float *dst, int size);
     static void float_add_float16(const float *src1, const float16_t *src2, float *dst, int size);
 
 private:
@@ -150,6 +152,36 @@ inline void float16_t::cvt_float_to_float16(const float *src, float16_t *dst, in
     }
 }
 
+inline void float16_t::cvt_float_to_float16_MT(const float *src, float16_t *dst, int size) {
+    // Round to nearest even mode
+    constexpr int rounding_mode = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+
+    // Process 16 floats (AVX512 is a 512-bit SIMD register)
+    constexpr int kStep = 16;
+    int blockSize = size / kStep;
+    int remainder = size % kStep;
+
+    // Process blocks of 16 floats at a time
+#pragma omp parallel for
+    for (int i = 0; i < blockSize; ++i) {
+        // Load the input floats into a AVX512 register
+        __m512 input_vector = _mm512_loadu_ps(src + i * kStep);
+
+        // Convert the floats to float16_t using AVX512 intrinsics
+        __m256i output_vector = _mm512_cvtps_ph(input_vector, rounding_mode);
+
+        // Store the converted values in the output array
+        _mm256_mask_storeu_epi16(dst + i * kStep, 0xffff, output_vector);
+    }
+
+    if (remainder != 0) {
+        __mmask16 mask = 0xFFFF >> (kStep - remainder);
+        __m512 input_vector = _mm512_maskz_loadu_ps(mask, src + size - remainder);
+        __m256i output_vector = _mm512_cvtps_ph(input_vector, rounding_mode);
+        _mm256_mask_storeu_epi16(dst + size - remainder, mask, output_vector);
+    }
+}
+
 inline void float16_t::cvt_float16_to_float(const float16_t *src, float *dst, int size) {
     // Process 16 floats (AVX512 is a 512-bit SIMD register)
     constexpr int kStep = 16;
@@ -170,6 +202,27 @@ inline void float16_t::cvt_float16_to_float(const float16_t *src, float *dst, in
     }
 }
 
+inline void float16_t::cvt_float16_to_float_MT(const float16_t *src, float *dst, int size) {
+    // Process 16 floats (AVX512 is a 512-bit SIMD register)
+    constexpr int kStep = 16;
+    int blockSize = size / kStep;
+    int remainder = size % kStep;
+
+#pragma omp parallel for
+    for (int i = 0; i < blockSize; ++i) {
+        __m256i input_vector = _mm256_maskz_loadu_epi16(0xffff, src + i * kStep);
+        __m512 output_vector = _mm512_cvtph_ps(input_vector);
+        _mm512_storeu_ps(dst + i * kStep, output_vector);
+    }
+
+    if (remainder != 0) {
+        __mmask16 mask = 0xFFFF >> (kStep - remainder);
+        __m256i input_vector = _mm256_maskz_loadu_epi16(mask, src + size - remainder);
+        __m512 output_vector = _mm512_cvtph_ps(input_vector);
+        _mm512_mask_storeu_ps(dst + size - remainder, mask, output_vector);
+    }
+}
+
 inline void float16_t::float_add_float16(const float *src1, const float16_t *src2, float *dst, int size) {
     constexpr int kStep = 16;
     int blockSize = size / kStep;
 
@@ -812,13 +812,8 @@ class Attention {
                     auto srcV = value + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;
                     auto dstV = presentValue.getSequence(pastSeqLen + seq, b, i);
 
-                    if constexpr (std::is_same_v<KVCacheT, float>) {
-                        memcpy(dstK, srcK, headSize * sizeof(float));
-                        memcpy(dstV, srcV, headSize * sizeof(float));
-                    } else if constexpr (std::is_same_v<KVCacheT, float16_t>) {
-                        float16_t::cvt_float_to_float16(srcK, dstK, headSize);
-                        float16_t::cvt_float_to_float16(srcV, dstV, headSize);
-                    }
+                    xft::copy(dstK, srcK, headSize);
+                    xft::copy(srcV, dstV, headSize);
                 }
             }
         }
 
@@ -45,7 +45,7 @@ class DistLinear {
     // |                                         |
     // |                                         | splitSize(N)
     // |_________________________________________|
-    void setWeight(DecoderContext *ctx, const float *w, const float *b) {
+    void setWeight(DecoderContext *ctx, const float *w, const float *b = nullptr) {
         this->splitSize = outputSize / splits;
         this->splitOffset = this->splitSize * splitIdx;
 
@@ -111,5 +111,5 @@ class DistLinear {
     hpj::Vector<float> scaleWeight; // if weight is int8
     hpj::Vector<float> zeroWeight; // if weight is int8
     hpj::Vector<float> sumWeight; // if weight is int8
-    float *bias;
+    float *bias = nullptr;
 };
@@ -17,35 +17,44 @@
 #include <cstdlib>
 #include <cstring>
 
-#include "layernorm_kernels.h"
 #include "layer_norm.h"
+#include "layernorm_kernels.h"
 #include "timeline.h"
 
 namespace xft {
 
 // Layer normalization: only support the norm along last dimension
 LayerNorm::LayerNorm() {
-    weights = nullptr;
+    gamma = nullptr;
+    beta = nullptr;
     normSize = 0;
 }
 
 LayerNorm::~LayerNorm() {
-    if (weights) { free(weights); }
+    if (gamma) { free(gamma); }
+    if (beta) { free(beta); }
 }
 
 void LayerNorm::setWeight(const float *gamma, const float *beta, int cols) {
     this->normSize = cols;
-    this->weights = (float *)aligned_alloc(64, 2 * cols * sizeof(float));
-    memcpy(weights, gamma, cols * sizeof(float));
-    memcpy(weights + cols, beta, cols * sizeof(float));
+    this->gamma = (float *)aligned_alloc(64, cols * sizeof(float));
+    this->beta = (float *)aligned_alloc(64, cols * sizeof(float));
+    memcpy(this->gamma, gamma, cols * sizeof(float));
+    memcpy(this->beta, beta, cols * sizeof(float));
+}
+
+void LayerNorm::setWeight(const std::string &gammaPath, const std::string &betaPath, int cols) {
+    this->normSize = cols;
+    loadWeight(gammaPath, this->gamma, cols);
+    if (betaPath != "") loadWeight(betaPath, this->beta, cols);
 }
 
 // input and output are in shape of (rows, normSize)
 // TODO: column-wise parallel
 void LayerNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
     TimeLine t("LayerNorm.forward");
-    const float *pgamma = weights;
-    const float *pbeta = weights + normSize;
+    const float *pgamma = gamma;
+    const float *pbeta = beta;
     invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
 }
 
 
@@ -14,6 +14,9 @@
 // ============================================================================
 #pragma once
 
+#include <string>
+#include "weight_util.h"
+
 namespace xft {
 
 // Layer normalization: only support the norm along last dimension
@@ -23,6 +26,7 @@ class LayerNorm {
     ~LayerNorm();
 
     void setWeight(const float *gamma, const float *beta, int cols);
+    void setWeight(const std::string &gammaPath, const std::string &betaPath, int cols);
 
     // input and output are in shape of (rows, normSize)
     // TODO: column-wise parallel
@@ -31,8 +35,8 @@ class LayerNorm {
 private:
     int normSize;
 
-    // the weights contains gamma and beta concated together
-    float *weights;
+    float *gamma = nullptr;
+    float *beta = nullptr;
 };
 
 } // namespace xft
@@ -38,6 +38,11 @@ void RmsNorm::setWeight(const float *w, const float *, int cols) {
     memcpy(weight, w, cols * sizeof(float));
 }
 
+void RmsNorm::setWeight(const std::string &modelPath, const std::string &, int cols) {
+    this->normSize = cols;
+    loadWeight(modelPath, weight, cols);
+}
+
 // input and output are in shape of (rows, normSize)
 void RmsNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
     TimeLine t("RmsNorm.forward");
 
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "bfloat16.h"
+#include "weight_util.h"
 
 namespace xft {
 
@@ -25,6 +26,7 @@ class RmsNorm {
     ~RmsNorm();
 
     void setWeight(const float *w, const float *, int cols);
+    void setWeight(const std::string &modelPath, const std::string &, int cols);
 
     // Input and output are in shape of (rows, normSize)
     void forward(const float *input, float *output, int rows, int iStride = -1, int oStride = -1, float epsilon = 1e-6);
@@ -41,7 +43,7 @@ class RmsNorm {
     int normSize;
 
     // the scale weight
-    float *weight;
+    float *weight = nullptr;
 };
 
 } // namespace xft
@@ -38,6 +38,10 @@ class TokenEmbedding {
         }
     }
 
+    void setWeights(const std::string &weightPath) {
+        loadWeight(weightPath, embTable, vocabSize * hiddenSize);
+    }
+
     // tokenIds ia a 2-dimension array with batchSize rows, and seqLen cols
     template <typename OutT>
     void forward(int *tokenIds, OutT *output, int batchSize, int seqLen) {
@@ -57,5 +61,5 @@ class TokenEmbedding {
     int vocabSize;
     int hiddenSize;
 
-    T *embTable;
+    T *embTable = nullptr;
 };
Original file line number	Diff line number	Diff line change
`@@ -812,13 +812,8 @@ class Attention {`
`812`	`812`	`auto srcV = value + b * tgtLen * qkvCols + seq * qkvCols + i * headSize;`
`813`	`813`	`auto dstV = presentValue.getSequence(pastSeqLen + seq, b, i);`
`814`	`814`
`815`		`- if constexpr (std::is_same_v<KVCacheT, float>) {`
`816`		`- memcpy(dstK, srcK, headSize * sizeof(float));`
`817`		`- memcpy(dstV, srcV, headSize * sizeof(float));`
`818`		`- } else if constexpr (std::is_same_v<KVCacheT, float16_t>) {`
`819`		`- float16_t::cvt_float_to_float16(srcK, dstK, headSize);`
`820`		`- float16_t::cvt_float_to_float16(srcV, dstV, headSize);`
`821`		`- }`
	`815`	`+ xft::copy(dstK, srcK, headSize);`
	`816`	`+ xft::copy(srcV, dstV, headSize);`
`822`	`817`	`}`
`823`	`818`	`}`
`824`	`819`	`}`