intel
diff --git a/‎include/dtype.h‎
Lines changed: 1 addition & 0 deletions b/‎include/dtype.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/common/float16.h‎
Lines changed: 3 additions & 0 deletions b/‎src/common/float16.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/layers/dist_linear.h‎
Lines changed: 2 additions & 2 deletions b/‎src/layers/dist_linear.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/layers/layer_norm.cpp‎
Lines changed: 17 additions & 8 deletions b/‎src/layers/layer_norm.cpp‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎src/layers/layer_norm.h‎
Lines changed: 6 additions & 2 deletions b/‎src/layers/layer_norm.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/layers/rms_norm.cpp‎
Lines changed: 4 additions & 0 deletions b/‎src/layers/rms_norm.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/layers/rms_norm.h‎
Lines changed: 3 additions & 1 deletion b/‎src/layers/rms_norm.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/layers/token_embedding.h‎
Lines changed: 5 additions & 1 deletion b/‎src/layers/token_embedding.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/models/baichuan.cpp‎
Lines changed: 2 additions & 19 deletions b/‎src/models/baichuan.cpp‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎src/models/chatglm.cpp‎
Lines changed: 3 additions & 22 deletions b/‎src/models/chatglm.cpp‎
Lines changed: 3 additions & 22 deletions
@@ -31,5 +31,6 @@ enum class DataType {
     w8a8_int8,
     w8a8_int4,
     w8a8_nf4,
+    unknown,
 };
 } // namespace xft
@@ -131,6 +131,7 @@ inline void float16_t::cvt_float_to_float16(const float *src, float16_t *dst, in
     int remainder = size % kStep;
 
     // Process blocks of 16 floats at a time
+#pragma omp parallel for
     for (int i = 0; i < blockSize; ++i) {
         // Load the input floats into a AVX512 register
         __m512 input_vector = _mm512_loadu_ps(src + i * kStep);
@@ -156,6 +157,7 @@ inline void float16_t::cvt_float16_to_float(const float16_t *src, float *dst, in
     int blockSize = size / kStep;
     int remainder = size % kStep;
 
+#pragma omp parallel for
     for (int i = 0; i < blockSize; ++i) {
         __m256i input_vector = _mm256_maskz_loadu_epi16(0xffff, src + i * kStep);
         __m512 output_vector = _mm512_cvtph_ps(input_vector);
@@ -175,6 +177,7 @@ inline void float16_t::float_add_float16(const float *src1, const float16_t *src
     int blockSize = size / kStep;
     int remainder = size % kStep;
 
+#pragma omp parallel for
     for (int i = 0; i < blockSize; ++i) {
         __m512 vec1 = _mm512_loadu_ps(src1 + i * kStep);
         __m256i _t = _mm256_maskz_loadu_epi16(0xffff, src2 + i * kStep);
 
@@ -45,7 +45,7 @@ class DistLinear {
     // |                                         |
     // |                                         | splitSize(N)
     // |_________________________________________|
-    void setWeight(const float *w, const float *b) {
+    void setWeight(const float *w, const float *b = nullptr) {
         this->splitSize = outputSize / splits;
         this->splitOffset = this->splitSize * splitIdx;
 
@@ -111,5 +111,5 @@ class DistLinear {
     hpj::Vector<float> scaleWeight; // if weight is int8
     hpj::Vector<float> zeroWeight; // if weight is int8
     hpj::Vector<float> sumWeight; // if weight is int8
-    float *bias;
+    float *bias = nullptr;
 };
@@ -17,35 +17,44 @@
 #include <cstdlib>
 #include <cstring>
 
-#include "layernorm_kernels.h"
 #include "layer_norm.h"
+#include "layernorm_kernels.h"
 #include "timeline.h"
 
 namespace xft {
 
 // Layer normalization: only support the norm along last dimension
 LayerNorm::LayerNorm() {
-    weights = nullptr;
+    gamma = nullptr;
+    beta = nullptr;
     normSize = 0;
 }
 
 LayerNorm::~LayerNorm() {
-    if (weights) { free(weights); }
+    if (gamma) { free(gamma); }
+    if (beta) { free(beta); }
 }
 
 void LayerNorm::setWeight(const float *gamma, const float *beta, int cols) {
     this->normSize = cols;
-    this->weights = (float *)aligned_alloc(64, 2 * cols * sizeof(float));
-    memcpy(weights, gamma, cols * sizeof(float));
-    memcpy(weights + cols, beta, cols * sizeof(float));
+    this->gamma = (float *)aligned_alloc(64, cols * sizeof(float));
+    this->beta = (float *)aligned_alloc(64, cols * sizeof(float));
+    memcpy(this->gamma, gamma, cols * sizeof(float));
+    memcpy(this->beta, beta, cols * sizeof(float));
+}
+
+void LayerNorm::setWeight(const std::string &gammaPath, const std::string &betaPath, int cols) {
+    this->normSize = cols;
+    loadWeight(gammaPath, this->gamma, cols);
+    if (betaPath != "") loadWeight(betaPath, this->beta, cols);
 }
 
 // input and output are in shape of (rows, normSize)
 // TODO: column-wise parallel
 void LayerNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
     TimeLine t("LayerNorm.forward");
-    const float *pgamma = weights;
-    const float *pbeta = weights + normSize;
+    const float *pgamma = gamma;
+    const float *pbeta = beta;
     invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
 }
 
 
@@ -14,6 +14,9 @@
 // ============================================================================
 #pragma once
 
+#include <string>
+#include "weight_util.h"
+
 namespace xft {
 
 // Layer normalization: only support the norm along last dimension
@@ -23,6 +26,7 @@ class LayerNorm {
     ~LayerNorm();
 
     void setWeight(const float *gamma, const float *beta, int cols);
+    void setWeight(const std::string &gammaPath, const std::string &betaPath, int cols);
 
     // input and output are in shape of (rows, normSize)
     // TODO: column-wise parallel
@@ -31,8 +35,8 @@ class LayerNorm {
 private:
     int normSize;
 
-    // the weights contains gamma and beta concated together
-    float *weights;
+    float *gamma = nullptr;
+    float *beta = nullptr;
 };
 
 } // namespace xft
@@ -38,6 +38,10 @@ void RmsNorm::setWeight(const float *w, const float *, int cols) {
     memcpy(weight, w, cols * sizeof(float));
 }
 
+void RmsNorm::setWeight(const std::string &modelPath, const std::string &, int cols) {
+    loadWeight(modelPath, weight, cols);
+}
+
 // input and output are in shape of (rows, normSize)
 void RmsNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
     TimeLine t("RmsNorm.forward");
 
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "bfloat16.h"
+#include "weight_util.h"
 
 namespace xft {
 
@@ -25,6 +26,7 @@ class RmsNorm {
     ~RmsNorm();
 
     void setWeight(const float *w, const float *, int cols);
+    void setWeight(const std::string &modelPath, const std::string &, int cols);
 
     // Input and output are in shape of (rows, normSize)
     void forward(const float *input, float *output, int rows, int iStride = -1, int oStride = -1, float epsilon = 1e-6);
@@ -41,7 +43,7 @@ class RmsNorm {
     int normSize;
 
     // the scale weight
-    float *weight;
+    float *weight = nullptr;
 };
 
 } // namespace xft
@@ -38,6 +38,10 @@ class TokenEmbedding {
         }
     }
 
+    void setWeights(const std::string &weightPath) {
+        loadWeight(weightPath, embTable, vocabSize * hiddenSize);
+    }
+
     // tokenIds ia a 2-dimension array with batchSize rows, and seqLen cols
     template <typename OutT>
     void forward(int *tokenIds, OutT *output, int batchSize, int seqLen) {
@@ -57,5 +61,5 @@ class TokenEmbedding {
     int vocabSize;
     int hiddenSize;
 
-    T *embTable;
+    T *embTable = nullptr;
 };
@@ -38,29 +38,12 @@ Baichuan<WeiT>::~Baichuan() {
 
 template <typename WeiT>
 void Baichuan<WeiT>::setEmbeddingWeights(const std::string &modelPath) {
-    int vocabSize = embedding->getVocabSize();
-    int hiddenSize = embedding->getHiddenSize();
-
-    float *tokenEmb = (float *)malloc(vocabSize * hiddenSize * sizeof(float));
-
-    loadWeight(modelPath + "/model.wte.bin", tokenEmb, vocabSize * hiddenSize, this->getDataType());
-
-    embedding->setWeights(tokenEmb);
-
-    free(tokenEmb);
+    embedding->setWeights(modelPath + "/model.wte.bin");
 }
 
 template <typename WeiT>
 void Baichuan<WeiT>::setFinalLnWeight(const std::string &modelPath) {
-    int hiddenSize = embedding->getHiddenSize();
-
-    float *gamma = (float *)malloc(hiddenSize * sizeof(float));
-
-    loadWeight(modelPath + "/model.final_layernorm.weight.bin", gamma, hiddenSize, this->getDataType());
-
-    finalLN.setWeight(gamma, nullptr, hiddenSize);
-
-    free(gamma);
+    finalLN.setWeight(modelPath + "/model.final_layernorm.weight.bin", "", embedding->getHiddenSize());
 }
 
 // Prepare attention_mask which is like:
 
@@ -50,32 +50,13 @@ ChatGLM<WeiT>::~ChatGLM() {
 
 template <typename WeiT>
 void ChatGLM<WeiT>::setEmbeddingWeights(const std::string &modelPath) {
-    int vocabSize = embedding->getVocabSize();
-    int hiddenSize = embedding->getHiddenSize();
-
-    float *tokenEmb = (float *)malloc(vocabSize * hiddenSize * sizeof(float));
-
-    loadWeight(modelPath + "/model.wte.bin", tokenEmb, vocabSize * hiddenSize, this->getDataType());
-
-    embedding->setWeights(tokenEmb);
-
-    free(tokenEmb);
+    embedding->setWeights(modelPath + "/model.wte.bin");
 }
 
 template <typename WeiT>
 void ChatGLM<WeiT>::setFinalLnWeight(const std::string &modelPath) {
-    int hiddenSize = embedding->getHiddenSize();
-
-    float *gamma = (float *)malloc(hiddenSize * sizeof(float));
-    float *beta = (float *)malloc(hiddenSize * sizeof(float));
-
-    loadWeight(modelPath + "/model.final_layernorm.weight.bin", gamma, hiddenSize, this->getDataType());
-    loadWeight(modelPath + "/model.final_layernorm.bias.bin", beta, hiddenSize, this->getDataType());
-
-    finalLN.setWeight(gamma, beta, hiddenSize);
-
-    free(gamma);
-    free(beta);
+    finalLN.setWeight(modelPath + "/model.final_layernorm.weight.bin", modelPath + "/model.final_layernorm.bias.bin",
+            embedding->getHiddenSize());
 }
 
 // Prepare attention_mask