Merge pull request #14 from feng-intel/main

Support Intel dGPU/iGPU platforms.
ankan-ban · Jan 31, 2024 · 877340f · 877340f
2 parents c193261 + e3bd4c6
commit 877340f
Show file tree

Hide file tree

Showing 9 changed files with 2,386 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,9 @@ Simple and fast Pure Cuda inference for 4-bit [AWQ](https://github.com/mit-han-l
 
 Based on [llama2.c](https://github.com/karpathy/llama2.c)
 
+## sycl / llama2_q4.sycl.cpp
+Sycl inference on **Intel dGPU/iGPU** platforms. More details [here](./sycl)
+
 ## Build
 
 ```

diff --git a/sycl/README.md b/sycl/README.md
@@ -0,0 +1,52 @@
+## llama2_q4.sycl.cpp
+
+Simple and fast Pure CPP inference on **Intel** **dGPU/iGPU** platforms for 4-bit [AWQ](https://github.com/mit-han-lab/llm-awq) quantized models.
+Based on [llama2.c](https://github.com/karpathy/llama2.c)
+
+Verified platforms: **PVC, ARC770, iGPU**(Meteor Lake CPU)    
+oneAPI version: 2024.0
+
+## Build
+
+```
+# for PVC
+$ icpx -fsycl -O2 -ffast-math -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc -q" llama2_q4.sycl.cpp -o llama2_awq
+
+# for ARC770 
+$ icpx -fsycl -O2 -ffast-math -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg2-g10 -q" llama2_q4.sycl.cpp -o llama2_awq
+
+# for iGPU
+$ icpx -fsycl -O2 -ffast-math llama2_q4.sycl.cpp -o llama2_awq
+```
+
+
+## Get model
+```
+$ cd ..
+```
+Refer to README.md to use convert_awq_to_bin.py and weight_packer to get llama2-7b-awq-q4.bin
+
+## Run
+```
+$ cd ./sycl
+$ ./llama2_awq llama2-7b-awq-q4.bin -n 256 -i "write an essay about dogs"
+Model params:-
+dim: 4096
+hidden_dim: 11008
+n_heads: 32
+n_kv_heads: 32
+n_layers: 32
+seq_len: 4096
+vocab_size: 32000
+rope_theta: 10000
+
+Loading Weights... done!
+
+Encoding Prompt... Done!
+write an essay about dogs and their importance in our lives.
+Dogs are considered to be man's best friend for a reason. They have been a part of human life for thousands of years, providing companionship, protection, and unconditional love. From the moment they are born, dogs are trained to be loyal and obedient, and they quickly become an integral part of our families.
+
+One of the most important aspects of dogs is their ability to provide emotional support. They are known to have a calming effect on people, and can help to reduce stress and anxiety. This is especially true for people who suffer from mental health conditions such as depression and PTSD. Studies have shown that interacting with dogs can lower levels of cortisol, a hormone associated with stress, and increase levels of oxytocin, a hormone associated with feelings of happiness and well-being.
+
+In addition to providing emotional support, dogs are also important in our lives because of their ability to protect us. They have a strong instinct to defend their pack, and will often put themselves in harm's way to protect their family. This makes them excellent guard dogs, and they are often used
+```
diff --git a/sycl/common.h b/sycl/common.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <stdint.h>
+
+constexpr int MAX_SEQ_LEN_SMEM_KERNEL = 8192; // 8k is the max sequence length supported by the kernel that uses shared memory
+constexpr int MAX_SEQ_LEN = 128 * 1024;       // Can be arbitirarily large, but we need to allocate memory for the whole sequence
+
+typedef struct dpct_type_328188 {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 32000 for llama2 models.
+    int seq_len; // max sequence length
+    float rope_theta; // theta for the rope rotational embedding
+} Config;
+
+struct QWeight {
+    uint32_t* weight;
+    uint32_t* zeros;
+    sycl::half *scales;
+};
+
+struct PerLayerWeight {
+    sycl::half *rms_att_weight; // (layer, dim) rmsnorm weights
+    sycl::half *rms_ffn_weight; // (layer, dim)
+    QWeight wq_q;
+    QWeight wq_k;
+    QWeight wq_v;
+    QWeight wq_o;
+    QWeight wq_gate;
+    QWeight wq_up;
+    QWeight wq_down;
+};
+
+typedef struct dpct_type_245411 {
+    // token embedding table
+    sycl::half *token_embedding_table; // (vocab_size, dim)
+    // classifier weights for the logits, on the last layer
+    sycl::half *wcls;
+    // final rmsnorm
+    sycl::half *rms_final_weight; // (dim,)
+    // Per layer weights
+    PerLayerWeight* layers;
+    int num_layers;
+} TransformerWeights;
+
+// data shared between CPU and GPU (allocated in host memory)
+struct SharedData {
+    volatile int pos;         // current token index
+    int tokens[MAX_SEQ_LEN];  // seq_len (tokens processed/generated so far) allocated in host memory so that CPU can read this
+};
+
+typedef struct dpct_type_557719 {
+    // current wave of activations
+    sycl::half *x;      // activation at current time stamp (dim,)
+    sycl::half *xb;     // same, but inside a residual branch (dim,)
+    sycl::half *hb;     // buffer for hidden dimension in the ffn (hidden_dim,)
+    sycl::half *hb2;    // buffer for hidden dimension in the ffn (hidden_dim,)
+    sycl::half *q;      // query (dim,)
+    sycl::half *att;    // buffer for scores/attention values (n_heads, seq_len)
+    sycl::half *logits; // output logits
+    // kv cache
+    sycl::half *key_cache;   // (layer, seq_len, kv_dim)
+    sycl::half *value_cache; // (layer, seq_len, kv_dim)
+
+    int* pos;  // GPU copy of the current position (just 1 element)
+    SharedData* shared_data;
+
+    float* logits_array;  // array of output logits used to compute perplexity (seq_len, vocab_size)
+} RunState;
+
+typedef struct dpct_type_146585 {
+    Config config; // the hyperparameters of the architecture (the blueprint)
+    TransformerWeights weights; // the weights of the model
+    RunState state; // buffers for the "wave" of activations in the forward pass
+} Transformer;
+
+int divUp(int a, int b) {
+    return (a - 1) / b + 1;
+}