intel
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmark/benchmark.py‎
Lines changed: 4 additions & 2 deletions b/‎benchmark/benchmark.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎examples/model_config/deepseek-coder-33b/config.ini‎
Lines changed: 19 additions & 0 deletions b/‎examples/model_config/deepseek-coder-33b/config.ini‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/model_config/deepseek-coder-33b/generation_config.json‎
Lines changed: 6 additions & 0 deletions b/‎examples/model_config/deepseek-coder-33b/generation_config.json‎
Lines changed: 6 additions & 0 deletions
@@ -49,6 +49,7 @@ xFasterTransformer provides a series of APIs, both of C++ and Python, for end us
 |      ChatGLM3      | &#10004;  | &#10004; |   &#10004;   |
 |       Llama        | &#10004;  | &#10004; |   &#10004;   |
 |       Llama2       | &#10004;  | &#10004; |   &#10004;   |
+|   Deepseek-coder   | &#10004;  | &#10004; |   &#10004;   |
 |      Baichuan      | &#10004;  | &#10004; |   &#10004;   |
 |        QWen        | &#10004;  | &#10004; |   &#10004;   |
 | SecLLM(YaRN-Llama) | &#10004;  | &#10004; |   &#10004;   |
@@ -141,6 +142,7 @@ xFasterTransformer supports a different model format from Huggingface, but it's
     
     Supported model convert list:
     - LlamaConvert
+    - DeepseekConvert
     - ChatGLMConvert
     - ChatGLM2Convert
     - ChatGLM3Convert
 
@@ -117,7 +117,7 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
         model_prompt = prompt_pool["chatglm2"]
     if "chatglm3" in args.model_name.lower():
         model_prompt = prompt_pool["chatglm3"]
-    if "llama" in args.model_name.lower():
+    if "llama" in args.model_name.lower() or "deepseek" in args.model_name.lower():
         model_prompt = prompt_pool["llama"]
     if "baichuan" in args.model_name.lower():
         model_prompt = prompt_pool["baichuan"]
@@ -208,7 +208,9 @@ def build_inputs_chatglm(tokenizer, query: List[str], padding, history: List[Tup
         print(f"Next token P90 Latency:\t{np.percentile(next_token_times, 90):.2f} ms")
         print(f"Next token Avg Latency:\t{np.mean(next_token_times):.2f} ms")
         print(f"Next token Latency:\t{np.percentile(next_token_times, 90):.2f} ms")
-        print(f"Throughput without 1st token:\t{1000 / np.percentile(next_token_times, 90) * args.batch_size:.2f} tokens/s")
+        print(
+            f"Throughput without 1st token:\t{1000 / np.percentile(next_token_times, 90) * args.batch_size:.2f} tokens/s"
+        )
         print("=" * 120, "\n" * 3)
     else:
         for i in range(args.warmup + args.iteration):
 
@@ -0,0 +1,19 @@
+[llama]
+model_name = /data/models/deepseek-coder-33b-instruct
+head_num = 56
+kv_head_num = 8
+size_per_head = 128
+inter_size = 19200
+max_pos_seq_len = 16384
+num_layer = 62
+layernorm_eps = 1e-06
+layernorm_type = pre_layernorm
+activation_type = silu
+has_post_decoder_layernorm = 1
+vocab_size = 32256
+start_id = 32013
+end_id = 32021
+rope_type = linear
+rope_theta = 100000
+scaling_factor = 4.0
+weight_data_type = fp16
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 32013,
+  "eos_token_id": 32021,
+  "transformers_version": "4.34.1"
+}