integrate Mixtral-8x7B-Instruct-v0.1 inference

minmingzhu · minmingzhu · commit 3ff43736b107 · 2024-03-19T07:28:25.000Z
Signed-off-by: minmingzhu &lt;minming.zhu@intel.com&gt;
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -145,7 +145,7 @@ jobs:
 
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
-          if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1|google\/gemma-2b)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, mixtral-8x7B-Instruct-v0.1 ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -46,6 +46,7 @@ jobs:
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b-bigdl"}
           - { model: "llama-2-7b-chat-hf-vllm"}
+          - { model: "mixtral-8x7B-Instruct-v0.1"}
           - dtuner_model: nathan0/mpt-7b-deltatuner-model
             model: mpt-7b
 
diff --git a/llm_on_ray/inference/models/mixtral-8x7B-Instruct-v0.1.yaml b/llm_on_ray/inference/models/mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,22 @@
+port: 8000
+name: Mixtral-8x7B-Instruct-v0.1
+route_prefix: /Mixtral-8x7B-Instruct-v0.1
+num_replicas: 1
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+device: CPU
+ipex:
+  enabled: true
+  precision: bf16
+model_description:
+  model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
+  bigdl: false
+  tokenizer_name_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '<s>[INST] {msg} [/INST]'
+    bot_id: ''
+    stop_words: []