add orion 14b chat model

cookieshake · cookieshake · commit b7fc464e1e9f · 2024-03-03T00:21:42.000+09:00
diff --git a/models/Orion-14B-Chat-Int4/model-settings.json b/models/Orion-14B-Chat-Int4/model-settings.json
@@ -0,0 +1,4 @@
+{
+    "name": "Orion-14B-Chat-Int4",
+    "implementation": "model.Orion14BChatInt4"
+}
diff --git a/models/Orion-14B-Chat-Int4/model.py b/models/Orion-14B-Chat-Int4/model.py
@@ -0,0 +1,47 @@
+import json
+
+from typing import Dict, Any
+from mlserver import MLModel, types
+from mlserver.codecs import StringCodec
+
+
+class Orion14BChatInt4(MLModel):
+    MODEL_NAME = "OrionStarAI/Orion-14B-Chat-Int4"
+
+    async def load(self) -> bool:
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers.generation.utils import GenerationConfig
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME, use_fast=False, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.MODEL_NAME, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
+        self.model.generation_config = GenerationConfig.from_pretrained(self.MODEL_NAME)
+        return await super().load()
+
+    async def predict(self, payload: types.InferenceRequest) -> types.InferenceResponse:
+        messages = self._extract_json(payload)["messages"]
+        response = {
+            "assistant": self.model.chat(self.tokenizer, messages, streaming=False)
+        }
+        response_bytes = json.dumps(response, ensure_ascii=False).encode("UTF-8")
+        return types.InferenceResponse(
+            id=payload.id,
+            model_name=self.name,
+            model_version=self.version,
+            outputs=[
+                types.ResponseOutput(
+                    name="generated_text",
+                    shape=[len(response_bytes)],
+                    datatype="BYTES",
+                    data=[response_bytes],
+                    parameters=types.Parameters(content_type="str"),
+                )
+            ],
+        )
+
+    def _parse_request(self, payload: types.InferenceRequest) -> Dict[str, Any]:
+        inputs = {}
+        for inp in payload.inputs:
+            inputs[inp.name] = json.loads(
+                "".join(self.decode(inp, default_codec=StringCodec))
+            )
+        return inputs

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "name": "Orion-14B-Chat-Int4",
 +    "implementation": "model.Orion14BChatInt4"
 +}