Update CPU path and llm api test.

nod-ai · monorimet · Jan 19, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
commit c195dd6cf1edfe799a8f07c0f9980d2fd3392d0e
diff --git a/apps/shark_studio/api/llm.py b/apps/shark_studio/api/llm.py
@@ -62,31 +62,30 @@ def __init__(
     ):
         print(llm_model_map[model_name])
         self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
-        self.device = device.split("=>")[-1].strip()
-        self.driver = self.device.split("://")[0]
-        print(f"Selected {self.driver} as device driver")
+        self.device = device.split("=>")[-1].strip() if "cpu" not in device else "local-task"
+        self.driver = self.device.split("://")[0] if not any(x in self.device for x in ["cpu", "local-task"]) else "llvm-cpu"
+        print(f"Selected {self.driver} as IREE target backend.")
         self.precision = "f32" if "cpu" in self.driver else "f16"
         self.quantization = quantization
+        self.safe_name = self.hf_model_name.replace("/","_").replace("-", "_")
         #TODO: find a programmatic solution for model arch spec instead of hardcoding llama2
         self.file_spec = "_".join([
-            "llama2",
-            "streaming" if streaming_llm else "chat",
+            self.safe_name,
             self.precision,
             self.quantization,
         ])
+        if streaming_llm:
+            self.file_spec += "_streaming"
         self.tempfile_name = get_resource_path(f"{self.file_spec}.tempfile")
         #TODO: Tag vmfb with target triple of device instead of HAL backend
         self.vmfb_name = get_resource_path(f"{self.file_spec}_{self.driver}.vmfb.tempfile")    
-        self.safe_name = self.hf_model_name.split("/")[-1].replace("-", "_")
         self.max_tokens = llm_model_map[model_name]["max_tokens"]
         self.iree_module_dict = None
         self.external_weight_file = None
         self.streaming_llm = streaming_llm
         if external_weights is not None:
             self.external_weight_file = get_resource_path(
-                self.safe_name
-                + "_" + self.precision
-                + "_" + self.quantization
+                self.file_spec
                 + "." + external_weights
             )
         self.use_system_prompt = use_system_prompt
@@ -113,7 +112,7 @@ def __init__(
             external_weights is None or os.path.exists(str(self.external_weight_file))
         ):
             self.runner = vmfbRunner(
-                device = self.driver,
+                device = self.device,
                 vmfb_path=self.vmfb_name,
                 external_weight_path=self.external_weight_file,
             )
@@ -132,7 +131,6 @@ def __init__(
                 hf_auth_token,
                 compile_to="torch",
                 external_weights=external_weights,
-                external_weight_file=self.external_weight_file,
                 precision=self.precision,
                 quantization=self.quantization,
                 streaming_llm=self.streaming_llm,

diff --git a/apps/shark_studio/tests/api_test.py b/apps/shark_studio/tests/api_test.py
@@ -14,8 +14,10 @@ def testLLMSimple(self):
         lm = LanguageModel(
             "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
             hf_auth_token=None,
-            device="cpu-task",
+            device="local-task",
             external_weights="safetensors",
+            precision="fp32",
+            quantization="int4"
         )
         count = 0
         for msg, _ in lm.chat("hi, what are you?"):
@@ -24,8 +26,8 @@ def testLLMSimple(self):
                 count += 1
                 continue
             assert (
-                msg.strip(" ") == "Hello"
-            ), f"LLM API failed to return correct response, expected 'Hello', received {msg}"
+                msg.strip(" ") == "Hello!"
+            ), f"LLM API failed to return correct response, expected 'Hello!', received {msg}"
             break