fix type check

vllm-project · Oct 8, 2024 · 23c45ae · 23c45ae
1 parent a3cb972
commit 23c45ae
Show file tree

Hide file tree

Showing 16 changed files with 119 additions and 409 deletions.
diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py
@@ -11,22 +11,18 @@ def clean_up():
 
 # Sample prompts.
 prompts = [
-    # "Hello, my name is",
+    "Hello, my name is",
     "The president of the United States is",
+    "The capital of France is",
     "The future of AI is",
-
-
-    # "美国的首都是"
-    # "The capital of France is",
-
 ]
+
 # Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=2, distributed_executor_backend="mp")
-# llm = LLM(model="Qwen/Qwen2-7B-Instruct")
-# llm = LLM(model="/workspace/cmq/models/LLM-Research/Meta-Llama-3-8B-Instruct")
+llm = LLM(model="facebook/opt-125m")
+
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)

diff --git a/setup.py b/setup.py
@@ -294,7 +294,8 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu() or _is_npu())
+    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu() or
+                _is_npu())
 
 
 def get_hipcc_rocm_version():

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -18,9 +18,8 @@
 from ..utils import multi_gpu_test
 
 MODELS = [
-    # "facebook/opt-125m",
-    # "meta-llama/Llama-2-7b-hf",
-    "/workspace/cmq/models/LLM-Research/Meta-Llama-3-8B-Instruct",
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")

diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
@@ -7,7 +7,6 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
-from vllm.executor.npu_executor import NPUExecutor, NPUExecutorAsync
 from vllm.sampling_params import SamplingParams
 
 
@@ -24,15 +23,6 @@ def execute_model(self, *args, **kwargs):
         return super().execute_model(*args, **kwargs)
 
 
-class CustomNPUExecutor(NPUExecutor):
-
-    def execute_model(self, *args, **kwargs):
-        # Drop marker to show that this was ran
-        with open(".marker", "w"):
-            ...
-        return super().execute_model(*args, **kwargs)
-
-
 class CustomGPUExecutorAsync(GPUExecutorAsync):
 
     async def execute_model_async(self, *args, **kwargs):
@@ -51,13 +41,9 @@ def test_custom_executor_type_checking(model):
         engine_args = AsyncEngineArgs(model=model,
                                       distributed_executor_backend=Mock)
         AsyncLLMEngine.from_engine_args(engine_args)
-    # with pytest.raises(TypeError):
-    #     engine_args = AsyncEngineArgs(
-    #         model=model, distributed_executor_backend=CustomGPUExecutor)
-    #     AsyncLLMEngine.from_engine_args(engine_args)
     with pytest.raises(TypeError):
         engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomNPUExecutor)
+            model=model, distributed_executor_backend=CustomGPUExecutor)
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
@@ -69,7 +55,7 @@ def test_custom_executor(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomNPUExecutor)
+            model=model, distributed_executor_backend=CustomGPUExecutor)
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
@@ -81,25 +67,25 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-# @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-# def test_custom_executor_async(model, tmpdir):
-#     cwd = os.path.abspath(".")
-#     os.chdir(tmpdir)
-#     try:
-#         assert not os.path.exists(".marker")
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_async(model, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
 
-#         engine_args = AsyncEngineArgs(
-#             model=model, distributed_executor_backend=CustomGPUExecutorAsync)
-#         engine = AsyncLLMEngine.from_engine_args(engine_args)
-#         sampling_params = SamplingParams(max_tokens=1)
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
 
-#         async def t():
-#             stream = await engine.add_request("0", "foo", sampling_params)
-#             async for x in stream:
-#                 ...
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
 
-#         asyncio.run(t())
+        asyncio.run(t())
 
-#         assert os.path.exists(".marker")
-#     finally:
-#         os.chdir(cwd)
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
diff --git a/tests/kernels/test_cache_npu.py b/tests/kernels/test_cache_npu.py