init npu_support

Co-authored-by: MengqingCao <cmq0113@163.com>
vllm-project · Sep 9, 2024 · 6ae737e · 6ae737e
1 parent 4ef41b8
commit 6ae737e
Show file tree

Hide file tree

Showing 22 changed files with 3,372 additions and 149 deletions.
diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py
@@ -0,0 +1,22 @@
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    # "Hello, my name is",
+    "The president of the United States is",
+    # "The capital of France is",
+    # "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(max_tokens=100, temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.4.0",
+    "torch == 2.1.0",
     "wheel",
     "jinja2",
 ]

diff --git a/requirements-npu.txt b/requirements-npu.txt
@@ -0,0 +1,11 @@
+# Common dependencies
+-r requirements-common.txt
+
+decorator
+pyyaml
+scipy
+setuptools
+torch == 2.1.0
+torch_npu == 2.1.0.post6
+# torch == 2.4.0
+# torch_npu == 2.4.0.rc1
diff --git a/setup.py b/setup.py
@@ -277,6 +277,9 @@ def _is_openvino() -> bool:
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
+def _is_npu() -> bool:
+    return VLLM_TARGET_DEVICE == "npu"
+
 
 def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
@@ -389,6 +392,8 @@ def get_vllm_version() -> str:
         version += "+cpu"
     elif _is_xpu():
         version += "+xpu"
+    elif _is_npu():
+        version += "+npu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -444,10 +449,13 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-cpu.txt")
     elif _is_xpu():
         requirements = _read_requirements("requirements-xpu.txt")
+    elif _is_npu():
+        requirements = _read_requirements("requirements-npu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, "
             "OpenVINO, or CPU.")
+    print("requirements", requirements)
     return requirements
 
 

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -29,7 +29,7 @@ def test_vllm_gc_ed():
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["ASCEND_TORCH"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -33,7 +33,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sequence import SampleLogprobs
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_cpu)
+                        identity, is_cpu, is_npu)
 
 logger = init_logger(__name__)
 
@@ -213,6 +213,10 @@ def wrap_device(self, input: _T) -> _T:
             if hasattr(input, 'device') and input.device.type == "cuda":
                 return input  # Already on GPU, no need to move
             return input.to("cuda")
+        elif is_npu():
+            if hasattr(input, 'device') and input.device.type == "npu":
+                return input  # Already on GPU, no need to move
+            return input.to("npu")
         else:
             # Check if the input is already on the CPU
             if hasattr(input, 'device') and input.device.type == "cpu":

diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
@@ -7,7 +7,14 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.executor.npu_executor import NPUExecutor, NPUExecutorAsync
 from vllm.sampling_params import SamplingParams
+from vllm.attention.backends.ascend import AscendPagedAttention
+from vllm.attention.ops.paged_attn import PagedAttention
+
+
+# NOTE (cmq): do monkey patch
+PagedAttention = AscendPagedAttention
 
 
 class Mock:
@@ -23,6 +30,15 @@ def execute_model(self, *args, **kwargs):
         return super().execute_model(*args, **kwargs)
 
 
+class CustomNPUExecutor(NPUExecutor):
+
+    def execute_model(self, *args, **kwargs):
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().execute_model(*args, **kwargs)
+
+
 class CustomGPUExecutorAsync(GPUExecutorAsync):
 
     async def execute_model_async(self, *args, **kwargs):
@@ -41,9 +57,13 @@ def test_custom_executor_type_checking(model):
         engine_args = AsyncEngineArgs(model=model,
                                       distributed_executor_backend=Mock)
         AsyncLLMEngine.from_engine_args(engine_args)
+    # with pytest.raises(TypeError):
+    #     engine_args = AsyncEngineArgs(
+    #         model=model, distributed_executor_backend=CustomGPUExecutor)
+    #     AsyncLLMEngine.from_engine_args(engine_args)
     with pytest.raises(TypeError):
         engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
+            model=model, distributed_executor_backend=CustomNPUExecutor)
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
@@ -55,7 +75,7 @@ def test_custom_executor(model, tmpdir):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
+            model=model, distributed_executor_backend=CustomNPUExecutor)
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
@@ -67,25 +87,25 @@ def test_custom_executor(model, tmpdir):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-def test_custom_executor_async(model, tmpdir):
-    cwd = os.path.abspath(".")
-    os.chdir(tmpdir)
-    try:
-        assert not os.path.exists(".marker")
+# @pytest.mark.parametrize("model", ["facebook/opt-125m"])
+# def test_custom_executor_async(model, tmpdir):
+#     cwd = os.path.abspath(".")
+#     os.chdir(tmpdir)
+#     try:
+#         assert not os.path.exists(".marker")
 
-        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
+#         engine_args = AsyncEngineArgs(
+#             model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+#         engine = AsyncLLMEngine.from_engine_args(engine_args)
+#         sampling_params = SamplingParams(max_tokens=1)
 
-        async def t():
-            stream = await engine.add_request("0", "foo", sampling_params)
-            async for x in stream:
-                ...
+#         async def t():
+#             stream = await engine.add_request("0", "foo", sampling_params)
+#             async for x in stream:
+#                 ...
 
-        asyncio.run(t())
+#         asyncio.run(t())
 
-        assert os.path.exists(".marker")
-    finally:
-        os.chdir(cwd)
+#         assert os.path.exists(".marker")
+#     finally:
+#         os.chdir(cwd)