revert change on test

vllm-project · Oct 9, 2024 · 4abc281 · 4abc281
1 parent 1df3978
commit 4abc281
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 19 deletions.
diff --git a/Dockerfile.npu b/Dockerfile.npu
@@ -20,7 +20,6 @@ COPY examples/ /workspace/vllm/examples
 COPY tests/ /workspace/vllm/tests
 
 # install build requirements
-RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with NPU backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="npu" python3 -m pip install /workspace/vllm/

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -241,14 +241,14 @@ async def run_vllm_async(
 
 def run_hf(requests: List[Tuple[str, int, int]], model: str,
            tokenizer: PreTrainedTokenizerBase, n: int, use_beam_search: bool,
-           max_batch_size: int, trust_remote_code: bool, device: str) -> float:
+           max_batch_size: int, trust_remote_code: bool) -> float:
     assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.to(device)
+    llm = llm.cuda()
 
     pbar = tqdm(total=len(requests))
     start = time.perf_counter()
@@ -273,7 +273,7 @@ def run_hf(requests: List[Tuple[str, int, int]], model: str,
         input_ids = tokenizer(batch, return_tensors="pt",
                               padding=True).input_ids
         llm_outputs = llm.generate(
-            input_ids=input_ids.to(device),
+            input_ids=input_ids.cuda(),
             do_sample=not use_beam_search,
             num_return_sequences=n,
             temperature=1.0,
@@ -350,7 +350,7 @@ def main(args: argparse.Namespace):
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
                               args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code, args.device)
+                              args.trust_remote_code)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -18,8 +18,8 @@
 from ..utils import multi_gpu_test
 
 MODELS = [
-    "facebook/opt-125m", "/home/models/llama-2-7b/"
-    # "meta-llama/Llama-2-7b-hf",
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -35,7 +35,6 @@
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                         identity, is_cpu)
 
@@ -249,15 +248,7 @@ class HfRunner:
 
     def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
         if device is None:
-            if current_platform.is_cpu():
-                current_device = "cpu"
-            elif current_platform.is_npu():
-                current_device = "npu"
-            elif current_platform.is_xpu():
-                current_device = "xpu"
-            else:
-                current_device = "cuda"
-            return self.wrap_device(input, current_device)
+            return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
 
         if hasattr(input, "device") and input.device.type == device:
             return input

diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt
@@ -1,2 +1,8 @@
-The president of the United States is
-The future of AI is
+vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
+Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
+Compare and contrast artificial intelligence with human intelligence in terms of processing information.
+Describe the basic components of a neural network and how it can be trained.
+Write a short story about a robot that dreams for the first time.
+Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
+Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
+Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'