Skip to content

Commit

Permalink
revert change on test
Browse files Browse the repository at this point in the history
  • Loading branch information
MengqingCao committed Oct 9, 2024
1 parent 1df3978 commit 4abc281
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 19 deletions.
1 change: 0 additions & 1 deletion Dockerfile.npu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ COPY examples/ /workspace/vllm/examples
COPY tests/ /workspace/vllm/tests

# install build requirements
RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with NPU backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="npu" python3 -m pip install /workspace/vllm/
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,14 @@ async def run_vllm_async(

def run_hf(requests: List[Tuple[str, int, int]], model: str,
tokenizer: PreTrainedTokenizerBase, n: int, use_beam_search: bool,
max_batch_size: int, trust_remote_code: bool, device: str) -> float:
max_batch_size: int, trust_remote_code: bool) -> float:
assert not use_beam_search
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token
llm = llm.to(device)
llm = llm.cuda()

pbar = tqdm(total=len(requests))
start = time.perf_counter()
Expand All @@ -273,7 +273,7 @@ def run_hf(requests: List[Tuple[str, int, int]], model: str,
input_ids = tokenizer(batch, return_tensors="pt",
padding=True).input_ids
llm_outputs = llm.generate(
input_ids=input_ids.to(device),
input_ids=input_ids.cuda(),
do_sample=not use_beam_search,
num_return_sequences=n,
temperature=1.0,
Expand Down Expand Up @@ -350,7 +350,7 @@ def main(args: argparse.Namespace):
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.use_beam_search, args.hf_max_batch_size,
args.trust_remote_code, args.device)
args.trust_remote_code)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len)
Expand Down
4 changes: 2 additions & 2 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from ..utils import multi_gpu_test

MODELS = [
"facebook/opt-125m", "/home/models/llama-2-7b/"
# "meta-llama/Llama-2-7b-hf",
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]

TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
Expand Down
11 changes: 1 addition & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu)

Expand Down Expand Up @@ -249,15 +248,7 @@ class HfRunner:

def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
if device is None:
if current_platform.is_cpu():
current_device = "cpu"
elif current_platform.is_npu():
current_device = "npu"
elif current_platform.is_xpu():
current_device = "xpu"
else:
current_device = "cuda"
return self.wrap_device(input, current_device)
return self.wrap_device(input, "cpu" if is_cpu() else "cuda")

if hasattr(input, "device") and input.device.type == device:
return input
Expand Down
10 changes: 8 additions & 2 deletions tests/prompts/example.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
The president of the United States is
The future of AI is
vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
Compare and contrast artificial intelligence with human intelligence in terms of processing information.
Describe the basic components of a neural network and how it can be trained.
Write a short story about a robot that dreams for the first time.
Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'

0 comments on commit 4abc281

Please sign in to comment.