Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
raywanb committed Aug 15, 2024
1 parent 66d617e commit 34e5db1
Show file tree
Hide file tree
Showing 3 changed files with 389 additions and 41 deletions.
125 changes: 125 additions & 0 deletions test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ['VLLM_ATTENTION_BACKEND'] = \"FLASHINFER\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 08-13 11:24:30 llm_engine.py:176] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B, use_v2_block_manager=False, enable_prefix_caching=False)\n",
"INFO 08-13 11:24:31 selector.py:143] Using Flashinfer backend.\n",
"INFO 08-13 11:24:31 model_runner.py:721] Starting to load model meta-llama/Meta-Llama-3-8B...\n",
"INFO 08-13 11:24:31 selector.py:143] Using Flashinfer backend.\n",
"INFO 08-13 11:24:32 weight_utils.py:231] Using model weights format ['*.safetensors']\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9055277a9730496daf79b93e5c4b06b9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00<?, ?it/s]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 08-13 11:24:36 model_runner.py:733] Loading model weights took 14.9595 GB\n",
"INFO 08-13 11:24:37 gpu_executor.py:102] # GPU blocks: 27699, # CPU blocks: 2048\n",
"INFO 08-13 11:24:43 model_runner.py:1025] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
"INFO 08-13 11:24:43 model_runner.py:1029] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
"INFO 08-13 11:25:01 model_runner.py:1226] Graph capturing finished in 19 secs.\n"
]
}
],
"source": [
"from vllm.entrypoints.llm import LLM\n",
"\n",
"llm = LLM(model=\"meta-llama/Meta-Llama-3-8B\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 4.69it/s, est. speed input: 14.12 toks/s, output: 75.29 toks/s]\n"
]
},
{
"data": {
"text/plain": [
"[RequestOutput(request_id=1, prompt='apples', prompt_token_ids=[128000, 680, 645], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='common clegCommonGenERIC:wikipedia:cccccccc${Satellite Image}', token_ids=(5581, 272, 1978, 11076, 10172, 37016, 53982, 15288, 25, 56697, 56697, 2420, 35982, 18652, 4758, 92), cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1723548797.5150597, last_token_time=1723548797.5150597, first_scheduled_time=1723548797.5180113, first_token_time=1723548797.5427005, time_in_queue=0.0029516220092773438, finished_time=1723548797.7302678), lora_request=None)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"llm.generate(\"apples\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 34e5db1

Please sign in to comment.