wip

vllm-project · Aug 15, 2024 · 34e5db1 · 34e5db1
1 parent 66d617e
commit 34e5db1
Show file tree

Hide file tree

Showing 3 changed files with 389 additions and 41 deletions.
diff --git a/test.ipynb b/test.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ['VLLM_ATTENTION_BACKEND'] = \"FLASHINFER\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 08-13 11:24:30 llm_engine.py:176] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B, use_v2_block_manager=False, enable_prefix_caching=False)\n",
+      "INFO 08-13 11:24:31 selector.py:143] Using Flashinfer backend.\n",
+      "INFO 08-13 11:24:31 model_runner.py:721] Starting to load model meta-llama/Meta-Llama-3-8B...\n",
+      "INFO 08-13 11:24:31 selector.py:143] Using Flashinfer backend.\n",
+      "INFO 08-13 11:24:32 weight_utils.py:231] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9055277a9730496daf79b93e5c4b06b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 08-13 11:24:36 model_runner.py:733] Loading model weights took 14.9595 GB\n",
+      "INFO 08-13 11:24:37 gpu_executor.py:102] # GPU blocks: 27699, # CPU blocks: 2048\n",
+      "INFO 08-13 11:24:43 model_runner.py:1025] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "INFO 08-13 11:24:43 model_runner.py:1029] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 08-13 11:25:01 model_runner.py:1226] Graph capturing finished in 19 secs.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from vllm.entrypoints.llm import LLM\n",
+    "\n",
+    "llm = LLM(model=\"meta-llama/Meta-Llama-3-8B\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.69it/s, est. speed input: 14.12 toks/s, output: 75.29 toks/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[RequestOutput(request_id=1, prompt='apples', prompt_token_ids=[128000, 680, 645], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='common clegCommonGenERIC:wikipedia:cccccccc${Satellite Image}', token_ids=(5581, 272, 1978, 11076, 10172, 37016, 53982, 15288, 25, 56697, 56697, 2420, 35982, 18652, 4758, 92), cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1723548797.5150597, last_token_time=1723548797.5150597, first_scheduled_time=1723548797.5180113, first_token_time=1723548797.5427005, time_in_queue=0.0029516220092773438, finished_time=1723548797.7302678), lora_request=None)]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.generate(\"apples\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}