format

vllm-project · zhuohan123 · Feb 16, 2024 · Jan 20, 2024 · Jan 20, 2024 · Jan 20, 2024
commit 2f25efa1130197b1f67fef736a6d4c258e84cfae
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -22,7 +22,7 @@
 sampling_params = SamplingParams(temperature=0.0)
 
 # Create an LLM.
-llm = llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat", tensor_parallel_size=4, enforce_eager=True, dtype="half", trust_remote_code=True)
+llm = llm = LLM(model="facebook/opt-125m")
 
 generating_prompts = [prefix + prompt for prompt in prompts]
 

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -4,7 +4,6 @@
 import torch
 import triton
 import triton.language as tl
-import time
 
 TESLA = 'Tesla' in torch.cuda.get_device_name(0)
 
@@ -621,7 +620,7 @@ def context_attention_fwd(q,
                               b_ctx_len,
                               max_input_len,
                               alibi_slopes=None):
-        
+
         BLOCK = 128 if not TESLA else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
@@ -729,4 +728,4 @@ def context_attention_fwd(q,
             num_warps=num_warps,
             num_stages=1,
         )
-        return
+        return