Open
Description
This is a follow-up to #2096
Exported AO/Gemlite models work correctly with Transformers but produce incorrect tokens when used with vLLM. I suspect that the QKV merging is not handled properly, which involves a call to the .narrow()
method. However, we have already double-checked the slicing operation, and it appears to be correct.
#pip install git+https://github.com/mobiusml/gemlite --upgrade;
#VLLM_USE_V1=0 TRITON_PRINT_AUTOTUNING=1 ipython3 ... #Make sure to disable V1!
import torch, time
from vllm import LLM
from vllm.sampling_params import SamplingParams
#model_id = "mobicham/llama3.1_8b_instruct_torchao_gemlite_4bitgs64"
model_id = "mobicham/Phi-4-mini-instruct_torchao_gemlite_4bitgs64"
#model_id = "mobicham/Llama-3.2-3B-Instruct_torchao_gemlite_4bitgs64"
llm = LLM(model=model_id, gpu_memory_utilization=0.9, dtype=torch.float16 , max_model_len=2048)
sampling_params = SamplingParams(temperature=0., top_k=1, max_tokens=1024)
prompts = ["Describe the impact of artificial intelligence on society."]
outputs = llm.generate(prompts * batch_size, sampling_params)
print(outputs[0].outputs[0].text)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!