fix

vllm-project · zhuohan123 · Mar 22, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
commit 364d6203c9926654bb206aa05cf17ede8a33dc4c
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
@@ -24,7 +24,8 @@
     # The device can be automatically detected when AWS Neuron SDK is installed.
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.
-    device="neuron")
+    device="neuron",
+    tensor_parallel_size=2)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)

diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py
@@ -30,7 +30,7 @@
     "LlamaForCausalLM": ("transformers_neuronx.llama.model",
                          "LlamaForSampling", "LlamaForCausalLM"),
     "MistralForCausalLM": ("transformers_neuronx.mistral.model",
-                           "LlamaForSampling", "MistralForCausalLM")
+                           "MistralForSampling", "MistralForCausalLM")
 }