We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent b30425b commit f9936d3Copy full SHA for f9936d3
vllm/attention/layer.py
@@ -281,6 +281,7 @@ def forward(
281
if attn_metadata.enable_kv_scales_calculation:
282
self.calc_kv_scales(query, key, value)
283
284
+ output_dtype = query.dtype
285
if self.query_quant is not None:
286
# quantizing with a simple torch operation enables
287
# torch.compile to fuse this into previous ops
@@ -293,7 +294,7 @@ def forward(
293
294
output_shape = (output_shape
295
if output_shape is not None else query.shape)
296
output = torch.zeros(output_shape,
- dtype=query.dtype,
297
+ dtype=output_dtype,
298
device=query.device)
299
hidden_size = output_shape[-1]
300
0 commit comments