Skip to content

Commit cfce256

Browse files
divakar-amdsumitd2
authored andcommitted
[Misc] add process_weights_after_loading for DummyLoader (vllm-project#8969)
Signed-off-by: Sumit Dubey <sumit.dubey2@ibm.com>
1 parent fb63827 commit cfce256

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

vllm/model_executor/model_loader/loader.py

+12
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,18 @@ def load_model(self, *, model_config: ModelConfig,
441441
# NOTE(woosuk): For accurate performance evaluation, we assign
442442
# random values to the weights.
443443
initialize_dummy_weights(model)
444+
445+
for _, module in model.named_modules():
446+
quant_method = getattr(module, "quant_method", None)
447+
if quant_method is not None:
448+
# When quant methods need to process weights after loading
449+
# (for repacking, quantizing, etc), they expect parameters
450+
# to be on the global target device. This scope is for the
451+
# case where cpu offloading is used, where we will move the
452+
# parameters onto device for processing and back off after.
453+
with device_loading_context(
454+
module, torch.device(device_config.device)):
455+
quant_method.process_weights_after_loading(module)
444456
return model.eval()
445457

446458

0 commit comments

Comments
 (0)