Closed
Description
Quantization on GPU works as expected with very small errors, but on CPU there seems to be a problem with the quantized model's output. Here is the code to replicate the problem.
import torch
import torch.nn as nn
from torch.nn import functional as F
from torchao.quantization.quant_api import (
quantize_,
int4_weight_only,
)
class TestModel(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(10, 20)
self.linear2 = nn.Linear(20, 30)
self.relu = nn.ReLU()
self.seq = nn.Sequential(nn.Linear(30,40), nn.ReLU())
def forward(self, x):
x = self.linear1(x)
x = self.relu(x)
x = self.linear2(x)
x = self.seq(x)
return x
model = TestModel()
cpu_quant_model = TestModel()
device = "cuda:0"
model.to(device)
cpu_quant_model.cpu()
test_input = torch.randn((10, 10), device=device)
original_output = model(test_input)
quantize_(model, int4_weight_only()) # Quantize the model on GPU
quanted_output = model(test_input)
print(F.mse_loss(original_output, quanted_output)) # Only a very small difference of 6.8689e-08
quantize_(cpu_quant_model, int4_weight_only()) # Quantize the model on CPU
cpu_quanted_output = cpu_quant_model(test_input.cpu())
print(F.mse_loss(original_output, cpu_quanted_output.to(device))) # Getting a large difference of 0.0281 (Close to 50000 times larger error compared to the original?)