pytorch
diff --git a/‎.github/workflows/regression_test.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/regression_test.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 98 additions & 97 deletions b/‎README.md
Lines changed: 98 additions & 97 deletions
diff --git a/‎dev-requirements.txt
Lines changed: 1 addition & 1 deletion b/‎dev-requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎e2e_fp8_sparse.csv
Lines changed: 0 additions & 8 deletions b/‎e2e_fp8_sparse.csv
Lines changed: 0 additions & 8 deletions
diff --git a/‎rowwise_scaled_linear_sparse_cutlass_time_results.csv
Lines changed: 0 additions & 4 deletions b/‎rowwise_scaled_linear_sparse_cutlass_time_results.csv
Lines changed: 0 additions & 4 deletions
diff --git a/‎setup.py
Lines changed: 1 addition & 0 deletions b/‎setup.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/dtypes/test_affine_quantized.py
Lines changed: 18 additions & 0 deletions b/‎test/dtypes/test_affine_quantized.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 21 additions & 0 deletions b/‎test/dtypes/test_affine_quantized_float.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎test/quantization/test_config_serialization.py
Lines changed: 0 additions & 2 deletions b/‎test/quantization/test_config_serialization.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎test/sparsity/test_activation24.py
Lines changed: 66 additions & 0 deletions b/‎test/sparsity/test_activation24.py
Lines changed: 66 additions & 0 deletions
@@ -64,7 +64,7 @@ jobs:
             torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
-            dev-requirements-overrides: "s/^pytest$/pytest==7.4.0/"
+            dev-requirements-overrides: "s/^pytest.*$/pytest==7.4.0/"
           - name: CUDA 2.6
             runs-on: linux.g5.12xlarge.nvidia.gpu
             torch-spec: 'torch==2.6.0'
@@ -83,7 +83,7 @@ jobs:
             torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-            dev-requirements-overrides: "s/^pytest$/pytest==7.4.0/"
+            dev-requirements-overrides: "s/^pytest.*$/pytest==7.4.0/"
           - name: CPU 2.6
             runs-on: linux.4xlarge
             torch-spec: 'torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu'
 
@@ -1,5 +1,5 @@
 # Test utilities
-pytest
+pytest==8.3.4
 unittest-xml-reporting
 parameterized
 packaging
 
@@ -433,6 +433,7 @@ def get_extensions():
                     "to_sparse_semi_structured_cutlass_sm9x_f8.cu",
                 ),
                 os.path.join(extensions_cuda_dir, "activation24", "sparsify24.cu"),
+                os.path.join(extensions_cuda_dir, "activation24", "sparse_gemm.cu"),
             ]
             for dtypes in ["e4m3e4m3", "e4m3e5m2", "e5m2e4m3", "e5m2e5m2"]:
                 cutlass_90a_sources.append(
 
@@ -424,6 +424,24 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         # making sure param.data is updated
         assert param.data.dequantize()[0][0] != 0
 
+    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("dtype", [torch.bfloat16])
+    @skip_if_no_cuda()
+    @skip_if_rocm("ROCm enablement in progress")
+    def test_mm_int4wo(self, device, dtype):
+        weight = torch.randn(512, 1024).to(device).to(dtype)
+        weight = weight.t()
+
+        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
+        l.weight = torch.nn.Parameter(weight)
+        quantize_(l, Int4WeightOnlyConfig())
+        # weight shape: 1024 x 512
+        weight = l.weight
+
+        input = torch.randn(1, 512, device=device, dtype=dtype)
+        # make sure it runs
+        torch.nn.functional.linear(input, weight)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantized)
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)
 
@@ -27,6 +27,7 @@
 
 from torchao.float8.float8_utils import compute_error
 from torchao.quantization import (
+    Float8DynamicActivationFloat8WeightConfig,
     float8_dynamic_activation_float8_weight,
     float8_weight_only,
     quantize_,
@@ -308,6 +309,26 @@ def test_fp8_weight_dimension_warning(self):
                 f"Expected warning message containing: {expected}",
             )
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    def test_mm_float8dq(self):
+        device = "cuda"
+        dtype = torch.bfloat16
+        weight = torch.randn(512, 1024).to(device).to(dtype)
+        weight = weight.t()
+
+        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
+        l.weight = torch.nn.Parameter(weight)
+        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
+        # weight shape: 1024 x 512
+        weight = l.weight
+
+        input = torch.randn(1, 512, device=device, dtype=dtype)
+        # make sure it runs
+        torch.nn.functional.linear(input, weight)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
 
 
@@ -63,8 +63,6 @@
     GemliteUIntXWeightOnlyConfig(
         group_size=128,  # Optional, has default of 64
         bit_width=8,  # Optional, has default of 4
-        packing_bitwidth=8,  # Optional, has default of 32
-        contiguous=True,  # Optional, has default of None
     ),
     FPXWeightOnlyConfig(ebits=4, mbits=8),
     # Sparsity configs
 
@@ -8,6 +8,7 @@
     PerRow,
     quantize_,
 )
+from torchao.quantization.quant_api import _float8_cutlass_quant
 
 torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = True
 
@@ -141,3 +142,68 @@ def srelu_linear(x):
         custom_output = reference_linear_copy(input_tensor)
 
         torch.testing.assert_close(reference_output, custom_output, rtol=0.1, atol=0.01)
+
+
+@unittest.skipIf(not is_sm_at_least_90(), "Need cuda arch greater than SM90")
+def test_sparse24_fp8_sm90_cutlass_gemm_eye(
+    M=512, K=256, dtype=torch.float8_e4m3fn
+) -> None:
+    torch.manual_seed(0)
+
+    A_dense = create_semi_structured_tensor(M, K, dtype=torch.bfloat16).cuda()
+    A_aqt = _float8_cutlass_quant(A_dense, dtype)
+    A = A_aqt.tensor_impl.float8_data
+
+    # NOTE: CUTLASS compression kernel expects the input to be *exactly*
+    # 2:4 sparse already (eg it does not select the largest values)
+    A_packed, A_mdata = to_sparse_semi_structured_cutlass_sm9x_f8(A)
+    assert torch.allclose(
+        A_packed.float().sum(), A.float().sum()
+    )  # Check all values are there
+
+    # Check MM without scale
+    eye = torch.eye(A.shape[1], device=A.device, dtype=A.dtype).T
+    A_reconstructed = torch.ops.torchao.sparse24_fp8_sm90_cutlass_gemm(
+        A_packed, A_mdata, eye
+    )
+    assert torch.allclose(A.float(), A_reconstructed.float())
+
+    # Check MM with scale
+    b_scale = torch.randn([1, A.shape[1]], device=eye.device, dtype=torch.float32)
+    a_scale = torch.randn([A.shape[0], 1], device=eye.device, dtype=torch.float32)
+    A_reconstructed = torch.ops.torchao._sparse24_fp8_sm90_cutlass_gemm(
+        A_packed, A_mdata, eye, a_scale=a_scale, b_scale=b_scale
+    )
+    assert torch.allclose(
+        A.float() * b_scale * a_scale, A_reconstructed.float(), rtol=0.01
+    )
+
+
+@unittest.skipIf(not is_sm_at_least_90(), "Need cuda arch greater than SM90")
+def test_sparse24_fp8_sm90_cutlass_gemm_random_tensor(
+    M=512, N=1024, K=256, dtype=torch.float8_e4m3fn
+) -> None:
+    def _to_fp8_rowwise(x: torch.Tensor, dtype):
+        max_v = torch.finfo(dtype).max
+        x_scale = (x.abs().max(1, keepdim=True)[0] / max_v).float()
+        x = (x / x_scale).to(dtype)
+        return x, x_scale
+
+    torch.manual_seed(0)
+    A_dense = create_semi_structured_tensor(M, K, dtype=torch.bfloat16).cuda()
+    A, a_scale = _to_fp8_rowwise(A_dense, dtype)
+
+    B_dense = torch.randn([N, K], device="cuda", dtype=torch.bfloat16)
+    B, b_scale = _to_fp8_rowwise(B_dense, dtype)
+
+    B = B.T
+    b_scale = b_scale.T
+
+    A_packed, A_mdata = to_sparse_semi_structured_cutlass_sm9x_f8(A)
+    out_sparse = torch.ops.torchao.sparse24_fp8_sm90_cutlass_gemm(
+        A_packed, A_mdata, B, a_scale=a_scale, b_scale=b_scale
+    )
+    out_ref = torch._scaled_mm(
+        A, B, scale_a=a_scale, scale_b=b_scale, out_dtype=out_sparse.dtype
+    )
+    assert torch.allclose(out_sparse, out_ref, rtol=0.01, atol=0.01)
Original file line number	Diff line number	Diff line change
`@@ -433,6 +433,7 @@ def get_extensions():`
`433`	`433`	`"to_sparse_semi_structured_cutlass_sm9x_f8.cu",`
`434`	`434`	`),`
`435`	`435`	`os.path.join(extensions_cuda_dir, "activation24", "sparsify24.cu"),`
	`436`	`+ os.path.join(extensions_cuda_dir, "activation24", "sparse_gemm.cu"),`
`436`	`437`	`]`
`437`	`438`	`for dtypes in ["e4m3e4m3", "e4m3e5m2", "e5m2e4m3", "e5m2e5m2"]:`
`438`	`439`	`cutlass_90a_sources.append(`