enable tests on mx_formats + Blackwell (#1905)

vkuzo · web-flow · commit 38b1f455583a · 2025-03-17T15:04:54.000-07:00
Update

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_custom_cast.py b/test/prototype/mx_formats/test_custom_cast.py
@@ -42,10 +42,13 @@
     sem_vals_to_f32,
 )
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, is_sm_at_least_100
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_8, is_sm_at_least_100
 
 torch.manual_seed(0)
 
+if not TORCH_VERSION_AT_LEAST_2_8:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
 
 @pytest.mark.skip(
     reason="TODO debug CI failure, low pri since this is not used in the MX code"  # noqa: E501
@@ -311,10 +314,7 @@ def test_fp4_pack_unpack():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_4, reason="requires PyTorch >= 2.4")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
+@pytest.mark.skipif(is_sm_at_least_100(), reason="broken on CUDA capability 10.0")
 def test_fp4_triton_unscaled_cast():
     packed_vals = torch.arange(0, 255, dtype=torch.uint8, device="cuda")
     f32_ref = f4_unpacked_to_f32(unpack_uint4(packed_vals))
@@ -324,10 +324,7 @@ def test_fp4_triton_unscaled_cast():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_4, reason="requires PyTorch >= 2.4")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
+@pytest.mark.skipif(is_sm_at_least_100(), reason="broken on CUDA capability 10.0")
 def test_fp4_triton_scaled_cast():
     size = (256,)
     orig_vals = torch.randn(size, dtype=torch.float, device="cuda") * 100
@@ -421,10 +418,6 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_4, reason="requires PyTorch >= 2.4")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
 def test_fp6_e2m3_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to(
         "cuda"
@@ -440,10 +433,6 @@ def test_fp6_e2m3_pack_unpack():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_4, reason="requires PyTorch >= 2.4")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
 def test_fp6_e3m2_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to(
         "cuda"
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,14 +25,14 @@
 )
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_5,
+    TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
     is_sm_at_least_100,
 )
 
 torch.manual_seed(2)
 
-if not TORCH_VERSION_AT_LEAST_2_5:
+if not TORCH_VERSION_AT_LEAST_2_8:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
 
@@ -169,10 +169,6 @@ def test_activation_checkpointing():
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    is_sm_at_least_100(),
-    reason="triton does not work yet on CUDA capability 10.0",
-)
 @pytest.mark.parametrize(
     "recipe_name",
     [
@@ -265,9 +261,6 @@ def test_inference_linear(elem_dtype, bias, input_shape):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 def test_inference_compile_simple(elem_dtype):
     """
@@ -294,10 +287,6 @@ def test_inference_compile_simple(elem_dtype):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    is_sm_at_least_100(),
-    reason="triton does not work yet on CUDA capability 10.0",
-)
 @pytest.mark.skipif(
     not is_sm_at_least_100(),
     reason="MX gemms require CUDA capability 10.0",
diff --git a/test/prototype/mx_formats/test_mx_mm.py b/test/prototype/mx_formats/test_mx_mm.py
@@ -10,9 +10,9 @@
 from torchao.ops import mx_fp4_bf16, mx_fp8_bf16
 from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP4, MXTensor
 from torchao.prototype.mx_formats.utils import to_blocked
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, is_sm_at_least_100
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_8, is_sm_at_least_100
 
-if not TORCH_VERSION_AT_LEAST_2_4:
+if not TORCH_VERSION_AT_LEAST_2_8:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
 
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -25,14 +25,13 @@
 )
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_4,
+    TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
-    is_sm_at_least_100,
 )
 
 torch.manual_seed(2)
 
-if not TORCH_VERSION_AT_LEAST_2_4:
+if not TORCH_VERSION_AT_LEAST_2_8:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
 
@@ -207,8 +206,6 @@ def test_transpose(elem_dtype, fp4_triton):
     """
     if elem_dtype != DTYPE_FP4 and fp4_triton:
         pytest.skip("unsupported configuration")
-    elif fp4_triton and is_sm_at_least_100():
-        pytest.skip("triton does not work yet on CUDA capability 10.0")
 
     M, K = 128, 256
     block_size = 32
@@ -265,9 +262,6 @@ def test_fp6_packing(elem_dtype, pack_fp6):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 @pytest.mark.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("all_zeros", [False, True])
@@ -324,9 +318,6 @@ def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(
-    is_sm_at_least_100(), reason="triton does not work yet on CUDA capability 10.0"
-)
 @pytest.mark.skipif(
     not is_sm_at_least_89(),
     reason="float8 in triton requires CUDA capability 8.9 or greater",
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -356,6 +356,7 @@ def torch_version_at_least(min_version):
     return is_fbcode() or compare_versions(torch.__version__, min_version) >= 0
 
 
+TORCH_VERSION_AT_LEAST_2_8 = torch_version_at_least("2.8.0")
 TORCH_VERSION_AT_LEAST_2_7 = torch_version_at_least("2.7.0")
 TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0")
 TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0")