pytorch
diff --git a/‎.github/workflows/build_wheels_linux.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_wheels_linux.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 7 additions & 1 deletion b/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎setup.py
Lines changed: 11 additions & 10 deletions b/‎setup.py
Lines changed: 11 additions & 10 deletions
diff --git a/‎test/prototype/mx_formats/test_custom_cast.py renamed to ‎test/prototype/mx_formats/test_kernels.py
Lines changed: 33 additions & 11 deletions b/‎test/prototype/mx_formats/test_custom_cast.py renamed to ‎test/prototype/mx_formats/test_kernels.py
Lines changed: 33 additions & 11 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 1 addition & 1 deletion b/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 1 addition & 1 deletion
@@ -28,7 +28,7 @@ jobs:
       os: linux
       with-cpu: enable
       with-cuda: enable
-      with-rocm: disable
+      with-rocm: enable
       with-xpu: enable
       # Note: if free-threaded python is required add py3.13t here
       python-versions: '["3.9"]'
 
@@ -1,11 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Callable, Tuple
 
 import fire
 import torch
 import triton
 from torch._inductor.utils import do_bench_using_profiling
 
-from torchao.prototype.mx_formats.custom_cast import (
+from torchao.prototype.mx_formats.kernels import (
     triton_to_mxfp8_dim1,
 )
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 
@@ -311,16 +311,17 @@ def get_extensions():
         glob.glob(os.path.join(extensions_cuda_dir, "**/*.cu"), recursive=True)
     )
 
-    extensions_hip_dir = os.path.join(
-        extensions_dir, "cuda", "tensor_core_tiled_layout"
-    )
-    hip_sources = list(
-        glob.glob(os.path.join(extensions_hip_dir, "*.cu"), recursive=True)
-    )
-    extensions_hip_dir = os.path.join(extensions_dir, "cuda", "sparse_marlin")
-    hip_sources += list(
-        glob.glob(os.path.join(extensions_hip_dir, "*.cu"), recursive=True)
-    )
+    # Define HIP source directories
+    hip_source_dirs = [
+        os.path.join(extensions_dir, "cuda", "tensor_core_tiled_layout"),
+        # TODO: Add sparse_marlin back in once we have a ROCm build for it
+        # os.path.join(extensions_dir, "cuda", "sparse_marlin")
+    ]
+
+    # Collect all HIP sources from the defined directories
+    hip_sources = []
+    for hip_dir in hip_source_dirs:
+        hip_sources.extend(glob.glob(os.path.join(hip_dir, "*.cu"), recursive=True))
 
     # Collect CUDA source files if needed
     if not IS_ROCM and use_cuda:
 
@@ -16,7 +16,17 @@
     F6_E2M3_EXP_BIAS,
     F6_E3M2_EXP_BIAS,
 )
-from torchao.prototype.mx_formats.custom_cast import (
+from torchao.prototype.mx_formats.fp_format_spec import (
+    _assert_equals,
+    dtype_to_interesting_values,
+    float4_e2m1_interesting_values,
+    float6_e2m3_interesting_values,
+    float6_e3m2_interesting_values,
+    get_sem_bits,
+    sem_bits_to_sem_vals,
+    sem_vals_to_f32,
+)
+from torchao.prototype.mx_formats.kernels import (
     f4_unpacked_to_f32,
     f6_e2m3_unpacked_to_f32,
     f6_e3m2_unpacked_to_f32,
@@ -33,17 +43,8 @@
     triton_to_mxfp8_dim1_reference,
     unpack_uint4,
 )
-from torchao.prototype.mx_formats.fp_format_spec import (
-    _assert_equals,
-    dtype_to_interesting_values,
-    float4_e2m1_interesting_values,
-    float6_e2m3_interesting_values,
-    float6_e3m2_interesting_values,
-    get_sem_bits,
-    sem_bits_to_sem_vals,
-    sem_vals_to_f32,
-)
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
@@ -465,3 +466,24 @@ def test_triton_mxfp8_dim1_randn(M, K):
     x_mx_t, x_s_t = triton_to_mxfp8_dim1(x, inner_block_size=32)
     torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
     torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (63, 1023),
+        (128, 4),
+        (128, 8),
+        (256, 8),
+        (300, 9),
+        (133, 512),
+        (528, 512),
+        (128, 1),
+    ],
+)
+def test_rearrange(shape):
+    scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8)
+    eager = to_blocked(scales, False)
+    triton = to_blocked(scales, True)
+    torch.testing.assert_close(eager, triton, atol=0, rtol=0)
@@ -17,7 +17,7 @@
     DTYPE_FP6_E3M2,
     SUPPORTED_ELEM_DTYPES,
 )
-from torchao.prototype.mx_formats.custom_cast import pack_uint4, pack_uint6
+from torchao.prototype.mx_formats.kernels import pack_uint4, pack_uint6
 from torchao.prototype.mx_formats.mx_tensor import (
     MXTensor,
     ScaleCalculationMode,
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`DTYPE_FP6_E3M2,`
`18`	`18`	`SUPPORTED_ELEM_DTYPES,`
`19`	`19`	`)`
`20`		`-from torchao.prototype.mx_formats.custom_cast import pack_uint4, pack_uint6`
	`20`	`+from torchao.prototype.mx_formats.kernels import pack_uint4, pack_uint6`
`21`	`21`	`from torchao.prototype.mx_formats.mx_tensor import (`
`22`	`22`	`MXTensor,`
`23`	`23`	`ScaleCalculationMode,`