From 90d3b0fb1803fed7edab91e63c013d2fb83a525a Mon Sep 17 00:00:00 2001
From: ANIKET SHIVAM <3268307+ANIKET-SHIVAM@users.noreply.github.com>
Date: Tue, 26 Sep 2023 14:24:26 -0700
Subject: [PATCH] CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release.

* Minor fix in gemm op profiler for raster order.

* Add scheduler mapping for raster order in the kernels.
---
 CHANGELOG.md                                  |   12 +-
 CMakeLists.txt                                |   59 +-
 CUDA.cmake                                    |    4 +-
 README.md                                     |   15 +-
 cmake/NvidiaCutlassConfig.cmake               |    9 +-
 .../08_turing_tensorop_gemm/CMakeLists.txt    |    1 -
 .../turing_tensorop_gemm.cu                   |    5 +-
 .../turing_tensorop_conv2dfprop.cu            |    6 -
 examples/12_gemm_bias_relu/CMakeLists.txt     |    1 -
 .../fused_two_convs_s8_sm75_rf.cu             |    5 -
 .../fused_two_convs_s8_sm75_shmem.cu          |    8 -
 .../fused_two_gemms_s8_sm75_rf.cu             |    4 -
 .../fused_two_gemms_s8_sm75_shmem.cu          |    6 -
 .../threadblock/grouped_threadblock_swizzle.h |   28 -
 examples/24_gemm_grouped/CMakeLists.txt       |    3 +-
 examples/40_cutlass_py/README.md              |   27 +-
 examples/40_cutlass_py/conv2d.py              |    8 +-
 examples/40_cutlass_py/customizable/README.md |   25 -
 examples/40_cutlass_py/customizable/conv2d.py |    7 +-
 examples/40_cutlass_py/customizable/gemm.py   |  135 +--
 .../customizable/gemm_grouped.py              |    7 +-
 examples/40_cutlass_py/gemm.py                |    6 +-
 examples/40_cutlass_py/gemm_grouped.py        |    6 +-
 .../ir_gen/gen_device.py                      |    8 -
 ...ampere_gemm_universal_streamk_broadcast.cu |  140 ++-
 .../52_hopper_gather_scatter_fusion.cu        |    3 +-
 .../gather_gemm.hpp                           |    2 +-
 .../54_hopper_fp8_warp_specialized_gemm.cu    |    2 +-
 examples/python/00_basic_gemm.ipynb           |    4 +-
 examples/python/01_epilogue.ipynb             |    4 +-
 .../02_pytorch_extension_grouped_gemm.ipynb   |    2 -
 examples/python/04_epilogue_visitor.ipynb     |  221 ++++
 examples/python/README.md                     |    4 +
 include/cute/algorithm/axpby.hpp              |    9 +-
 include/cute/algorithm/gemm.hpp               |    6 +-
 include/cute/algorithm/tuple_algorithms.hpp   |   75 +-
 include/cute/arch/copy_sm90_desc.hpp          |    2 +-
 include/cute/arch/mma_sm80.hpp                |   17 +-
 include/cute/arch/mma_sm90.hpp                |    7 +-
 include/cute/arch/mma_sm90_desc.hpp           |   14 +-
 include/cute/atom/copy_traits_sm90_tma.hpp    |  474 +++++---
 include/cute/atom/mma_atom.hpp                |    2 +-
 include/cute/atom/mma_traits_sm75.hpp         |    4 +-
 include/cute/config.hpp                       |    2 +-
 include/cute/container/bit_field.hpp          |   10 +-
 include/cute/container/tuple.hpp              |   38 +-
 include/cute/int_tuple.hpp                    |  113 +-
 include/cute/layout.hpp                       |   65 +-
 include/cute/numeric/arithmetic_tuple.hpp     |    6 +-
 include/cute/numeric/complex.hpp              |  144 +--
 include/cute/numeric/integral_constant.hpp    |   59 +-
 include/cute/numeric/integral_ratio.hpp       |  175 +++
 include/cute/numeric/math.hpp                 |   36 +-
 include/cute/pointer.hpp                      |   36 +-
 include/cute/stride.hpp                       |    3 +
 include/cute/swizzle.hpp                      |  171 ++-
 include/cute/swizzle_layout.hpp               |    2 +
 include/cute/swizzle_ptr.hpp                  |    9 +-
 include/cute/util/print.hpp                   |   66 +-
 include/cutlass/arch/mma_sm75.h               |  104 +-
 include/cutlass/arch/mma_sm80.h               |   17 +-
 include/cutlass/array.h                       |   16 +-
 include/cutlass/array_subbyte.h               |    9 +
 include/cutlass/barrier.h                     |    2 +-
 include/cutlass/bfloat16.h                    |   11 +
 include/cutlass/complex.h                     |   56 +-
 include/cutlass/conv/conv2d_problem_size.h    |   20 +-
 include/cutlass/conv/conv3d_problem_size.h    |   52 +-
 include/cutlass/conv/convolution.h            |   12 +-
 .../cutlass/conv/kernel/direct_convolution.h  |    2 +-
 .../conv/kernel/implicit_gemm_convolution.h   |    2 +-
 .../conv/threadblock/threadblock_swizzle.h    |   12 +-
 include/cutlass/coord.h                       |   10 +
 include/cutlass/core_io.h                     |   11 +-
 include/cutlass/cutlass.h                     |   10 +
 include/cutlass/detail/helper_macros.hpp      |   14 +
 include/cutlass/detail/layout.hpp             |   17 +
 .../collective/builders/sm90_builder.inl      |   92 +-
 .../collective/collective_builder.hpp         |    1 +
 .../cutlass/epilogue/collective/detail.hpp    |    4 +-
 .../sm90_epilogue_tma_warpspecialized.hpp     |  129 ++-
 include/cutlass/epilogue/fusion/callbacks.hpp |    2 +
 .../cutlass/epilogue/fusion/operations.hpp    |    8 +-
 .../sm90_callbacks_tma_warpspecialized.hpp    |   88 +-
 ...90_visitor_compute_tma_warpspecialized.hpp |  121 ++
 .../sm90_visitor_load_tma_warpspecialized.hpp |   60 +-
 ...sm90_visitor_store_tma_warpspecialized.hpp |   44 +-
 .../sm90_visitor_tma_warpspecialized.hpp      |  104 +-
 include/cutlass/epilogue/thread/activation.h  |  367 ++----
 .../thread/linear_combination_generic.h       |   63 +-
 .../epilogue_with_visitor_callbacks.h         |  495 ++++++++
 .../threadblock/fusion/visitor_2x.hpp         |  433 +++++++
 .../threadblock/fusion/visitor_compute.hpp    |  109 ++
 .../threadblock/fusion/visitor_load.hpp       |  559 +++++++++
 .../threadblock/fusion/visitor_store.hpp      |  781 +++++++++++++
 .../epilogue/threadblock/fusion/visitors.hpp  |   25 +-
 .../predicated_tile_iterator_params.h         |   10 +
 include/cutlass/fast_math.h                   |   60 +-
 include/cutlass/float8.h                      |   16 +
 include/cutlass/functional.h                  |   11 +-
 .../collective/builders/sm90_gmma_builder.inl |    2 +-
 .../sm90_mma_tma_gmma_rs_warpspecialized.hpp  |   96 +-
 .../cutlass/gemm/device/gemm_universal_base.h |   14 -
 include/cutlass/gemm/gemm.h                   |   31 +-
 .../cutlass/gemm/gemm_enumerated_types.h      |   76 +-
 .../default_gemm_universal_with_visitor.h     |  157 +++
 .../kernel/gemm_grouped_problem_visitor.h     |    1 -
 include/cutlass/gemm/kernel/gemm_universal.h  |   16 +-
 .../gemm/kernel/gemm_universal_with_visitor.h |  321 ++++++
 .../gemm_universal_with_visitor_streamk.h     |  892 +++++++++++++++
 .../gemm/kernel/params_universal_base.h       |   73 +-
 include/cutlass/gemm/kernel/sm70_gemm.hpp     |    2 +-
 include/cutlass/gemm/kernel/sm90_gemm_tma.hpp |    4 +-
 .../kernel/sm90_gemm_tma_warpspecialized.hpp  |    4 +-
 ...0_gemm_tma_warpspecialized_cooperative.hpp |   19 +-
 ...sm90_gemm_tma_warpspecialized_pingpong.hpp |    5 +-
 .../gemm/kernel/sm90_tile_scheduler.hpp       |  227 +---
 .../kernel/sm90_tile_scheduler_stream_k.hpp   |  525 ++-------
 .../gemm/kernel/tile_scheduler_params.h       | 1005 +++++++++++++++++
 .../gemm/threadblock/threadblock_swizzle.h    |   72 +-
 .../threadblock/threadblock_swizzle_streamk.h |   12 +-
 .../cutlass/gemm_coord.hpp                    |   59 +-
 include/cutlass/half.h                        |   12 +-
 include/cutlass/integer_subbyte.h             |   14 +
 .../cutlass/kernel_hardware_info.h            |   77 +-
 include/cutlass/kernel_hardware_info.hpp      |   47 +-
 include/cutlass/layout/matrix.h               |   14 +
 include/cutlass/layout/pitch_linear.h         |   11 +
 include/cutlass/layout/vector.h               |    1 +
 include/cutlass/numeric_size.h                |   93 ++
 include/cutlass/numeric_types.h               |   45 +-
 include/cutlass/pipeline/sm90_pipeline.hpp    |   55 +-
 include/cutlass/platform/platform.h           |   25 +-
 include/cutlass/subbyte_reference.h           |    7 +-
 .../collective/sm90_wgmma_transpose.hpp       |    2 +-
 .../predicated_tile_access_iterator_params.h  |   12 +
 include/cutlass/uint128.h                     |    9 +-
 .../cutlass/{workspace.hpp => workspace.h}    |   11 +
 .../building_in_windows_with_visual_studio.md |   93 ++
 .../building_with_clang_as_host_compiler.md   |   53 +
 media/docs/code_organization.md               |    9 +-
 media/docs/profiler.md                        |    2 +
 python/README.md                              |   71 +-
 python/cutlass/__init__.py                    |   37 +-
 python/cutlass/backend/__init__.py            |    5 -
 python/cutlass/backend/arguments.py           |   74 +-
 python/cutlass/backend/c_types.py             |  252 ++++-
 python/cutlass/backend/compiler.py            |   30 +-
 python/cutlass/backend/conv2d_operation.py    |  355 +++---
 python/cutlass/backend/epilogue.py            |  890 +++------------
 .../cutlass/backend/{test => evt}/__init__.py |    8 +-
 .../cutlass/backend/evt/backend/__init__.py   |   36 +
 .../backend/evt/backend/emitter_base.py       |  158 +++
 .../backend/evt/backend/sm80_emitter.py       |   47 +
 .../cutlass/backend/evt/backend/sm80_nodes.py |  258 +++++
 .../backend/evt/backend/sm90_emitter.py       |   98 ++
 .../cutlass/backend/evt/backend/sm90_nodes.py |  351 ++++++
 python/cutlass/backend/evt/epilogue.py        |  165 +++
 .../cutlass/backend/evt/frontend/__init__.py  |   33 +
 .../backend/evt/frontend/frontend_base.py     |  262 +++++
 .../backend/evt/frontend/python_ast.py        |  184 +++
 python/cutlass/backend/evt/ir/__init__.py     |   53 +
 .../cutlass/backend/evt/ir/compute_nodes.py   |   91 ++
 python/cutlass/backend/evt/ir/dag_ir.py       |  235 ++++
 .../backend/evt/ir/layout_algorithm.py        |  324 ++++++
 python/cutlass/backend/evt/ir/layout_nodes.py |  336 ++++++
 python/cutlass/backend/evt/ir/load_nodes.py   |  294 +++++
 python/cutlass/backend/evt/ir/node.py         |  292 +++++
 python/cutlass/backend/evt/ir/store_nodes.py  |  276 +++++
 python/cutlass/backend/evt/ir/tensor.py       |  130 +++
 python/cutlass/backend/evt/passes/__init__.py |   42 +
 .../backend/evt/passes/graph_drawer.py        |  158 +++
 .../backend/evt/passes/pass_argument_type.py  |  116 ++
 .../backend/evt/passes/pass_dag_2_tree.py     |  147 +++
 .../backend/evt/passes/pass_fix_element_d.py  |   64 ++
 .../backend/evt/passes/pass_get_impl.py       |   89 ++
 .../evt/passes/pass_layout_elimination.py     |  217 ++++
 .../backend/evt/passes/pass_manager.py        |  163 +++
 .../evt/passes/pass_no_op_elimination.py      |   53 +
 .../backend/evt/passes/pass_preprocess_red.py |   98 ++
 .../evt/passes/pass_shape_type_propagation.py |   59 +
 .../evt/passes/smem_size_calculator.py        |  200 ++++
 python/cutlass/backend/frontend.py            |   22 +-
 python/cutlass/backend/gemm_operation.py      |  597 +++++-----
 python/cutlass/backend/library.py             |  529 ++-------
 python/cutlass/backend/parser.py              |  877 --------------
 python/cutlass/backend/reduction_operation.py |   97 +-
 python/cutlass/backend/test/conv2d_testbed.py |  807 -------------
 .../backend/test/gemm_grouped_testbed.py      |  276 -----
 python/cutlass/backend/test/gemm_testbed.py   |  765 -------------
 python/cutlass/backend/test/profiler.py       |   69 --
 python/cutlass/backend/utils/__init__.py      |    1 -
 python/cutlass/backend/utils/datatypes.py     |   71 +-
 .../cutlass/backend/utils/reference_model.py  |  317 ------
 python/cutlass/backend/utils/software.py      |    6 +-
 python/cutlass/cpp/cutlass_bindings.cpp       |  182 ---
 .../cpp/include/conv/conv_problem_size.h      |  102 --
 python/cutlass/cpp/include/conv/convolution.h |   91 --
 python/cutlass/cpp/include/conv/host.h        |   54 -
 .../epilogue/epilogue_visitor_generic.h       |  222 ----
 .../epilogue/epilogue_visitor_op/unary_ops.h  |  233 ----
 .../visitor_op_accumulator.h                  |  148 ---
 .../epilogue_visitor_op/visitor_op_binary.h   |  245 ----
 .../visitor_op_column_broadcast.h             |  250 ----
 .../visitor_op_column_reduction.h             |  341 ------
 .../visitor_op_linear_combination.h           |  266 -----
 .../visitor_op_row_broadcast.h                |  258 -----
 .../visitor_op_row_reduction.h                |  319 ------
 .../visitor_op_tensor_input.h                 |  188 ---
 .../visitor_op_tensor_output.h                |  240 ----
 .../epilogue_visitor_op/visitor_op_unary.h    |  226 ----
 .../epilogue_visitor_with_layernorm.h         |  480 --------
 python/cutlass/cpp/include/gemm/gemm.h        |   77 --
 .../gemm/gemm_universal_with_visitor.h        |  638 -----------
 python/cutlass/cpp/include/layout/matrix.h    |   87 --
 python/cutlass/cpp/include/layout/tensor.h    |   74 --
 python/cutlass/cpp/include/swizzling.h        |  169 ---
 python/cutlass/cpp/include/tensor_coord.h     |   78 --
 python/cutlass/cpp/include/tensor_ref_view.h  |  102 --
 python/cutlass/cpp/include/types.h            |  146 ---
 python/cutlass/cpp/library.h                  |   32 -
 python/cutlass/cpp/test/conv/conv_problems.h  |   54 -
 python/cutlass/cpp/test/conv/convolution.h    |   49 -
 python/cutlass/cpp/test/conv/host.h           |  181 ---
 python/cutlass/cpp/test/gemm/gemm.h           |   45 -
 python/cutlass/cpp/test/gemm/host.h           |  431 -------
 python/cutlass/emit/common.py                 |   18 +-
 python/cutlass/emit/pytorch.py                |   96 +-
 python/cutlass/epilogue/__init__.py           |   53 +
 python/cutlass/{ => epilogue}/epilogue.py     |   53 +-
 .../tensor_ref.py => epilogue/evt_ops.py}     |   68 +-
 python/cutlass/library_defaults.py            |   50 +-
 python/cutlass/op/__init__.py                 |    2 +-
 python/cutlass/op/conv.py                     |  421 ++++---
 python/cutlass/op/gemm.py                     |  137 ++-
 python/cutlass/op/gemm_grouped.py             |   25 +-
 python/cutlass/op/op.py                       |   96 +-
 python/cutlass/profiler/__init__.py           |   37 +
 python/cutlass/profiler/event_profiler.py     |  185 +++
 python/cutlass/shape.py                       |  184 +++
 python/cutlass/swizzle.py                     |   23 +-
 python/cutlass/utils/__init__.py              |    2 +-
 python/cutlass/utils/check.py                 |   10 +-
 python/cutlass/utils/datatypes.py             |  122 +-
 python/cutlass_library/__init__.py            |   49 +
 .../cutlass_library}/conv2d_operation.py      |   59 +-
 .../cutlass_library}/conv3d_operation.py      |   49 +-
 .../cutlass_library}/gemm_operation.py        |   36 +-
 .../cutlass_library}/generator.py             |  132 ++-
 .../cutlass_library}/library.py               |   87 +-
 python/cutlass_library/manifest.py            |  683 +++++++++++
 .../cutlass_library}/rank_2k_operation.py     |   65 +-
 .../cutlass_library}/rank_k_operation.py      |   61 +-
 .../cutlass_library}/symm_operation.py        |   65 +-
 .../cutlass_library}/trmm_operation.py        |   71 +-
 python/pycute/__init__.py                     |   36 +
 python/pycute/int_tuple.py                    |  230 ++++
 python/pycute/layout.py                       |  358 ++++++
 python/pycute/swizzle.py                      |  129 +++
 python/pycute/typing.py                       |   42 +
 python/setup.py                               |  110 +-
 .../setup_library.py                          |   23 +-
 python/setup_pycute.py                        |   46 +
 test/CMakeLists.txt                           |    5 +-
 test/python/backend/conv/__init__.py          |    0
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py |  233 ----
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py |  209 ----
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py |  130 ---
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py |  127 ---
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py |  196 ----
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py |  220 ----
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py |  341 ------
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py |   86 --
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py |  128 ---
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py |  139 ---
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py |  285 -----
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py |  129 ---
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py |  274 -----
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py |  128 ---
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py |  139 ---
 test/python/backend/gemm/__init__.py          |    0
 test/python/backend/gemm/gemm_bf16_sm80.py    |  128 ---
 test/python/backend/gemm/gemm_bf16_sm90.py    |  138 ---
 test/python/backend/gemm/gemm_f16_sm80.py     |  479 --------
 test/python/backend/gemm/gemm_f16_sm90.py     |  182 ---
 test/python/backend/gemm/gemm_f32_sm80.py     |  178 ---
 test/python/backend/gemm/gemm_f64_sm80.py     |  134 ---
 test/python/backend/gemm/gemm_f64_sm90.py     |  124 --
 test/python/backend/gemm/gemm_grouped_sm80.py |  235 ----
 test/python/backend/gemm/gemm_s8_sm80.py      |  261 -----
 test/python/backend/gemm/gemm_s8_sm90.py      |  154 ---
 test/python/conv2d/conv2d_test_utils.py       |  508 ---------
 .../cutlass/conv2d/conv2d_problem_sizes.py    |  660 +++++++++++
 .../{ => cutlass}/conv2d/conv2d_sm80.py       |   62 +-
 .../cutlass/conv2d/conv2d_test_utils.py       |  425 +++++++
 .../conv => cutlass/conv2d}/run_all_tests.py  |   14 +-
 test/python/{ => cutlass}/emit/pytorch.py     |   43 +-
 .../python/cutlass/evt/evt_compute_sm80_90.py |  100 ++
 test/python/cutlass/evt/evt_layout_sm80_90.py |  173 +++
 test/python/cutlass/evt/evt_load_sm80_90.py   |  142 +++
 test/python/cutlass/evt/evt_mixed_sm80_90.py  |  274 +++++
 test/python/cutlass/evt/evt_store_sm80_90.py  |  155 +++
 .../{conv2d => cutlass/evt}/run_all_tests.py  |    4 +-
 test/python/cutlass/evt/utils/evt_testbed.py  |  230 ++++
 .../python/{ => cutlass}/gemm/gemm_batched.py |   13 +-
 .../{ => cutlass}/gemm/gemm_f16_sm80.py       |    6 +-
 .../{ => cutlass}/gemm/gemm_f16_sm90.py       |    6 +-
 .../{ => cutlass}/gemm/gemm_f32_sm80.py       |    6 +-
 .../{ => cutlass}/gemm/gemm_f64_sm80.py       |    6 +-
 .../{ => cutlass}/gemm/gemm_f64_sm90.py       |    6 +-
 .../python/{ => cutlass}/gemm/gemm_s8_sm80.py |    6 +-
 .../python/{ => cutlass}/gemm/gemm_s8_sm90.py |    6 +-
 test/python/cutlass/gemm/gemm_testbed.py      |  387 +++++++
 .../{ => cutlass}/gemm/run_all_tests.py       |    4 +-
 .../python/cutlass/gemm}/utils.py             |  120 +-
 .../interface/conv2d_interface.py             |    1 -
 .../python/cutlass/interface/evt_interface.py |  245 ++++
 .../{ => cutlass}/interface/gemm_interface.py |    5 +-
 test/python/{ => cutlass}/interface/utils.py  |    6 +-
 test/python/pycute/run_all_tests.py           |   75 ++
 test/python/pycute/test_coalesce.py           |   95 ++
 test/python/pycute/test_complement.py         |   92 ++
 test/python/pycute/test_composition.py        |  204 ++++
 test/python/pycute/test_int_tuple.py          |   80 ++
 test/python/pycute/test_left_inverse.py       |   87 ++
 test/python/pycute/test_right_inverse.py      |   96 ++
 test/python/pycute/test_typing.py             |   59 +
 ...wx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu |    2 -
 ...4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu |    1 -
 ...wx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu |    1 -
 ...8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu |    1 -
 test/unit/core/CMakeLists.txt                 |   20 +
 test/unit/core/cpp11.cu                       |   86 ++
 test/unit/cute/core/CMakeLists.txt            |    2 +-
 test/unit/cute/core/constant_arithmetic.cpp   |  106 --
 .../unit/cute/core/constants.cpp              |   37 +-
 test/unit/cute/core/mixedbits.cpp             |   90 +-
 test/unit/cute/hopper/tma_load.cu             |  323 +++---
 test/unit/cute/hopper/tma_load_testbed.hpp    |  199 ++++
 test/unit/cute/hopper/tma_store.cu            |  153 +--
 test/unit/cute/hopper/tma_store_testbed.hpp   |  184 +++
 .../gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu   |   11 +-
 .../gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu   |    5 +-
 .../gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu    |    2 -
 .../gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu   |    2 -
 ...mm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu |    2 -
 .../gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu   |    2 -
 ...mm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu |    1 -
 .../gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu    |    1 -
 .../gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu    |    3 +-
 .../gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu    |    2 -
 .../gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu   |    2 -
 .../gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu   |    2 -
 .../gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu    |    2 -
 .../gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu    |    2 -
 test/unit/gemm/device/gemm_testbed_3x.hpp     |   18 +-
 test/unit/gemm/device/gemm_testbed_3x_evt.hpp |    2 +-
 test/unit/gemm/device/sm90_evt_operations.hpp |   53 -
 ...er_warpspecialized_cooperative_aux_load.cu |   12 +-
 ...cluster_warpspecialized_cooperative_dag.cu |    8 +-
 ...rpspecialized_cooperative_row_broadcast.cu |    4 +-
 ...uster_warpspecialized_pingpong_aux_load.cu |   12 +-
 ...32_cluster_warpspecialized_pingpong_dag.cu |    8 +-
 ..._warpspecialized_pingpong_row_broadcast.cu |    4 +-
 ...sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu |    4 +-
 ...cluster_warpspecialized_cooperative_evt.cu |    4 +-
 .../sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu  |    4 +-
 .../gemm/threadblock/mma_pipelined_sm75.cu    |    2 +-
 .../threadblock/mma_pipelined_wmma_sm75.cu    |    2 +-
 .../threadblock/mma_singlestage_wmma_sm75.cu  |    2 +-
 test/unit/gemm/warp/gemm_sm75.cu              |    2 +-
 test/unit/pipeline/pipeline_async.cu          |    6 -
 ...e_tma_async_warp_specialized_persistent.cu |    1 -
 tools/library/CMakeLists.txt                  |  247 ++--
 .../library/include/cutlass/library/library.h |    1 +
 tools/library/include/cutlass/library/types.h |    7 +
 tools/library/include/cutlass/library/util.h  |    7 +
 tools/library/scripts/__init__.py             |    0
 tools/library/scripts/manifest.py             |  476 --------
 tools/library/scripts/rt.py                   |  796 -------------
 tools/library/src/gemm_operation_3x.hpp       |   14 +
 tools/library/src/util.cu                     |   44 +
 tools/profiler/CMakeLists.txt                 |   18 +-
 .../profiler}/conv2d_operation_profiler.h     |    2 +
 .../profiler}/conv3d_operation_profiler.h     |    2 +
 .../cutlass/profiler}/cublas_helpers.h        |    0
 .../cutlass/profiler}/cudnn_helpers.h         |    0
 .../cutlass/profiler}/cutlass_profiler.h      |    0
 .../{src => include/cutlass/profiler}/debug.h |    0
 .../cutlass/profiler}/device_allocation.h     |    0
 .../cutlass/profiler}/device_context.h        |    0
 .../cutlass/profiler}/enumerated_types.h      |    0
 .../profiler}/gemm_operation_profiler.h       |    6 +-
 .../cutlass/profiler}/gpu_timer.h             |    0
 .../cutlass/profiler}/operation_profiler.h    |    0
 .../cutlass/profiler}/options.h               |    4 +
 .../cutlass/profiler}/performance_report.h    |    0
 .../cutlass/profiler}/performance_result.h    |    0
 .../cutlass/profiler}/problem_space.h         |    9 +
 .../profiler}/rank_2k_operation_profiler.h    |    0
 .../profiler}/rank_k_operation_profiler.h     |    0
 .../profiler}/reduction_operation_profiler.h  |    0
 .../sparse_gemm_operation_profiler.h          |    0
 .../profiler}/symm_operation_profiler.h       |    0
 .../profiler}/trmm_operation_profiler.h       |    0
 .../profiler/src/conv2d_operation_profiler.cu |    5 +-
 .../profiler/src/conv3d_operation_profiler.cu |    5 +-
 tools/profiler/src/cublas_helpers.cu          |    2 +-
 tools/profiler/src/cudnn_helpers.cpp          |    2 +-
 tools/profiler/src/cutlass_profiler.cu        |   18 +-
 tools/profiler/src/device_allocation.cu       |    2 +-
 tools/profiler/src/device_context.cu          |    2 +-
 tools/profiler/src/enumerated_types.cpp       |    2 +-
 tools/profiler/src/gemm_operation_profiler.cu |   22 +-
 tools/profiler/src/gpu_timer.cpp              |    2 +-
 tools/profiler/src/main.cpp                   |    4 +-
 tools/profiler/src/operation_profiler.cu      |   18 +-
 tools/profiler/src/options.cu                 |   15 +-
 tools/profiler/src/performance_report.cpp     |    5 +-
 tools/profiler/src/performance_result.cu      |    4 +-
 tools/profiler/src/problem_space.cpp          |   42 +-
 .../src/rank_2k_operation_profiler.cu         |    6 +-
 .../profiler/src/rank_k_operation_profiler.cu |    6 +-
 .../src/sparse_gemm_operation_profiler.cu     |    6 +-
 tools/profiler/src/symm_operation_profiler.cu |    6 +-
 tools/profiler/src/trmm_operation_profiler.cu |    6 +-
 .../util/include/cutlass/util/print_error.hpp |    7 +-
 .../cutlass/util/reference/host/gett.hpp      |    4 +-
 428 files changed, 22241 insertions(+), 21750 deletions(-)
 create mode 100644 examples/python/04_epilogue_visitor.ipynb
 create mode 100644 include/cute/numeric/integral_ratio.hpp
 create mode 100644 include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
 create mode 100644 include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
 create mode 100644 include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
 create mode 100644 include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
 create mode 100644 include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
 rename python/cutlass/cpp/include/layout/layout.h => include/cutlass/epilogue/threadblock/fusion/visitors.hpp (81%)
 rename python/cutlass/cpp/include/arch.h => include/cutlass/gemm/gemm_enumerated_types.h (52%)
 create mode 100644 include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
 create mode 100644 include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
 create mode 100644 include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
 create mode 100644 include/cutlass/gemm/kernel/tile_scheduler_params.h
 rename python/cutlass/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h => include/cutlass/gemm_coord.hpp (69%)
 rename python/cutlass/cpp/compiler.h => include/cutlass/kernel_hardware_info.h (57%)
 create mode 100644 include/cutlass/numeric_size.h
 rename include/cutlass/{workspace.hpp => workspace.h} (87%)
 create mode 100644 media/docs/build/building_in_windows_with_visual_studio.md
 create mode 100644 media/docs/build/building_with_clang_as_host_compiler.md
 rename python/cutlass/backend/{test => evt}/__init__.py (85%)
 create mode 100644 python/cutlass/backend/evt/backend/__init__.py
 create mode 100644 python/cutlass/backend/evt/backend/emitter_base.py
 create mode 100644 python/cutlass/backend/evt/backend/sm80_emitter.py
 create mode 100644 python/cutlass/backend/evt/backend/sm80_nodes.py
 create mode 100644 python/cutlass/backend/evt/backend/sm90_emitter.py
 create mode 100644 python/cutlass/backend/evt/backend/sm90_nodes.py
 create mode 100644 python/cutlass/backend/evt/epilogue.py
 create mode 100644 python/cutlass/backend/evt/frontend/__init__.py
 create mode 100644 python/cutlass/backend/evt/frontend/frontend_base.py
 create mode 100644 python/cutlass/backend/evt/frontend/python_ast.py
 create mode 100644 python/cutlass/backend/evt/ir/__init__.py
 create mode 100644 python/cutlass/backend/evt/ir/compute_nodes.py
 create mode 100644 python/cutlass/backend/evt/ir/dag_ir.py
 create mode 100644 python/cutlass/backend/evt/ir/layout_algorithm.py
 create mode 100644 python/cutlass/backend/evt/ir/layout_nodes.py
 create mode 100644 python/cutlass/backend/evt/ir/load_nodes.py
 create mode 100644 python/cutlass/backend/evt/ir/node.py
 create mode 100644 python/cutlass/backend/evt/ir/store_nodes.py
 create mode 100644 python/cutlass/backend/evt/ir/tensor.py
 create mode 100644 python/cutlass/backend/evt/passes/__init__.py
 create mode 100644 python/cutlass/backend/evt/passes/graph_drawer.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_argument_type.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_dag_2_tree.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_fix_element_d.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_get_impl.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_layout_elimination.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_manager.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_no_op_elimination.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_preprocess_red.py
 create mode 100644 python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
 create mode 100644 python/cutlass/backend/evt/passes/smem_size_calculator.py
 delete mode 100644 python/cutlass/backend/parser.py
 delete mode 100644 python/cutlass/backend/test/conv2d_testbed.py
 delete mode 100644 python/cutlass/backend/test/gemm_grouped_testbed.py
 delete mode 100644 python/cutlass/backend/test/gemm_testbed.py
 delete mode 100644 python/cutlass/backend/test/profiler.py
 delete mode 100644 python/cutlass/backend/utils/reference_model.py
 delete mode 100644 python/cutlass/cpp/cutlass_bindings.cpp
 delete mode 100644 python/cutlass/cpp/include/conv/conv_problem_size.h
 delete mode 100644 python/cutlass/cpp/include/conv/convolution.h
 delete mode 100644 python/cutlass/cpp/include/conv/host.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_generic.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h
 delete mode 100644 python/cutlass/cpp/include/epilogue/epilogue_visitor_with_layernorm.h
 delete mode 100644 python/cutlass/cpp/include/gemm/gemm.h
 delete mode 100644 python/cutlass/cpp/include/gemm/gemm_universal_with_visitor.h
 delete mode 100644 python/cutlass/cpp/include/layout/matrix.h
 delete mode 100644 python/cutlass/cpp/include/layout/tensor.h
 delete mode 100644 python/cutlass/cpp/include/swizzling.h
 delete mode 100644 python/cutlass/cpp/include/tensor_coord.h
 delete mode 100644 python/cutlass/cpp/include/tensor_ref_view.h
 delete mode 100644 python/cutlass/cpp/include/types.h
 delete mode 100644 python/cutlass/cpp/library.h
 delete mode 100644 python/cutlass/cpp/test/conv/conv_problems.h
 delete mode 100644 python/cutlass/cpp/test/conv/convolution.h
 delete mode 100644 python/cutlass/cpp/test/conv/host.h
 delete mode 100644 python/cutlass/cpp/test/gemm/gemm.h
 delete mode 100644 python/cutlass/cpp/test/gemm/host.h
 create mode 100644 python/cutlass/epilogue/__init__.py
 rename python/cutlass/{ => epilogue}/epilogue.py (69%)
 rename python/cutlass/{backend/tensor_ref.py => epilogue/evt_ops.py} (60%)
 create mode 100644 python/cutlass/profiler/__init__.py
 create mode 100644 python/cutlass/profiler/event_profiler.py
 create mode 100644 python/cutlass/shape.py
 create mode 100644 python/cutlass_library/__init__.py
 rename {tools/library/scripts => python/cutlass_library}/conv2d_operation.py (89%)
 rename {tools/library/scripts => python/cutlass_library}/conv3d_operation.py (85%)
 rename {tools/library/scripts => python/cutlass_library}/gemm_operation.py (96%)
 rename {tools/library/scripts => python/cutlass_library}/generator.py (98%)
 rename {tools/library/scripts => python/cutlass_library}/library.py (92%)
 create mode 100644 python/cutlass_library/manifest.py
 rename {tools/library/scripts => python/cutlass_library}/rank_2k_operation.py (85%)
 rename {tools/library/scripts => python/cutlass_library}/rank_k_operation.py (85%)
 rename {tools/library/scripts => python/cutlass_library}/symm_operation.py (85%)
 rename {tools/library/scripts => python/cutlass_library}/trmm_operation.py (85%)
 create mode 100644 python/pycute/__init__.py
 create mode 100644 python/pycute/int_tuple.py
 create mode 100644 python/pycute/layout.py
 create mode 100644 python/pycute/swizzle.py
 create mode 100644 python/pycute/typing.py
 rename test/python/backend/gemm/run_all_tests.py => python/setup_library.py (84%)
 create mode 100644 python/setup_pycute.py
 delete mode 100644 test/python/backend/conv/__init__.py
 delete mode 100644 test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
 delete mode 100644 test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
 delete mode 100644 test/python/backend/gemm/__init__.py
 delete mode 100644 test/python/backend/gemm/gemm_bf16_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_bf16_sm90.py
 delete mode 100644 test/python/backend/gemm/gemm_f16_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_f16_sm90.py
 delete mode 100644 test/python/backend/gemm/gemm_f32_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_f64_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_f64_sm90.py
 delete mode 100644 test/python/backend/gemm/gemm_grouped_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_s8_sm80.py
 delete mode 100644 test/python/backend/gemm/gemm_s8_sm90.py
 delete mode 100644 test/python/conv2d/conv2d_test_utils.py
 create mode 100644 test/python/cutlass/conv2d/conv2d_problem_sizes.py
 rename test/python/{ => cutlass}/conv2d/conv2d_sm80.py (79%)
 create mode 100644 test/python/cutlass/conv2d/conv2d_test_utils.py
 rename test/python/{backend/conv => cutlass/conv2d}/run_all_tests.py (85%)
 rename test/python/{ => cutlass}/emit/pytorch.py (89%)
 create mode 100644 test/python/cutlass/evt/evt_compute_sm80_90.py
 create mode 100644 test/python/cutlass/evt/evt_layout_sm80_90.py
 create mode 100644 test/python/cutlass/evt/evt_load_sm80_90.py
 create mode 100644 test/python/cutlass/evt/evt_mixed_sm80_90.py
 create mode 100644 test/python/cutlass/evt/evt_store_sm80_90.py
 rename test/python/{conv2d => cutlass/evt}/run_all_tests.py (93%)
 create mode 100644 test/python/cutlass/evt/utils/evt_testbed.py
 rename test/python/{ => cutlass}/gemm/gemm_batched.py (95%)
 rename test/python/{ => cutlass}/gemm/gemm_f16_sm80.py (99%)
 rename test/python/{ => cutlass}/gemm/gemm_f16_sm90.py (99%)
 rename test/python/{ => cutlass}/gemm/gemm_f32_sm80.py (98%)
 rename test/python/{ => cutlass}/gemm/gemm_f64_sm80.py (98%)
 rename test/python/{ => cutlass}/gemm/gemm_f64_sm90.py (97%)
 rename test/python/{ => cutlass}/gemm/gemm_s8_sm80.py (98%)
 rename test/python/{ => cutlass}/gemm/gemm_s8_sm90.py (98%)
 create mode 100644 test/python/cutlass/gemm/gemm_testbed.py
 rename test/python/{ => cutlass}/gemm/run_all_tests.py (93%)
 rename {python/cutlass/backend/test => test/python/cutlass/gemm}/utils.py (70%)
 rename test/python/{ => cutlass}/interface/conv2d_interface.py (99%)
 create mode 100644 test/python/cutlass/interface/evt_interface.py
 rename test/python/{ => cutlass}/interface/gemm_interface.py (98%)
 rename test/python/{ => cutlass}/interface/utils.py (91%)
 create mode 100644 test/python/pycute/run_all_tests.py
 create mode 100644 test/python/pycute/test_coalesce.py
 create mode 100644 test/python/pycute/test_complement.py
 create mode 100644 test/python/pycute/test_composition.py
 create mode 100644 test/python/pycute/test_int_tuple.py
 create mode 100644 test/python/pycute/test_left_inverse.py
 create mode 100644 test/python/pycute/test_right_inverse.py
 create mode 100644 test/python/pycute/test_typing.py
 create mode 100644 test/unit/core/cpp11.cu
 delete mode 100644 test/unit/cute/core/constant_arithmetic.cpp
 rename python/cutlass/cpp/include/gemm/host.h => test/unit/cute/core/constants.cpp (57%)
 create mode 100644 test/unit/cute/hopper/tma_load_testbed.hpp
 create mode 100644 test/unit/cute/hopper/tma_store_testbed.hpp
 delete mode 100644 tools/library/scripts/__init__.py
 delete mode 100644 tools/library/scripts/manifest.py
 delete mode 100644 tools/library/scripts/rt.py
 rename tools/profiler/{src => include/cutlass/profiler}/conv2d_operation_profiler.h (99%)
 rename tools/profiler/{src => include/cutlass/profiler}/conv3d_operation_profiler.h (99%)
 rename tools/profiler/{src => include/cutlass/profiler}/cublas_helpers.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/cudnn_helpers.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/cutlass_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/debug.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/device_allocation.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/device_context.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/enumerated_types.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/gemm_operation_profiler.h (97%)
 rename tools/profiler/{src => include/cutlass/profiler}/gpu_timer.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/options.h (97%)
 rename tools/profiler/{src => include/cutlass/profiler}/performance_report.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/performance_result.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/problem_space.h (98%)
 rename tools/profiler/{src => include/cutlass/profiler}/rank_2k_operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/rank_k_operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/reduction_operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/sparse_gemm_operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/symm_operation_profiler.h (100%)
 rename tools/profiler/{src => include/cutlass/profiler}/trmm_operation_profiler.h (100%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 039fc805da..7bb701ed3e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # NVIDIA CUTLASS Changelog
 
+## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
+* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
+* SM80 EVT support in C++ and Python.
+* Other SM90 epilogue improvements.
+* Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
+* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
+* SM90 TF32 kernel improvements for all layouts.
+* SM90 rasterization direction support in the CUTLASS profiler.
+* Improvement for CUTLASS profiler build times.
+* Remove Python-C++ bindings.
 
 ## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)
 
@@ -91,7 +101,7 @@
   * [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
   * [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
   * [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
-  * [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
+  * [Python-based instance emitter](/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
 * [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
   * Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
   * [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d4f9cc3a3..b880de0a52 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ endif()
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
 
-project(CUTLASS VERSION 3.2.0 LANGUAGES CXX)
+project(CUTLASS VERSION 3.2.1 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 if (CUDA_VERSION VERSION_LESS 11.3)
@@ -85,6 +85,21 @@ message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
 set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
 # 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
 
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+
+# Install cutlass_library Python package
+execute_process(
+  WORKING_DIRECTORY ${CUTLASS_DIR}/python
+  COMMAND ${Python3_EXECUTABLE} ${CUTLASS_DIR}/python/setup_library.py develop --user
+  RESULT_VARIABLE cutlass_lib_GENERATOR_INSTALL_RESULT
+  OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
+  ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
+)
+
+if(NOT cutlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
+  message(FATAL_ERROR "Error installing cutlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log")
+endif()
+
 ################################################################################
 set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
 
@@ -92,10 +107,16 @@ if(CUTLASS_ENABLE_HEADERS_ONLY)
   set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
+  set(CUTLASS_ENABLE_TESTS_INIT OFF)
 else()
   set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT ON)
+  if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(CUTLASS_ENABLE_TESTS_INIT ON)
+  else()
+    set(CUTLASS_ENABLE_TESTS_INIT OFF)
+  endif()
 endif()
 
 set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
@@ -104,20 +125,10 @@ set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable C
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
 set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
 set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
-set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Proformance")
-
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-  set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY})
-else()
-  set(CUTLASS_ENABLE_TESTS_INIT OFF)
-endif()
+set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")
 
 set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
-
-if (CUTLASS_ENABLE_TESTS)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
-endif()
-
+set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
 ################################################################################
 
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
@@ -285,6 +296,8 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
 endif()
 
 
+
+
 if (NOT MSVC AND CUTLASS_NVCC_KEEP)
   # MSVC flow handles caching already, but for other generators we handle it here.
   set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
@@ -395,6 +408,7 @@ endif()
 # Some tests require this build option in order to link.
 if (MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
 endif()
 
 function(cutlass_apply_cuda_gencode_flags TARGET)
@@ -572,11 +586,17 @@ target_include_directories(
   $<INSTALL_INTERFACE:include>
   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
-  $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
   $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
   $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
   )
 
+# Mark CTK headers as system to supress warnings from them
+target_include_directories(
+  CUTLASS
+  SYSTEM INTERFACE
+  $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
+  )
+
 install(
   DIRECTORY
   ${CUTLASS_INCLUDE_DIR}/
@@ -633,6 +653,11 @@ endif()
 
 include(CTest)
 enable_testing()
+
+if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
+endif()
+
 if (NOT TARGET test_all)
   add_custom_target(test_all)
 endif()
@@ -818,7 +843,7 @@ function(cutlass_add_executable_tests NAME TARGET)
  
   set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
 
-  if (CUTLASS_INSTALL_TESTS)
+    if (CUTLASS_INSTALL_TESTS)
 
     file(GENERATE 
       OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake" 
@@ -831,7 +856,7 @@ function(cutlass_add_executable_tests NAME TARGET)
       RENAME CTestTestfile.${TEST_NAME}.cmake
       )
 
-  endif()
+    endif()
 
 endfunction()
 
@@ -849,7 +874,9 @@ endif()
 
 if (CUTLASS_ENABLE_TESTS)
   add_subdirectory(test)
+  if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
   add_dependencies(test_all test_unit)
+  endif()
 endif()
 
 if (CUTLASS_INSTALL_TESTS)
diff --git a/CUDA.cmake b/CUDA.cmake
index 32bd8a58b4..b9c60bcd0b 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -305,10 +305,10 @@ function(cutlass_add_library NAME)
 
   if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
     cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
-    add_library(${NAME} ${TARGET_SOURCE_ARGS})
+    add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
   else()
     set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
+    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
   endif()
 
   cutlass_apply_standard_compile_options(${NAME})
diff --git a/README.md b/README.md
index 7ed86c117f..2d09925798 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ In addition to GEMMs, CUTLASS implements high-performance convolution via the im
 
 # What's New in CUTLASS 3.2
 
-CUTLASS 3.2 is an update to CUTLASS adding:
+CUTLASS 3.2.0 is an update to CUTLASS adding:
 - New warp-specialized persistent FP8 GEMM kernel [kernel schedules](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp)  targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](/examples/54_hopper_fp8_warp_specialized_gemm).
 - New [Epilogue Visitor Tree (EVT)](/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
 - [Stream-K](/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
@@ -53,6 +53,14 @@ CUTLASS 3.2 is an update to CUTLASS adding:
 - New CUTLASS 2D Convolution Python interface. New [example](/examples/python/03_basic_conv2d.ipynb) here.
 - Support for Windows (MSVC) builds.
 
+CUTLASS 3.2.1 is an update to CUTLASS adding:
+- Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
+- SM80 EVT support in C++ and Python.
+- Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
+- Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
+- SM90 TF32 kernel improvements for all layouts.
+- SM90 rasterization direction support in the CUTLASS profiler.
+- Improvement for CUTLASS profiler build times.
 
 Minimum requirements:
 
@@ -176,7 +184,8 @@ CUTLASS is a header-only template library and does not need to be built to be us
 projects. Client applications should target CUTLASS's `include/` directory in their include
 paths.
 
-CUTLASS unit tests, examples, and utilities can be build with CMake starting version 3.12. 
+CUTLASS unit tests, examples, and utilities can be build with CMake.
+The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.
 
@@ -512,7 +521,7 @@ reference_device: Passed
 ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
 - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
   - [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples) 
-  - [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
+  - [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
 - [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
 
 
diff --git a/cmake/NvidiaCutlassConfig.cmake b/cmake/NvidiaCutlassConfig.cmake
index 701ecb4af4..56d1c45076 100644
--- a/cmake/NvidiaCutlassConfig.cmake
+++ b/cmake/NvidiaCutlassConfig.cmake
@@ -2,6 +2,11 @@ get_filename_component(NvidiaCutlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
 
 include(CMakeFindDependencyMacro)
 
-if(NOT TARGET nvidia::cutlass::CUTLASS)
-    include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
+if(TARGET nvidia::cutlass::CUTLASS)
+  return()
 endif()
+
+include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
+
+# For backward compatibility with the old name
+add_library(cutlass_lib ALIAS cutlass_library)
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index e9d659e192..a240bcc97f 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -31,6 +31,5 @@
 cutlass_example_add_executable(
   08_turing_tensorop_gemm
   turing_tensorop_gemm.cu
-  DISABLE_TESTS ON
   )
 
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index f627b842a5..c5498adf33 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -291,8 +291,8 @@ int run() {
                                    LayoutInputB,
                                    ElementOutput,
                                    LayoutOutput,
-                                   ElementComputeEpilogue,
-                                   ElementComputeEpilogue>
+                                   int32_t,
+                                   int32_t>
       gemm_device;
 
   // Launch device reference gemm kernel
@@ -355,4 +355,3 @@ int main() {
 
   return run();
 }
-
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index ade0b97947..6f234410c2 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -143,7 +143,6 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
 #include "cutlass/util/tensor_view_io.h"
 
 #include "helper.h"
-
 // The code section below describes datatype for input, output tensors and computation between
 // elements
 using ElementAccumulator = int32_t;                 // Data type of accumulator
@@ -675,7 +674,6 @@ Result profile_convolution(Options const &options) {
 
   return result;
 }
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 int main(int argc, char const **args) {
@@ -762,11 +760,7 @@ int main(int argc, char const **args) {
     Result::print_header(std::cout, options) << std::endl;
     result.print(std::cout, 1, options) << std::endl;
   }
-
   return 0;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index abe61be1ce..5d4dac6cf0 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -31,6 +31,5 @@
 cutlass_example_add_executable(
   12_gemm_bias_relu
   gemm_bias_relu.cu
-  DISABLE_TESTS ON
   )
 
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
index 64955f8f83..07b583469e 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
@@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
 
   return pass;
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -229,10 +228,6 @@ int main() {
   };
 
   return testRun(75, funcs, "conv int8 RF residency");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
index 7f82518123..9886be0d0f 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
@@ -39,7 +39,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@@ -219,20 +218,13 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
 
   return pass;
 }
-
-
 int main() {
-
   std::vector<bool (*)()>funcs = {
     &run_nonfused_conv2d_fprop_optimized_s8_sm75,
     &run_fused_conv2d_fprop_optimized_s8_sm75_shmem
   };
 
   return testRun(75, funcs, "conv int8 shmem staging");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
index 565cca7e5c..3872caa22f 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
@@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -204,9 +203,6 @@ int main() {
   };
 
   return testRun(75, funcs, "gemm int8 RF residency");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
index 8719d74839..d1ab01945d 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
@@ -43,7 +43,6 @@
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
@@ -197,18 +196,13 @@ bool run_fused_gemm_s8_shmem() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
     &run_nonfused_gemm_s8,
     &run_fused_gemm_s8_shmem
   };
-
   return testRun(75, funcs, "gemm int8 shmem staing");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
index 42ef4110a8..62efba26a5 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
@@ -90,34 +90,6 @@ struct GroupedThreadblockSwizzle : detail::GroupedThreadblockSwizzleBase {
   }
 };
 
-template <
-  typename ThreadblockShape,
-  typename LayoutC,
-  cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_ = cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,
-  int PrefetchTileCount = 128,
-  int ThreadCount = PrefetchTileCount>
-struct GemmGroupedThreadblockSwizzle : GroupedThreadblockSwizzle<
-                                          cutlass::gemm::kernel::GemmGroupedProblemVisitor<
-                                            ThreadblockShape,
-                                            GroupScheduleMode_,
-                                            PrefetchTileCount,
-                                            ThreadCount,
-                                            platform::is_same<LayoutC, cutlass::layout::ColumnMajor>::value
-                                          >
-                                        > {
-  using Base = GroupedThreadblockSwizzle<cutlass::gemm::kernel::GemmGroupedProblemVisitor<
-                                            ThreadblockShape,
-                                            GroupScheduleMode_,
-                                            PrefetchTileCount,
-                                            ThreadCount,
-                                            platform::is_same<LayoutC, cutlass::layout::ColumnMajor>::value>>;
-
-  CUTLASS_HOST_DEVICE
-  GemmGroupedThreadblockSwizzle(typename Base::ProblemVisitor::Params& params,
-                                typename Base::ProblemVisitor::SharedStorage& shared_storage,
-                                int block_idx) : Base(params, shared_storage, block_idx) {}
-};
-
 template <
   typename ThreadblockShape,
   typename LayoutC,
diff --git a/examples/24_gemm_grouped/CMakeLists.txt b/examples/24_gemm_grouped/CMakeLists.txt
index db3479f4d2..054b96d1ed 100644
--- a/examples/24_gemm_grouped/CMakeLists.txt
+++ b/examples/24_gemm_grouped/CMakeLists.txt
@@ -31,6 +31,7 @@
 
 cutlass_example_add_executable(
   24_gemm_grouped
-  gemm_grouped.cu 
+  gemm_grouped.cu
   )
 
+
diff --git a/examples/40_cutlass_py/README.md b/examples/40_cutlass_py/README.md
index d33a6d5371..c670e34072 100644
--- a/examples/40_cutlass_py/README.md
+++ b/examples/40_cutlass_py/README.md
@@ -1,27 +1,4 @@
 # PyCUTLASS Examples
 
-**NOTE:** This directory contains examples for PyCUTLASS, a Python library providing low-level
-building blocks for emitting CUTLASS C++ kernels. For examples using CUTLASS's Pythonic interface,
-see the [examples/python](/examples/python) directory.
-
-Two types of examples are provided:
-* _Basic examples_: minimal examples that illustrate how to set up GEMMs, convolutions, and grouped GEMM operations
-* [_Customizable examples_](customizable): examples that allow one to specify a variety of template parameters for the given kernel
-
-## Setting up the Python interface
-Please follow the instructions [here](/python/README.md#installation) to set up the PyCUTLASS.
-
-## Running examples
-Each of the basic examples can be run as follows:
-```shell
-# Run the GEMM example
-python gemm.py
-
-# Run the Conv2d example
-python conv2d.py
-
-# Run the grouped GEMM example
-python gemm_grouped.py
-```
-
-To run the customizable examples, refer to the README in the [customizable](customizable) directory.
+This directory contains deprecated examples for PyCUTLASS, a precursor to the CUTLASS Python interface.
+For examples of using CUTLASS's actively-maintained Pythonic interface, see the [examples/python](/examples/python) directory.
diff --git a/examples/40_cutlass_py/conv2d.py b/examples/40_cutlass_py/conv2d.py
index a21f97690c..5e7b8e24e5 100644
--- a/examples/40_cutlass_py/conv2d.py
+++ b/examples/40_cutlass_py/conv2d.py
@@ -33,10 +33,14 @@
 Basic example of using the CUTLASS Python interface to run a 2d convolution
 """
 
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import argparse
-import torch
 import numpy as np
-import sys
+import torch
 
 import cutlass_bindings
 import cutlass.backend as pycutlass
diff --git a/examples/40_cutlass_py/customizable/README.md b/examples/40_cutlass_py/customizable/README.md
index cd25c69f3f..e8aeee9e71 100644
--- a/examples/40_cutlass_py/customizable/README.md
+++ b/examples/40_cutlass_py/customizable/README.md
@@ -165,28 +165,3 @@ Example 7: GELU
 ```python
 python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -sw IdentitySwizzle2 -p 512 256 128 -alpha 0.0 -beta 0.5 -gm GemmSplitKParallel -k 5 -bias -activ gelu
 ```
-### Epilogue Visitor Tree
-Example 1:
-```python
-python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_bf16 -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -epv RowBroadcast -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
-```
-Example 2:
-```python
-python gemm.py -i 8 8 4 -ta float64 -tb float64 -tc float64 -tacc float64 -m multiply_add -op TensorOp -b 32 32 16 -s 4 -w 2 2 1 -cc 80 -la ColumnMajor -aa 1 -lb RowMajor -ab 1 -lc RowMajor -ac 1 -te float64 -ep LinearCombination -epv ColumnBroadcast -sw IdentitySwizzle1 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
-```
-Example 3:
-```python
-python gemm.py -i 16 8 16 -ta float16 -tb float16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb RowMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -epv RowReduction -sw IdentitySwizzle4 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
-```
-Example 4:
-```python
-python gemm.py -i 16 8 16 -ta bfloat16 -tb bfloat16 -tc float32 -tacc float32 -m multiply_add -op TensorOp -b 64 128 64 -s 3 -w 2 2 1 -cc 80 -la ColumnMajor -aa 8 -lb ColumnMajor -ab 8 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -epv ColumnReduction -sw IdentitySwizzle2 -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Gemm -k 1
-```
-Example 5:
-```python
-python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_bf16 -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -epv RowReduction -sw BatchedIdentitySwizzle -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Batched -k 1 -batch 3
-```
-Example 6:
-```python
-python gemm.py -i 16 8 8 -ta float32 -tb float32 -tc float32 -tacc float32 -m multiply_add_fast_bf16 -op TensorOp -b 128 128 32 -s 3 -w 2 2 1 -cc 80 -la RowMajor -aa 4 -lb ColumnMajor -ab 4 -lc RowMajor -ac 4 -te float32 -ep LinearCombination -epv ColumnBroadcast -sw BatchedIdentitySwizzle -p 512 256 128 -alpha 1.0 -beta 0.5 -gm Array -k 1 -batch 3
-```
diff --git a/examples/40_cutlass_py/customizable/conv2d.py b/examples/40_cutlass_py/customizable/conv2d.py
index 6fb2494473..01e4133e7c 100644
--- a/examples/40_cutlass_py/customizable/conv2d.py
+++ b/examples/40_cutlass_py/customizable/conv2d.py
@@ -29,13 +29,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import numpy as np
 import cutlass.backend as pycutlass
 from cutlass.backend import *
 from cutlass.backend.utils.device import device_cc
 from cutlass.backend.conv2d_operation import *
 from cutlass.backend.utils.reference_model import Conv2dReferenceModule
-import sys
 import torch.nn.functional as F
 
 import argparse
diff --git a/examples/40_cutlass_py/customizable/gemm.py b/examples/40_cutlass_py/customizable/gemm.py
index 745f6aac2b..d98ffe884e 100644
--- a/examples/40_cutlass_py/customizable/gemm.py
+++ b/examples/40_cutlass_py/customizable/gemm.py
@@ -29,13 +29,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import numpy as np
 import cutlass.backend as pycutlass
 from cutlass.backend import *
 from cutlass.backend.utils.device import device_cc
 import cutlass_bindings
 from bfloat16 import bfloat16
-import sys
 
 import argparse
 
@@ -100,8 +105,6 @@
 parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
                     type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'], 
                     help="This option describes the epilogue part of the kernel")
-parser.add_argument("-epv", "--epilogue_visitor", default=None,
-                    type=str, choices=['RowReduction', 'ColumnReduction', 'RowBroadcast', 'ColumnBroadcast'], help="epilogue visitor for more complex epilogues")
 # swizzling
 parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
                     "IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle", "BatchedIdentitySwizzle"],
@@ -193,71 +196,10 @@
 
 swizzling_functor = getattr(cutlass_bindings, args.swizzling_functor)
 
-visitor = args.epilogue_visitor is not None
-
-if args.epilogue_visitor == "ColumnReduction":
-    class ColumnReduction_(EpilogueVisitTree):
-        def __call__(
-            self, accum: 'tensor',  c: 'tensor', 
-            alpha: 'scalar', beta: 'scalar'):
-            #
-            D = alpha * accum + beta * c
-            reduction = reduction_op(D, "column", "Add", args.threadblock_shape[0])
-            return D, reduction
-    epilogue_functor = ColumnReduction_(
-        epilogue_functor, tile_description, math_inst.element_accumulator, 
-        C.alignment, element_epilogue, C.element)
-    epilogue_functor.initialize()
-elif args.epilogue_visitor == "RowReduction":
-    class RowReduction_(EpilogueVisitTree):
-        def __call__(
-            self, accum: 'tensor',  c: 'tensor', 
-            alpha: 'scalar', beta: 'scalar'):
-            #
-            D = alpha * accum + tanh.numpy(beta * c)
-            reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
-            return D, reduction
-    epilogue_functor = RowReduction_(
-        epilogue_functor, tile_description, math_inst.element_accumulator, 
-        C.alignment, element_epilogue, C.element)
-    epilogue_functor.initialize()
-
-elif args.epilogue_visitor == "RowBroadcast":
-    class RowBroadcast_(EpilogueVisitTree):
-        def __call__(
-            self, accum: 'tensor',  c: 'tensor', 
-            vector: 'row', alpha: 'scalar', beta: 'scalar'):
-            #
-            T = accum + vector
-            scale_T = alpha * T
-            Z = relu.numpy(scale_T + beta * c)
-            return Z, T
-    epilogue_functor = RowBroadcast_(
-        epilogue_functor, tile_description, math_inst.element_accumulator, 
-        C.alignment, element_epilogue, C.element)
-    epilogue_functor.initialize()
-elif args.epilogue_visitor == "ColumnBroadcast":
-    class ColumnBroadcast_(EpilogueVisitTree):
-        def __call__(
-            self, accum: 'tensor',  c: 'tensor', 
-            vector: 'column', alpha: 'scalar', beta: 'scalar'):
-            #
-            T = accum + vector
-            scale_T = leaky_relu.numpy(alpha * T, 0.2)
-            Z = scale_T + beta * c
-            return Z, T
-    epilogue_functor = ColumnBroadcast_(
-        epilogue_functor, tile_description, math_inst.element_accumulator, 
-        C.alignment, element_epilogue, C.element)
-    epilogue_functor.initialize()
-else:
-    epilogue_functor = epilogue_functor
-
 operation = GemmOperationUniversal(
     arch=args.compute_capability, tile_description=tile_description,
     A=A, B=B, C=C,
-    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
-    visitor=visitor
+    epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
 )
 
 if args.print_cuda:
@@ -347,38 +289,7 @@ def __call__(
     shape=(args.batch * problem_size.m() * problem_size.n(),)
 ).astype(getattr(np, args.element_c))
 
-if args.epilogue_visitor == "RowReduction":
-    cta_n = args.threadblock_shape[1]
-    num_cta_n = (problem_size.n() + cta_n - 1) // cta_n
-    reduction = np.zeros(shape=(args.batch * problem_size.m() * num_cta_n,), dtype=getattr(np, args.element_c))
-    output_op = operation.epilogue_type(
-        D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
-    )
-elif args.epilogue_visitor == "ColumnReduction":
-    cta_m = args.threadblock_shape[0]
-    num_cta_m = (problem_size.m() + cta_m - 1) // cta_m
-    reduction = np.zeros(shape=(args.batch * problem_size.n() * num_cta_m,), dtype=getattr(np, args.element_c))
-    output_op = operation.epilogue_type(
-        D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
-    )
-elif args.epilogue_visitor == "RowBroadcast":
-    vector = np.ceil(
-            np.random.uniform(low=-8.5, high=7.5, size=(args.batch, 1, problem_size.n()))
-            ).astype(getattr(np, args.element_c))
-    tensor_t = np.empty_like(tensor_D)
-    output_op = operation.epilogue_type(
-        c=tensor_C, vector=vector, alpha=args.alpha, beta=args.beta, Z=tensor_D, T=tensor_t, problem_size=[problem_size.m(), problem_size.n()]
-    )
-elif args.epilogue_visitor == "ColumnBroadcast":
-    vector = np.ceil(
-            np.random.uniform(low=-8.5, high=7.5, size=(args.batch, problem_size.m(), 1))
-            ).astype(getattr(np, args.element_c))
-    tensor_t = np.empty_like(tensor_D)
-    output_op = operation.epilogue_type(
-        c=tensor_C, vector=vector, alpha=args.alpha, beta=args.beta, Z=tensor_D, T=tensor_t, problem_size=[problem_size.m(), problem_size.n()]
-    )
-else:
-    output_op = operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args))
+output_op = operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args))
 
 arguments = GemmArguments(
     operation=operation, problem_size=problem_size,
@@ -411,38 +322,8 @@ def __call__(
 tensor_D_ref = reference.run(
     tensor_A, tensor_B, tensor_C, problem_size, args.alpha, args.beta, args.bias, args.batch)
 
-if args.epilogue_visitor in ["RowBroadcast", "ColumnBroadcast"]:
-    tensor_D_ref = (tensor_D_ref.reshape((args.batch, problem_size.m(), problem_size.n())) + vector).flatten()
 tensor_D_ref = getattr(pycutlass, args.activation_function).numpy(*([tensor_D_ref,] + args.activation_args))
 
-if args.epilogue_visitor in ["RowReduction", "ColumnReduction"]:
-    output_op.sync()
-    accum_ref = reference.run(
-        tensor_A, tensor_B, tensor_C, problem_size, 1.0, 0.0, args.bias, args.batch)
-    tensor_D_ref, reduction_ref = epilogue_functor(
-        accum_ref.reshape((args.batch, problem_size.m(), problem_size.n())),
-        tensor_C.reshape((args.batch, problem_size.m(), problem_size.n())),
-        args.alpha, args.beta
-    )
-    tensor_D_ref = tensor_D_ref.flatten()
-    reduction_ref = reduction_ref.flatten()
-    assert np.allclose(reduction_ref, reduction, atol=1e-2)
-
-elif args.epilogue_visitor in ["RowBroadcast", "ColumnBroadcast"]:
-    output_op.sync()
-    accum_ref = reference.run(
-        tensor_A, tensor_B, tensor_C, problem_size, 1.0, 0.0, args.bias, args.batch)
-    
-    tensor_D_ref, tensor_T_ref = epilogue_functor(
-        accum_ref.reshape((args.batch, problem_size.m(), problem_size.n())),
-        tensor_C.reshape((args.batch, problem_size.m(), problem_size.n())), 
-        vector, args.alpha, args.beta)
-
-    tensor_D_ref = tensor_D_ref.flatten()
-    tensor_T_ref = tensor_T_ref.flatten()
-
-    assert np.array_equal(tensor_t, tensor_T_ref)
-
 try:
     assert np.array_equal(tensor_D, tensor_D_ref)
 except:
diff --git a/examples/40_cutlass_py/customizable/gemm_grouped.py b/examples/40_cutlass_py/customizable/gemm_grouped.py
index 0cecb328d0..06638b5fed 100644
--- a/examples/40_cutlass_py/customizable/gemm_grouped.py
+++ b/examples/40_cutlass_py/customizable/gemm_grouped.py
@@ -29,12 +29,17 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
+
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import numpy as np
 import cutlass.backend as pycutlass
 from cutlass.backend import *
 from cutlass.backend.utils.device import device_cc
 import csv
-import sys
 
 import argparse
 
diff --git a/examples/40_cutlass_py/gemm.py b/examples/40_cutlass_py/gemm.py
index 17b5d389bc..88fbd79b22 100644
--- a/examples/40_cutlass_py/gemm.py
+++ b/examples/40_cutlass_py/gemm.py
@@ -33,9 +33,13 @@
 Basic example of using the CUTLASS Python interface to run a GEMM
 """
 
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import argparse
 import numpy as np
-import sys
 
 import cutlass_bindings
 import cutlass.backend as pycutlass
diff --git a/examples/40_cutlass_py/gemm_grouped.py b/examples/40_cutlass_py/gemm_grouped.py
index 16e25d0c9c..e461ba9db5 100644
--- a/examples/40_cutlass_py/gemm_grouped.py
+++ b/examples/40_cutlass_py/gemm_grouped.py
@@ -33,9 +33,13 @@
 Basic example of using the CUTLASS Python interface to run a grouped GEMM
 """
 
+import sys
+print("This example is deprecated. Please see examples/python for examples of using "
+      "the CUTLASS Python interface.")
+sys.exit(0)
+
 import argparse
 import numpy as np
-import sys
 
 import cutlass_bindings
 import cutlass.backend as pycutlass
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
index d47b886363..c6df88cce4 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
@@ -434,14 +434,6 @@ def gen_func_run(self):
                 "    if (result != cudaSuccess) {\n" + \
                 "      return Status::kErrorInternal;\n" + \
                 "    }\n" + \
-                "\n" + \
-                "    result = cudaFuncSetAttribute(\n" + \
-                "        Kernel<B2bGemmKernel>,\n" + \
-                "        cudaFuncAttributePreferredSharedMemoryCarveout, 100);\n" + \
-                "\n" + \
-                "    if (result != cudaSuccess) {\n" + \
-                "      return Status::kErrorInternal;\n" + \
-                "    }\n" + \
                 "  }\n" + \
                 "  cutlass::Kernel<B2bGemmKernel><<<grid, block, smem_size, stream>>>(params_);\n" + \
                 "  result = cudaGetLastError();\n" + \
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
index db2eff51f3..4e7751f740 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
@@ -83,6 +83,10 @@
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/tensor_view_io.h"
 
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
 #include "helper.h"
 
 
@@ -120,6 +124,7 @@ using ThreadblockShape    = cutlass::gemm::GemmShape<128, 128, 32>;   // Threadb
 using WarpShape           = cutlass::gemm::GemmShape<64, 64, 32>;     // Warp-level tile size (concept: GemmShape)
 using InstructionShape    = cutlass::gemm::GemmShape<16, 8, 16>;      // Instruction-level tile size (concept: GemmShape)
 constexpr int NumStages   = 4;                                        // Number of global->shared pipeline stages used in the GEMM mainloop
+constexpr int EVTEpilogueStages = 1;                                  // Number of epilogue stages in EVT
 
 // Residual block configuration
 
@@ -166,23 +171,93 @@ using DeviceGemmBasic = cutlass::gemm::device::GemmUniversalWithBroadcast<
     AlignmentA,
     AlignmentB>;
 
-// StreamK device GEMM implementation type
-using DeviceGemmStreamK = cutlass::gemm::device::GemmUniversalStreamkWithBroadcast<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
+// StreamK device GEMM implementation type with EVT
+using namespace cute;
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+  ThreadblockShape, 
+  WarpShape, 
+  ElementC, 
+  AlignmentC, 
+  EVTEpilogueStages
+>;
+
+using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+    OutputTileThreadMap, ElementC,
+    cute::Stride<_0, _1, int32_t>  // StrideMNL
+>;
+
+using C1 = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, ElementC, 
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using C2 = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, ElementC, 
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementCompute, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute0 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute0,
+    Accum,
+    Bias>;
+  
+using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementCompute, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute1 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute1,
+    EVTCompute0,
+    C1>;
+
+using Compute2 = cutlass::epilogue::threadblock::VisitorCompute<
+    cutlass::plus, ElementOutput, ElementCompute,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute2 = cutlass::epilogue::threadblock::Sm80EVT<
+    Compute2,
+    EVTCompute1,
+    C2>;
+
+using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, ElementOutput, cutlass::FloatRoundStyle::round_to_nearest,
+    cute::Stride<int64_t, _1, int64_t> // StrideMNL
+>;
+
+using EVTD = cutlass::epilogue::threadblock::Sm80EVT<
+    D,
+    EVTCompute2>;
+
+using EVTKernelStreamK =
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    ElementA, LayoutA, cutlass::ComplexTransform::kNone, AlignmentA,
+    ElementB, LayoutB, cutlass::ComplexTransform::kNone, AlignmentB,
+    ElementC, LayoutC, AlignmentC,
     ElementAccumulator,
-    OperatorClass,
-    ArchTag,
+    ElementCompute,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
-    EpilogueOp,
+    EVTD,
     cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
     NumStages,
-    AlignmentA,
-    AlignmentB>;
+    cutlass::arch::OpMultiplyAdd,
+    EVTEpilogueStages
+>::GemmKernel;
 
+using DeviceGemmStreamK = cutlass::gemm::device::GemmUniversalAdapter<EVTKernelStreamK>;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Testbed utility types
@@ -360,36 +435,41 @@ typename DeviceGemmStreamK::Arguments args_from_options(
     cutlass::HostTensor<ElementC, LayoutC> &tensor_Vector/*,
     cutlass::HostTensor<ElementC, LayoutC> &tensor_Tensor*/
     )
-{
+{ 
+  typename EVTD::Arguments callback_args{
+    {
+      {
+        {
+          {},                                                                                                          // Accum
+          {tensor_Vector.device_data(), ElementC(0), {_0{}, _1{}, int32_t(options.problem_size.n())}},                 // Bias
+          {}                                                                                                           // Compute0
+        },                                                                                                             // EVTCompute0
+        {tensor_c1.device_data(), ElementC(0), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}}, // C1
+        {}                                                                                                             // Compute1
+      },                                                                                                               // EVTCompute1
+      {tensor_c2.device_data(), ElementC(0), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}},   // C2
+      {}                                                                                                               // Compute2
+    },                                                                                                                 // EVTCompute2
+    {tensor_d.device_data(), {options.problem_size.n(), _1{}, options.problem_size.mn().product()}},                   // D
+  };                                                                                                                   // EVTD
+
   return typename DeviceGemmStreamK::Arguments(
     cutlass::gemm::GemmUniversalMode::kGemm,  // universal mode
     options.problem_size,                     // problem_size
     options.split_k_factor,                   // batch count / splitk slices
-    {                                         // epilogue parameters
-      ElementAccumulator(options.alpha),
-      ElementAccumulator(options.beta)
-    },
+    callback_args,                            // argument of EVT callbacks
     tensor_a.device_data(),                   // ptr_A
     tensor_b.device_data(),                   // ptr_B
-    tensor_c1.device_data(),                  // ptr_C1
-    tensor_c2.device_data(),                  // ptr_C2
-    tensor_d.device_data(),                   // ptr_D
-    tensor_Vector.device_data(),              // ptr_Vector
-    /* tensor_Tensor.device_data(), */nullptr,// ptr_Tensor    // We're not storing Tensor
+    nullptr,                                  // ptr_C (unused)
+    nullptr,                                  // ptr_D (unused)
     options.problem_size.mk().product(),      // batch_stride_A
     options.problem_size.nk().product(),      // batch_stride_B
-    options.problem_size.mn().product(),      // batch_stride_C1
-    options.problem_size.mn().product(),      // batch_stride_C2
-    options.problem_size.mn().product(),      // batch_stride_D
-    options.problem_size.mn().product(),      // batch_stride_Vector
-    options.problem_size.mn().product(),      // batch_stride_Tensor
+    0,                                        // batch_stride_C (unused)
+    0,                                        // batch_stride_D (unused)
     tensor_a.layout().stride(0),              // stride_a
     tensor_b.layout().stride(0),              // stride_b
-    tensor_c1.layout().stride(0),             // stride_c1
-    tensor_c2.layout().stride(0),             // stride_c2
-    tensor_d.layout().stride(0),              // stride_d
-    /*tensor_Vector.layout().stride(0)*/0,    // stride_Vector // Vector stride is always 0
-    /*tensor_Tensor.layout().stride(0)*/0,    // stride_Tensor // We're not storing Tensor
+    0,                                        // stride_c (unused)
+    0,                                        // stride_d (unused)
     options.avail_sms);                       // avail_sms
 }
 
diff --git a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
index 110c6e44b1..c99afc05e6 100644
--- a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
+++ b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
@@ -526,7 +526,8 @@ struct ExampleRunner
 
     // Forward calls via lambda to avoid specifying template arguments
     auto gather_call = [](auto&&... args){ gather(static_cast<decltype(args)&&>(args)...); };
-    auto scatter_call = [](auto&&... args){ scatter(static_cast<decltype(args)&&>(args)...); };
+    // MSVC doesn't count use inside a false "if constexpr" branch.
+    [[maybe_unused]] auto scatter_call = [](auto&&... args){ scatter(static_cast<decltype(args)&&>(args)...); };
 
     if constexpr (DoGatherA) {
       run_gather(gather_call, tensor_a, tensor_a_gathered, arguments.gather_A, problem_size.batch(), stride_A);
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
index 458cb19554..579122210a 100644
--- a/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
+++ b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
@@ -58,7 +58,7 @@ class GemmGather
   // Type Aliases
   //
   using ProblemShape = ProblemShape_;
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = TileScheduler_;
   static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
     "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
index f6291c6e7f..080d703454 100644
--- a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
@@ -161,7 +161,7 @@ using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 using EpilogueOutputOp  = typename Gemm::EpilogueOutputOp;
 using ElementScalar     = typename EpilogueOutputOp::ElementScalar;
 using ElementAmax       = typename EpilogueOutputOp::ElementAmax;
-using ActivationFunctor = typename EpilogueOutputOp::ActivationFn<ElementCompute>;
+using ActivationFunctor = typename EpilogueOutputOp::ActivationFn;
 
 using StrideA = typename Gemm::GemmKernel::StrideA;
 using StrideB = typename Gemm::GemmKernel::StrideB;
diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/00_basic_gemm.ipynb
index 65c1107fe6..6c8222e0de 100644
--- a/examples/python/00_basic_gemm.ipynb
+++ b/examples/python/00_basic_gemm.ipynb
@@ -7,9 +7,7 @@
    "metadata": {},
    "source": [
     "# Basic example of using the CUTLASS Python interface\n",
-    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n",
-    "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)\n"
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n"
    ]
   },
   {
diff --git a/examples/python/01_epilogue.ipynb b/examples/python/01_epilogue.ipynb
index f7abddd886..13acbffdac 100644
--- a/examples/python/01_epilogue.ipynb
+++ b/examples/python/01_epilogue.ipynb
@@ -7,9 +7,7 @@
    "metadata": {},
    "source": [
     "# Example of using elementwise activation functions in the CUTLASS Python interface\n",
-    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n",
-    "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)"
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n"
    ]
   },
   {
diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
index b0cdb0edfd..ecd7828044 100644
--- a/examples/python/02_pytorch_extension_grouped_gemm.ipynb
+++ b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
@@ -10,8 +10,6 @@
     "This notebook walks through a basic example of using the CUTLASS Python interface to declare\n",
     "a grouped GEMM kernel and export it as a PyTorch CUDA extension. Note that GEMM and Conv2d can also be exported as PyTorch CUDA extensions. \n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/00_basic_gemm.ipynb)\n",
-    "\n",
     "## Background on grouped GEMM\n",
     "Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n",
     "in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n",
diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb
new file mode 100644
index 0000000000..72547d1999
--- /dev/null
+++ b/examples/python/04_epilogue_visitor.ipynb
@@ -0,0 +1,221 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5d24a692",
+   "metadata": {},
+   "source": [
+    "# Example of using epilogue visitor in the CUTLASS Python interface\n",
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ca993fe",
+   "metadata": {},
+   "source": [
+    "We first import various packages needed for the example, construct the input and output tensors that will be used in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63a70a3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import cutlass\n",
+    "from cutlass.epilogue import relu\n",
+    "from cutlass import Tensor as FakeTensor\n",
+    "from cutlass.profiler import CUDAEventProfiler\n",
+    "\n",
+    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
+    "# omit this information.\n",
+    "print_module = True\n",
+    "\n",
+    "# The Epilogue Visitor feature currently only works for SM80 and 90\n",
+    "from cutlass.backend.utils.device import device_cc\n",
+    "if device_cc() not in [80, 90]:\n",
+    "    import sys\n",
+    "    sys.exit()\n",
+    "\n",
+    "m = 16384\n",
+    "n = m\n",
+    "k = 512\n",
+    "\n",
+    "type_A = torch.float16\n",
+    "type_B = torch.float16\n",
+    "type_C = torch.float16\n",
+    "type_D = torch.float16\n",
+    "\n",
+    "torch.manual_seed(2023)\n",
+    "scope_min = -4\n",
+    "scope_max = 4\n",
+    "tensor_A = torch.ceil(torch.empty(size=(m, k), dtype=type_A, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_B = torch.ceil(torch.empty(size=(k, n), dtype=type_B, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_D = torch.zeros_like(tensor_C)\n",
+    "\n",
+    "plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1eb0d95b",
+   "metadata": {},
+   "source": [
+    "## Define the epilogue visitor functor\n",
+    "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n",
+    "* Each named variable must be assigned exactly once and defined before it it used.\n",
+    "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n",
+    "* Return values must be a named variable.\n",
+    "\n",
+    "The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
+    "\n",
+    "The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d257833",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define epilogue visitor\n",
+    "def example_epilogue(accum, alpha, C, beta, aux, bias):\n",
+    "    F = alpha * accum + (beta * C + aux)\n",
+    "    E = relu(F + 1) + bias\n",
+    "    D = E + F\n",
+    "    return D, F\n",
+    "\n",
+    "# Construct inputs and outputs\n",
+    "alpha = 0.5\n",
+    "beta = 0.5\n",
+    "aux = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
+    "tensor_F = torch.zeros_like(tensor_D)\n",
+    "examples_tensors = {\n",
+    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
+    "    \"alpha\": alpha,\n",
+    "    \"C\": tensor_C,\n",
+    "    \"beta\": beta,\n",
+    "    \"aux\": aux,\n",
+    "    \"bias\": bias,\n",
+    "    \"D\": tensor_D,\n",
+    "    \"F\": tensor_F\n",
+    "}\n",
+    "\n",
+    "# Trace the epilogue visitor\n",
+    "epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54961694",
+   "metadata": {},
+   "source": [
+    "## Run a GEMM with the epilogue visitor functor\n",
+    "The `epilogue_visitor` can be used by setting the plan's `epilogue_visitor` field. The arguments for the epilogue visitor are provided as a `dict` through the `visitor_args` keyword argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fe49443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visitor_args = {\n",
+    "    \"alpha\": alpha, \"C\": tensor_C, \"beta\": beta, \n",
+    "    \"aux\": aux, \"bias\": bias, \"D\": tensor_D, \"F\": tensor_F\n",
+    "}\n",
+    "\n",
+    "plan.epilogue_visitor = epilogue_visitor\n",
+    "plan.run(\n",
+    "    tensor_A, tensor_B, tensor_C, tensor_D, \n",
+    "    visitor_args=visitor_args, print_module=print_module)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "455d0a37",
+   "metadata": {},
+   "source": [
+    "The epilogue function `example_epilogue` can be used as a reference function. We can now verify the results simply with"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e32e7798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TorchReference(torch.nn.Module):\n",
+    "    def forward(self, A, B, alpha, C, beta, aux, bias):\n",
+    "        accum = torch.matmul(A, B)\n",
+    "        return example_epilogue(accum, alpha, C, beta, aux, bias)\n",
+    "\n",
+    "torch_reference = TorchReference()\n",
+    "if hasattr(torch, \"compile\"):\n",
+    "    # If the torch.compile feature is available\n",
+    "    torch_reference = torch.compile(torch_reference)\n",
+    "\n",
+    "tensor_D_ref, tensor_F_ref = torch_reference(tensor_A, tensor_B, alpha, tensor_C, beta, aux, bias)\n",
+    "\n",
+    "assert torch.equal(tensor_D, tensor_D_ref)\n",
+    "assert torch.equal(tensor_F, tensor_F_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b69e441f",
+   "metadata": {},
+   "source": [
+    "The performance of CUTLASS fused kernel can be profiled with"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8db92150",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warmup_iterations = 10\n",
+    "profile_iterations = 50\n",
+    "# Profile CUTLASS fused kernel\n",
+    "duration = CUDAEventProfiler(\n",
+    "    plan, warmup_iterations, profile_iterations,\n",
+    "    tensor_A, tensor_B, tensor_C, tensor_D, \n",
+    "    visitor_args=visitor_args)()\n",
+    "\n",
+    "print(f\"CUTLASS duration: {duration:.2f} ms\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/python/README.md b/examples/python/README.md
index 2ed80e1939..590f2e24e4 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -16,3 +16,7 @@
 * [03_basic_conv2d](/examples/python/03_basic_conv2d.ipynb)
 
     Shows how to declare, configure, compile, and run a CUTLASS Conv2d using the Python interface
+
+* [04_epilogue_visitor](/examples/python/04_epilogue_visitor.ipynb)
+
+    Shows how to fuse elementwise activation functions to GEMMs via the Python Epilogue Visitor interface
diff --git a/include/cute/algorithm/axpby.hpp b/include/cute/algorithm/axpby.hpp
index a613417d39..a01fb1df14 100644
--- a/include/cute/algorithm/axpby.hpp
+++ b/include/cute/algorithm/axpby.hpp
@@ -68,7 +68,14 @@ axpby(Alpha                    const& alpha,
       Beta                     const& beta,
       Tensor<YEngine, YLayout>      & y)
 {
-  auto isBetaZero = (beta == Int<0>{});
+  auto isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+  } ();
 
   CUTE_UNROLL
   for (int i = 0; i < size(x); ++i) {
diff --git a/include/cute/algorithm/gemm.hpp b/include/cute/algorithm/gemm.hpp
index 44a0f7d487..4a8e6fdd17 100644
--- a/include/cute/algorithm/gemm.hpp
+++ b/include/cute/algorithm/gemm.hpp
@@ -218,7 +218,6 @@ gemm(MMA_Atom<MMA>       const& mma,
   CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
   CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
   CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
-
   gemm(mma,
        D,                                                       // (M,N)
        make_tensor(A.data(), append<2>(A.layout())),            // (M,1)
@@ -253,7 +252,7 @@ gemm(MMA_Atom<MMA>       const& mma,
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
-
+  
   gemm(mma,
        make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
        make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
@@ -282,7 +281,6 @@ gemm(MMA_Atom<MMA>       const& mma,
   CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
   CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
   CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
-
   auto M = size<1>(A);
   auto N = size<1>(B);
   // REGISTER .reuse OPTIMIZATIONS
@@ -409,7 +407,6 @@ gemm(MMA_Atom<MMA>       const& mma,
   CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
   CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
   CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
-
   auto K = size<2>(A);
 
   CUTE_UNROLL
@@ -454,7 +451,6 @@ gemm(MMA_Atom<MMA>       const& mma,
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
   CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
-
   gemm(mma,
        make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
        make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
diff --git a/include/cute/algorithm/tuple_algorithms.hpp b/include/cute/algorithm/tuple_algorithms.hpp
index d9ae200338..4eeebf8b7a 100644
--- a/include/cute/algorithm/tuple_algorithms.hpp
+++ b/include/cute/algorithm/tuple_algorithms.hpp
@@ -140,7 +140,11 @@ CUTE_HOST_DEVICE constexpr
 auto
 transform_apply(T&& t, F&& f, G&& g)
 {
-  return detail::tapply(static_cast<T&&>(t), f, g, tuple_seq<T>{});
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::tapply(static_cast<T&&>(t), f, g, tuple_seq<T>{});
+  } else {
+    return g(f(static_cast<T&&>(t)));
+  }
 }
 
 template <class T0, class T1, class F, class G>
@@ -148,7 +152,11 @@ CUTE_HOST_DEVICE constexpr
 auto
 transform_apply(T0&& t0, T1&& t1, F&& f, G&& g)
 {
-  return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1)));
+  }
 }
 
 template <class T0, class T1, class T2, class F, class G>
@@ -156,7 +164,11 @@ CUTE_HOST_DEVICE constexpr
 auto
 transform_apply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g)
 {
-  return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2), f, g, tuple_seq<T0>{});
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2)));
+  }
 }
 
 //
@@ -306,21 +318,16 @@ transform_leaf(T0 const& t0, T1 const& t1, F&& f)
 
 namespace detail {
 
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-find_if(T const& t, F&& f, seq<>)
-{
-  return cute::integral_constant<int, tuple_size<T>::value>{};
-}
-
 template <class T, class F, int I, int... Is>
 CUTE_HOST_DEVICE constexpr
 auto
 find_if(T const& t, F&& f, seq<I,Is...>)
 {
   if constexpr (decltype(f(get<I>(t)))::value) {
-    return cute::integral_constant<int, I>{};
+    return cute::C<I>{};
+  } else
+  if constexpr (sizeof...(Is) == 0) {
+    return cute::C<I+1>{};
   } else {
     return find_if(t, f, seq<Is...>{});
   }
@@ -338,7 +345,7 @@ find_if(T const& t, F&& f)
   if constexpr (is_tuple<T>::value) {
     return detail::find_if(t, f, tuple_seq<T>{});
   } else {
-    return cute::integral_constant<int, decltype(f(t))::value ? 0 : 1>{};
+    return cute::C<decltype(f(t))::value ? 0 : 1>{};
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -355,12 +362,12 @@ find(T const& t, X const& x)
 template <class T, class F>
 CUTE_HOST_DEVICE constexpr
 auto
-none_of(T const& t, F&& f)
+any_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return cute::integral_constant<bool, decltype(find_if(t, f))::value == tuple_size<T>::value>{};
+    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
   } else {
-    return not f(t);
+    return f(t);
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -372,8 +379,7 @@ auto
 all_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    auto not_f = [&](auto const& a) { return not f(a); };
-    return cute::integral_constant<bool, decltype(find_if(t, not_f))::value == tuple_size<T>::value>{};
+    return detail::apply(t, [&] (auto const&... a) { return (true_type{} && ... && f(a)); }, tuple_seq<T>{});
   } else {
     return f(t);
   }
@@ -384,9 +390,9 @@ all_of(T const& t, F&& f)
 template <class T, class F>
 CUTE_HOST_DEVICE constexpr
 auto
-any_of(T const& t, F&& f)
+none_of(T const& t, F&& f)
 {
-  return not none_of(t, f);
+  return not any_of(t, f);
 }
 
 //
@@ -410,6 +416,14 @@ filter_tuple(T0 const& t0, T1 const& t1, F&& f)
   return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); });
 }
 
+template <class T0, class T1, class T2, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
+{
+  return transform_apply(t0, t1, t2, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
 //
 // Fold (Reduce, Accumulate)
 // (t, v, f) => f(...f(f(v,t_0),t_1),...,t_n)
@@ -595,6 +609,13 @@ unwrap(T const& t)
 //
 // Flatten a hierarchical tuple to a tuple of depth one.
 //
+//
+
+template <class T>
+struct is_flat : true_type {};
+
+template <class... Ts>
+struct is_flat<tuple<Ts...>> : bool_constant<(true && ... && (not is_tuple<Ts>::value))> {};
 
 template <class T>
 CUTE_HOST_DEVICE constexpr
@@ -602,7 +623,12 @@ auto
 flatten_to_tuple(T const& t)
 {
   if constexpr (is_tuple<T>::value) {
-    return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    if constexpr (is_flat<T>::value) {
+      return t;
+    } else
+    {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
   } else {
     return cute::make_tuple(t);
   }
@@ -616,7 +642,12 @@ auto
 flatten(T const& t)
 {
   if constexpr (is_tuple<T>::value) {
-    return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    if constexpr (is_flat<T>::value) {
+      return t;
+    } else
+    {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
   } else {
     return t;
   }
diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
index aaef8b4161..d33ed305be 100644
--- a/include/cute/arch/copy_sm90_desc.hpp
+++ b/include/cute/arch/copy_sm90_desc.hpp
@@ -177,7 +177,7 @@ to_CUtensorMapSwizzle(SmemSwizzleBits const& t) {
 #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
   using TmaDescriptor = CUtensorMap;
 #else
-  using TmaDescriptor = struct { char bytes[128]; };
+  using TmaDescriptor = struct alignas(64) { char bytes[128]; };
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Initiates a TensorMap Prefetch
diff --git a/include/cute/arch/mma_sm80.hpp b/include/cute/arch/mma_sm80.hpp
index 6050500a47..8dc5fdcb2c 100644
--- a/include/cute/arch/mma_sm80.hpp
+++ b/include/cute/arch/mma_sm80.hpp
@@ -37,8 +37,19 @@
 // Config
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
 #  define CUTE_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTE_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+
+#if (__CUDA_ARCH__ <= 890)
+#define CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
 #endif
 
+
+
 namespace cute {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -2044,7 +2055,7 @@ struct SM80_8x8x128_S32U1U1S32_TN_XORPOPC
       uint32_t const& b0,
       uint32_t const& c0, uint32_t const& c1)
   {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
     asm volatile(
       "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.xor.popc "
       "{%0, %1},"
@@ -2077,7 +2088,7 @@ struct SM80_16x8x128_S32U1U1S32_TN_XORPOPC
       uint32_t const& b0,
       uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
   {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
     asm volatile(
       "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.xor.popc "
       "{%0,  %1,  %2,  %3},"
@@ -2110,7 +2121,7 @@ struct SM80_16x8x256_S32U1U1S32_TN_XORPOPC
       uint32_t const& b0, uint32_t const& b1,
       uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
   {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
     asm volatile(
       "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc "
       "{%0,  %1,  %2,  %3},"
diff --git a/include/cute/arch/mma_sm90.hpp b/include/cute/arch/mma_sm90.hpp
index 25a98e6cb0..64561fa1f6 100644
--- a/include/cute/arch/mma_sm90.hpp
+++ b/include/cute/arch/mma_sm90.hpp
@@ -38,6 +38,7 @@
 // Config
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 #    define CUTE_ARCH_MMA_SM90_ENABLED
+#    define CUTE_ARCH_MMA_F64_SM90_ENABLED
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -60,7 +61,7 @@ struct SM90_16x8x4_F64F64F64F64_TN
       double const& b0,
       double const& c0, double const& c1, double const& c2, double const& c3)
   {
-#if defined(CUTE_ARCH_MMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
     asm volatile(
       "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64"
       "{%0,  %1,  %2,  %3},"
@@ -93,7 +94,7 @@ struct SM90_16x8x8_F64F64F64F64_TN
       double const& b0, double const& b1,
       double const& c0, double const& c1, double const& c2, double const& c3)
   {
-#if defined(CUTE_ARCH_MMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
     asm volatile(
       "mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64"
       "{%0,  %1,  %2,  %3},"
@@ -127,7 +128,7 @@ struct SM90_16x8x16_F64F64F64F64_TN
       double const& b0, double const& b1, double const& b2, double const& b3,
       double const& c0, double const& c1, double const& c2, double const& c3)
   {
-#if defined(CUTE_ARCH_MMA_SM90_ENABLED)
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
     asm volatile(
       "mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64"
       "{%0,  %1,  %2,  %3},"
diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp
index ae647eb9ed..4c99b9ef7c 100644
--- a/include/cute/arch/mma_sm90_desc.hpp
+++ b/include/cute/arch/mma_sm90_desc.hpp
@@ -86,22 +86,22 @@ CUTE_HOST std::ostream& operator<<(std::ostream& os, LayoutType const& t) {
 union GmmaDescriptor
 {
 
-  CUTE_HOST_DEVICE constexpr 
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor() noexcept : desc_(0) {}
-  CUTE_HOST_DEVICE constexpr 
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor(uint64_t desc) noexcept : desc_(desc) {}
-  CUTE_HOST_DEVICE constexpr 
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor(GmmaDescriptor const& t) noexcept : desc_(t.desc_) {}
-  CUTE_HOST_DEVICE constexpr 
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor(GmmaDescriptor && t) noexcept : desc_(t.desc_) {}
-  
-  CUTE_HOST_DEVICE constexpr 
+
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor& operator=(GmmaDescriptor const& t) noexcept {
     desc_ = t.desc_;
     return *this;
   }
 
-  CUTE_HOST_DEVICE constexpr 
+  CUTE_HOST_DEVICE constexpr
   GmmaDescriptor& operator=(GmmaDescriptor && t) noexcept {
     desc_ = t.desc_;
     return *this;
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
index 9b91f87ef4..d2617abdd8 100644
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -38,9 +38,17 @@
 #include <cute/atom/copy_traits.hpp>
 #include <cute/atom/copy_atom.hpp>
 
+#include <cute/numeric/integral_ratio.hpp>
+
 namespace cute
 {
 
+template <class GmemStrides_, class TmaGBasis_, class TmaSwizzle_>
+struct AuxTmaParams {
+  using GmemStrides = GmemStrides_;
+  GmemStrides g_stride_;
+};
+
 //////////////////////////////////////////////////////////////////////////////
 ///////////////////////////// TMA_LOAD ///////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
@@ -88,14 +96,14 @@ struct Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
   {
     static_assert(is_smem<TD>::value, "Expected smem dst for SM90_TMA_LOAD");
 
-    traits.copy_unpack_(raw_pointer_cast(dst.data()), src.data().coord_, tuple_seq<decltype(src.data().coord_)>{});
+    traits.copy_unpack_(cute::raw_pointer_cast(dst.data()), src.data().coord_, tuple_seq<decltype(src.data().coord_)>{});
   }
 };
 
 // The non-executable SM90_TMA_LOAD with tma_desc and no tma_mbar
 // Use .with(tma_mbar) to construct an executable version
-template <class NumBitsPerTMA, class GmemStrides>
-struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, GmemStrides>
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, AuxParams_>
 {
   using ThrID   = Layout<_1>;
 
@@ -109,7 +117,8 @@ struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, GmemStrides>
 
   // SM90_TMA_LOAD arguments
   TmaDescriptor tma_desc_;
-  GmemStrides g_stride_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
 
   // Return TmaDescriptor/TensorMap
   CUTE_HOST_DEVICE constexpr
@@ -133,8 +142,8 @@ struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, GmemStrides>
   CUTE_HOST_DEVICE constexpr
   auto
   get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, g_stride_));
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM90_TMA_LOAD before calling .with()
@@ -190,12 +199,12 @@ struct Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
   {
     static_assert(is_smem<TD>::value, "Expected smem dst for SM90_TMA_LOAD_MULTICAST");
 
-    traits.copy_unpack_(raw_pointer_cast(dst.data()), src.data().coord_, tuple_seq<decltype(src.data().coord_)>{});
+    traits.copy_unpack_(cute::raw_pointer_cast(dst.data()), src.data().coord_, tuple_seq<decltype(src.data().coord_)>{});
   }
 };
 
-template <class NumBitsPerTMA, class GmemStrides>
-struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, GmemStrides>
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
 {
   using ThrID   = Layout<_1>;
 
@@ -209,7 +218,8 @@ struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, GmemStrides>
 
   // SM90_TMA_LOAD_MULTICAST arguments
   TmaDescriptor tma_desc_;
-  GmemStrides g_stride_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
 
   // Return TmaDescriptor/TensorMap
   CUTE_HOST_DEVICE constexpr
@@ -230,8 +240,8 @@ struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, GmemStrides>
   CUTE_HOST_DEVICE constexpr
   auto
   get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, g_stride_));
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with()
@@ -248,8 +258,8 @@ struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, GmemStrides>
 //////////////////////////////////////////////////////////////////////////////
 
 // The executable SM90_TMA_STORE with tma_desc
-template <class NumBitsPerTMA, class GmemStrides>
-struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, GmemStrides>
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
 {
   using ThrID   = Layout<_1>;
 
@@ -263,7 +273,8 @@ struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, GmemStrides>
 
   // SM90_TMA_STORE arguments
   TmaDescriptor tma_desc_;
-  GmemStrides g_stride_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
 
   // Return TmaDescriptor/TensorMap
   CUTE_HOST_DEVICE constexpr
@@ -277,8 +288,8 @@ struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, GmemStrides>
   CUTE_HOST_DEVICE constexpr
   auto
   get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, g_stride_));
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
   }
 
   template <class Coord, int... Is>
@@ -305,7 +316,7 @@ struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, GmemStrides>
     static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
     //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_STORE");  // TMA spoofed src tensor
 
-    traits.copy_unpack_(raw_pointer_cast(src.data()), dst.data().coord_, tuple_seq<decltype(dst.data().coord_)>{});
+    traits.copy_unpack_(cute::raw_pointer_cast(src.data()), dst.data().coord_, tuple_seq<decltype(dst.data().coord_)>{});
   }
 };
 
@@ -417,9 +428,78 @@ struct Copy_Traits<SM90_BULK_COPY_AUTO, OpArgs...>
 
 namespace detail {
 
-// Use a smem2gmode map to read through the GMEM tensor
-// and construct a TMA Descriptor for the resulting instruction
-template <class GEngine, class GLayout,
+// Custom version of coalesce that greedily combines modes only up to size-256
+// Look at each element and the back of the stack (in order of priority)
+// back(NewLayout)  get<I>(OldLayout)
+//      s0:d0           _1:d1     =>  continue
+//      _1:d0           s1:d1     =>  replace_back     s1:d1
+//      s0:d0           s1:s0*d0  =>  replace_back  s0*s1:d0   if s0*s1 <= 256
+//      s0:d0           s1:d1     =>  append           s1:d1
+//
+// @pre OldShape and OldStride are flat
+template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256_impl(OldShape const& old_shape, OldStride const& old_stride,
+                  NewShape const& new_shape, NewStride const& new_stride)
+{
+  if constexpr (I == rank_v<OldShape>) {
+    // Base case, we're done
+    if constexpr (is_constant<1, NewShape>::value) {
+      return Layout<_1,_0>{};
+    } else {
+      return Layout<NewShape,NewStride>{new_shape,new_stride};
+    }
+  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
+    // shape<I>(layout) == _1, skip it and continue
+    return coalesce_256_impl<I+1>(old_shape, old_stride, new_shape, new_stride);
+  } else if constexpr (is_constant<1, NewShape>::value) {
+    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
+    return coalesce_256_impl<I+1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
+  } else if constexpr (is_constant<true, decltype(back(new_shape) * back(new_stride) == get<I>(old_stride) &&
+                                                  get<I>(old_shape) * back(new_shape) <= Int<256>{})>::value) {
+    // Merge modes because the shapes and strides match and the merge is 256 or less
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  replace_back(new_shape, get<I>(old_shape) * back(new_shape)),
+                                  new_stride);
+  } else {
+    // Can't replace or merge, so append a new mode
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  append(new_shape,  get<I>(old_shape)),
+                                  append(new_stride, get<I>(old_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Combine all the modes that are possible to combine
+// Does not respect the profile of the layout, but does preserve total size
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+  return coalesce_256_impl<1>(flat_shape, flat_stride, get<0>(flat_shape), get<0>(flat_stride));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256(Tensor<Engine,Layout> const& tensor)
+{
+  return make_tensor(tensor.data(), coalesce_256(tensor.layout()));
+}
+
+
+// Use a smem_inv_h to read through the GMEM tensor
+//   and construct a TMA Descriptor for the resulting instruction
+// At the same time, construct the Tma Tensor's Stride to generate
+//   the TMA coordinates that the instruction consumes.
+// 
+template <class TmaInternalType,
+          class GEngine, class GLayout,
           class SShape, class SStride,
           int B, int M, int S>
 CUTE_HOST_RTC
@@ -428,63 +508,78 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,    // The original GM
                    Layout<SShape,SStride>  const& smem_inv_h, // smem_idx to hier gmode
                    Swizzle<B,M,S>          const& swizzle)    // Swizzle fn on smem_idx
 {
-  using T = typename GEngine::value_type;
-
-  // This is the gmem "vector" that corresponds to the smem vector in memory  (smem_box_shape):(gmem_prob_stride)
-  Tensor tma_gstride = recast<T>(gtensor.compose(smem_inv_h));
-
-  // If the sizes of smem_inv_h and tma_gstride don't match, then a non-trivial recast was performed.
-  // In that case, require that the recasted modes all have size-1 so TMA can identity them and skip them.
-  for_each(zip(flatten(shape(smem_inv_h)), flatten(shape(tma_gstride))), [] (auto s_and_g) {
-    auto [s,g] = s_and_g;
-    CUTE_STATIC_ASSERT_V(s == g or g == Int<1>{},
-                         "A non-trivial recast was performed, but TMA cannot identify which modes to leave out.");
-  });
+  // The smem vector is the same units as gtensor, so compose first and then recast
+  // tma_val_idx:gmem_strides
+  Tensor tile_gstride = recast<TmaInternalType>(gtensor.compose(smem_inv_h));
+  // Coalesce modes up to size-256 (the maximum TMA box extent in units of TmaInternalType)
+  // tma_box_shape:gmem_strides
+  Tensor tma_gstride  = coalesce_256(tile_gstride);
 
   // Perform the tiling to the gmem vector again, but with indirections to the gtensor modes
   auto gbasis = make_identity_layout(shape(gtensor));
-  auto tma_gbasis_tile_tmp = gbasis.compose(smem_inv_h);
-  // Instead of the recast (gbasis doesn't have type info), replace the shape with the already-recasted shape and coalesce out any size-1 modes
-  auto tma_gbasis_tile = coalesce(make_layout(shape(tma_gstride), stride(tma_gbasis_tile_tmp)));
+  auto tile_gbasis_tmp = gbasis.compose(smem_inv_h);
+  
+  // Instead of the recast (gbasis doesn't have type info), replace the shape with the already-recasted shape
+  // tma_box_shape:gmem_mode
+  auto tile_gbasis = make_layout(shape(tile_gstride), stride(tile_gbasis_tmp));
 
   // Recast the original tensor for shape inspections
-  auto glayout_T = recast<T>(gtensor).layout();
+  auto gtensor_T = recast<TmaInternalType>(gtensor);
 
-  // Find missing bases that don't belong to a size-1 mode of the recast input
+  // Find missing bases that don't appear in tile_gbasis
   // NOTE This is essentially ArithmeticTuple complement...
-  // NOTE   in persuit of implementing an ArithmeticTuple logical_divide for smem_inv_h
-  auto tma_gbasis_full = fold(zip(flatten(shape(glayout_T)), flatten(stride(gbasis))), tma_gbasis_tile, 
-                              [](auto tma_g, auto s_and_d) {
-    auto [s,d] = s_and_d;
-    auto k = find(stride(tma_g), d);  // Find the basis in tma_gstride
-    if constexpr (decltype(k != rank(tma_g) || is_constant<1, decltype(s)>{})::value) {
-      // If d was found or s is static-1, then don't append
-      return tma_g;
+  // NOTE   in pursuit of implementing an ArithmeticTuple logical_divide for smem_inv_h
+  auto tile_gbasis_remaining_stride = filter_tuple(flatten(shape (gtensor_T)), flatten(stride(gtensor_T)),
+                                                   flatten(stride(gbasis)), 
+                                                   [&](auto s, auto d, auto e) 
+  {
+    if constexpr (is_constant<1, decltype(s)>::value || is_constant<0, decltype(d)>::value) {
+      return cute::tuple<>{};          // If size-1 or stride-0, then don't append
     } else {
-      // Else, append the missing basis
-      return append(tma_g, make_layout(Int<1>{}, d));
+      using E = decltype(e);
+      auto has_e = any_of(stride(tile_gbasis), [] (auto tb) { return tb == E{}; });
+      if constexpr (decltype(has_e)::value) {
+        return cute::tuple<>{};        // If d was found, then don't append
+      } else {
+        return cute::tuple<E>(e);      // Else, this is missing so append
+      }
     }
   });
+  auto tile_gbasis_remaining_rank = rank(tile_gbasis_remaining_stride);
+
+  // "Coalesce" the tile basis into a compatible shape with the tma
+  auto tma_gbasis_tile = tile_gbasis.compose(make_layout(wrap(shape(tma_gstride))));
 
-  // Group the trailing modes to make this max rank-5
+  // Append the remaining basis modes that contribute to the TMA with size-1
+  auto tma_gbasis_full = make_layout(tuple_cat(wrap( shape(tma_gbasis_tile)), wrap(repeat<tile_gbasis_remaining_rank>(Int<1>{}))), 
+                                     tuple_cat(wrap(stride(tma_gbasis_tile)), wrap(tile_gbasis_remaining_stride)));
+
+  // Group the trailing modes to make this max rank-5 -- TMA rank limitation
+  // tma_box_shape:gmem_mode
   auto tma_gbasis = group<cute::min(rank(tma_gbasis_full),4),-1>(tma_gbasis_full);
 
 #if 0
-  print("gtensor      : "); print(gtensor); print("\n");
   print("smem_inv_h   : "); print(smem_inv_h); print("\n");
+  print("gtensor      : "); print(gtensor); print("\n");
+  print("tile_gstride : "); print(tile_gstride); print("\n");
   print("tma_gstride  : "); print(tma_gstride); print("\n");
   print("gbasis       : "); print(gbasis); print("\n");
-  print("tma_gb_tile  : "); print(tma_gbasis_tile  ); print("\n");
+  print("tile_gbasis  : "); print(tile_gbasis); print("\n");
   print("tma_gbasis   : "); print(tma_gbasis); print("\n");
 #endif
 
+  //
+  // TMA desc creation
+  // 
+
   constexpr int tma_dim = decltype(rank(tma_gbasis))::value;
 
   //
   // TMA gmem desc info
   //
 
-  void* gmem_address = (void*) raw_pointer_cast(gtensor.data());
+  void* gmem_address = (void*) raw_pointer_cast(gtensor_T.data());
+  auto  gmem_layout = gtensor_T.layout();
 
   cute::array<uint64_t, 5> gmem_prob_shape  = {1,1,1,1,1};
   cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
@@ -492,12 +587,12 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,    // The original GM
   for_each(make_seq<tma_dim>{}, [&](auto i) {
     for_each(stride<i>(tma_gbasis), [&](auto ej) {
       // Problem stride
-      uint64_t stride_j = basis_get(ej, stride(glayout_T)) * sizeof(T);
+      uint64_t stride_j = ceil_div(basis_get(ej, stride(gmem_layout)) * sizeof_bits_v<TmaInternalType>, 8);
       uint64_t old_stride = gmem_prob_stride[i];
       gmem_prob_stride[i] = gcd(gmem_prob_stride[i], stride_j);
 
       // Problem shape
-      uint64_t shape_j = basis_get(ej, shape(glayout_T));
+      uint64_t shape_j = basis_get(ej, shape(gmem_layout));
       if (gmem_prob_stride[i] != 0) {
         // Recurrence: g_shape = (s_i - 1) * (d_i / gcd_j d_j) + 1
         gmem_prob_shape[i] = (gmem_prob_shape[i]-1) * (old_stride / gmem_prob_stride[i])
@@ -522,8 +617,8 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,    // The original GM
   assert(gmem_prob_shape[4] >= (uint64_t(1)));               // Size must be min 1
   assert(gmem_prob_shape[4] <= (uint64_t(1) << 32));         // Size must be max 2^32
 
-  // TMA descriptor does not store the zeroth stride and assumes it is sizeof(T) == one element.
-  assert(gmem_prob_stride[0] == sizeof(T) && "Majorness of smem doesn't match majorness of gmem");
+  // TMA descriptor does not store the zeroth stride and assumes it is 1 (TmaInternalType element).
+  assert(gmem_prob_stride[0] == sizeof(TmaInternalType) && "Majorness of smem doesn't match majorness of gmem");
 
   assert((gmem_prob_stride[1]) < (uint64_t(1) << 40));       // Stride must be max 2^40
   assert((gmem_prob_stride[1] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
@@ -545,14 +640,16 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,    // The original GM
     smem_box_shape[i] *= size<i>(tma_gbasis);
   });
 
-  assert(smem_box_shape[0] >= (uint64_t(1)));                // Size must be min 1
-  assert(smem_box_shape[0] <= (uint64_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[0] >= (uint64_t(1)));                // Size must be min 1
-  assert(smem_box_shape[0] <= (uint64_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[0] >= (uint64_t(1)));                // Size must be min 1
-  assert(smem_box_shape[0] <= (uint64_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[0] >= (uint64_t(1)));                // Size must be min 1
-  assert(smem_box_shape[0] <= (uint64_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[0] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[0] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[1] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[1] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[2] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[2] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[3] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[3] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[4] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[4] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
 
   assert(smem_box_stride[0] >= (uint32_t(1)));               // Stride must be min 1
   assert(smem_box_stride[0] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
@@ -565,88 +662,101 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,    // The original GM
   assert(smem_box_stride[4] >= (uint32_t(1)));               // Stride must be min 1
   assert(smem_box_stride[4] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
 
-  //
-  // Construct the descriptor
-  //
-
-  TmaDescriptor tma_desc = {0};
-
-  //
-  // TMA general info
-  //
-
-#if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
-
-  CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<T>();
-  CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
-  CUtensorMapL2promotion  tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
-  CUtensorMapFloatOOBfill tma_oobFill     = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
-
-  // TMA smem swizzle type
-  CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(get_tma_swizzle_bits(swizzle));
-  CUresult result = cuTensorMapEncodeTiled(
-      &tma_desc,
-      tma_format,
-      tma_dim,
-      gmem_address,
-      gmem_prob_shape.data(),
-      gmem_prob_stride.data() + 1,  // gmem_prob_stride[0] implicitly 1
-      smem_box_shape.data(),
-      smem_box_stride.data(),
-      tma_interleave,
-      smem_swizzle,
-      tma_l2Promotion,
-      tma_oobFill);
-
-  if (result != CUDA_SUCCESS) {
-    std::cerr << "TMA Desc Addr:   " << &tma_desc
-              << "\nformat         " << tma_format
-              << "\ndim            " << tma_dim
-              << "\ngmem_address   " << gmem_address
-              << "\nglobalDim      " << gmem_prob_shape
-              << "\nglobalStrides  " << gmem_prob_stride
-              << "\nboxDim         " << smem_box_shape
-              << "\nelementStrides " << smem_box_stride
-              << "\ninterleave     " << tma_interleave
-              << "\nswizzle        " << smem_swizzle
-              << "\nl2Promotion    " << tma_l2Promotion
-              << "\noobFill        " << tma_oobFill << std::endl;
-    std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
-    assert(false);
-  }
-
-#endif // (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+    //
+    // Construct the descriptor
+    //
+  
+    TmaDescriptor tma_desc = {0};
+  
+    //
+    // TMA general info
+    //
+  
+  #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  
+    CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<TmaInternalType>();
+    CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
+    CUtensorMapL2promotion  tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
+    CUtensorMapFloatOOBfill tma_oobFill     = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+  
+    // TMA smem swizzle type
+    CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(get_tma_swizzle_bits(swizzle));
+    CUresult result = cuTensorMapEncodeTiled(
+        &tma_desc,
+        tma_format,
+        tma_dim,
+        gmem_address,
+        gmem_prob_shape.data(),
+        gmem_prob_stride.data() + 1,  // gmem_prob_stride[0] implicitly 1
+        smem_box_shape.data(),
+        smem_box_stride.data(),
+        tma_interleave,
+        smem_swizzle,
+        tma_l2Promotion,
+        tma_oobFill);
+  
+    if (result != CUDA_SUCCESS) {
+      std::cerr << "TMA Desc Addr:   " << &tma_desc
+                << "\nformat         " << tma_format
+                << "\ndim            " << tma_dim
+                << "\ngmem_address   " << gmem_address
+                << "\nglobalDim      " << gmem_prob_shape
+                << "\nglobalStrides  " << gmem_prob_stride
+                << "\nboxDim         " << smem_box_shape
+                << "\nelementStrides " << smem_box_stride
+                << "\ninterleave     " << tma_interleave
+                << "\nswizzle        " << smem_swizzle
+                << "\nl2Promotion    " << tma_l2Promotion
+                << "\noobFill        " << tma_oobFill << std::endl;
+      std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
+      assert(false);
+    }
+  
+  #endif // (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
   // Finally, get the inverse permutation of the E<i> bases for the mocked gmem stride
   // NOTE This is essentially ArithmeticTuple inverse...
   auto gmem_stride_bases = transform_leaf(stride(gbasis), [&](auto ei) {
-    auto si = basis_get(ei,  shape(glayout_T));
-    auto di = basis_get(ei, stride(glayout_T));
-    auto tma_gbasis_stride = stride(tma_gbasis);
-    // Find j such that E<i> is in stride<j>(tma_gbasis)
-    [[maybe_unused]] auto j = find_if(tma_gbasis_stride, [&](auto tma_stride_j) { return any_of(tma_stride_j, [&](auto dj) { return dj == ei; }); });
-    // Return the TMA basis this gmode contributes to
-    if constexpr (is_constant<1, decltype(si)>::value || decltype(j == rank(tma_gbasis_stride))::value) {
-      return Int<0>{};     // Return arithmetic identity -- no contribution to the TMA
-    } else 
-    if constexpr (decltype(rank<j>(tma_gbasis_stride) == Int<1>{})::value) {
-      return E<j>{};       // We know that the scale factor is Int<1>{}
+    auto si = basis_get(ei,  shape(gmem_layout));
+    auto di = basis_get(ei, stride(gmem_layout));
+    if constexpr (is_constant<1, decltype(si)>::value || is_constant<0, decltype(di)>::value) {
+      return Int<0>{};                  // If size-1 or stride-0, return arithmetic identity -- no contribution to the TMA
     } else {
-      return E<j>{} * int32_t(di * sizeof(T) / cute::max(gmem_prob_stride[j], 16));
+      auto tma_gbasis_stride = stride(tma_gbasis);
+      // Find j such that E<i> is in stride<j>(tma_gbasis)
+      using EI = decltype(ei);
+      [[maybe_unused]] auto j = find_if(tma_gbasis_stride, [&](auto tma_stride_j) { return any_of(tma_stride_j, [&](auto dj) { return dj == EI{}; }); });
+      if constexpr (decltype(j == rank(tma_gbasis_stride))::value) {
+        return Int<0>{};               // If not-found, return arithmetic identity -- no contribution to the TMA
+      } else 
+      if constexpr (decltype(j == Int<0>{})::value) {
+        auto scale = ratio(size(tma_gstride), size(smem_inv_h)) * basis_get(ei, stride(gtensor));
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a recast scale factor
+      } else
+      if constexpr (decltype(rank<j>(tma_gbasis_stride) == Int<1>{})::value) {
+        return E<j>{};                 // Return TMA Coord basis -- known scale of Int<1>{}
+      } else {
+        int32_t scale = ceil_div(int32_t(di * sizeof_bits_v<TmaInternalType> / cute::max(gmem_prob_stride[j], 16)), 8);
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a dynamic scale factor
+      }
     }
   });
 
-#if 0
-  print("gmem_stride_bases : "); print(gmem_stride_bases); print("\n");
-#endif
+  #if 0
+    print("tma_gbasis   : "); print(gmem_stride_bases); print("\n");
+  #endif
 
-  return cute::make_tuple(tma_desc, gmem_stride_bases);
+  using AuxParams = AuxTmaParams<decltype(gmem_stride_bases),
+                                 decltype(tma_gbasis),
+                                 decltype(swizzle)>;
+  return cute::make_tuple(tma_desc, AuxParams{gmem_stride_bases});
 }
 
 // The "logical TMA tid" is a map from the CTA rank to its logical id
 // within the instruction.  It works like a mask or ordering on the
 // CTAs.  For non-multicast TMA, all CTAs should map to 0.  For
 // multicast TMA of size 4, CTAs will be mapped to {0,1,2,3}.
-template <class CopyOp,
+template <class TmaInternalType,
+          class CopyOp,
           class GEngine, class GLayout,
           class SLayout,
           class TShape, class TStride,
@@ -657,7 +767,7 @@ make_tma_copy_tiled(CopyOp,
                     Tensor<GEngine,GLayout> const& gtensor,     // Full GMEM Tensor
                     SLayout                 const& slayout,     // CTA Tile of SMEM
                     Layout<TShape,TStride>  const& cta_t_map,   // T: CTA thr idx -> logical TMA tid
-                    Layout<VShape,VStride>  const& cta_v_map)   // V: CTA val idx -> gmem coord
+                    Layout<VShape,VStride>  const& cta_v_map)   // V: CTA val idx -> gmem mode
 {
   //
   // TMA parameter checking
@@ -673,18 +783,19 @@ make_tma_copy_tiled(CopyOp,
   //
 
   // Invert the smem to get the largest contiguous vector in the smem layout
+  // smem idx -> smem coord
   auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
-  // trunc_smem_idx -> trunc_smem_coord
 
-  // Map from smem idx to a gmem mode
+  // Compose with the V-Map to convert smem coord (CTA val idx) to gmem mode
+  // smem idx -> gmem mode
   auto sidx_to_gmode = coalesce(composition(cta_v_map, inv_smem_layout));
 
 #if 0
-  print("g_layout         : "); print(gtensor.layout()); print("\n");
+  print("g_tensor         : "); print(gtensor); print("\n");
   print("s_layout         : "); print(slayout); print("\n");
   print("cta_t_map        : "); print(cta_t_map); print("\n");
   print("cta_v_map        : "); print(cta_v_map); print("\n");
-  print("inv_smem         : "); print(inv_smem_layout); print("\n");
+  print("inv_s_layout     : "); print(inv_smem_layout); print("\n");
   print("sidx_to_gmode    : "); print(sidx_to_gmode); print("\n");
 #endif
 
@@ -693,9 +804,11 @@ make_tma_copy_tiled(CopyOp,
   //
 
   // Generate a TupleBasis for the gtensor
+  // gmem coord -> gmem coord
   auto glayout_basis = make_identity_layout(shape(gtensor));
 
   // Tile the modes of gtensor with the truncated cta_v_map o inv_smem_layout_trunc
+  // smem idx -> gmem coord
   auto tma_layout_full = flatten(composition(glayout_basis, sidx_to_gmode));
 
   // Truncate any incompatibilities -- no starting in the middle of gmodes
@@ -704,61 +817,60 @@ make_tma_copy_tiled(CopyOp,
     return not is_constant<1,decltype(v)>{};
   });
   static_assert(smem_rank > 0, "Could not find a common tile-gmem vectorization. Does the Tile select out major GMEM modes?");
-  // TMA uses a maximum of 5 modes
-  // If the gtensor has more than 5 modes, we need to reserve the last TMA-mode as a "multimode"
-  constexpr int smem_tma_rank = cute::min(int(smem_rank), (rank(tma_layout_full) > Int<5>{} ? 4 : 5));
 
   // Keep only the static-1 basis modes into gmem
-  auto tma_layout_trunc = take<0,smem_tma_rank>(tma_layout_full);
+  auto tma_layout_trunc = take<0,smem_rank>(tma_layout_full);
 
-  // Split according to the portion each multicast CTA will be responsible for
-  auto tma_layout_vt = logical_divide(tma_layout_trunc, shape_div(size(tma_layout_trunc), cosize(cta_t_map)));
+  // Keep only the portion each multicast CTA will be responsible for
+  auto tma_layout_v = composition(tma_layout_trunc, shape_div(size(tma_layout_trunc), cosize(cta_t_map)));
 
 #if 0
   print("glayout_basis   : "); print(glayout_basis); print("\n");
   print("tma_layout_full : "); print(tma_layout_full); print("\n");
 
   print("tma_layout_trunc: "); print(tma_layout_trunc); print("\n");
-  print("tma_layout_vt   : "); print(tma_layout_vt); print("\n");
+  print("tma_layout_v   : "); print(tma_layout_v); print("\n");
 #endif
 
   //
-  // Construct the TMA Desc and GMEM mode ordering
+  // Construct the TMA Desc and the strides of the TMA Tensor
   //
 
-  auto [tma_desc, gmem_stride_bases] = detail::make_tma_copy_desc(gtensor, layout<0>(tma_layout_vt), get_swizzle_portion(slayout));
+  auto [tma_desc, aux_params] = detail::make_tma_copy_desc<TmaInternalType>(gtensor,
+                                                                            tma_layout_v,
+                                                                            get_swizzle_portion(slayout));
 
   //
   // Construct the Copy_Traits
   //
 
   using T = typename GEngine::value_type;
-  constexpr int num_bits_per_tma = decltype(size<0>(tma_layout_vt))::value * sizeof(T) * 8;
-  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(gmem_stride_bases)>;
+  constexpr int num_bits_per_tma = decltype(size(tma_layout_trunc))::value * sizeof_bits_v<T>;
+  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(aux_params)>;
+  using Atom   = Copy_Atom<Traits, T>;
+
+  Traits tma_traits{tma_desc, aux_params};
 
 #if 0
-  print("num_bits      :  "); print(NumBitsPerTMA{}); print("\n");
-  print("g_stride_bases:  "); print(gmem_stride_bases); print("\n");
+  print("num_bits_per_tma :  "); print(num_bits_per_tma); print("\n");
+  print("g_stride_bases   :  "); print(tma_traits.aux_params_.g_stride_); print("\n");
 #endif
 
-  Traits tma_traits{tma_desc, gmem_stride_bases};
-
   //
   // Construct the TiledCopy
   //
 
   auto cta_tiler = product_each(shape(cta_v_map));
 
-  // (CTA V, CTA T) -> smem_coord
-  auto layout_vt = composition(inv_smem_layout, make_layout(shape(tma_layout_vt)));
-  // Scale that up to cover all of the smem_coords
-  //
-  // The smem vector might not cover all of the tile,
-  // so multiply it up to cover the entire tile.
-  // "T" here (the parallel index) is a CTA index.
-  auto layout_VT = tile_to_shape(layout_vt, make_shape(size(cta_v_map)/size<1>(layout_vt), size<1>(layout_vt)));
-  // Flip it and change the domain of the T from logical thr to thr_idx
-  auto layout_TV = make_layout(composition(layout<1>(layout_VT), cta_t_map), layout<0>(layout_VT));
+  // CTA V -> smem_coord
+  auto layout_v = composition(inv_smem_layout, size(tma_layout_trunc));
+  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
+  // CTA T -> smem idx
+  auto layout_t = make_layout(cosize(cta_t_map), shape_div(size(tma_layout_trunc), cosize(cta_t_map)));
+  // CTA TID -> smem coord
+  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
+  // Combine with the T mapping
+  auto layout_TV = make_layout(layout_T, layout_V);
 
 #if 0
   print("cta_tiler : "); print(cta_tiler); print("\n");
@@ -766,8 +878,7 @@ make_tma_copy_tiled(CopyOp,
   print("layout_TV : "); print(layout_TV); print("\n");
 #endif
 
-  using T = typename GEngine::value_type;
-  return TiledCopy<Copy_Atom<Traits,T>, decltype(layout_TV), decltype(cta_tiler)>{tma_traits};
+  return TiledCopy<Atom, decltype(layout_TV), decltype(cta_tiler)>{tma_traits};
 }
 
 } // end namespace detail
@@ -844,7 +955,8 @@ make_tma_copy_tiled(CopyOp,
 
      copy(tma.with(barrier, mcast_mask), tAgA, tAsA);          // copy with supporting TMA params
  */
-template <class CopyOp,
+template <class TmaInternalType,
+          class CopyOp,
           class GEngine, class GLayout,
           class SLayout,
           class CTA_Tile,
@@ -857,15 +969,35 @@ make_tma_copy(CopyOp                  const& copy_op,
               CTA_Tile                const& cta_tile,
               Cluster_Size            const& cluster_size)
 {
-
-  return detail::make_tma_copy_tiled(copy_op,
-                                     gtensor,
-                                     slayout,
-                                     make_layout(cluster_size),
-                                     make_identity_layout(cta_tile));
+  return detail::make_tma_copy_tiled<TmaInternalType>(copy_op,
+                                                      gtensor,
+                                                      slayout,
+                                                      make_layout(cluster_size),
+                                                      make_identity_layout(cta_tile));
 }
 
 // Explicit defaulting
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tile,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              CTA_Tile                const& cta_tile,
+              Cluster_Size            const& cluster_size)
+{
+  using TmaInternalType = typename GEngine::value_type;
+  return make_tma_copy<TmaInternalType>(copy_op,
+                                        gtensor,
+                                        slayout,
+                                        cta_tile,
+                                        cluster_size);
+}
+
 template <class CopyOp,
           class GEngine, class GLayout,
           class SLayout>
diff --git a/include/cute/atom/mma_atom.hpp b/include/cute/atom/mma_atom.hpp
index 844d653eeb..68bd290e6d 100644
--- a/include/cute/atom/mma_atom.hpp
+++ b/include/cute/atom/mma_atom.hpp
@@ -155,7 +155,7 @@ struct MMA_Atom<MMA_Traits<Args...>>
 
     if constexpr (has_dereference<FrgTypeA>::value) {
       // If the intended FrgTypeA is a view (of the current tensor), forward the whole
-      static_assert(is_same<ValTypeA, typename remove_cvref_t<ATensor>::value_type>::value, "Expecting ValTypeA type");
+      static_assert(is_same<get_raw_type_t<ValTypeA>, typename remove_cvref_t<ATensor>::value_type>::value, "Expecting ValTypeA type");
       return make_tensor<FrgTypeA>(std::forward<ATensor>(atensor));
     } else {
       // Else, the intended FrgTypeA is a value type, construct a new tensor with a fragment layout
diff --git a/include/cute/atom/mma_traits_sm75.hpp b/include/cute/atom/mma_traits_sm75.hpp
index 405e871fd2..63f834664b 100644
--- a/include/cute/atom/mma_traits_sm75.hpp
+++ b/include/cute/atom/mma_traits_sm75.hpp
@@ -49,11 +49,11 @@ struct MMA_Traits<SM75_16x8x8_F32F16F16F32_TN>
   using Shape_MNK = Shape<_16,_8,_8>;
   using ThrID   = Layout<_32>;
   using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_2>,Stride<_16,_1>>>;
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
   using BLayout = Layout<Shape <Shape < _4,_8>,_2>,
                          Stride<Stride<_16,_1>,_8>>;
   using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_2>,Stride<_16,_1>>>;
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
 };
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/cute/config.hpp b/include/cute/config.hpp
index 4a12f1c584..ba2504cd22 100644
--- a/include/cute/config.hpp
+++ b/include/cute/config.hpp
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 #pragma once
 
-#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) || defined(__clang__)
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
 #  define CUTE_HOST_DEVICE __forceinline__ __host__ __device__
 #  define CUTE_DEVICE      __forceinline__          __device__
 #  define CUTE_HOST        __forceinline__ __host__
diff --git a/include/cute/container/bit_field.hpp b/include/cute/container/bit_field.hpp
index 5398e3271e..0cd3e4fe5d 100644
--- a/include/cute/container/bit_field.hpp
+++ b/include/cute/container/bit_field.hpp
@@ -72,8 +72,16 @@ struct bit_field
   // Number of bits in data_[idx] used for NumBits if straddling, else 0
   static constexpr uint32_t bit_hi = (idx + 1 < N) ? (storage_type_bits - bit_lo) : 0;
 
+private:
+  // MSVC issues warning C4293 ("shift count negative or too big, undefined behavior")
+  // if we use NumBits directly in the shift expression, even if the shift occurs
+  // in the branch of a ternary expression where NumBits is known to be less than
+  // the number of bits of the value being shifted.
+  static constexpr uint32_t MollifiedNumBits = NumBits > 63u ? 63u : NumBits;
+public:
+
   // NumBits mask
-  static constexpr value_type   mask    = (NumBits < 64) ? ((uint64_t(1) << NumBits) - 1) : uint64_t(-1);
+  static constexpr value_type   mask    = (NumBits < 64u) ? ((uint64_t(1) << MollifiedNumBits) - 1) : uint64_t(-1);
   // NumBits mask for BitStart
   static constexpr storage_type mask_lo = storage_type(mask) << bit_lo;
   // NumBits mask for leftover bits in data_[idx+1] if straddling, else 0
diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp
index 3455a41620..75829f4520 100644
--- a/include/cute/container/tuple.hpp
+++ b/include/cute/container/tuple.hpp
@@ -76,6 +76,10 @@ namespace detail
 template <size_t N, class T, bool IsEmpty = is_empty<T>::value>
 struct EBO;
 
+template <class T, size_t N, bool B>
+CUTE_HOST_DEVICE constexpr C<N> findt(EBO<N, T, B> const&)
+{ return {}; }
+
 // Specialization for types T that have no data;
 // the "static tuple leaf."  Valid T here include
 // integral_constant<U, Value>, Int<Value>,
@@ -218,6 +222,20 @@ get(tuple<T...>&& t) noexcept
   return detail::getv<I>(static_cast<tuple<T...>&&>(t));
 }
 
+//
+// find a type X within a cute::tuple
+//   Requires X to be unique in tuple
+//   Returns a static integer
+//
+
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+find(tuple<T...> const& t) noexcept
+{
+  return detail::findt<X>(t);
+}
+
 //
 // Custom is_tuple trait simply checks the existence of tuple_size
 //      and assumes std::get<I>(.), std::tuple_element<I,.>
@@ -225,7 +243,7 @@ get(tuple<T...>&& t) noexcept
 namespace detail {
 
 template <class T>
-auto has_tuple_size( T*) -> integral_constant<bool, 0 <= tuple_size<T>::value>;
+auto has_tuple_size( T*) -> bool_constant<(0 <= tuple_size<T>::value)>;
 auto has_tuple_size(...) -> false_type;
 
 } // end namespace detail
@@ -347,6 +365,14 @@ tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4,
   return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)..., get<I4>(t4)...);
 }
 
+template<class T0, class T1>
+struct tuple_cat_static;
+
+template<class... T0s, class... T1s>
+struct tuple_cat_static<tuple<T0s...>, tuple<T1s...>> {
+  using type = tuple<T0s..., T1s...>;
+};
+
 } // end namespace detail
 
 CUTE_HOST_DEVICE constexpr
@@ -370,9 +396,15 @@ CUTE_HOST_DEVICE constexpr
 auto
 tuple_cat(T0 const& t0, T1 const& t1)
 {
-  return detail::tuple_cat(t0, t1,
+  if constexpr (is_static<T0>::value && is_static<T1>::value &&
+		is_tuple<T0>::value && is_tuple<T1>::value) {
+    return typename detail::tuple_cat_static<T0, T1>::type{};
+  } else 
+  {
+    return detail::tuple_cat(t0, t1,
                            make_index_sequence<tuple_size<T0>::value>{},
                            make_index_sequence<tuple_size<T1>::value>{});
+  }
 }
 
 template <class T0, class T1, class T2>
@@ -416,7 +448,7 @@ CUTE_HOST_DEVICE constexpr
 auto
 tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, T5 const& t5, Ts const&... ts)
 {
-  return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), t5, ts...);
+  return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), cute::tuple_cat(t5, ts...));
 }
 #endif
 
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
index 7875ac1581..4497034fcf 100644
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@@ -37,29 +37,20 @@
 #include <cute/algorithm/tuple_algorithms.hpp>
 #include <cute/numeric/integral_constant.hpp>
 
-namespace cute
-{
-
-template <class... Ts>
-using IntTuple = cute::tuple<Ts...>;
+/** IntTuple is an integer or a tuple of IntTuples.
+ * This file holds utilities for working with IntTuples,
+ * but does not hold a concrete concept or class of IntTuple.
+ */ 
 
-// Construct an IntTuple with all value-elements
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-IntTuple<Ts...>
-make_int_tuple(Ts const&... t)
+namespace cute
 {
-  return {t...};
-}
 
-// CuTe does not treat integers as tuples.
-// For example, is_tuple is false, and tuple_size doesn't compile.
-// Nevertheless, CuTe defines rank(Integral) as 1
-// (where "Integral" is a shorthand for either run-time integers
-// or CuTe's compile-time integer constants),
-// so therefore get<0>(Integral) just returns its input.
+// Implementation of get<0>(Integral).
+//   Even though is_tuple<Integral> is false and tuple_size<Integral> doesn't compile,
+//   CuTe defines rank(Integral) as 1, so it's useful for get<0>(Integral) to return its input
 template <size_t I, class T, __CUTE_REQUIRES(cute::is_integral<cute::remove_cvref_t<T>>::value)>
-CUTE_HOST_DEVICE constexpr decltype(auto)
+CUTE_HOST_DEVICE constexpr 
+decltype(auto)
 get(T&& t) noexcept
 {
   static_assert(I == 0, "Index out of range");
@@ -67,23 +58,12 @@ get(T&& t) noexcept
 }
 
 // Custom recursive get for anything that implements get<I>(.) (for a single integer I).
-template <size_t I0, size_t I1, size_t... Is, class Tuple>
-CUTE_HOST_DEVICE constexpr decltype(auto)
-get(Tuple&& t) noexcept
-{
-  using get_I0_result_t = cute::remove_cvref_t<decltype(cute::get<I0>(static_cast<Tuple&&>(t)))>;
-  if constexpr (cute::is_integral<get_I0_result_t>::value) {
-    // Help MSVC deduce that the inner get<I0>(...) call is not a "local variable or temporary."
-    // The above if constexpr test repeats the constraint on the above get(T&&) overload.
-    // get<0, 0, ..., 0>(t) for cute::integral (either one of the built-in integer types like int,
-    // or one of CuTe's compile-time constant types) t, and for one or more zeros, just returns t.
-    static_assert(I1 == 0, "Index I1 is out of range");
-    static_assert(((Is == 0) && ...), "At least one index in Is is out of range");
-    return get<I0>(static_cast<Tuple&&>(t));
-  }
-  else {
-    return get<I1, Is...>(get<I0>(static_cast<Tuple&&>(t)));
-  }
+template <size_t I0, size_t I1, size_t... Is, class T>
+CUTE_HOST_DEVICE constexpr 
+decltype(auto)
+get(T&& t) noexcept
+{
+  return get<I1, Is...>(get<I0>(static_cast<T&&>(t)));
 }
 
 //
@@ -347,6 +327,16 @@ ceil_div(IntTupleA const& a, IntTupleB const& b)
 }
 
 /** Division for Shapes
+ * Case Tuple Tuple:
+ *   Perform shape_div element-wise
+ * Case Tuple Int:
+ *   Fold the division of b across each element of a
+ *   Example: shape_div((4,5,6),40) -> shape_div((1,5,6),10) -> shape_div((1,1,6),2) -> (1,1,3)
+ * Case Int Tuple:
+ *   Return shape_div(a, product(b))
+ * Case Int Int:
+ *   Enforce the divisibility condition a % b == 0 || b % a == 0 when possible
+ *   Return a / b with rounding away from 0 (that is, 1 or -1 when a < b)
  */
 template <class IntTupleA, class IntTupleB>
 CUTE_HOST_DEVICE constexpr
@@ -357,39 +347,28 @@ shape_div(IntTupleA const& a, IntTupleB const& b)
     if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
       static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
       return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); });
-    } else {                                    // tuple int
+    } else {                                     // tuple int
       auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
         [] (auto const& init, auto const& ai) {
           return cute::make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai));
         });
       return result;
     }
-  } else {
-    if constexpr (is_tuple<IntTupleB>::value) {  // int tuple
-      return shape_div(a, product(b));
-    } else {                                     // int int
-      //assert(a % b == 0 || b % a == 0);
-      return a / b != 0 ? a / b : signum(a) * signum(b);  // divide with rounding away from zero
-    }
+  } else
+  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
+    return shape_div(a, product(b));
+  } else
+  if constexpr (is_static<IntTupleA>::value && is_static<IntTupleB>::value) {
+    static_assert(IntTupleA::value % IntTupleB::value == 0 || IntTupleB::value % IntTupleA::value == 0, "Static shape_div failure");
+    return C<shape_div(IntTupleA::value, IntTupleB::value)>{};
+  } else {                                       // int int   
+    //assert(a % b == 0 || b % a == 0);          // Wave dynamic assertion
+    return a / b != 0 ? a / b : signum(a) * signum(b);  // Division with rounding away from zero
   }
 
   CUTE_GCC_UNREACHABLE;
 }
 
-/** Division for Shapes that are static constants
- * @pre t % u == 0 || u % t == 0
- * @result if t % u == 0, then t / u
- *         if u % t == 0, then signum(t) * signum(u)
- */
-template <class T, T t, class U, U u>
-CUTE_HOST_DEVICE constexpr
-constant<decltype(shape_div(t,u)), shape_div(t,u)>
-shape_div(constant<T, t> const&, constant<U, u> const&)
-{
-  static_assert(t % u == 0 || u % t == 0, "Static shape_div failure");
-  return {};
-}
-
 /** Minimum for Shapes
  */
 template <class IntTupleA, class IntTupleB>
@@ -581,7 +560,7 @@ make_int_tuple(Indexable const& t, int n, T const& init)
 
 /** Fill the dynamic values of a Tuple with values from another Tuple
  * \code
- *   auto params = make_int_tuple(6,3,4);
+ *   auto params = make_tuple(6,3,4);
  *   cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>> result;
  *   fill_int_tuple_from(result, params);                    // (_1,(6,3,_3),4,_2)
  * \endcode
@@ -893,7 +872,8 @@ increment(Coord& coord, Shape const& shape)
 struct ForwardCoordIteratorSentinal
 {};
 
-// A forward iterator for a coordinate that starts from zero and goes to shape
+// A forward iterator for a starting coordinate in a shape's domain, and a shape.
+// The starting coordinate may be zero but need not necessarily be.
 template <class Coord, class Shape>
 struct ForwardCoordIterator
 {
@@ -905,7 +885,7 @@ struct ForwardCoordIterator
   CUTE_HOST_DEVICE constexpr
   ForwardCoordIterator& operator++() { increment(coord, shape); return *this; }
 
-  // Sentinal for the end of the implied range
+  // Sentinel for the end of the implied range
   CUTE_HOST_DEVICE constexpr
   bool operator< (ForwardCoordIteratorSentinal const&) const { return back(coord) <  back(shape); }
   CUTE_HOST_DEVICE constexpr
@@ -924,6 +904,15 @@ struct ForwardCoordIterator
   Shape const& shape;
 };
 
+// A forward iterator for a coordinate that starts from a provided coordinate
+template <class Shape, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Coord const& coord, Shape const& shape)
+{
+  return ForwardCoordIterator<Coord,Shape>{coord,shape};
+}
+
 // A forward iterator for a coordinate that starts from zero
 template <class Shape>
 CUTE_HOST_DEVICE constexpr
@@ -931,7 +920,7 @@ auto
 make_coord_iterator(Shape const& shape)
 {
   auto coord = repeat_like(shape, int(0));
-  return ForwardCoordIterator<decltype(coord),Shape>{coord,shape};
+  return make_coord_iterator(coord, shape);
 }
 
 } // end namespace cute
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
index 5b81cfd833..5072f0121e 100644
--- a/include/cute/layout.hpp
+++ b/include/cute/layout.hpp
@@ -43,16 +43,16 @@ namespace cute
 // Aliases
 
 template <class... Shapes>
-using Shape = IntTuple<Shapes...>;
+using Shape = cute::tuple<Shapes...>;
 
 template <class... Strides>
-using Stride = IntTuple<Strides...>;
+using Stride = cute::tuple<Strides...>;
 
 template <class... Strides>
-using Step = IntTuple<Strides...>;
+using Step = cute::tuple<Strides...>;
 
 template <class... Coords>
-using Coord = IntTuple<Coords...>;
+using Coord = cute::tuple<Coords...>;
 
 template <class... Ts>
 CUTE_HOST_DEVICE constexpr
@@ -1034,29 +1034,29 @@ complement(Shape const& shape, Stride const& stride, CoSizeHi const& cosize_hi)
 
     // Should just be a sort and a fold...
     // Then we could even handle dynamic strides (but they would destroy all static strides)
-    auto result = fold(make_seq<R-1>{},
-                       cute::make_tuple(shape, stride, cute::make_tuple(), cute::make_tuple(Int<1>{})),
-      [](auto const& init, auto i)
-      {
-        auto curr_stride = cute::min(get<1>(init));
-        auto curr_idx    = find(get<1>(init), curr_stride);
-        auto curr_shape  = get<curr_idx>(get<0>(init));
-
-        return cute::make_tuple(remove<curr_idx>(get<0>(init)),                     // Remove the curr shape
-                                remove<curr_idx>(get<1>(init)),                     // Remove the curr stride
-                                append(get<2>(init), curr_stride / get<3,i>(init)), // new shape  = curr_stride / last_stride
-                                append(get<3>(init), curr_shape  * curr_stride));   // new stride = curr_shape  * curr_stride
-      });
+    auto [shape_, stride_, result_shape_, result_stride] = 
+      fold(make_seq<R-1>{},
+           cute::make_tuple(shape, stride, cute::make_tuple(), cute::make_tuple(Int<1>{})),
+           [](auto const& init, auto i)
+           {
+              auto [shape, stride, result_shape, result_stride] = init;
+              auto min_stride = cute::min(stride);
+              auto min_idx    = find(stride, min_stride);
+
+              return cute::make_tuple(remove<min_idx>(shape),                                    // Remove the min_idx from shape
+                                      remove<min_idx>(stride),                                   // Remove the min_idx from stride
+                                      append(result_shape , min_stride / get<i>(result_stride)), // new shape  = min_stride / last_stride
+                                      append(result_stride, get<min_idx>(shape) * min_stride));  // new stride = curr_shape * min_stride
+            });
 
     // Append the last shape mode
-    auto result_stride = get<3>(result);
-    auto result_shape  = append(get<2>(result), get<1,0>(result) / back(result_stride)); // new shape  = curr_stride / last_stride
-
-    // Compute the rest_stride
-    auto rest_stride = get<0,0>(result) * get<1,0>(result);
-    //return make_layout(append(result_shape,  ceil_div(cosize_hi, rest_stride)), append(result_stride, rest_stride));
-    // Jump into coalesce and append (ceil_div(cosize_hi, rest_stride), rest_stride)
-    return detail::bw_coalesce<R-1>(result_shape, result_stride, ceil_div(cosize_hi, rest_stride), rest_stride);
+    auto result_shape = append(result_shape_, get<0>(stride_) / get<R-1>(result_stride));        // new shape  = min_stride / last_stride
+
+    // Compute the rest_shape and rest_stride
+    auto rest_stride = get<0>(shape_) * get<0>(stride_);
+    auto rest_shape  = ceil_div(cosize_hi, rest_stride);
+    // Jump into coalesce and append (rest_shape, rest_stride)
+    return detail::bw_coalesce<R-1>(result_shape, result_stride, rest_shape, rest_stride);
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -1608,16 +1608,15 @@ CUTE_HOST_DEVICE constexpr
 auto
 recast(Layout<Shape,Stride> const& layout)
 {
-  if constexpr (sizeof(NewType) == sizeof(OldType)) {
+  if constexpr (sizeof_bits<NewType>::value == sizeof_bits<OldType>::value) {
     return layout;
-  } else if constexpr (sizeof(NewType) > sizeof(OldType)) {
-    static_assert(sizeof(NewType) % sizeof(OldType) == 0, "NewType must be a multiple of OldType");
-    return upcast<sizeof(NewType)/sizeof(OldType)>(layout);
-  } else if constexpr (sizeof(NewType) < sizeof(OldType)) {
-    static_assert(sizeof(OldType) % sizeof(NewType) == 0, "NewType must be a divisor of OldType");
-    return downcast<sizeof(OldType)/sizeof(NewType)>(layout);
+  } else if constexpr (sizeof_bits<NewType>::value > sizeof_bits<OldType>::value) {
+    static_assert(sizeof_bits<NewType>::value % sizeof_bits<OldType>::value == 0, "NewType must be a multiple of OldType");
+    return upcast<sizeof_bits<NewType>::value/sizeof_bits<OldType>::value>(layout);
+  } else if constexpr (sizeof_bits<NewType>::value < sizeof_bits<OldType>::value) {
+    static_assert(sizeof_bits<OldType>::value % sizeof_bits<NewType>::value == 0, "NewType must be a divisor of OldType");
+    return downcast<sizeof_bits<OldType>::value/sizeof_bits<NewType>::value>(layout);
   }
-
   CUTE_GCC_UNREACHABLE;
 }
 
diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp
index c2c73be7d8..ead3005cc8 100644
--- a/include/cute/numeric/arithmetic_tuple.hpp
+++ b/include/cute/numeric/arithmetic_tuple.hpp
@@ -387,8 +387,7 @@ abs(ScaledBasis<T,N> const& e) {
 }
 
 // Multiplication
-template <class A, int N, class T,
-          __CUTE_REQUIRES(cute::is_integral<A>::value)>
+template <class A, int N, class T>
 CUTE_HOST_DEVICE constexpr
 auto
 operator*(A const& a, ScaledBasis<T,N> const& e) {
@@ -396,8 +395,7 @@ operator*(A const& a, ScaledBasis<T,N> const& e) {
   return ScaledBasis<decltype(r),N>{r};
 }
 
-template <int N, class T, class B,
-          __CUTE_REQUIRES(cute::is_integral<B>::value)>
+template <int N, class T, class B>
 CUTE_HOST_DEVICE constexpr
 auto
 operator*(ScaledBasis<T,N> const& e, B const& b) {
diff --git a/include/cute/numeric/complex.hpp b/include/cute/numeric/complex.hpp
index 43e4dd6356..b0d60a05fe 100644
--- a/include/cute/numeric/complex.hpp
+++ b/include/cute/numeric/complex.hpp
@@ -30,97 +30,19 @@
  **************************************************************************************************/
 #pragma once
 
-#include <cute/util/type_traits.hpp>
-
-//#if defined(__CUDA_ARCH__)
-//#  include <cuda/std/complex>
-//#else
-//#  include <complex>
-//#endif
-
-// Suppress warnings for code in Thrust headers.
-
-#if defined(_MSC_VER)
-  // We check for MSVC first, because MSVC also defines __GNUC__.
-  // It's common for non-GCC compilers that emulate GCC's behavior
-  // to define __GNUC__.
-  //
-  // thrust/complex.h triggers MSVC's warning on conversion
-  // from double to float (or const float) ("possible loss of data").
-  // MSVC treats this as an error by default (at least with
-  // CUTLASS's default CMake configuration).
-#pragma warning( push )
-#pragma warning( disable : 4244 )
-#elif defined(__GNUC__)
-  // With GCC + CUDA 11.4, builds show spurious "-Wconversion"
-  // warnings on line 656 of thrust/detail/type_traits.h.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-#endif
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/complex>
-#else
-#include <thrust/complex.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning( pop )
-#elif defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-
 #include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cutlass/complex.h>
 
 namespace cute
 {
 
-//#if defined(__CUDA_ARCH__)
-//template <class T>
-//using complex = cuda::std::complex<T>;
-//#else
-//template <class T>
-//using complex = std::complex<T>;
-//#endif
-
-//template <class T>
-//using complex = thrust::complex<T>;
-
-#if defined(__CUDACC_RTC__)
-using cuda::std::complex;
-#else
-using thrust::complex;
-#endif
-
-template <class T>
-CUTE_HOST_DEVICE
-T real(complex<T> const& z) {
-  return z.real();
-}
-
-template <class T>
-CUTE_HOST_DEVICE
-T imag(complex<T> const& z) {
-  return z.imag();
-}
-
-template <class T>
-CUTE_HOST_DEVICE
-complex<T> conj(complex<T> const& z) {
-  return complex<T>(real(z), -imag(z));
-}
-
-// cute::conj forwards scalars
-template <class T>
-CUTE_HOST_DEVICE
-T conj(T z) {
-  return z;
-}
-
-//CUTE_HOST_DEVICE constexpr
-//float conj(float z) { return z; }
-//CUTE_HOST_DEVICE constexpr
-//double conj(double z) { return z; }
+using cutlass::complex;
+using cutlass::is_complex;
+using cutlass::RealType;
+using cutlass::real;
+using cutlass::imag;
+using cutlass::conj;
 
 /// Fused multiply-add for complex numbers
 template <class T>
@@ -131,10 +53,10 @@ fma(complex<T>      & d,
     complex<T> const& b,
     complex<T> const& c)
 {
-  d.real(c.real() + a.real() * b.real());
-  d.imag(c.imag() + a.real() * b.imag());
-  d.real(d.real() - a.imag() * b.imag());
-  d.imag(d.imag() + a.imag() * b.real());
+  d.real(fma( a.real(), b.real(), c.real()));
+  d.imag(fma( a.real(), b.imag(), c.imag()));
+  d.real(fma(-a.imag(), b.imag(), d.real()));
+  d.imag(fma( a.imag(), b.real(), d.imag()));
 }
 
 /// Fused multiply-add for triplets
@@ -148,46 +70,4 @@ fma(complex<T> const& a,
   return fma(c, a, b, c);
 }
 
-/// Used to determine the real-valued underlying type of a numeric type T
-template <class T>
-struct RealType {
-  using Type = T;
-};
-
-/// Partial specialization for complex-valued type
-template <class T>
-struct RealType<complex<T>> {
-  using Type = T;
-};
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class T>
-struct is_complex {
-  static bool const value = false;
-};
-
-template <class T>
-struct is_complex<complex<T>> {
-  static bool const value = true;
-};
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-// Display utilities
-
-#if !defined(__CUDACC_RTC__)
-template <class T>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, complex<T> const& z)
-{
-  T _r = z.real();
-  T _i = z.imag();
-
-  if (bool(_i)) {
-    return os << _r << "+i" << _i;
-  } else {
-    return os << _r;
-  }
-}
-#endif
-
 } // end namespace cute
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
index bb165111f0..a88892251b 100644
--- a/include/cute/numeric/integral_constant.hpp
+++ b/include/cute/numeric/integral_constant.hpp
@@ -30,15 +30,14 @@
  **************************************************************************************************/
 #pragma once
 
-#include <cute/config.hpp>
-
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/math.hpp>
+#include "cute/util/print.hpp"
+#include "cute/util/type_traits.hpp"
+#include "cute/numeric/math.hpp"
 
 namespace cute
 {
 
-// Short name for fast compilation
+// A constant value: short name and type-deduction for fast compilation
 template <auto v>
 struct C {
   using type = C<v>;
@@ -48,29 +47,40 @@ struct C {
   CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
 };
 
+// Deprecate
 template <class T, T v>
 using constant = C<v>;
 
-template <class T, T v>
-using integral_constant = C<v>;
-
 template <bool b>
 using bool_constant = C<b>;
 
 using true_type  = bool_constant<true>;
 using false_type = bool_constant<false>;
 
+// A more std:: conforming integral_constant that enforces type but interops with C<v>
+template <class T, T v>
+struct integral_constant : C<v> {
+  using type = integral_constant<T,v>;
+  static constexpr T value = v;
+  using value_type = T;
+  // Disambiguate C<v>::operator value_type()
+  //CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }  
+  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
+};
+
 //
 // Traits
 //
 
 // Use cute::is_std_integral<T> to match built-in integral types (int, int64_t, unsigned, etc)
-// Use cute::is_integral<T> to match both built-in integral types AND constant<T,t>
+// Use cute::is_integral<T> to match both built-in integral types AND static integral types.
 
 template <class T>
 struct is_integral : bool_constant<is_std_integral<T>::value> {};
 template <auto v>
-struct is_integral<C<v>> : true_type {};
+struct is_integral<C<v>                  > : true_type {};
+template <class T, T v>
+struct is_integral<integral_constant<T,v>> : true_type {};
 
 // is_static detects if an (abstract) value is defined completely by it's type (no members)
 
@@ -80,20 +90,22 @@ struct is_static : bool_constant<is_empty<T>::value> {};
 template <class T>
 constexpr bool is_static_v = is_static<T>::value;
 
-// is_constant detects if a type is a constant<T,v> and if v is equal to a value
+// is_constant detects if a type is a static integral type and if v is equal to a value
 
 template <auto n, class T>
 struct is_constant : false_type {};
+template <auto n, class T>
+struct is_constant<n, T const > : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T const&> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T      &> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T     &&> : is_constant<n,T> {};
 template <auto n, auto v>
-struct is_constant<n, C<v>       > : bool_constant<v == n> {};
-template <auto n, auto v>
-struct is_constant<n, C<v> const > : bool_constant<v == n> {};
-template <auto n, auto v>
-struct is_constant<n, C<v> const&> : bool_constant<v == n> {};
-template <auto n, auto v>
-struct is_constant<n, C<v>      &> : bool_constant<v == n> {};
-template <auto n, auto v>
-struct is_constant<n, C<v>     &&> : bool_constant<v == n> {};
+struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
+template <auto n, class T, T v>
+struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
 
 //
 // Specializations
@@ -403,9 +415,10 @@ conditional_return(TrueType const& t, FalseType const& f) {
 // Display utilities
 //
 
-template <auto t>
-CUTE_HOST_DEVICE void print(C<t> const&) {
-  printf("_%d", int(t));
+template <auto Value>
+CUTE_HOST_DEVICE void print(C<Value>) {
+  printf("_");
+  ::cute::print(Value);
 }
 
 #if !defined(__CUDACC_RTC__)
diff --git a/include/cute/numeric/integral_ratio.hpp b/include/cute/numeric/integral_ratio.hpp
new file mode 100644
index 0000000000..028ffffd66
--- /dev/null
+++ b/include/cute/numeric/integral_ratio.hpp
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/math.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+namespace cute
+{
+
+/** Compile-time rational arithmetic type.
+ * Like cute::C for std::integral_constant, cute::R for std::ratio has a short name 
+ *   for error messages and compile times.
+ * The static data members @a num and @a den represent the reduced numerator and denominator
+ *   of the rational value. Thus, two cute::R types with different @a n or @a d are distinct types 
+ *   even if they represent the same rational value. A cute::R exposes the reduced canonical type
+ *   via its type member. That is, cute::R<3,6>::type is cute::R<1,2> and cute::R<6,3>::type is cute::C<2>
+ */
+template <auto n, auto d>
+class R {
+  static_assert(d != 0);
+  static constexpr auto an  = abs(n);
+  static constexpr auto ad  = abs(d);
+  static constexpr auto g   = gcd(an, ad);
+  
+ public:
+  static constexpr auto num = signum(n) * signum(d) * an / g;
+  static constexpr auto den =                         ad / g;
+  // RI: den >= 1 && gcd(abs(num),den) == 1
+  using type = typename conditional<num == 0 || den == 1, C<num>, R<num,den>>::type;
+};
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a,b>::type 
+ratio(C<a>, C<b>) {
+  return {};
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*x,b*y>::type 
+operator*(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type 
+operator*(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type 
+operator*(C<c>, R<a,b>) {
+  return {};
+}
+
+// Product with dynamic type needs to produce an integer...
+template <class C, auto a, auto b,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(C const& c, R<a,b>) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+// Product with dynamic type needs to produce an integer...
+template <auto a, auto b, class C,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(R<a,b>, C const& c) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*y+b*x, b*y>::type 
+operator+(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type 
+operator+(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type 
+operator+(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == R<x,y>::num && R<a,b>::den == R<x,y>::den> 
+operator==(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
+operator==(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1> 
+operator==(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<abs(a),abs(b)>::type 
+abs(R<a,b>) {
+  return {};
+}
+
+//
+// Display utilities
+//
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE void print(R<a,b>) {
+  print(C<a>{}); print("/"); print(C<b>{});
+}
+
+#if !defined(__CUDACC_RTC__)
+template <auto a, auto b>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, R<a,b>) {
+  return os << "_" << C<a>{} << "/" << C<b>{};
+}
+#endif
+
+} // end namespace cute
diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp
index ec46fd79a4..fc717c9310 100644
--- a/include/cute/numeric/math.hpp
+++ b/include/cute/numeric/math.hpp
@@ -73,11 +73,26 @@ abs(T const& t) {
   CUTE_GCC_UNREACHABLE;
 }
 
+// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero.
+template <class T,
+          __CUTE_REQUIRES(is_arithmetic<T>::value)>
+CUTE_HOST_DEVICE constexpr
+int
+signum(T const& x) {
+  if constexpr (is_signed<T>::value) {
+    return (T(0) < x) - (x < T(0));
+  } else {
+    return T(0) < x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
 //
 // C++17 <numeric> operations
 //
 
-// Greatest common divisor of two integers
+// Greatest common divisor of two positive integers
 template <class T, class U,
           __CUTE_REQUIRES(is_std_integral<T>::value &&
                           is_std_integral<U>::value)>
@@ -92,7 +107,7 @@ gcd(T t, U u) {
   }
 }
 
-// Least common multiple of two integers
+// Least common multiple of two positive integers
 template <class T, class U,
           __CUTE_REQUIRES(is_std_integral<T>::value &&
                           is_std_integral<U>::value)>
@@ -280,23 +295,6 @@ shiftr(T x, int s) {
   return s >= 0 ? (x >> s) : (x << -s);
 }
 
-// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero.
-template <class T,
-          __CUTE_REQUIRES(is_unsigned<T>::value)>
-CUTE_HOST_DEVICE constexpr
-int
-signum(T const& x) {
-  return T(0) < x;
-}
-
-template <class T,
-          __CUTE_REQUIRES(not is_unsigned<T>::value)>
-CUTE_HOST_DEVICE constexpr
-int
-signum(T const& x) {
-  return (T(0) < x) - (x < T(0));
-}
-
 // Safe divide
 // @pre t % u == 0
 // @result t / u
diff --git a/include/cute/pointer.hpp b/include/cute/pointer.hpp
index 479ad699b5..6c6a738f10 100644
--- a/include/cute/pointer.hpp
+++ b/include/cute/pointer.hpp
@@ -58,6 +58,19 @@ raw_pointer_cast(T* ptr) {
   return ptr;
 }
 
+//
+// Extract the physical type from a logical elem type.
+// 
+template <class T>
+struct get_raw_type
+{
+    using type = T;
+};
+
+template <class T>
+using get_raw_type_t = typename get_raw_type<T>::type;
+
+
 //
 // Pointer categories
 //
@@ -79,6 +92,8 @@ template <class T, class DerivedType>
 struct device_ptr
 {
   using value_type = T;
+  
+  static const uint32_t ElementsPerStoredItem = sizeof(T) * 8 / sizeof_bits_v<T>;
 
   CUTE_HOST_DEVICE constexpr
   device_ptr(T* ptr) : ptr_(ptr) {}
@@ -91,11 +106,14 @@ struct device_ptr
 
   template <class Index>
   CUTE_HOST_DEVICE constexpr
-  T& operator[](Index const& i) const { return ptr_[i]; }
+  T& operator[](Index const& i) const { 
+    static_assert(sizeof_bits_v<T> >= 8, "Use subbyte_iterator to access the element");
+    return ptr_[i]; 
+  }
 
   template <class Index>
   CUTE_HOST_DEVICE constexpr
-  DerivedType operator+(Index const& i) const { return {ptr_ + i}; }
+  DerivedType operator+(Index const& i) const { return {ptr_ + i / ElementsPerStoredItem}; }
 
   CUTE_HOST_DEVICE constexpr friend
   ptrdiff_t operator-(device_ptr<T,DerivedType> const& a,
@@ -326,44 +344,44 @@ recast(rmem_ptr<T const> const& ptr) {
 template <class T>
 CUTE_HOST_DEVICE void print(T const* const ptr)
 {
-  printf("raw_ptr_%db(%p)", int(8*sizeof(T)), ptr);
+  printf("raw_ptr_%db(%p)", int(sizeof_bits<T>::value), ptr);
 }
 
 template <class T>
 CUTE_HOST_DEVICE void print(gmem_ptr<T> const& ptr)
 {
-  printf("gmem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get());
+  printf("gmem_ptr_%db(%p)", int(sizeof_bits<T>::value), ptr.get());
 }
 
 template <class T>
 CUTE_HOST_DEVICE void print(smem_ptr<T> const& ptr)
 {
-  printf("smem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get());
+  printf("smem_ptr_%db(%p)", int(sizeof_bits<T>::value), ptr.get());
 }
 
 template <class T>
 CUTE_HOST_DEVICE void print(rmem_ptr<T> const& ptr)
 {
-  printf("rmem_ptr_%db(%p)", int(8*sizeof(T)), ptr.get());
+  printf("rmem_ptr_%db(%p)", int(sizeof_bits<T>::value), ptr.get());
 }
 
 #if !defined(__CUDACC_RTC__)
 template <class T>
 CUTE_HOST std::ostream& operator<<(std::ostream& os, gmem_ptr<T> const& ptr)
 {
-  return os << "gmem_ptr_" << int(8*sizeof(T)) << "b";
+  return os << "gmem_ptr_" << int(sizeof_bits<T>::value) << "b";
 }
 
 template <class T>
 CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr<T> const& ptr)
 {
-  return os << "smem_ptr_" << int(8*sizeof(T)) << "b";
+  return os << "smem_ptr_" << int(sizeof_bits<T>::value) << "b";
 }
 
 template <class T>
 CUTE_HOST std::ostream& operator<<(std::ostream& os, rmem_ptr<T> const& ptr)
 {
-  return os << "rmem_ptr_" << int(8*sizeof(T)) << "b";
+  return os << "rmem_ptr_" << int(sizeof_bits<T>::value) << "b";
 }
 
 #endif // !defined(__CUDACC_RTC__)
diff --git a/include/cute/stride.hpp b/include/cute/stride.hpp
index 06d4b97755..d5221339eb 100644
--- a/include/cute/stride.hpp
+++ b/include/cute/stride.hpp
@@ -75,6 +75,9 @@ crd2idx_itt(CInt   const& coord,
 {
   if constexpr (sizeof...(Is) == 0) {  // Avoid recursion and mod on single/last iter
     return crd2idx(coord, get<I0>(shape), get<I0>(stride));
+  } else if constexpr (is_constant<0, CInt>::value) {
+    return crd2idx(_0{}, get<I0>(shape), get<I0>(stride))
+         + (_0{} + ... + crd2idx(_0{}, get<Is>(shape), get<Is>(stride)));
   } else {                             // General case
     return crd2idx(coord % product(get<I0>(shape)), get<I0>(shape), get<I0>(stride))
          + crd2idx_itt(coord / product(get<I0>(shape)), shape, stride, seq<Is...>{});
diff --git a/include/cute/swizzle.hpp b/include/cute/swizzle.hpp
index c8d910a03b..39ac311de2 100644
--- a/include/cute/swizzle.hpp
+++ b/include/cute/swizzle.hpp
@@ -218,41 +218,40 @@ recast(Swizzle<B,M,S> const& swizzle)
 //   consumed and which bits are free. Furthermore, it is useful to know whether
 // each of these bits is known statically or dynamically.
 
-// MixedBits is an integer class where some bits are known statically and some
-//   bits are known dynamically. These sets of bits are disjoint and it is known
-//   statically which bits are known dynamically.
+// MixedBits is an 32-bit unsigned integer class where some bits are known statically 
+//   and some bits are known dynamically. These sets of bits are disjoint and it is 
+//   known statically which bits are known dynamically.
 
 // MixedBits can only be manipulated through bitwise operations
 
 // Abstract value:  StaticInt | (dynamic_int_ & StaticFlags)
-template <uint32_t StaticInt   = 0,
-          class    DynamicType = uint32_t,
-          uint32_t StaticFlags = 0>         // 0: static, 1: dynamic
+template <uint32_t StaticInt,
+          uint32_t StaticFlags>    // 0: static, 1: dynamic
 struct MixedBits
 {
   // Representation invariants
   static_assert(StaticFlags != 0, "Should be at least one dynamic bit in MixedBits.");
   static_assert((StaticInt & StaticFlags) == 0, "No static/dynamic overlap allowed in MixedBits.");
-  // assert((dynamic_int_ & ~F) == 0);
 
-  DynamicType dynamic_int_;
+  uint32_t dynamic_int_;
+  // assert((dynamic_int_ & ~StaticFlags) == 0);
 
   CUTE_HOST_DEVICE constexpr operator uint32_t() const noexcept { return StaticInt | dynamic_int_; }
 };
 
-template <class S, S s, class DynamicType, class F, F f>
+// Return a value representing (C<s>{} | (d & C<f>)) potentially using MixedBits to track s and f.
+// This maker does allow ((s & f) != 0) and enforces the MixedBits invariant before creation.
+template <auto s, class DynamicType, auto f>
 CUTE_HOST_DEVICE constexpr
 auto
-make_mixed_bits(constant<S,s> const&, DynamicType const& d, constant<F,f> const&)
+make_mixed_bits(C<s>, DynamicType const& d, C<f>)
 {
   static_assert(is_integral<DynamicType>::value);
-  if constexpr (is_static<DynamicType>::value) {
-    static_assert((s & DynamicType::value & f) == 0, "No static/dynamic overlap allowed.");
-    return constant<S,s>{} | (d & constant<F,f>{});   // Just return a static int
-  } else if constexpr (f == 0) {
-    return constant<S,s>{};                           // Just return a static int
+  constexpr uint32_t new_f = uint32_t(f) & ~uint32_t(s);        // StaticBits take precedence, M<0,f>{d} | C<s>{}
+  if constexpr (new_f == 0 || is_static<DynamicType>::value) {
+    return C<s>{} | (d & C<new_f>{});                           // Just return a static int
   } else {
-    return MixedBits<s, DynamicType, f>{d & f};       // MixedBits
+    return MixedBits<s, new_f>{uint32_t(d) & new_f};            // MixedBits
   }
 
   CUTE_GCC_UNREACHABLE;
@@ -263,28 +262,28 @@ make_mixed_bits(constant<S,s> const&, DynamicType const& d, constant<F,f> const&
 //
 
 // Equality
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator==(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator==(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return (S0 == (S1 & ~F0)) && (m.dynamic_int_ == (S1 & F0));
+  return (S0 == (uint32_t(S1) & ~F0)) && (m.dynamic_int_ == (uint32_t(S1) & F0));
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator==(constant<TS1,S1> const& s, MixedBits<S0,D0,F0> const& m)
+operator==(C<S1> s, MixedBits<S0,F0> const& m)
 {
   return m == s;
 }
 
 // Bitwise AND
-template <uint32_t S0, class D0, uint32_t F0,
-          uint32_t S1, class D1, uint32_t F1>
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator&(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
+operator&(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
 {
   // Truth table for (S0,D0,F0) & (S1,D1,F1) -> (S,D,F)
   //   S0D0F0  | 0X0 | 001 | 011 | 1X0 |
@@ -294,36 +293,36 @@ operator&(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
   //  011      | 0X0 | 001 | 011 | 011 |
   //  1X0      | 0X0 | 001 | 011 | 1X0 |
 
-  return make_mixed_bits(constant<uint32_t,S0 & S1>{},
+  return make_mixed_bits(C<S0 & S1>{},
                          //(S0 | m0.dynamic_int_) & (S1 | m1.dynamic_int_),
                          ((S1 & F0) & m0.dynamic_int_) | ((S0 & F1) & m1.dynamic_int_) | (m0.dynamic_int_ & m1.dynamic_int_),
-                         constant<uint32_t,(S1 & F0) | (S0 & F1) | (F0 & F1)>{});
+                         C<(S1 & F0) | (S0 & F1) | (F0 & F1)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator&(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator&(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return make_mixed_bits(constant<uint32_t,S0 & S1>{},
+  return make_mixed_bits(C<S0 & uint32_t(S1)>{},
                          m.dynamic_int_,
-                         constant<uint32_t,S1 & F0>{});
+                         C<F0 & uint32_t(S1)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator&(constant<TS1,S1> const& s, MixedBits<S0,D0,F0> const& m)
+operator&(C<S1> s, MixedBits<S0,F0> const& m)
 {
   return m & s;
 }
 
 // Bitwise OR
-template <uint32_t S0, class D0, uint32_t F0,
-          uint32_t S1, class D1, uint32_t F1>
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator|(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
+operator|(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
 {
   // Truth table for (S0,D0,F0) | (S1,D1,F1) -> (S,D,F)
   //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
@@ -333,35 +332,35 @@ operator|(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
   //  011     | 011 | 011 | 011 | 1X0 |
   //  1X0     | 1X0 | 1X0 | 1X0 | 1X0 |
 
-  return make_mixed_bits(constant<uint32_t,S0 | S1>{},
+  return make_mixed_bits(C<S0 | S1>{},
                          ((~S1 & F0) & m0.dynamic_int_) | ((~S0 & F1) & m1.dynamic_int_),
-                         constant<uint32_t,(~S0 & F1) | (~S1 & F0)>{});
+                         C<(~S0 & F1) | (~S1 & F0)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator|(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator|(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return make_mixed_bits(constant<uint32_t,S0 | S1>{},
+  return make_mixed_bits(C<S0 |  uint32_t(S1)>{},
                          m.dynamic_int_,
-                         constant<uint32_t,~S1 & F0>{});
+                         C<F0 & ~uint32_t(S1)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator|(constant<TS1,S1> const& s, MixedBits<S0,D0,F0> const& m)
+operator|(C<S1> s, MixedBits<S0,F0> const& m)
 {
   return m | s;
 }
 
 // Bitwise XOR
-template <uint32_t S0, class D0, uint32_t F0,
-          uint32_t S1, class D1, uint32_t F1>
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator^(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
+operator^(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
 {
   // Truth table for (S0,D0,F0) ^ (S1,D1,F1) -> (S,D,F)
   //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
@@ -371,53 +370,53 @@ operator^(MixedBits<S0,D0,F0> const& m0, MixedBits<S1,D1,F1> const& m1)
   //  011     | 011 | 011 | 001 | 001 |
   //  1X0     | 1X0 | 011 | 001 | 0X0 |
 
-  return make_mixed_bits(constant<uint32_t,(~S0 & S1 & ~F0) | (S0 & ~S1 & ~F1)>{},
+  return make_mixed_bits(C<(~S0 & S1 & ~F0) | (S0 & ~S1 & ~F1)>{},
                          (S0 | m0.dynamic_int_) ^ (S1 | m1.dynamic_int_),
-                         constant<uint32_t,F0 | F1>{});
+                         C<F0 | F1>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator^(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator^(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return make_mixed_bits(constant<uint32_t,(~S0 & S1 & ~F0) | (S0 & ~S1)>{},
-                         (S0 | m.dynamic_int_) ^ S1,
-                         constant<uint32_t,F0>{});
+  return make_mixed_bits(C<(~S0 & uint32_t(S1) & ~F0) | (S0 & ~uint32_t(S1))>{},
+                         (S0 | m.dynamic_int_) ^ uint32_t(S1),
+                         C<F0>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator^(constant<TS1,S1> const& s, MixedBits<S0,D0,F0> const& m)
+operator^(C<S1> s, MixedBits<S0,F0> const& m)
 {
   return m ^ s;
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator<<(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator<<(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return make_mixed_bits(constant<uint32_t,(S0 << S1)>{},
+  return make_mixed_bits(C<(S0 << S1)>{},
                          m.dynamic_int_ << S1,
-                         constant<uint32_t,(F0 << S1)>{});
+                         C<(F0 << S1)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-operator>>(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const&)
+operator>>(MixedBits<S0,F0> const& m, C<S1>)
 {
-  return make_mixed_bits(constant<uint32_t,(S0 >> S1)>{},
+  return make_mixed_bits(C<(S0 >> S1)>{},
                          m.dynamic_int_ >> S1,
-                         constant<uint32_t,(F0 >> S1)>{});
+                         C<(F0 >> S1)>{});
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-shiftl(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const& s)
+shiftl(MixedBits<S0,F0> const& m, C<S1> s)
 {
   if constexpr (S1 >= 0) {
     return m << s;
@@ -426,10 +425,10 @@ shiftl(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const& s)
   }
 }
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-shiftr(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const& s)
+shiftr(MixedBits<S0,F0> const& m, C<S1> s)
 {
   if constexpr (S1 >= 0) {
     return m >> s;
@@ -442,24 +441,24 @@ shiftr(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const& s)
 // upcast and downcast
 //
 
-template <uint32_t S0, class D0, uint32_t F0, class TS1, TS1 S1>
+template <uint32_t S0, uint32_t F0, auto S1>
 CUTE_HOST_DEVICE constexpr
 auto
-safe_div(MixedBits<S0,D0,F0> const& m, constant<TS1,S1> const& s)
+safe_div(MixedBits<S0,F0> const& m, C<S1> s)
 {
-  static_assert(has_single_bit(S1), "Only divide MixedBits by powers of two.");
-  return make_mixed_bits(safe_div(constant<uint32_t,S0>{}, s),
+  static_assert(has_single_bit(uint32_t(S1)), "Only divide MixedBits by powers of two.");
+  return make_mixed_bits(safe_div(C<S0>{}, s),
                          safe_div(m.dynamic_int_, s),
-                         safe_div(constant<uint32_t,F0>{}, s));
+                         safe_div(C<F0>{}, s));
 }
 
-template <uint32_t N, uint32_t S0, class D0, uint32_t F0>
+template <uint32_t N, uint32_t S0, uint32_t F0>
 CUTE_HOST_DEVICE constexpr
 auto
-upcast(MixedBits<S0,D0,F0> const& m)
+upcast(MixedBits<S0,F0> const& m)
 {
   static_assert(has_single_bit(N), "Only divide MixedBits by powers of two.");
-  return safe_div(m, constant<uint32_t,N>{});
+  return safe_div(m, C<N>{});
 }
 
 template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
@@ -467,18 +466,18 @@ CUTE_HOST_DEVICE constexpr
 auto
 upcast(T const& m)
 {
-  return safe_div(m, constant<uint32_t,N>{});
+  return safe_div(m, C<N>{});
 }
 
-template <uint32_t N, uint32_t S0, class D0, uint32_t F0>
+template <uint32_t N, uint32_t S0, uint32_t F0>
 CUTE_HOST_DEVICE constexpr
 auto
-downcast(MixedBits<S0,D0,F0> const& m)
+downcast(MixedBits<S0,F0> const& m)
 {
   static_assert(has_single_bit(N), "Only scale MixedBits by powers of two.");
-  return make_mixed_bits(constant<uint32_t,S0 * N>{},
+  return make_mixed_bits(C<S0 * N>{},
                          m.dynamic_int_ * N,
-                         constant<uint32_t,F0 * N>{});
+                         C<F0 * N>{});
 }
 
 template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
@@ -486,7 +485,7 @@ CUTE_HOST_DEVICE constexpr
 auto
 downcast(T const& m)
 {
-  return m * constant<uint32_t, N>{};
+  return m * C<N>{};
 }
 
 //
@@ -525,17 +524,17 @@ to_mixed_bits(Layout const& layout, Coord const& coord)
 // Display utilities
 //
 
-template <uint32_t S, class D, uint32_t F>
-CUTE_HOST_DEVICE void print(MixedBits<S,D,F> const& m)
+template <uint32_t S, uint32_t F>
+CUTE_HOST_DEVICE void print(MixedBits<S,F> const& m)
 {
-  printf("M_%u|(%u&%u)=%u", S, uint32_t(m.dynamic_int_), F, uint32_t(m));
+  printf("M_%u|(%u&%u)=%u", S, m.dynamic_int_, F, uint32_t(m));
 }
 
 #if !defined(__CUDACC_RTC__)
 template <uint32_t S, class D, uint32_t F>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits<S,D,F> const& m)
+CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits<S,F> const& m)
 {
-  return os << "M_" << S << "|(" << uint32_t(m.dynamic_int_) << "&" << F << ")=" << uint32_t(m);
+  return os << "M_" << S << "|(" << m.dynamic_int_ << "&" << F << ")=" << uint32_t(m);
 }
 
 template <int B, int M, int S>
diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp
index a5919716e2..be966d97e7 100644
--- a/include/cute/swizzle_layout.hpp
+++ b/include/cute/swizzle_layout.hpp
@@ -128,6 +128,7 @@ namespace detail {
 
 // Get just the Swizzle part of a composed layout.
 template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
 auto
 get_swizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>)
 {
@@ -136,6 +137,7 @@ get_swizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>)
 
 // A non-swizzled layout's "Swizzle part" is the identity swizzle.
 template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
 auto
 get_swizzle_portion(Layout<Shape,Stride>)
 {
diff --git a/include/cute/swizzle_ptr.hpp b/include/cute/swizzle_ptr.hpp
index 50bfbfa2dd..fde7454f14 100644
--- a/include/cute/swizzle_ptr.hpp
+++ b/include/cute/swizzle_ptr.hpp
@@ -70,6 +70,8 @@ struct smem_ptr_swizzle
 {
   static_assert(is_empty<Swizzle>::value, "Swizzle can't have state.");
 
+  static const uint32_t ElementsPerStoredItem = sizeof(T) * 8 / sizeof_bits_v<T>;
+
   CUTE_HOST_DEVICE constexpr
   T* get() const
   {
@@ -98,6 +100,7 @@ struct smem_ptr_swizzle
   CUTE_HOST_DEVICE constexpr
   T& operator[](Int const& i) const
   {
+    static_assert(sizeof_bits_v<T> >= 8, "Use subbyte_iterator to access the element");
     return *apply_swizzle(get() + i);
   }
 
@@ -105,7 +108,7 @@ struct smem_ptr_swizzle
   CUTE_HOST_DEVICE constexpr
   smem_ptr_swizzle operator+(Int const& i) const
   {
-    return {ptr_ + i};
+    return {ptr_ + i / ElementsPerStoredItem};
   }
 
   T* ptr_;
@@ -286,14 +289,14 @@ CUTE_HOST_DEVICE void print(smem_ptr_flag_bits<B> const& ptr)
 template <class T, int B, int M, int S>
 CUTE_HOST_DEVICE void print(smem_ptr_swizzle<T,Swizzle<B,M,S>> const& ptr)
 {
-  printf("smem_ptr_S<%d,%d,%d>_%db(%p)", B, M, S, int(8*sizeof(T)), ptr.get());
+  printf("smem_ptr_S<%d,%d,%d>_%db(%p)", B, M, S, int(sizeof_bits<T>::value), ptr.get());
 }
 
 #if !defined(__CUDACC_RTC__)
 template <class T, int B, int M, int S>
 CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr_swizzle<T,Swizzle<B,M,S>> const&)
 {
-  return os << "smem_ptr_S<" << B << "," << M << "," << S << ">_" << int(8*sizeof(T)) << "b";
+  return os << "smem_ptr_S<" << B << "," << M << "," << S << ">_" << int(sizeof_bits<T>::value) << "b";
 }
 #endif
 
diff --git a/include/cute/util/print.hpp b/include/cute/util/print.hpp
index f585bf3172..31dad07fd0 100644
--- a/include/cute/util/print.hpp
+++ b/include/cute/util/print.hpp
@@ -33,7 +33,6 @@
 #include <cute/config.hpp>
 
 #include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>
 
 //
 // CUDA compatible print and printf
@@ -119,30 +118,75 @@ get_format(double) {
 
 CUTE_HOST_DEVICE
 void
-print(char const& c) {
+print(char c) {
   printf("%c", c);
 }
 
-template <class T,
-          __CUTE_REQUIRES(is_std_integral<T>::value)>
 CUTE_HOST_DEVICE
 void
-print(T const& a) {
-  printf("%d", int(a));
+print(signed char a) {
+  printf("%hhd", a);
 }
 
-template <class... T>
 CUTE_HOST_DEVICE
 void
-print(char const* format, T const&... t) {
-  printf(format, t...);
+print(unsigned char a) {
+  printf("%hhu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(short a) {
+  printf("%hd", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned short a) {
+  printf("%hu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(int a) {
+  printf("%d", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned int a) {
+  printf("%u", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long a) {
+  printf("%ld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long a) {
+  printf("%lu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long long a) {
+  printf("%lld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long long a) {
+  printf("%llu", a);
 }
 
 template <class... T>
 CUTE_HOST_DEVICE
 void
-print(T const&... t) {
-  (print(t), ...);
+print(char const* format, T const&... t) {
+  printf(format, t...);
 }
 
 CUTE_HOST_DEVICE
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index 4d6c63102c..a08ba333c9 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -130,7 +130,7 @@ struct Mma<
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -196,7 +196,7 @@ struct Mma<
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -257,13 +257,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -318,13 +317,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -379,14 +377,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.s8.u8 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -441,13 +437,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -461,7 +456,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
@@ -471,7 +466,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
@@ -508,13 +503,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -522,7 +516,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * S8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -532,7 +526,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
@@ -569,13 +563,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -583,7 +576,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
@@ -593,7 +586,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
@@ -630,13 +623,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -644,7 +636,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U8 * U8 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,16>,
+  gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
@@ -654,7 +646,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,16>;
+  using Shape = gemm::GemmShape<8, 8, 16>;
 
   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
@@ -691,13 +683,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -711,7 +702,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -721,7 +712,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAdd> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -751,19 +742,19 @@ struct Mma<
 
   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);
 
   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -771,7 +762,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -781,7 +772,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAdd> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -818,13 +809,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -832,7 +822,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -842,7 +832,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAdd> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -879,13 +869,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -893,7 +882,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -903,7 +892,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAdd> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -940,13 +929,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -960,7 +948,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -970,7 +958,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -1007,13 +995,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1021,7 +1008,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * S4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -1031,7 +1018,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -1068,13 +1055,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1082,7 +1068,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = S4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   int4b_t,
   layout::RowMajor,
@@ -1092,7 +1078,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
@@ -1129,13 +1115,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1143,7 +1128,7 @@ struct Mma<
 /// Matrix multiply-add operation: S32 = U4 * U4 + S32
 template <>
 struct Mma<
-  gemm::GemmShape<8,8,32>,
+  gemm::GemmShape<8, 8, 32>,
   32,
   uint4b_t,
   layout::RowMajor,
@@ -1153,7 +1138,7 @@ struct Mma<
   layout::RowMajor,
   OpMultiplyAddSaturate> {
 
-  using Shape = gemm::GemmShape<8,8,32>;
+  using Shape = gemm::GemmShape<8, 8, 32>;
 
   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
@@ -1190,13 +1175,12 @@ struct Mma<
   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-
 #else
     CUTLASS_UNUSED(a);
     CUTLASS_UNUSED(b);
     CUTLASS_UNUSED(c);
     CUTLASS_UNUSED(d);
-    assert(0);
+    CUTLASS_NOT_IMPLEMENTED();
 #endif
   }
 };
@@ -1287,7 +1271,7 @@ struct Mma<
   CUTLASS_UNUSED(b);
   CUTLASS_UNUSED(c);
   CUTLASS_UNUSED(d);
-  assert(0); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
+  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
 
 #endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
 
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
index c01a7b07c4..18543b71a8 100644
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@@ -53,7 +53,16 @@
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
 #define CUTLASS_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+#if (__CUDA_ARCH__ <= 890)
+#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
 #endif
+
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -2084,7 +2093,7 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
@@ -2149,7 +2158,7 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
@@ -2220,7 +2229,7 @@ struct Mma<
     FragmentC const &c
   ) const {
 
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
     uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
     uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
@@ -2244,7 +2253,7 @@ struct Mma<
     CUTLASS_UNUSED(d);
     assert(0);
 
-#endif // defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
   }
 };
 
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index 19d16cc251..e5132d9d83 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -33,10 +33,24 @@
            and is safe to use in a union.
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 #include "cutlass/cutlass.h"
 #include "cutlass/functional.h"
-#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_size.h"
+#include "cutlass/half.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/tfloat32.h"
+#include "cutlass/bfloat16.h"
 #include "cutlass/half.h"
 namespace cutlass {
 
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index ac30422408..7ec158b16d 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -32,6 +32,15 @@
     \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
            and is safe to use in a union.
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 
 #pragma once
 
diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index 5250048222..63502571fc 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -342,7 +342,7 @@ struct SyncManager {
   CUTLASS_DEVICE
   static
   void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
-    BarrierSync::wait_lt_helper(lock_ptr, thread_idx, flag_idx, count);
+    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
   }
 
   CUTLASS_DEVICE
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index b660cd44c6..0c3397d288 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -33,6 +33,17 @@
     \brief Defines a proxy class for storing non-standard 16-bit floating point values with
           8 bits of exponent and 7 bit of mantissa.
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #if defined(__CUDACC_RTC__)
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index ffce5d09b7..729d242ceb 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -300,6 +300,13 @@ class complex
   CUTLASS_HOST_DEVICE
   T &imag() { return _imag; }
 
+  /// Set the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  void real(T real) { _real = real; }
+
+  /// Set the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  void imag(T imag) { _imag = imag; }
 
   #if !defined(__CUDACC_RTC__)
   /// Converts to cuFloatComplex
@@ -431,34 +438,55 @@ CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
 /// Norm accumulate specialized for complex types
 template <typename T, typename R>
 CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
-  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) + 
+  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
     static_cast<R>(imag(z)) * static_cast<R>(imag(z));
 }
 
-/// Returns the complex conjugate
 CUTLASS_HOST_DEVICE float conj(float const &z) {
   return z;
 }
 
-/// Returns the complex conjugate
 CUTLASS_HOST_DEVICE double conj(double const &z) {
   return z;
 }
 
+CUTLASS_HOST_DEVICE half_t conj(half_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE int32_t conj(int32_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE uint32_t conj(uint32_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE int4b_t conj(int4b_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE uint4b_t conj(uint4b_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE bfloat16_t conj(bfloat16_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE uint1b_t conj(uint1b_t const& z) {
+  return z;
+}
+
+CUTLASS_HOST_DEVICE tfloat32_t conj(tfloat32_t const& z) {
+  return z;
+}
+
 /// Returns the complex conjugate
 template <typename T>
 CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const &z) {
   return complex<T>(real(z), -imag(z));
 }
-/// Indentity transform for non-complex types
-template <typename T>
-CUTLASS_HOST_DEVICE T conj(T const &z) {
-    static_assert( !platform::is_same<T, cuComplex>::value &&
-                   !platform::is_same<T, cuDoubleComplex>::value &&
-                   !platform::is_same<T, cutlass::complex<double>>::value &&
-                   !platform::is_same<T, cutlass::complex<float>>::value, "May not be a complex data type");
-  return z;
-}
 
 /// Projects the complex number z onto the Riemann sphere
 template <typename T>
@@ -511,10 +539,10 @@ CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
   return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
 }
 
-/// Comparison 
+/// Comparison
 template <typename T>
 CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
-  return true; 
+  return true;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index e7d8360fa9..8c29767fb6 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -44,13 +44,22 @@
     Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
     Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 
 #pragma once
 
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/functional.h"
@@ -80,7 +89,7 @@ struct Conv2dProblemSize {
 
 public:
   CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(): 
+  Conv2dProblemSize():
     N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
     pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
     mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
@@ -125,7 +134,7 @@ struct Conv2dProblemSize {
     int split_k_slices = 1,
     int groups = 1
   ): 
-    N(N), H(H), W(W), C(C), K(K), R(R), S(S), P(P), Q(Q),
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
     pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w), 
     dilation_h(dilation_h), dilation_w(dilation_w), 
     mode(mode), split_k_slices(split_k_slices), groups (groups) { }
@@ -145,11 +154,11 @@ struct Conv2dProblemSize {
     int groups = 1
   ):
     N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
     K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
     pad_h(padding[0]), pad_w(padding[2]), 
     stride_h(stride.row()), stride_w(stride.column()), 
     dilation_h(dilation.row()), dilation_w(dilation.column()),
-    P(output_size.h()), Q(output_size.w()),     
     mode(mode), split_k_slices(split_k_slices), groups(groups) {}
 
   /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
@@ -188,8 +197,8 @@ struct Conv2dProblemSize {
     int groups = 1
   ):
     N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
     K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    P(output_size.h()), Q(output_size.w()), 
     pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), 
     dilation_h(1), dilation_w(1),
     mode(mode), split_k_slices(split_k_slices), groups(groups) {}
@@ -486,7 +495,6 @@ int depthwise_gemm_k_iterations(
 CUTLASS_HOST_DEVICE
 int implicit_gemm_k_iterations_per_channel(
     Operator conv_operator,
-    int threadblock_K,
     Conv2dProblemSize const &problem_size,
     IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
 
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index 5bef4ffb71..4a2b20704c 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -44,6 +44,15 @@
     Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
     Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)  
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 
 #pragma once
 
@@ -80,11 +89,11 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
 public:
   CUTLASS_HOST_DEVICE
   Conv3dProblemSize(): 
+    Conv2dProblemSize(),
     D(0), T(0), Z(0), 
     pad_d(0), 
     stride_d(1), 
-    dilation_d(1),
-    Conv2dProblemSize() { }
+    dilation_d(1) { }
  
   /// Constructor for default padding, stride, dilation, and split-K
   CUTLASS_HOST_DEVICE
@@ -102,10 +111,10 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int R,
     int S,
     Mode mode
-  ): 
+  ):
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
     D(D), T(T), Z(Z), 
-    pad_d(T / 2), stride_d(1), dilation_d(1),
-    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode) { }
+    pad_d(T / 2), stride_d(1), dilation_d(1) { }
 
   /// Constructor
   CUTLASS_HOST_DEVICE
@@ -134,15 +143,15 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     Mode mode,
     int split_k_slices = 1,
     int groups = 1
-  ): 
-    D(D), T(T), Z(Z), 
-    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d),
+  ):
     Conv2dProblemSize(
-      N, H, W, C, K, R, S, P, Q, 
-      pad_h, pad_w, 
-      stride_h, stride_w, 
-      dilation_h, dilation_w,
-      mode, split_k_slices, groups) { }
+    N, H, W, C, K, R, S, P, Q, 
+    pad_h, pad_w, 
+    stride_h, stride_w, 
+    dilation_h, dilation_w,
+    mode, split_k_slices, groups),
+    D(D), T(T), Z(Z), 
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
 
   /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
   // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
@@ -158,8 +167,6 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int split_k_slices = 1,
     int groups = 1
   ):
-    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
     Conv2dProblemSize(
       {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
       {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
@@ -167,8 +174,9 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
       {stride[1], stride[2]},
       {dilation[1], dilation[2]},
       {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
-      mode, split_k_slices, groups
-    ) { }
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
 
   /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
   // *computes* output size and sets Z, P and Q (include all data members in ctor)
@@ -183,18 +191,18 @@ struct Conv3dProblemSize : public Conv2dProblemSize {
     int split_k_slices = 1,
     int groups = 1
   ):
-    D(input_size.d()), T(filter_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]),
     Conv2dProblemSize(
       {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
       {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
       {padding[1], padding[1], padding[2], padding[2]},
       {stride[1], stride[2]},
       {dilation[1], dilation[2]},
-      mode, split_k_slices, groups
-    ) { 
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
       // set output Z
-      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;      
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
     }
 
   /// Equality operator (ignores mode and split_k_slice)
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
index 2984901b9d..5b1e4d34c0 100644
--- a/include/cutlass/conv/convolution.h
+++ b/include/cutlass/conv/convolution.h
@@ -70,13 +70,23 @@ Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
 Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/matrix_coord.h"
 
 namespace cutlass {
diff --git a/include/cutlass/conv/kernel/direct_convolution.h b/include/cutlass/conv/kernel/direct_convolution.h
index ef7a920e64..f5cce5939e 100644
--- a/include/cutlass/conv/kernel/direct_convolution.h
+++ b/include/cutlass/conv/kernel/direct_convolution.h
@@ -142,7 +142,7 @@ struct DirectConvolutionParams {
                                                                 ThreadblockShape::kN);
 
     gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
-        kConvolutionalOperator, ThreadblockShape::kK, args.problem_size, kIteratorAlgorithm);
+        kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
 
     ThreadblockSwizzle threadblock_swizzle;
 
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
index 2669ff7758..79dac6dd83 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -250,7 +250,7 @@ struct ImplicitGemmConvolution {
         ThreadblockShape::kN);
 
       gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
-          kConvolutionalOperator, ThreadblockShape::kK, args.problem_size, kIteratorAlgorithm);
+          kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
 
       ThreadblockSwizzle threadblock_swizzle;
 
diff --git a/include/cutlass/conv/threadblock/threadblock_swizzle.h b/include/cutlass/conv/threadblock/threadblock_swizzle.h
index 4b886049d3..726a77c8b7 100644
--- a/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -95,11 +95,11 @@ struct StridedDgradHorizontalThreadblockSwizzle :
   /// Returns the shape of the problem in units of logical tiles
   /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  gemm::GemmCoord get_tiled_shape(
+  static gemm::GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     gemm::GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -136,11 +136,11 @@ struct StridedDgradIdentityThreadblockSwizzle :
   /// Returns the shape of the problem in units of logical tiles
   /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  gemm::GemmCoord get_tiled_shape(
+  static gemm::GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     gemm::GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -174,10 +174,10 @@ struct DepthwiseDirect2dConvIdentityThreadblockSwizzle
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
+  static gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
                             cutlass::conv::Conv2dProblemSize const &problem_size,
                             gemm::GemmCoord tile_size,
-                            int split_k_slices) const {
+                            int split_k_slices) {
         
     gemm::GemmCoord implicit_gemm_problem_size =
         cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index 455838533e..50fd51930b 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -32,6 +32,16 @@
     \brief A Coord is a coordinate of arbitrary rank into a tensor or matrix
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #if defined(__CUDACC_RTC__)
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index c0a9685076..63617afa25 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -31,7 +31,15 @@
 /*! \file
     \brief Helpers for printing cutlass/core objects
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
 
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 #pragma once
 
 #include <iostream>
@@ -45,11 +53,10 @@
 #include "cutlass/matrix_shape.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv2d_problem_size.h"
 #include "cutlass/conv/conv3d_problem_size.h"
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Output operator for CUDA built-in dim3 type
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index bbef6fc2c6..75a46d56cf 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -33,6 +33,16 @@
     \brief Basic include for CUTLASS.
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/detail/helper_macros.hpp"
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 0c3a9cd2f4..5e0ea623fa 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -141,4 +141,18 @@ namespace cutlass {
 #define CUTLASS_THREAD_LOCAL
 #endif
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (201700L <= __cplusplus)
+#define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
+#define CUTLASS_CXX17_OR_LATER 1
+#else
+#define CUTLASS_CONSTEXPR_IF_CXX17
+#define CUTLASS_CXX17_OR_LATER 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }; // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/detail/layout.hpp b/include/cutlass/detail/layout.hpp
index da76f0d655..2defe558c3 100644
--- a/include/cutlass/detail/layout.hpp
+++ b/include/cutlass/detail/layout.hpp
@@ -239,6 +239,23 @@ check_alignment(Shape const & shape, Stride const & stride) {
     : get_contiguous_shape(cute::get<1>(shape), cute::get<1>(stride)) % Alignment == 0;
 }
 
+// Check if tensor shape satisfies a given major alignment
+
+template<int B, int M, int S>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(cute::Swizzle<B, M, S>) {
+  static_assert(B >= 0 and M >= 0);
+  return size_t(1) << size_t(B + M + cute::abs(S));
+}
+
+template<class Layout>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(Layout layout) {
+  return alignment_for_swizzle(cute::detail::get_swizzle_portion(layout));
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::detail
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
index 072da89969..dec4b9ff6e 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -259,6 +259,95 @@ struct Sm90TmaBuilderImpl {
     >;
 };
 
+///////////////////////////////////////////////////////////////////////////////
+// Descriptor classes for defining EVT nodes
+// Some of the epilogue visitor nodes require non-intuitive template arguments
+// such as CopyOpS2R for AuxLoad node. Traditionaly, these are resolved by the
+// builder classes. Here we provide a set of descriptor classes that resolve
+// these template arguments from more intuitive types such as Stride, Layout
+
+// Get TileShape, EpilogueTile, Dispatch Policy, StagesC, and STagesD
+template<
+  typename TileShape_MNK,
+  typename EpilogueTileType, 
+  typename ElementC,
+  typename ElementD,
+  typename Schedule
+>
+struct EpilogueDescriptor {
+  using TileShape = TileShape_MNK;
+  using EpilogueTile = 
+    decltype(
+      detail::sm90_compute_tile_shape_or_override<
+        ElementD, EpilogueTileType, Schedule
+      >()
+    );
+  using DispatchPolicy = 
+    decltype(
+      detail::sm90_get_tma_dispatch_policy<
+        TileShape_MNK, EpilogueTile, 
+        ElementC, ElementD, Schedule
+      >()
+    );
+  constexpr static int StagesC = DispatchPolicy::StagesC;
+  constexpr static int StagesD = DispatchPolicy::StagesD;
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxLoad node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxLoadDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesC;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpS2R =
+    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxStore node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxStoreDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesD;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpR2S =
+    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
+};
+
+template<
+  typename EpilogueDescriptor,
+  typename ElementVector
+>
+struct RowBroadcastDescriptor {
+  constexpr static int Stages = ceil_div(
+    EpilogueDescriptor::StagesC, 
+    size(shape_div(take<0, 2>(typename EpilogueDescriptor::TileShape{}), typename EpilogueDescriptor::EpilogueTile{}))
+  ) + 1;
+
+  using Element = ElementVector;
+};
+
 } // namespace detail
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -426,7 +515,8 @@ private:
     ElementD,
     GmemLayoutTagD,
     AlignmentD,
-    EpilogueSchedule
+    EpilogueSchedule,
+    FusionOperation
   >;
 
 public:
diff --git a/include/cutlass/epilogue/collective/collective_builder.hpp b/include/cutlass/epilogue/collective/collective_builder.hpp
index 46ad166b2e..02cb795b79 100644
--- a/include/cutlass/epilogue/collective/collective_builder.hpp
+++ b/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -45,6 +45,7 @@ struct EpilogueTileAuto {};
 // Used to let the builder pick the epilogue schedule automatically.
 // Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
 struct EpilogueScheduleAuto {};
+struct EpilogueIm2ColScheduleAuto {};
 
 template <
   class ArchTag,
diff --git a/include/cutlass/epilogue/collective/detail.hpp b/include/cutlass/epilogue/collective/detail.hpp
index af77479c77..62d2ef755b 100644
--- a/include/cutlass/epilogue/collective/detail.hpp
+++ b/include/cutlass/epilogue/collective/detail.hpp
@@ -126,14 +126,14 @@ class Sm90TmaWarpSpecializedAdapter : public EpilogueOp {
   CUTLASS_HOST_DEVICE
   static constexpr int
   get_load_pipe_increment([[maybe_unused]] TileShapeMNK) {
-    return 0;
+    return 1;
   }
 
   template<class TileShapeMNK>
   CUTLASS_HOST_DEVICE
   static constexpr int
   get_store_pipe_increment([[maybe_unused]] TileShapeMNK) {
-    return 0;
+    return 1;
   }
 
   CUTLASS_DEVICE
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index 5bdfab882f..fe146a8546 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -40,6 +40,7 @@
 #include "cutlass/epilogue/collective/detail.hpp"
 #include "cutlass/epilogue/thread/scale_type.h"
 #include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/detail/layout.hpp"
 #include "cutlass/trace.h"
 
 #include "cute/tensor.hpp"
@@ -119,40 +120,52 @@ class CollectiveEpilogue<
   static_assert(rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
 
 private:
-  using InternalElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+  using SmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
   constexpr static int StagesC = StagesC_;
   constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
   constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
 
-  // internal optimization to reuse C shared memory for storing D
-  using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<InternalElementC>::value>(SmemLayoutAtomC{}));
-  using SmemLayoutAtomBitsD = decltype(downcast<sizeof_bits<ElementD>::value>(SmemLayoutAtomD{}));
-  constexpr static bool support_smem_reuse = is_source_supported &&
-                                              sizeof(InternalElementC) == sizeof(ElementD) &&
-                                              StrideC{} == StrideD{} &&
-                                              StagesD <= StagesC &&
-                                              cute::is_same_v<SmemLayoutAtomBitsC,SmemLayoutAtomBitsD>;
-  constexpr static bool ReuseSmemC = DispatchPolicy::ReuseSmemC;
-  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
-
   constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
   constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
 
-public:
   using SmemLayoutC = decltype(tile_to_shape(
       SmemLayoutAtomC{},
       make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
       cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
   using SmemLayoutD = decltype(tile_to_shape(
       SmemLayoutAtomD{},
-      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesD>{}),
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
       cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
 
+  constexpr static bool support_smem_reuse = is_source_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+
+  struct TensorStorageWithC {
+    alignas(SmemAlignmentC) array_aligned<SmemElementC, size(SmemLayoutC{})> smem_C;
+    alignas(SmemAlignmentD) array_aligned<ElementD, size(SmemLayoutD{})> smem_D;
+
+    using FusionStorage = typename FusionCallbacks::SharedStorage;
+    FusionStorage thread;
+  };
+
+  struct TensorStorageWithoutC {
+    alignas(SmemAlignmentD) array_aligned<ElementD, size(SmemLayoutD{})> smem_D;
+
+    using FusionStorage = typename FusionCallbacks::SharedStorage;
+    FusionStorage thread;
+  };
+
+public:
   // TMA pipeline for loading C
   using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
   using LoadPipelineState = cutlass::PipelineState<StagesC>;
   constexpr static uint32_t TmaTransactionBytes =
-    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(InternalElementC));
+    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(SmemElementC));
 
   // TMA pipeline for storing D
   using StorePipeline = cute::conditional_t<ReuseSmemC,
@@ -161,17 +174,9 @@ class CollectiveEpilogue<
   using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
 
   struct SharedStorage {
-    struct TensorStorage : aligned_struct<128> {
-      cute::conditional_t<not is_source_supported,
-        detail::EmptyStorage<InternalElementC>,
-        array_aligned<InternalElementC, size(SmemLayoutC{})>> smem_C;
-      alignas(128) cute::conditional_t<ReuseSmemC,
-        detail::EmptyStorage<ElementD>,
-        array_aligned<ElementD, size(SmemLayoutD{})>> smem_D;
-
-      using FusionStorage = typename FusionCallbacks::SharedStorage;
-      alignas(128) FusionStorage thread;
-    } tensors;
+    using TensorStorage =
+      cute::conditional_t<not is_source_supported or ReuseSmemC, TensorStorageWithoutC, TensorStorageWithC>;
+    TensorStorage tensors;
 
     using PipelineStorage = typename LoadPipeline::SharedStorage;
     PipelineStorage pipeline;
@@ -192,7 +197,7 @@ class CollectiveEpilogue<
   struct Params {
     using TMA_C = decltype(make_tma_copy(
         CopyOpG2S{},
-        make_tensor(static_cast<InternalElementC const*>(nullptr),
+        make_tensor(static_cast<SmemElementC const*>(nullptr),
             repeat_like(StrideC{}, int32_t(0)), StrideC{}),
         SmemLayoutC{}(_,_,0)));
     using TMA_D = decltype(make_tma_copy(
@@ -316,21 +321,22 @@ class CollectiveEpilogue<
       int thread_idx,
       TensorStorage& shared_tensors) {
     using namespace cute;
-    using _X = Underscore;
 
     // Indexing variables
     auto [M, N, K, L] = problem_shape_mnkl;
     auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
 
     // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
-    Tensor mC_mnl = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                        //              (M,N,L)
-    Tensor gC_mnl = local_tile(mC_mnl, tile_shape_MNK, make_coord(_,_,_), Step<_1,_1,_X>{});    //  (CTA_M,CTA_N,m,n,l)
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                            //        (CTA_M,CTA_N)
+    Tensor mC = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                   //       (M,N,L)
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), make_coord(m_coord,n_coord,l_coord));          // (CTA_M,CTA_N)
 
     // Apply epilogue subtile, get matching smem tensor
-    auto ptr_sC = make_smem_ptr(shared_tensors.smem_C.data());
+    SmemElementC* ptr_sC = reinterpret_cast<SmemElementC*>(shared_tensors.smem_D.data());
+    if constexpr (not ReuseSmemC and is_source_supported) {
+      ptr_sC = shared_tensors.smem_C.data();
+    }
     Tensor gC_epi = local_tile(gC, EpilogueTile{}, _);                           // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor sC_epi = make_tensor(ptr_sC, SmemLayoutC{});                          // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
 
     // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
     ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
@@ -420,8 +426,9 @@ class CollectiveEpilogue<
       int thread_idx,
       TensorStorage& shared_tensors) {
     using namespace cute;
-    using _X = Underscore;
     using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
 
     static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
     static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
@@ -439,16 +446,22 @@ class CollectiveEpilogue<
     auto epi_tile_n = size<1>(EpilogueTile{});
 
     // Represent the full output tensor, slice to get the tile this CTA is responsible for
-    Tensor mD_mnl = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                                    // (M,N,L)
-    Tensor gD_mnl = local_tile(mD_mnl, tile_shape_MNK, make_coord(_,_,_), Step<_1,_1,_X>{});     // (CTA_M,CTA_N,m,n,l)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (CTA_M,CTA_N)
-
-    // Apply epilogue subtiling, construct corresponding pipelined smem tensors
-    auto ptr_sC = make_smem_ptr(shared_tensors.smem_C.data());
-    auto ptr_sD = make_smem_ptr(shared_tensors.smem_D.data());
+    Tensor mD = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                                  //       (M,N,L)
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), make_coord(m_coord,n_coord,l_coord));          // (CTA_M,CTA_N)
+    
+    // Apply epilogue subtiling
     Tensor gD_epi = local_tile(gD, EpilogueTile{}, _);                           // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor sC_epi = make_tensor(ptr_sC, SmemLayoutC{});                          // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
-    Tensor sD_epi = make_tensor(ptr_sD, SmemLayoutD{});                          // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    // Construct the corresponding pipelined smem tensors
+    SmemElementC* ptr_sC = reinterpret_cast<SmemElementC*>(shared_tensors.smem_D.data());
+    if constexpr (not ReuseSmemC and is_source_supported) {
+      ptr_sC = shared_tensors.smem_C.data();
+    }
+    ElementD* ptr_sD = shared_tensors.smem_D.data();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
 
     // Get the smallest tiled copy we can use to retile the accumulators
     using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
@@ -458,14 +471,11 @@ class CollectiveEpilogue<
     TiledCopy tiled_r2s = make_tiled_copy_S(Copy_Atom<CopyOpR2S,ElementD>{}, tiled_copy_C_atom);
     ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
     Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
-    Tensor tRS_sD   = conditional_return<ReuseSmemC>(
-                      thread_r2s.partition_D(recast<ElementD>(sC_epi)),                     // (R2S,R2S_M,R2S_N,PIPE_C)
-                      thread_r2s.partition_D(sD_epi) );                                     // (R2S,R2S_M,R2S_N,PIPE_D)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
 
-    // Allocate register tensors
-    auto tRS_rD_shape = take<0,3>(shape(thread_r2s.partition_S(sD_epi)));
-    Tensor tRS_rC = make_tensor<InternalElementC>(tRS_rD_shape);                                   // (R2S,R2S_M,R2S_N)
-    Tensor tRS_rD = make_tensor<ElementD        >(tRS_rD_shape);                                   // (R2S,R2S_M,R2S_N)
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<ElementD>(tRS_rD_layout);                                          // (R2S,R2S_M,R2S_N)
 
     // Vectorized fragment view
     constexpr int FragmentSize = DispatchPolicy::FragmentSize;
@@ -474,16 +484,23 @@ class CollectiveEpilogue<
     CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
 
     // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
-    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,InternalElementC>{}, tiled_copy_C_atom);
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
     ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
-    Tensor tSR_sC = thread_s2r.partition_S(sC_epi);                                         // (S2R,S2R_M,S2R_N,PIPE_C)
-    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                            // (S2R,S2R_M,S2R_N)
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R,DefaultCopy>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
 
     // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
     ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
-    Tensor bSG_sD = conditional_return<ReuseSmemC>(
-                    thrblk_s2g.partition_S(recast<ElementD>(sC_epi)),                  // (S2G,S2G_M,S2G_N,PIPE_C)
-                    thrblk_s2g.partition_S(sD_epi) );                                  // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
     Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
 
     CUTE_STATIC_ASSERT(mma_tile_m == epi_tile_m, "EPI_TILE_M must equal MMA_TILE_M");
diff --git a/include/cutlass/epilogue/fusion/callbacks.hpp b/include/cutlass/epilogue/fusion/callbacks.hpp
index e9b8f65194..979a5257cf 100644
--- a/include/cutlass/epilogue/fusion/callbacks.hpp
+++ b/include/cutlass/epilogue/fusion/callbacks.hpp
@@ -62,6 +62,7 @@ struct FusionCallbacksTraits {
   using Operation = T;
   using CtaTile_MNK = void;
   using EpilogueTile_MN = void;
+  using ElementCompute = void;
 };
 
 template <
@@ -78,6 +79,7 @@ struct FusionCallbacksTraits<
   using Operation = Operation_;
   using CtaTile_MNK = CtaTile_MNK_;
   using EpilogueTile_MN = EpilogueTile_MN_;
+  using ElementCompute = typename Operation::ElementCompute;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/fusion/operations.hpp b/include/cutlass/epilogue/fusion/operations.hpp
index 14db464397..848d9a1146 100644
--- a/include/cutlass/epilogue/fusion/operations.hpp
+++ b/include/cutlass/epilogue/fusion/operations.hpp
@@ -60,7 +60,7 @@ struct FusionOperation {
   using ElementBias = void;
   static constexpr int AlignmentBias = 0;
   static constexpr bool IsPerRowBiasSupported = false;
-  template <class> using ActivationFn = void;
+  using ActivationFn = void;
   static constexpr bool IsEltActSupported = false;
 
   using ElementAux = void;
@@ -108,8 +108,7 @@ template<
 >
 struct LinCombEltAct
     : LinearCombination<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_> {
-  template <class T>
-  using ActivationFn = ActivationFn_<T>;
+  using ActivationFn = ActivationFn_<ElementCompute_>;
   static constexpr bool IsEltActSupported = true;
 };
 
@@ -142,8 +141,7 @@ template<
 struct LinCombPerRowBiasEltAct
     : LinCombPerRowBias<ElementOutput_, ElementCompute_,
         ElementBias_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  template <class T>
-  using ActivationFn = ActivationFn_<T>;
+  using ActivationFn = ActivationFn_<ElementCompute_>;
   static constexpr bool IsEltActSupported = true;
 };
 
diff --git a/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
index b2290a40fb..84f75f92ac 100644
--- a/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
@@ -217,6 +217,9 @@ struct FusionCallbacks<
     ElementScalar const* alpha_ptr = nullptr;
     ElementScalar const* beta_ptr = nullptr;
 
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
     operator typename Impl::Arguments() const {
       return
         {    // unary op: activation(beta * C + (alpha * acc))
@@ -230,7 +233,7 @@ struct FusionCallbacks<
             },                    // end binary op
             {} // ternary args : multiply_add
           },   // end ternary op
-          {} // unary args: activation
+          activation // unary args: activation
         };   // end unary op
     }
   };
@@ -258,7 +261,7 @@ using Sm90LinCombPerRowBias =
     Sm90EVT<Sm90Compute<multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
       Sm90ScalarBroadcast<ElementScalar>, // alpha
       Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,_0>, AlignmentBias> // bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,int>, AlignmentBias> // bias
     >
   >;
 
@@ -293,7 +296,10 @@ struct FusionCallbacks<
     ElementScalar beta = ElementScalar(0);
     ElementScalar const* alpha_ptr = nullptr;
     ElementScalar const* beta_ptr = nullptr;
+
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
 
     operator typename Impl::Arguments() const {
       return
@@ -303,7 +309,7 @@ struct FusionCallbacks<
           {                     // ternary op : alpha * acc + bias
             {{alpha}, {alpha_ptr}}, // leaf args : alpha
             {},                     // leaf args : acc
-            {bias_ptr},             // leaf args : bias
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
             {}                  // ternary args : multiply_add
           },                    // end ternary op
           {} // ternary args : multiply_add
@@ -373,7 +379,13 @@ struct FusionCallbacks<
     ElementScalar beta = ElementScalar(0);
     ElementScalar const* alpha_ptr = nullptr;
     ElementScalar const* beta_ptr = nullptr;
+
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
 
     operator typename Impl::Arguments() const {
       return
@@ -384,12 +396,12 @@ struct FusionCallbacks<
             {                     // ternary op : alpha * acc + bias
               {{alpha}, {alpha_ptr}}, // leaf args : alpha
               {},                     // leaf args : acc
-              {bias_ptr},             // leaf args : bias
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
               {}                  // ternary args : multiply_add
             },                    // end ternary op
             {} // ternary args : multiply_add
           },   // end ternary op
-          {} // unary args : activation
+          activation // unary args : activation
         };   // end unary op
     }
   };
@@ -461,10 +473,9 @@ struct FusionCallbacks<
       ElementOutput, ElementCompute, ElementAux, ElementBias, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
     > {
 
-  using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
   using Impl =
     Sm90LinCombPerRowBiasEltActAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
       ElementOutput, ElementCompute, ElementAux, ElementBias, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
     >;
   using Operation =
@@ -478,7 +489,15 @@ struct FusionCallbacks<
     ElementScalar beta = ElementScalar(0);
     ElementScalar const* alpha_ptr = nullptr;
     ElementScalar const* beta_ptr = nullptr;
+
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
     ElementAux* aux_ptr = nullptr;
     StrideAux dAux = {};
 
@@ -492,14 +511,14 @@ struct FusionCallbacks<
               {                     // ternary op : alpha * acc + bias
                 {{alpha}, {alpha_ptr}}, // leaf args : alpha
                 {},                     // leaf args : acc
-                {bias_ptr},             // leaf args : bias
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
                 {}                  // ternary args : multiply_add
               },                    // end ternary op
               {}               // ternary args : multiply_add
             },                 // end ternary op
             {aux_ptr, dAux} // unary args : store
           },                // end unary op
-          {} // unary args : activation
+          activation // unary args : activation
         };   // end unary op
     }
   };
@@ -528,7 +547,7 @@ using Sm90PerRowLinCombPerRowBias =
     Sm90EVT<Sm90Compute<multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
       Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, Stride<_1,_0,_0>, AlignmentScalar>, // alpha
       Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,_0>, AlignmentBias> // bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,int>, AlignmentBias> // bias
     >
   >;
 
@@ -591,7 +610,13 @@ struct FusionCallbacks<
     ElementScalar beta = ElementScalar(0);
     ElementScalar const* alpha_ptr = nullptr;
     ElementScalar const* beta_ptr = nullptr;
+
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
 
     operator typename Impl::Arguments() const {
       return
@@ -600,14 +625,14 @@ struct FusionCallbacks<
             {beta_ptr, beta}, // leaf args : beta
             {},               // leaf args : C
             {                 // ternary op : alpha * acc + bias
-              {alpha_ptr, alpha}, // leaf args : alpha
-              {},                 // leaf args : acc
-              {bias_ptr},         // leaf args : bias
+              {alpha_ptr, alpha},   // leaf args : alpha
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
               {}              // ternary args : multiply_add
             },                // end ternary op
             {} // ternary args : multiply_add
           },   // end ternary op
-          {} // unary args : activation
+          activation // unary args : activation
         };   // end unary op
     }
   };
@@ -650,7 +675,7 @@ using Sm90ScaledLinCombPerRowBias =
     Sm90EVT<Sm90Compute<multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
       Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,_0>, 3>, // scale_a * scale_b * alpha
       Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,_0>, AlignmentBias> // bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, Stride<_1,_0,int>, AlignmentBias> // bias
     >
   >;
 
@@ -728,7 +753,12 @@ struct FusionCallbacks<
     ElementScalar const* scale_c_ptr = nullptr;
     ElementScalar const* scale_d_ptr = nullptr;
 
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
 
     operator typename Impl::Arguments() const {
       return
@@ -742,14 +772,14 @@ struct FusionCallbacks<
               {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
                 {{scale_a, scale_b, alpha}, 
                  {scale_a_ptr, scale_b_ptr, alpha_ptr}
-                 },         // leaf args : (scale_a * scale_b * alpha)
-                {},         // leaf args : acc
-                {bias_ptr}, // leaf args : bias
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
                 {} // ternary args : multiply_add
               },   // end ternary op
               {} // ternary args : multiply_add
             },   // end ternary op
-            {} // unary args : activation
+            activation // unary args : activation
           },   // end unary op
           {{scale_d},
            {scale_d_ptr}
@@ -855,10 +885,10 @@ struct FusionCallbacks<
       ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
     > {
 
-  using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
   using Impl =
     Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
       ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
     >;
   using Operation =
@@ -885,9 +915,17 @@ struct FusionCallbacks<
     ElementScalar scale_aux = ElementScalar(1);
     ElementScalar const* scale_aux_ptr = nullptr;
 
+    using StrideBias = Stride<_1,_0,int>;
     ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
     ElementAmax* amax_D_ptr = nullptr;
     ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
     ElementAux* aux_ptr = nullptr;
     StrideAux dAux = {};
 
@@ -905,9 +943,9 @@ struct FusionCallbacks<
           {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
             {{scale_a, scale_b, alpha}, 
              {scale_a_ptr, scale_b_ptr, alpha_ptr}
-             },         // leaf args : (scale_a * scale_b * alpha)
-            {},         // leaf args : acc
-            {bias_ptr}, // leaf args : bias
+             },                   // leaf args : (scale_a * scale_b * alpha)
+            {},                   // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
             {} // ternary args : multiply_add
           },   // end ternary op
           {} // ternary args : multiply_add
@@ -924,7 +962,7 @@ struct FusionCallbacks<
           {    // unary op : reduce(activation(Z))
             {             // unary op : activation(Z)
               {},             // leaf args : Z
-              {}              // unary args : activation
+              activation      // unary args : activation
             },                // end unary op
             {amax_D_ptr_} // unary args : reduce
           },              // end unary op
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
index 9d3dabd799..0d62a4bdcb 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
@@ -99,7 +99,9 @@ struct Sm90Compute : Sm90VisitorImpl<> {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -118,6 +120,123 @@ struct Sm90Compute : Sm90VisitorImpl<> {
 
 };
 
+// partial specialization for compute fns that define an Arguments member, e.g. activation hyperparameters
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle
+>
+struct Sm90Compute<
+  ComputeFn,
+  ElementOutput,
+  ElementCompute,
+  RoundStyle,
+  cute::void_t<typename ComputeFn<ElementCompute>::Arguments>
+> {
+
+  struct SharedStorage { };
+
+  using Arguments = typename ComputeFn<ElementCompute>::Arguments;
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute(Params const& params, SharedStorage const& shared_storage)
+      : params(params) {}
+
+  Params const params;
+
+  template <
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class EpilogueTile
+  >
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      EpilogueTile epi_tile,
+      int thread_idx) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Params const& params)
+      : params(params) {}
+
+    Params const& params;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+          ComputeOutput compute_output{};
+          ConvertOutput convert_output{};
+
+          return convert_output(compute_output(cvt_frg_inputs..., params));
+        }
+      );
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class EpilogueTile,
+    class TiledCopy,
+    class SrcTensor
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      EpilogueTile epi_tile,
+      TiledCopy tiled_copy,
+      int thread_idx,
+      SrcTensor const& tCrC) {
+    return ConsumerStoreCallbacks(params);
+  }
+
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Performance Optimized Specializations
@@ -215,7 +334,9 @@ struct Sm90TreeVisitor<
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
index 28559027a7..348a62befe 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -71,7 +71,9 @@ struct Sm90AccFetch : Sm90VisitorImpl<> {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -129,7 +131,9 @@ struct Sm90SrcFetch : Sm90VisitorImpl<> {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -181,7 +185,8 @@ struct Sm90AuxLoad {
       cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
 
   struct SharedStorage {
-    alignas(128) array_aligned<Element, size(SmemLayout{})> smem_aux;
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
   };
 
   struct Arguments {
@@ -222,9 +227,9 @@ struct Sm90AuxLoad {
   Sm90AuxLoad() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90AuxLoad(Params const& params, SharedStorage& shared_storage)
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
       : params_ptr(&params),
-        smem_aux(shared_storage.smem_aux.data()) { }
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
 
   Params const* params_ptr;
   Element* smem_aux;
@@ -273,7 +278,9 @@ struct Sm90AuxLoad {
   };
 
   template <
-    class TileShapeMNK
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL
   >
   CUTLASS_DEVICE auto
   get_producer_load_callbacks(
@@ -284,8 +291,9 @@ struct Sm90AuxLoad {
       int thread_idx) {
 
     auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m, n, k, l] = tile_coord_mnkl;
     Tensor mAux = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                                // (M,N,L)
-    Tensor gAux = sm90_tensor_to_cta_tile(mAux, tile_shape_mnk, tile_coord_mnkl);                      // (CTA_M,CTA_N)
+    Tensor gAux = local_tile(mAux, take<0,2>(tile_shape_mnk), make_coord(m,n,l));                      // (CTA_M,CTA_N)
 
     Tensor gAux_epi = local_tile(gAux, epi_tile, _);                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
     Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
@@ -339,7 +347,9 @@ struct Sm90AuxLoad {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class TiledCopy,
     class SrcTensor
   >
@@ -363,7 +373,8 @@ struct Sm90AuxLoad {
       make_tiled_copy_S(Copy_Atom<CopyOpS2R,Element>{}, tiled_copy),
       make_tiled_copy_D(Copy_Atom<CopyOpS2R,Element>{}, tiled_copy)
     );
-    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});               // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));            // (EPI_TILE_M,EPI_TILE_N,PIPE)
     auto tSR_sAux = tiled_s2r.get_slice(thread_idx).partition_S(sAux_epi);                    // (S2R,S2R_M,S2R_N,PIPE)
 
 
@@ -378,6 +389,7 @@ struct Sm90AuxLoad {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
 template<
   class Element,
   class StrideMNL = Stride<_0,_0,_0>,
@@ -387,7 +399,8 @@ template<
 struct Sm90ScalarBroadcast {
   static_assert(
     (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar broadcast, e.g. alpha
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar broadcast, e.g. per-batch alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>)); 
 
   struct SharedStorage { };
 
@@ -419,7 +432,7 @@ struct Sm90ScalarBroadcast {
   Sm90ScalarBroadcast() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90ScalarBroadcast(Params const& params, SharedStorage& shared_storage)
+  Sm90ScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
       : params_ptr(&params) {
     // Get the scalar for non-batched broadcast
     if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
@@ -431,7 +444,9 @@ struct Sm90ScalarBroadcast {
   Params const* params_ptr;
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -442,7 +457,9 @@ struct Sm90ScalarBroadcast {
       EpilogueTile epi_tile,
       int thread_idx) {
     // Get the scalar for batched broadcast
-    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> || 
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
       auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
       update_scalar(l_coord);
     }
@@ -470,7 +487,9 @@ struct Sm90ScalarBroadcast {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -486,7 +505,9 @@ struct Sm90ScalarBroadcast {
       SrcTensor const& tCrC) {
 
     // Get the scalar for batched broadcast
-    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> || 
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
       auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
       update_scalar(l_coord);
     }
@@ -541,7 +562,7 @@ struct Sm90RowBroadcast {
 
   // Accumulator doesn't distribute row elements evenly amongst threads so we must buffer in smem
   struct SharedStorage {
-    array_aligned<Element, size<1>(CtaTileShapeMNK{}) * Stages> smem_row;
+    alignas(16) array_aligned<Element, size<1>(CtaTileShapeMNK{}) * Stages> smem_row;
   };
 
   struct Arguments {
@@ -562,9 +583,9 @@ struct Sm90RowBroadcast {
   Sm90RowBroadcast() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90RowBroadcast(Params const& params, SharedStorage& shared_storage)
+  Sm90RowBroadcast(Params const& params, SharedStorage const& shared_storage)
       : params(params),
-        smem_row(shared_storage.smem_row.data()) { }
+        smem_row(const_cast<Element*>(shared_storage.smem_row.data())) { }
 
   Params params;
   Element* smem_row;
@@ -613,7 +634,9 @@ struct Sm90RowBroadcast {
   };
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -625,8 +648,9 @@ struct Sm90RowBroadcast {
       int thread_idx) {
 
     auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m, n, k, l] = tile_coord_mnkl;
     Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
-    Tensor gRow = sm90_tensor_to_cta_tile(mRow, tile_shape_mnk, tile_coord_mnkl);                 // (CTA_M,CTA_N)
+    Tensor gRow = local_tile(mRow, take<0,2>(tile_shape_mnk), make_coord(m,n,l));                      // (CTA_M,CTA_N)
     Tensor sRow = make_tensor(make_smem_ptr(smem_row),                                            // (CTA_M,CTA_N,PIPE)
                     make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
                     make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
@@ -680,7 +704,9 @@ struct Sm90RowBroadcast {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -757,13 +783,15 @@ struct Sm90ColBroadcast {
   Sm90ColBroadcast() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90ColBroadcast(Params const& params, SharedStorage& shared_storage)
+  Sm90ColBroadcast(Params const& params, SharedStorage const& shared_storage)
       : params(params) { }
 
   Params params;
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -819,7 +847,9 @@ struct Sm90ColBroadcast {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
index 7da6d09c49..8e1ffb08a2 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -83,7 +83,8 @@ struct Sm90AuxStore {
       cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
 
   struct SharedStorage {
-    alignas(128) array_aligned<Element, size(SmemLayout{})> smem_aux;
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
   };
 
   struct Arguments {
@@ -125,9 +126,9 @@ struct Sm90AuxStore {
   Sm90AuxStore() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90AuxStore(Params const& params, SharedStorage& shared_storage)
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
       : params_ptr(&params),
-        smem_aux(shared_storage.smem_aux.data()) { }
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
 
   Params const* params_ptr;
   Element* smem_aux;
@@ -143,7 +144,9 @@ struct Sm90AuxStore {
   }
 
   template <
-    class TileShapeMNK
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL
   >
   CUTLASS_DEVICE auto
   get_producer_load_callbacks(
@@ -233,7 +236,9 @@ struct Sm90AuxStore {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class TiledCopy,
     class SrcTensor
   >
@@ -248,14 +253,16 @@ struct Sm90AuxStore {
       SrcTensor const& tCrC) {
 
     auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m, n, k, l] = tile_coord_mnkl;
     Tensor mAux = params_ptr->tma_store_aux.get_tma_tensor(make_shape(M,N,L));                               // (M,N,L)
-    Tensor gAux = sm90_tensor_to_cta_tile(mAux, tile_shape_mnk, tile_coord_mnkl);                      // (CTA_M,CTA_N)
+    Tensor gAux = local_tile(mAux, take<0,2>(tile_shape_mnk), make_coord(m,n,l));                      // (CTA_M,CTA_N)
 
     Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
                       gAux, epi_tile, tiled_copy, thread_idx);
     Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
 
-    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));     // (EPI_TILE_M,EPI_TILE_N,PIPE)
     Tensor gAux_epi = local_tile(gAux, epi_tile, _);                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
 
     auto tiled_r2s = conditional_return<ReferenceSrc>(
@@ -297,8 +304,8 @@ template <
 struct Sm90ScalarReduction {
   static_assert(
     (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar reduction, e.g. per-batch max element
-
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  
   struct SharedStorage { };
 
   struct Arguments {
@@ -329,13 +336,15 @@ struct Sm90ScalarReduction {
   Sm90ScalarReduction() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90ScalarReduction(Params const& params, SharedStorage& shared_storage)
+  Sm90ScalarReduction(Params const& params, SharedStorage const& shared_storage)
       : params(params) { }
 
   Params const params;
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -417,7 +426,9 @@ struct Sm90ScalarReduction {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -502,13 +513,15 @@ struct Sm90RowReduction {
   Sm90RowReduction() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90RowReduction(Params const& params, SharedStorage& shared_storage)
+  Sm90RowReduction(Params const& params, SharedStorage const& shared_storage)
       : params(params) { }
 
   Params params;
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -619,7 +632,9 @@ struct Sm90RowReduction {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -707,13 +722,15 @@ struct Sm90ColReduction {
   Sm90ColReduction() { }
 
   CUTLASS_HOST_DEVICE
-  Sm90ColReduction(Params const& params, SharedStorage& shared_storage)
+  Sm90ColReduction(Params const& params, SharedStorage const& shared_storage)
       : params(params) { }
 
   Params params;
 
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -765,10 +782,11 @@ struct Sm90ColReduction {
 
       Array frg_I = convert_input(frg_input);
       Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
 
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < FragmentSize; ++i) {
-        if (elem_less(tCcCol(i), residue_mn)) {
+        if (elem_less(tCcCol_mn(i), residue_mn)) {
           ElementCompute& tCrCol_vmn = tCrCol_mn(epi_v * FragmentSize + i);
           tCrCol_vmn = reduce_input(tCrCol_vmn, frg_I[i]);
         }
@@ -808,7 +826,9 @@ struct Sm90ColReduction {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
index 7750701e18..85b69333d6 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
@@ -51,34 +51,12 @@ using cute::tuple;
 
 namespace detail {
 
-// Convenience aliases
-using ProblemShapeMNKL = tuple<int,int,int,int>;
-using TileCoordMNKL = tuple<int,int,int,int>;
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Partitioning Helpers
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <
-  class Engine, class LayoutMNL,
-  class TileShapeMNK
->
-CUTLASS_HOST_DEVICE
-constexpr auto
-sm90_tensor_to_cta_tile(
-    Tensor<Engine, LayoutMNL> mT,  // (M,N,L)
-    TileShapeMNK tile_shape_mnk,   // (CTA_M,CTA_N,CTA_K)
-    TileCoordMNKL tile_coord_mnkl) {
-  using _X = Underscore;
-
-  auto [m, n, k, l] = tile_coord_mnkl;
-  Tensor mT_mnl = local_tile(mT, tile_shape_mnk, make_coord(_,_,_), Step<_1,_1,_X>{});                 // (CTA_M,CTA_N)
-
-  return mT_mnl(_,_,m,n,l);
-}
-
 template <
   bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
   class CtaTileMN,
@@ -106,6 +84,7 @@ template <
   bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
   class Engine, class LayoutMNL,
   class TileShapeMNK,
+  class TileCoordMNKL,
   class EpilogueTile,
   class TiledCopy
 >
@@ -118,7 +97,8 @@ sm90_partition_for_epilogue(
     EpilogueTile epi_tile,         // (EPI_TILE_M,EPI_TILE_N)
     TiledCopy tiled_copy,
     int thread_idx) {
-  Tensor cT = sm90_tensor_to_cta_tile(mT, tile_shape_mnk, tile_coord_mnkl);            // (CTA_M,CTA_N)
+  auto [m, n, k, l] = tile_coord_mnkl;
+  Tensor cT = local_tile(mT, take<0,2>(tile_shape_mnk), make_coord(m,n,l));                            // (CTA_M,CTA_N)
   Tensor tCcT =
     sm90_partition_for_epilogue<ReferenceSrc>(cT, epi_tile, tiled_copy, thread_idx);   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
 
@@ -156,7 +136,7 @@ struct Sm90VisitorImplBase {
   Sm90VisitorImplBase() {}
 
   CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage& shared_storage)
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
         [] (auto&& op, auto const& op_params, auto&& op_storage) {
           using Op = cute::remove_cvref_t<decltype(op)>;
@@ -262,7 +242,9 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   // Producer load callbacks factory
   // All operations must redefine this, but most can just dispatch to the base impl
   template <
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile
   >
   CUTLASS_DEVICE auto
@@ -363,7 +345,9 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   // All operations must redefine this
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -446,7 +430,9 @@ struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -516,7 +502,9 @@ struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputT
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -613,7 +601,9 @@ struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
 
   template <
     bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class ProblemShapeMNKL,
     class TileShapeMNK,
+    class TileCoordMNKL,
     class EpilogueTile,
     class TiledCopy,
     class SrcTensor
@@ -651,9 +641,11 @@ namespace detail {
 template <class Op0>
 struct Sm90VisitorImplBase<Op0> {
 
-  struct SharedStorage {
-    typename Op0::SharedStorage op_0;
-  };
+  // Retain tuple for SharedStorage because empty structs have 1B alignment
+  // tuples use multiple inheritance, avoids this problem
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage
+  >;
 
   struct Arguments {
     typename Op0::Arguments op_0;
@@ -675,9 +667,9 @@ struct Sm90VisitorImplBase<Op0> {
   Sm90VisitorImplBase() {}
 
   CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage& shared_storage)
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops({
-        Op0(params.op_0, shared_storage.op_0)
+        Op0(params.op_0, get<0>(shared_storage))
       }) {}
 
   tuple<Op0> ops;
@@ -686,10 +678,10 @@ struct Sm90VisitorImplBase<Op0> {
 template <class Op0, class Op1>
 struct Sm90VisitorImplBase<Op0, Op1> {
 
-  struct SharedStorage {
-    typename Op0::SharedStorage op_0;
-    typename Op1::SharedStorage op_1;
-  };
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage
+  >;
 
   struct Arguments {
     typename Op0::Arguments op_0;
@@ -714,10 +706,10 @@ struct Sm90VisitorImplBase<Op0, Op1> {
   Sm90VisitorImplBase() {}
 
   CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage& shared_storage)
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops({
-        Op0(params.op_0, shared_storage.op_0),
-        Op1(params.op_1, shared_storage.op_1)
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage))
       }) {}
 
   tuple<Op0, Op1> ops;
@@ -726,11 +718,11 @@ struct Sm90VisitorImplBase<Op0, Op1> {
 template <class Op0, class Op1, class Op2>
 struct Sm90VisitorImplBase<Op0, Op1, Op2> {
 
-  struct SharedStorage {
-    typename Op0::SharedStorage op_0;
-    typename Op1::SharedStorage op_1;
-    typename Op2::SharedStorage op_2;
-  };
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage
+  >;
 
   struct Arguments {
     typename Op0::Arguments op_0;
@@ -758,11 +750,11 @@ struct Sm90VisitorImplBase<Op0, Op1, Op2> {
   Sm90VisitorImplBase() {}
 
   CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage& shared_storage)
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops({
-        Op0(params.op_0, shared_storage.op_0),
-        Op1(params.op_1, shared_storage.op_1),
-        Op2(params.op_2, shared_storage.op_2)
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage))
       }) {}
 
   tuple<Op0, Op1, Op2> ops;
@@ -771,12 +763,12 @@ struct Sm90VisitorImplBase<Op0, Op1, Op2> {
 template <class Op0, class Op1, class Op2, class Op3>
 struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
 
-  struct SharedStorage {
-    typename Op0::SharedStorage op_0;
-    typename Op1::SharedStorage op_1;
-    typename Op2::SharedStorage op_2;
-    typename Op3::SharedStorage op_3;
-  };
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage,
+    typename Op3::SharedStorage
+  >;
 
   struct Arguments {
     typename Op0::Arguments op_0;
@@ -807,12 +799,12 @@ struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
   Sm90VisitorImplBase() {}
 
   CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage& shared_storage)
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops({
-        Op0(params.op_0, shared_storage.op_0),
-        Op1(params.op_1, shared_storage.op_1),
-        Op2(params.op_2, shared_storage.op_2),
-        Op3(params.op_3, shared_storage.op_3)
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage)),
+        Op3(params.op_3, get<3>(shared_storage))
       }) {}
 
   tuple<Op0, Op1, Op2, Op3> ops;
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 526d46b569..221aa0f3cd 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -49,38 +49,6 @@ namespace cutlass {
 namespace epilogue {
 namespace thread {
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename T>
-struct LinearCombinationGenericParams {
-  T alpha;                  ///< scales accumulators
-  T beta;                   ///< scales source tensor
-  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams():
-    alpha(T(1)),
-    beta(T(0)),
-    alpha_ptr(nullptr),
-    beta_ptr(nullptr) { }
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams(
-    T alpha,
-    T beta = T(0)
-  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams(
-    T const *alpha_ptr,
-    T const *beta_ptr = nullptr
-  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Identity operator
@@ -92,27 +60,67 @@ struct Identity {
   T operator()(T value) const {
     return value;
   }
+};
 
-  using Params = LinearCombinationGenericParams<T>;
+template <typename T, int N>
+struct Identity<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    return value;
+  }
+};
+
+/// Scale operator
+template <typename T>
+struct Scale {
+  struct Arguments {
+    T scale = T(1);
+  };
 
   CUTLASS_HOST_DEVICE
-  T operator()(T const &value, Params const &params_) const {
-    return this->operator()(value);
+  T operator()(T const& value, T const& scale) const {
+    multiplies<T> mul;
+    return mul(scale, value);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.scale);
   }
 };
 
 template <typename T, int N>
-struct Identity<Array<T, N> > {
+struct Scale<Array<T, N>> {
+  using Arguments = typename Scale<T>::Arguments;
+
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    return value;
+  Array<T, N> operator()(Array<T, N> const& values, T const& scale) const {
+    multiplies<Array<T, N>> mul;
+    return mul(scale, values);
   }
 
-  using Params = LinearCombinationGenericParams<T>;
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.scale);
+  }
+};
+
+/// Specialization to compose other activations with a defined unary operator
+/// e.g. Scale<Identity<T>>
+template <template <class> class Activation, typename T>
+struct Scale<Activation<T>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value, decltype(Arguments{}.scale) const& scale) const {
+    multiplies<T> mul;
+    Activation<T> act;
+    return mul(scale, act(value));
+  }
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.scale);
   }
 };
 
@@ -134,14 +142,6 @@ struct ReLu {
 
     return mx(value, T(0));
   }
-
-  /// Host-constructable parameters structure
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
 template <typename T>
@@ -162,90 +162,87 @@ struct ReLu<Array<T, N>> {
     maximum<Array<T, N>> mx;
     return mx(frag, T(0));
   }
+};
 
-  /// Host-constructable parameters structure
-  using Params = LinearCombinationGenericParams<T>;
+// Generic clamp
+template <typename T>
+struct Clamp {
+  struct Arguments {
+    T lower_bound = cutlass::platform::numeric_limits<T>::min();
+    T upper_bound = cutlass::platform::numeric_limits<T>::max();
+  };
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &frag, Params const &params_) const {
-    return this->operator()(frag);
+  T operator()(T const& value, T const& lower_bound, T const& upper_bound) const {
+    maximum<T> mx;
+    minimum<T> mn;
+
+    return mn(mx(value, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.lower_bound, args.upper_bound);
+  }
+};
+
+template <typename T, int N>
+struct Clamp<Array<T,N>> {
+  using Arguments = typename Clamp<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound, T const& upper_bound) const {
+    maximum<Array<T,N>> mx;
+    minimum<Array<T,N>> mn;
+
+    return mn(mx(values, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.lower_bound, args.upper_bound);
   }
 };
 
 // Leaky Relu operator
 template <typename T>
 struct LeakyReLU {
-
-  struct Params: LinearCombinationGenericParams<T> {
-    T leaky_alpha;            ///< leaky_alpha
-
-    // Methods
-    using LinearCombinationGenericParams<T>::LinearCombinationGenericParams;
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      LinearCombinationGenericParams<T>(),
-      leaky_alpha(T(1)) {}
- 
-    CUTLASS_HOST_DEVICE
-    Params(
-      T alpha,
-      T beta,
-      T leaky_alpha = T(1)
-    ): LinearCombinationGenericParams<T>(alpha, beta), leaky_alpha(leaky_alpha) {}
+  struct Arguments {
+    T leaky_alpha = T(0);
   };
 
   CUTLASS_HOST_DEVICE
-  T operator()(T const &value, T const & alpha_recip) const {
-    T res = value > T(0) ? value : value * alpha_recip;
+  T operator()(T const& value, T const& leaky_alpha) const {
+    T res = value > T(0) ? value : value * leaky_alpha;
     return res;
   }
 
   CUTLASS_HOST_DEVICE
-  T operator()(T const &value, Params const &params_) const {
-    this->operator()(value, params_.leaky_alpha);
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    this->operator()(value, args.leaky_alpha);
   }
 };
 
 template <typename T, int N>
 struct LeakyReLU<Array<T, N> > {
-
-  struct Params: LinearCombinationGenericParams<T> {
-    T leaky_alpha;            ///< leaky_alpha
-    using LinearCombinationGenericParams<T>::LinearCombinationGenericParams;
-
-    // Methods
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      LinearCombinationGenericParams<T>(),
-      leaky_alpha(T(1)) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      T alpha,
-      T beta,
-      T leaky_alpha = T(1)
-    ): LinearCombinationGenericParams<T>(alpha, beta), leaky_alpha(leaky_alpha) {}
-  };
-
+  using Arguments = typename LeakyReLU<T>::Arguments;
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, T const & alpha_recip) const {
+  Array<T, N> operator()(Array<T, N> const& values, T const& leaky_alpha) const {
     Array<T, N> y;
     LeakyReLU<T> leaky_op;
 
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(value.size()); ++i) {
-      y[i] = leaky_op(value[i], alpha_recip);
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = leaky_op(values[i], leaky_alpha);
     }
 
     return y;
   }
 
   CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value, params_.leaky_alpha);
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.leaky_alpha);
   }
 };
 
@@ -253,15 +250,8 @@ struct LeakyReLU<Array<T, N> > {
 template <typename T>
 struct Tanh {
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return fast_tanh(scalar);
-  }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+  T operator()(T const &value) const {
+    return fast_tanh(value);
   }
 };
 
@@ -279,13 +269,6 @@ struct Tanh<Array<T, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
 template <int N>
@@ -296,14 +279,6 @@ struct Tanh<Array<half_t, N>> {
   Array<T, N> operator()(Array<T, N> const& z) const {
     fast_tanh_op<Array<T, N>> tanh;
     return tanh(z);
-
-  }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
   }
 };
 
@@ -311,15 +286,8 @@ struct Tanh<Array<half_t, N>> {
 template <typename T>
 struct Sigmoid {
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return T(1) / (T(1) + fast_exp(-scalar));
-  }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+  T operator()(T const &value) const {
+    return T(1) / (T(1) + fast_exp(-value));
   }
 };
 
@@ -337,13 +305,6 @@ struct Sigmoid<Array<T, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
 template <int N>
@@ -368,13 +329,6 @@ struct Sigmoid<Array<half_t, N>> {
                    fast_exp(neg(z))));
 #endif
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &z, Params const &params_) const {
-    return this->operator()(z);
-  }
 };
 
 // SiLu (swish) operator introduced by Elfwing et al. in the following paper
@@ -385,16 +339,9 @@ struct Sigmoid<Array<half_t, N>> {
 template <typename T>
 struct SiLu {
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
+  T operator()(T const &value) const {
     Sigmoid<T> sigmoid;
-    return scalar * sigmoid(scalar);
-  }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+    return value * sigmoid(value);
   }
 };
 
@@ -406,13 +353,6 @@ struct SiLu<Array<T, N>> {
     multiplies<Array<T, N>>     mul;
     return mul(value, sigmoid_op(value));
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
 // Hardswish operator introduced by Howard et al. in the following paper
@@ -429,13 +369,6 @@ struct HardSwish {
     T relu6 = mn(mx(x + T(3), T(0)), T(6));
     return x * relu6 / T(6);
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &x, Params const &params_) const {
-    return this->operator()(x);
-  }
 };
 
 template <>
@@ -449,13 +382,6 @@ struct HardSwish<float> {
     T relu6 = mn(mx(x + T(3), T(0)), T(6));
     return x * relu6 * 0.16666667f;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &x, Params const &params_) const {
-    return this->operator()(x);
-  }
 };
 
 template <typename T, int N>
@@ -472,13 +398,6 @@ struct HardSwish<Array<T, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &x, Params const &params_) const {
-    return this->operator()(x);
-  }
 };
 
 template <int N>
@@ -494,13 +413,6 @@ struct HardSwish<Array<half_t, N> > {
 
     return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f));
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &x, Params const &params_) const {
-    return this->operator()(x);
-  }
 };
 
 //
@@ -516,48 +428,27 @@ struct HardSwish<Array<half_t, N> > {
 template <typename T>
 struct GELU {
   CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar) const {
-    return T(cutlass::constants::half<T>() * scalar *
-      (cutlass::constants::one<T>() + (T)erff((float)(scalar * cutlass::constants::half_root_two<T>()))));
-  }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+  T operator()(T const &value) const {
+    return T(cutlass::constants::half<T>() * value *
+      (cutlass::constants::one<T>() + (T)erff((float)(value * cutlass::constants::half_root_two<T>()))));
   }
 };
 
 template <>
 struct GELU<float> {
   CUTLASS_HOST_DEVICE
-  float operator()(float const &scalar) const {
-    return cutlass::constants::half<float>() * scalar *
-      (cutlass::constants::one<float>() + erff(scalar * cutlass::constants::half_root_two<float>() ));
-  }
-
-  using Params = LinearCombinationGenericParams<float>;
-
-  CUTLASS_HOST_DEVICE
-  float operator()(float const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+  float operator()(float const &value) const {
+    return cutlass::constants::half<float>() * value *
+      (cutlass::constants::one<float>() + erff(value * cutlass::constants::half_root_two<float>() ));
   }
 };
 
 template <>
 struct GELU<double> {
   CUTLASS_HOST_DEVICE
-  double operator()(double const &scalar) const {
-    return cutlass::constants::half<double>() * scalar *
-      (cutlass::constants::one<double>() + erf( scalar * cutlass::constants::half_root_two<double>() ));
-  }
-
-  using Params = LinearCombinationGenericParams<double>;
-
-  CUTLASS_HOST_DEVICE
-  double operator()(double const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
+  double operator()(double const &value) const {
+    return cutlass::constants::half<double>() * value *
+      (cutlass::constants::one<double>() + erf( value * cutlass::constants::half_root_two<double>() ));
   }
 };
 
@@ -575,15 +466,11 @@ struct GELU<Array<T, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
+template <typename T>
+using ScaledGELU = Scale<GELU<T>>;
+
 // GELU operator implemented using the Taylor series approximation
 template <typename T>
 struct GELU_taylor {
@@ -597,13 +484,6 @@ struct GELU_taylor {
     return T(cutlass::constants::half<T>() * z *
       (cutlass::constants::one<T>() + fast_tanh(k0 * z * (cutlass::constants::one<T>() + k1 * z * z))));
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &scalar, Params const &params_) const {
-    return this->operator()(scalar);
-  }
 };
 
 template <int N>
@@ -630,13 +510,6 @@ struct GELU_taylor<Array<half_t, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<half_t>;
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
 template <typename T, int N>
@@ -654,15 +527,11 @@ struct GELU_taylor<Array<T, N> > {
 
     return y;
   }
-
-  using Params = LinearCombinationGenericParams<T>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value, Params const &params_) const {
-    return this->operator()(value);
-  }
 };
 
+template <typename T>
+using ScaledGELU_taylor = Scale<GELU_taylor<T>>;
+
 /// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
 /// z is computed from the forward pass.
 template <typename T>
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index 9a762ae0c1..f44ed18eb9 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -49,6 +49,51 @@ namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <class Activation, class = void>
+struct GenericActivationTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class Activation>
+struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename Activation::Arguments;
+};
+
+template <typename T>
+struct LinearCombinationGenericParams {
+  T alpha;                  ///< scales accumulators
+  T beta;                   ///< scales source tensor
+  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams():
+    alpha(T(1)),
+    beta(T(0)),
+    alpha_ptr(nullptr),
+    beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T alpha,
+    T beta = T(0)
+  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T const *alpha_ptr,
+    T const *beta_ptr = nullptr
+  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Applies a linear combination operator followed by an activation function to an array of elements.
 ///
 /// D = activation(alpha * accumulator + beta * source + uniform)
@@ -84,7 +129,11 @@ class LinearCombinationGeneric {
   static FloatRoundStyle const kRound = Round;
 
   /// Host-constructable parameters structure
-  using Params = typename ActivationFunctor<FragmentCompute>::Params;
+  struct Params
+    : LinearCombinationGenericParams<ElementCompute>,
+      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+  };
 
 private:
 
@@ -161,7 +210,11 @@ class LinearCombinationGeneric {
       intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
     }
 
-    intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
@@ -192,7 +245,11 @@ class LinearCombinationGeneric {
       intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
     }
 
-    intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
 
     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
new file mode 100644
index 0000000000..a33b6ddf8c
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
@@ -0,0 +1,495 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ /*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename DefaultEpilogue,                 ///< Default Epilogue Descriptor
+  typename FusionCallbacks_,                ///< The called fusion callbacks
+  int Stages = 2,                           ///< Software pipeline stages for epilogue
+  int IterationsUnroll = true               ///< Used to reduce binary size when epilogue op is large
+>
+class EpilogueWithVisitorCallbacks :
+  public EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>,
+  public EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>
+   {
+
+public:
+
+  static_assert(Stages <= 2, "Sm80 EVT only support upto 2 Stages.");
+
+  // Whether the epilogue is pipelined
+  static bool constexpr Pipelined = Stages > 1;
+
+  using FusionCallbacks = FusionCallbacks_;
+
+  using OutputTileIterator = typename DefaultEpilogue::OutputTileIterator;
+  // Number of epilogue iterations. 
+  // Each iteration processes a 8xThreadblockTile::kN output tile
+  static const int kIterations = OutputTileIterator::kIterations;
+
+  using Base = EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>;
+  
+  using BaseStreamK = EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>;
+
+  static int const kPartitionsK = DefaultEpilogue::kPartitionsK;
+
+  using AccumulatorFragmentIterator = typename DefaultEpilogue::AccumulatorFragmentIterator;
+  using WarpTileIterator = typename DefaultEpilogue::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultEpilogue::SharedLoadIterator;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  struct OutputOp{
+    using ElementAccumulator = ElementAccumulator;
+    using Params = typename FusionCallbacks::Arguments;
+  };
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  // Output access size
+  static int const kElementsPerAccess = DefaultEpilogue::kElementsPerAccess;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using Params = typename FusionCallbacks::Params;
+
+  static size_t constexpr kSmemStageOffset = sizeof(Base::SharedStorage) / sizeof(ElementAccumulator);
+  static int constexpr kAccumulatorFragmentCount = AccumulatorTile::kElements / (kIterations * AccumulatorAccessType::kElements) / kPartitionsK;
+
+  struct SharedStorage {
+    typename Base::SharedStorage acc_smem[Stages];
+    typename FusionCallbacks::SharedStorage callback_smem;
+  };
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  FusionCallbacks fusion_callbacks;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitorCallbacks(
+    const Params &params_callbacks,   ///< Epilogue Visitor params
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.acc_smem[0], thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx),
+    shared_load_iterator_(shared_storage.acc_smem[0].reference(), thread_idx),
+    fusion_callbacks(params_callbacks, shared_storage.callback_smem)
+  { }
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      cutlass::gemm::GemmCoord threadblock_tile_offset,
+      ProblemShape problem_shape,
+      int thread_idx) 
+  {
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    callbacks.begin_step(reduce_fragment_idx);
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    //
+    // Iterate over output fragment
+    //
+
+    AccumulatorAccessType const *accum_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+      int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+      int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+      // Start a new row of the output fragment
+      if (!col_idx) {
+        callbacks.begin_row(row_idx);
+      }
+
+      callbacks.visit(
+        reduce_fragment_idx,
+        row_idx,
+        col_idx,
+        idx,
+        accum_frag_ptr[idx]
+      );
+
+      // End the row of the output fragment
+      if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+        callbacks.end_row(row_idx);
+      }
+    }
+
+    callbacks.end_step(reduce_fragment_idx);
+    callbacks.end_epilogue();
+  }
+
+  /// Streams the result to global memory
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void operator()(
+    AccumulatorTile const &accumulators,
+    cutlass::gemm::GemmCoord threadblock_tile_offset,
+    ProblemShape problem_shape,
+    int thread_idx
+    ) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    if constexpr(Pipelined){
+      __syncthreads();
+
+      //
+      // Pipeline Prologue
+      //
+      size_t warp_iterator_offset = kSmemStageOffset;
+      size_t smem_iterator_offset = kSmemStageOffset;
+      callbacks.begin_step(0);
+    
+      acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            0, accum_fragment_iterator, this->warp_tile_iterator_);
+      
+      this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+      warp_iterator_offset = -warp_iterator_offset;
+
+      //
+      // Pipeline Loop
+      //
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 1; iter_idx < kIterations + 1; ++iter_idx) {
+
+        __syncthreads();
+
+        // Skip the load for epilogue
+        if (iter_idx < kIterations) {
+          callbacks.begin_step(iter_idx);
+
+          acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+              iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+          this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+          warp_iterator_offset = -warp_iterator_offset;
+        }
+        
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        shared_load_iterator_.add_pointer_offset(smem_iterator_offset);
+        smem_iterator_offset = -smem_iterator_offset;
+        
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx-1,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx-1);
+      }
+    } else {
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
+
+        //
+        // Load the source
+        //
+
+        callbacks.begin_step(iter_idx);
+
+        //
+        // Convert and store fragment
+        //
+
+        __syncthreads();
+
+        acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+        __syncthreads();
+
+        //
+        // Load fragments from shared memory
+        //
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx);
+      }
+    }
+
+    callbacks.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
new file mode 100644
index 0000000000..cb602400e5
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
@@ -0,0 +1,433 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <class... Ops>
+struct VisitorImpl2x: fusion::detail::Sm90VisitorImplBase<Ops...> {
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::Sm90VisitorImplBase;
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::ops;
+
+  template <class CallbacksTuple>
+  struct Callbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    /// Called at the start of the epilogue just before iterating over accumulator slices
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin_epilogue();
+        }
+      );
+    }
+
+    /// Called at the start of one step before starting accumulator exchange
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_step(step_idx);
+        }
+      );
+    }
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after accumulators have been exchanged for each accumulator vector
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+    
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    end_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after all accumulator elements have been visited
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_step(step_idx);
+        }
+      );
+    }
+
+    /// Called after all steps have been completed
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end_epilogue();
+        }
+      );
+    }
+  };
+
+  // Callbacks factory
+  // All operations must redefine this
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_callbacks(
+          threadblock_tile_offset,
+          thread_idx,
+          problem_shape);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return Callbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyCallbacks = VisitorImpl2x<>::Callbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct TreeVisitor2x : VisitorImpl2x<ChildOps..., NodeOp> {
+
+  using VisitorImpl2x<ChildOps..., NodeOp>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+    
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc);
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+        },
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<
+    decltype(VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+struct TopologicalVisitor2x : VisitorImpl2x<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using VisitorImpl2x<Ops...>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+    
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+      
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+          // Compute the current op with children inputs
+          [&] (auto const&... frg_inputs) {
+            auto frg_output = callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+            using ElementOutput = typename decltype(frg_output)::Element;
+            using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+            ConvertOutput convert_output{};
+
+            return convert_output(frg_output);
+          },
+          // Get inputs in the sequence given by the children indices of the current op
+          edge_seq
+        );
+        return frg_compute;
+      },
+      // Visit the last op
+      [&] (auto const&...) {
+        return cute::detail::apply(frg_compute_tuple,
+          // Compute the last op with children inputs
+          [&] (auto const&... frg_inputs) {
+            return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+          },
+          // Get inputs in the sequence given by the children indices of the last op
+          get<Rm1>(EdgeTuple{})
+        );
+      },
+      // Transform to visit R-1 ops, apply to visit last op
+      make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<decltype(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+
+template <class NodeOp, class... ChildOps>
+using Sm80EVT = TreeVisitor2x<NodeOp, ChildOps...>;
+
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+using Sm80TopologicalVisitor = TopologicalVisitor2x<ElementCompute, EdgeTuple, Ops...>;
+
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// OutputTileThreadLayout translate the CUTLASS 2.X OutputTileOptimalThreadMap into cute layout
+// used by CUTLASS 3.X Epilogue
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  int Stages_=1
+>
+struct OutputTileThreadLayout: DefaultThreadMapTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  ThreadblockShape_::kK/WarpShape_::kK,
+  Element_,
+  ElementsPerAccess>::Type {
+  
+  using Base = typename DefaultThreadMapTensorOp<
+    ThreadblockShape_,
+    WarpShape_,
+    ThreadblockShape_::kK/WarpShape_::kK,
+    Element_,
+    ElementsPerAccess>::Type;
+  using Base::Base;
+
+  // Software pipeline stages in epilogue
+  static_assert(Stages_ <= 2, "Sm80 EVT only support upto 2 Stages.");
+  static const int Stages = Stages_;
+
+  using ThreadShape = cute::Shape<
+    cute::Int<Base::Detail::kAccessWidth>,                 // lane col idx
+    cute::Int<Base::Detail::kAccessRows>,                  // lane row idx
+    cute::Int<Base::Detail::kWarpsRemainingForRows>,       // warp row idx
+    cute::Int<Base::Shape::kGroup>,                        // group idx
+    cute::Int<Base::Shape::kCluster>                       // cluster idx
+  >;
+
+  using Shape = typename Base::Shape;
+  using Count = typename Base::Count;
+
+  using ThreadMapShape = cute::Shape<
+    // Column
+    Int<Base::kElementsPerAccess>,                // vector
+    Int<Base::Detail::kAccessWidth>,              // lane_col_coord
+    Int<Base::Iterations::kColumn>,               // iteration::column
+    // Row
+    Int<Base::Detail::kAccessRows>,               // lane_row_coord
+    Int<Base::Iterations::kRow>,                  // iterations in row
+    Int<Base::Detail::kWarpsRemainingForRows>,    // warp_row_coord
+    Int<Count::kRow>,                             // iteration::row
+    Int<Count::kGroup>,                           // iteration::group
+    Int<Shape::kGroup>,                           // group_coord
+    Int<Count::kCluster>,                         // iteration::cluster
+    Int<Shape::kCluster>                          // cluster_coord
+  >;
+
+  // The shape of CTA Tile
+  using CtaShapeMNL = cute::Shape<
+    Int<
+      Shape::kRow * Count::kRow *
+      Shape::kGroup * Count::kGroup *
+      Shape::kCluster * Count::kCluster
+    >,
+    Int<Shape::kColumn * Count::kColumn>,
+    _1
+  >;
+
+  static const int kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  static auto tid2coord(int thread_idx) {
+    return make_layout(ThreadShape{})[thread_idx];
+  }
+
+  template <class TensorInput>
+  CUTLASS_DEVICE
+  static auto partition(TensorInput &&xT, int thread_idx, gemm::GemmCoord threadblock_tile_offset) {
+
+    // (BLK_M,BLK_N)
+    Tensor bCxT = local_tile(
+      xT, CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+    )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k());
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = tid2coord(thread_idx);
+
+    // transform to column-major
+    Tensor bCxT_nm = make_tensor(
+      std::forward<decltype(bCxT)>(bCxT).data(), make_layout(get<1>(bCxT.layout()), get<0>(bCxT.layout()))
+    ).compose(make_layout(ThreadMapShape{}));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    return bCxT_nm(_,lane_col_coord,_,lane_row_coord,_,warp_row_coord,_,_,group_coord,_,cluster_coord);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
new file mode 100644
index 0000000000..dc19c75d84
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
@@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct VisitorCompute : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+          ComputeOutput compute_output{};
+          ConvertOutput convert_output{};
+
+          return convert_output(compute_output(cvt_frg_inputs...));
+        }
+      );
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
new file mode 100644
index 0000000000..54845b2646
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
@@ -0,0 +1,559 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct VisitorAccFetch : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      return frg_acc;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks{};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar broadcast
+template<
+  class Element,
+  class StrideMNL = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct VisitorScalarBroadcast {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) || // scalar broadcast, e.g. alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_1>>) ||
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
+  
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  struct Callbacks: EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(Element scalar)
+      : scalar(scalar) {}
+    
+    Element scalar;
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    // Get the scalar for batched broadcast
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> ||
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+      update_scalar(threadblock_tile_offset.k());
+    }
+    return Callbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][l_offset]);
+      } else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorAuxLoad{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  // Software pipeline stages
+  static const int Stages = ThreadMap::Stages;
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);  
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux(_,_,_,step_idx%Stages));
+      auto src_v = filter(tC_gAux(_,_,_,step_idx));
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_rAux(_,_,_,step_idx%Stages));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux(_,_,_,iter_idx%Stages)));
+      return tC_rAux_frg(frg_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) { 
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), 
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(
+      group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, Stages
+    Tensor tC_rAux = make_tensor<VecType>(
+      make_layout(flatten(make_shape(take<0,3>(tC_gAux.shape()), Int<Stages>{}))));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = local_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux), 
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowBroadcast {
+
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+    
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = get<1>(coord_v(i)) < n;
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row), 
+      problem_shape,
+      params_ptr->dRow);
+    
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = local_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+    
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow), 
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColBroadcast {
+
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+    
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rCol);
+      Tensor pred = make_tensor<bool>(shape(tC_gCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tC_cCol(i)) < m;
+      }
+      copy_if(pred, tC_gCol, tC_rCol);
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+    
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
new file mode 100644
index 0000000000..5edd5cf091
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
@@ -0,0 +1,781 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL
+>
+struct VisitorAuxStore{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  struct SharedStorage {};
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(frg_idx) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      auto src_v = filter(tC_rAux);
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_gAux(_,_,_,step_idx));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_store<VecType, sizeof(VecType)>(src_v(i), (void*)&dst_v(i), guard);
+      }
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), 
+      problem_shape, 
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    Tensor tC_rAux = make_tensor_like(take<0,3>(tC_gAux));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = local_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux), 
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper functions
+template <
+  template <class> class ReduceFn, 
+  int kThreads, class T>
+CUTLASS_DEVICE
+void intra_warp_row_reduce(T& value) {
+  using ReduceInput = ReduceFn<T>;
+  ReduceInput reduce_input{};
+  constexpr int kHalfThreads = kThreads >> 1;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = kHalfThreads; i > 0; i >>= 1) {
+    value = reduce_input(value, __shfl_xor_sync(0xFFFFFFFF, value, i));
+  }
+}
+
+template <
+  template <class> class ReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementFragment, int FragmentSize>
+CUTLASS_DEVICE
+void fragment_reduce(ElementCompute& value, Array<ElementFragment, FragmentSize> const& frg) {
+  using ReduceInput = ReduceFn<ElementCompute>;
+  ReduceInput reduce_input{};
+  using ConvertInput = NumericConverter<ElementCompute, ElementFragment, RoundStyle>;
+  ConvertInput convert_input{};
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < FragmentSize; ++i) {
+    value = reduce_input(value, convert_input(frg[i]));
+  }
+}
+
+template<
+  template <class> class AtomicReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementOutput>
+CUTLASS_DEVICE
+void atomic_reduce(ElementOutput* ptr, ElementCompute const& value) {
+  using ReduceOutput = AtomicReduceFn<ElementOutput>;
+  using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+  ReduceOutput reduce_output{};
+  ConvertOutput convert_output{};
+
+  reduce_output(ptr, convert_output(value));
+}
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  template <class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to the first thread in each row.
+        // Only the first thread in each row is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::Detail::kAccessWidth == 0;
+      }
+
+    GTensor tC_gCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+    int n;
+    int curr_iter_idx;
+    bool is_writing_thread;
+
+    ElementCompute reduction_accum;
+
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      reduction_accum = ElementCompute(params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      curr_iter_idx = iter_idx;
+
+      int coord_n = get<1>(tC_cCol(column_idx, row_idx, iter_idx));
+      if (coord_n < n) {
+        fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+      }
+
+      // Intra-warp reduction
+      if (column_idx + 1 == ThreadMap::Iterations::kColumn) {
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::Detail::kAccessWidth>(reduction_accum);
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE auto
+    end_row(int row_idx) {
+      bool guard = get<0>(tC_cCol(_0{}, row_idx,curr_iter_idx)) < m;
+
+      if (guard && is_writing_thread) {
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gCol(row_idx,curr_iter_idx), reduction_accum);
+      }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+    // FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cCol = group_modes<2,5>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_cCol),
+      ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>
+>
+struct VisitorRowReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  using SharedStorageShape = decltype(select<0,1,2,3,5,8,10>(typename ThreadMap::ThreadMapShape{}));
+
+  struct SharedStorage {
+    AlignedArray<ElementCompute, size(SharedStorageShape{}), 16> reduction;
+  };
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<ElementOutput>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params),
+      smem_reduce(const_cast<ElementCompute*>(shared_storage.reduction.data())) { }
+  
+  Params const* params_ptr;
+  ElementCompute* smem_reduce;
+
+  template <
+    class RTensorR2S, class STensorR2S, class CTensorR2S,
+    class STensorS2R, class RTensorS2R, class CTensorS2R, 
+    class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      // R->S
+      RTensorR2S&& tRS_rSrc,
+      STensorR2S&& tRS_sRows,
+      CTensorR2S&& tRS_cSrc,
+      // S->R
+      STensorS2R&& tSR_sRows,
+      RTensorS2R&& tSR_rRows,
+      CTensorS2R&& tSR_cRows,
+      // R->G
+      GTensor&& tC_gRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      // R->S
+      tRS_rSrc(cute::forward<RTensorR2S>(tRS_rSrc)),
+      tRS_sRows(cute::forward<STensorR2S>(tRS_sRows)),
+      tRS_cSrc(cute::forward<CTensorR2S>(tRS_cSrc)),
+      // S->R
+      tSR_sRows(cute::forward<STensorS2R>(tSR_sRows)),
+      tSR_rRows(cute::forward<RTensorS2R>(tSR_rRows)),
+      tSR_cRows(cute::forward<CTensorS2R>(tSR_cRows)),
+      // R->G
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    // R->S
+    RTensorR2S tRS_rSrc;
+    STensorR2S tRS_sRows;
+    CTensorR2S tRS_cSrc;
+    // S->R
+    STensorS2R tSR_sRows;
+    RTensorS2R tSR_rRows;
+    CTensorS2R tSR_cRows;
+    // R->G
+    GTensor tC_gRow;
+    CTensor tC_cRow;
+    
+    Params const* params_ptr;
+    int n;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      fill(tRS_rSrc, params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      Tensor tRS_rRow_frg = recast<Array<ElementCompute, FragmentSize>>(coalesce(tRS_rSrc));
+
+      int coord_m = get<0>(tRS_cSrc(column_idx,row_idx,iter_idx));
+      if (coord_m < m)
+        reduction(tRS_rRow_frg[column_idx], convert_input(frg_input));
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      //
+      // Store the partially reduced value to SMEM
+      //
+
+      // Guard against uses of the existing SMEM tile
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tRS_rSrc); ++i) {
+        copy_vec<VecType>(filter(tRS_rSrc), filter(tRS_sRows));
+      }
+
+      __syncthreads();
+
+      //
+      // Now, threads are assigned several columns of the output. They fetch over all rows from
+      // the compacted SMEM tile and perform a reduction.
+      //
+
+      fill(tSR_rRows, params_ptr->reduction_identity);
+
+      using ReduceInputReg = RegReduceFn<ElementCompute>;
+      ReduceInputReg reduce_input_reg{};
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(tSR_rRows); ++j) {
+        if (get<0>(tSR_cRows(j)) < get<1>(typename ThreadMap::CtaShapeMNL{}) && get<1>(tC_cRow(j)) < n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_sRows) / size(tSR_rRows); ++i) {
+            tSR_rRows(j) = reduce_input_reg(tSR_rRows(j), tSR_sRows(i + j * size(tSR_sRows) / size(tSR_rRows)));
+          }
+          atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gRow(j), tSR_rRows(j));
+        }
+
+      }      
+    }
+
+  private:
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE ElementCompute 
+    reduction(Array<ElementCompute, FragmentSize>& reduce_buffer, Array<ElementCompute, FragmentSize> const& result) {
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ReduceInput reduce_input{};
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+            reduce_buffer[i] = reduce_input(reduce_buffer[i], result[i]);
+        }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+    
+    //
+    // Step 1: reduce fragment input (Src) into tRS_rSrc
+    //
+
+    // VECTOR,FRAGMENT_COL
+    Tensor tRS_rSrc = make_tensor<ElementCompute>(select<0,2>(typename ThreadMap::ThreadMapShape{}));
+
+    Tensor cSrc = make_identity_tensor(mRow.shape());
+    // FRAGMENT_COLUMN, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tRS_cSrc = group_modes<2,5>(ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+    
+    //
+    // Step 2: copy the partial results in tRS_rSrc to sRows in shared memory
+    //
+
+    // VECTOR,ACCESS_WIDTH,FRAGMENT_COL,ACCESS_ROWS,WARPS_PER_ROW,GROUPS,CLUSTERS
+    Tensor sRows = make_tensor(
+      make_smem_ptr(smem_reduce), SharedStorageShape{}
+    );
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = ThreadMap::tid2coord(thread_idx);
+    Tensor tRS_sRows = sRows(_,lane_col_coord,_,lane_row_coord,warp_row_coord,group_coord,cluster_coord);
+
+    //
+    // Step 3: copy the partial results in sRows to tSR_sRow for reduction
+    //
+
+    // VECTOR*ACCESS_WIDTH*FRAGMENT_COL,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor sRows_nm = coalesce(group_modes<1,5>(group_modes<0,3>(sRows)), Shape<_1,_1>{});
+    // SMEM_ROW/THREADS,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor tSR_sRows = local_partition(sRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx);
+    // SMEM_ROW/THREADS
+    Tensor tSR_rRows = make_tensor_like(tSR_sRows(_,_0{}));
+    // Coord
+    Tensor cRows_nm = make_identity_tensor(sRows_nm.shape());
+    Tensor tSR_cRows = local_partition(cRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx)(_,_0{});
+    
+    //
+    // Step 4: atomically reduce the results to global memory
+    //
+    
+    Tensor tC_gRow = local_partition(
+      // Cta tile
+      local_tile(
+        mRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_),Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = local_partition(
+      // Cta tile
+      local_tile(
+        cRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    return Callbacks<
+      decltype(tRS_rSrc), decltype(tRS_sRows),
+      decltype(tRS_cSrc), decltype(tSR_sRows),
+      decltype(tSR_rRows), decltype(tSR_cRows),
+      decltype(tC_gRow), decltype(tC_cRow),
+      ProblemShape>(
+      // R->S
+      cute::move(tRS_rSrc),
+      cute::move(tRS_sRows),
+      cute::move(tRS_cSrc),
+      // S->R
+      cute::move(tSR_sRows),
+      cute::move(tSR_rRows),
+      cute::move(tSR_cRows),
+      // R->G
+      cute::move(tC_gRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>
+>
+struct VisitorScalarReduction {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(){ };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  template <class CTensor, class GTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      CTensor&& tC_cSrc,
+      GTensor&& tC_gScalar,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_cSrc(cute::forward<CTensor>(tC_cSrc)),
+      tC_gScalar(cute::forward<GTensor>(tC_gScalar)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to this first thread.
+        // Only the first thread of each warp is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::kWarpSize == 0;
+      }
+
+      GTensor tC_gScalar;
+      CTensor tC_cSrc;
+      Params const* params_ptr;
+      ProblemShape problem_shape;
+      bool is_writing_thread;
+
+      ElementCompute reduction_accum;
+
+      CUTLASS_DEVICE void
+      begin_epilogue() {
+        reduction_accum = ElementCompute(params_ptr->reduction_identity);
+      }
+
+      template <class ElementAccumulator, class ElementInput, int FragmentSize>
+      CUTLASS_DEVICE auto
+      visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+            Array<ElementAccumulator, FragmentSize> const& frg_acc,
+            Array<ElementInput, FragmentSize> const& frg_input) {
+
+        auto coord = tC_cSrc(column_idx, row_idx, iter_idx);
+        if (elem_less(coord, problem_shape)) {
+          fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+        }
+
+        return frg_input;
+      }
+
+      CUTLASS_DEVICE auto
+      end_epilogue() {
+        // Intra-warp reduction
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::kWarpSize>(reduction_accum);
+
+        // Atomically reduce to global memory
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gScalar(_0{},_0{}), reduction_accum);
+      }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor cSrc = make_identity_tensor(problem_shape);
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cSrc = group_modes<2,5>(
+      ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_)
+    );
+
+    Tensor mScalar = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_scalar),
+      problem_shape,
+      params_ptr->dScalar
+    );
+
+    Tensor tC_gScalar = mScalar(_,_,threadblock_tile_offset.k());
+    
+    return Callbacks<
+      decltype(tC_cSrc), decltype(tC_gScalar),
+      ProblemShape>(
+      cute::move(tC_cSrc),
+      cute::move(tC_gScalar),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/layout/layout.h b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
similarity index 81%
rename from python/cutlass/cpp/include/layout/layout.h
rename to include/cutlass/epilogue/threadblock/fusion/visitors.hpp
index 5968bc0b2b..bd8533a72e 100644
--- a/python/cutlass/cpp/include/layout/layout.h
+++ b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,20 +28,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/* \file
-   \brief Bind CUTLASS layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "tensor.h"
-#include "matrix.h"
-
 
-namespace py = pybind11;
-
-void bind_layout(py::module &m) {
-    bind_tensor_layout(m);
-    bind_matrix_layout(m);
-}
+/*! \file
+  \brief Higher-level header file includes all the CUTLASS 2x visitors
+*/
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_load.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_store.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_compute.hpp"
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
index 33bf30a51e..208450a80e 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -32,6 +32,16 @@
   \brief 
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index e1821f1efd..5bcd0c9784 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -28,6 +28,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 
 #pragma once
 
@@ -43,7 +52,7 @@
 #include "cutlass/array.h"
 #include "cutlass/uint128.h"
 #include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
+#include "cutlass/half.h"
 
 /**
  * \file
@@ -148,15 +157,23 @@ CUTLASS_HOST_DEVICE dividend_t round_nearest(dividend_t dividend, divisor_t divi
   return ((dividend + divisor - 1) / divisor) * divisor;
 }
 
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+constexpr
+value_t abs_for_integer(value_t a) {
+  return ((a > 0) ? a : -a);
+}
 /**
  * Greatest common divisor
  */
 template <typename value_t>
-CUTLASS_HOST_DEVICE constexpr value_t gcd(value_t a, value_t b) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd(value_t a, value_t b) {
   for (;;) {
-    if (a == 0) return b;
+    if (a == 0) return cutlass::abs_for_integer(b);
     b %= a;
-    if (b == 0) return a;
+    if (b == 0) return cutlass::abs_for_integer(a);
     a %= b;
   }
 }
@@ -165,21 +182,44 @@ CUTLASS_HOST_DEVICE constexpr value_t gcd(value_t a, value_t b) {
  * Least common multiple
  */
 template <typename value_t>
-CUTLASS_HOST_DEVICE constexpr value_t lcm(value_t a, value_t b) {
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm(value_t a, value_t b) {
   value_t temp = gcd(a, b);
 
-  return temp ? (a / temp * b) : 0;
+  return temp ? (cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : 0;
+}
+
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+constexpr
+value_t gcd_cxx11(value_t a, value_t b) {
+  return (a == 0 || b == 0) ? cutlass::abs_for_integer(a | b) : gcd_cxx11(b, a % b);
 }
 
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+constexpr
+value_t lcm_cxx11(value_t a, value_t b) {
+  return gcd_cxx11(a, b) ? (cutlass::abs_for_integer(a) / gcd_cxx11(a, b) * cutlass::abs_for_integer(b)) : 0;
+}
 /// Returns the smallest value in the half-open range [a, a+b) that is a multiple of b
 CUTLASS_HOST_DEVICE
-constexpr int round_up(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17
+int round_up(int a, int b) {
   return ((a + b - 1) / b) * b;
 }
 
 /// Returns the ceiling of (a / b)
 CUTLASS_HOST_DEVICE
-constexpr int ceil_div(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17
+int ceil_div(int a, int b) {
   return (a + b - 1) / b;
 }
 
@@ -639,12 +679,12 @@ struct Max {
 };
 
 CUTLASS_HOST_DEVICE
-constexpr int const_min(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17 int const_min(int a, int b) {
     return (b < a ? b : a);
 }
 
 CUTLASS_HOST_DEVICE
-constexpr int const_max(int a, int b) {
+CUTLASS_CONSTEXPR_IF_CXX17 int const_max(int a, int b) {
     return (b > a ? b : a);
 }
 
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index c97abf3648..e2d2245c43 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -33,6 +33,16 @@
     \brief Defines a class for using IEEE half-precision floating-point types in host or
       device code.
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 // FP8 types are available starting CUDA 11.8+
@@ -447,6 +457,9 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
     /// Constructor inheritance
     using Base::Base;
 
+    /// Default constructor
+    float_e4m3_t() = default;
+
 #ifdef CUDA_FP8_ENABLED
     /// Conversion from CUDA's FP8 type
     CUTLASS_HOST_DEVICE
@@ -649,6 +662,9 @@ struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
     /// Constructor inheritance
     using Base::Base;
 
+    /// Default constructor
+    float_e5m2_t() = default;
+
 #ifdef CUDA_FP8_ENABLED
     /// Conversion from CUDA's FP8 type
     CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 227ce2e159..0554dd7adb 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -33,12 +33,21 @@
 
     This is inspired by the Standard Library's <functional> header.
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
 
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 #pragma once
 
 #include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
 #include "cutlass/half.h"
+#include "cutlass/tfloat32.h"
+#include "cutlass/bfloat16.h"
 
 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 #include <mma.h>
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
index 7d8f591358..f0df56d408 100644
--- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -59,7 +59,7 @@ compute_stage_count_or_override(StageCount<stages> stage_count) {
 // Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
 template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
 constexpr int
-compute_stage_count_or_override(cute::integral_constant<int, stages> stage_count) {
+compute_stage_count_or_override(cute::Int<stages> stage_count) {
   return stages;
 }
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
index 94f4656f5a..6fe3f4565c 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -36,6 +36,7 @@
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/detail/dependent_false.hpp"
 #include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
 
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
@@ -193,12 +194,18 @@ struct CollectiveMma<
     "SmemLayoutB K must be 128bytes to be transposed.");
   static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
     "Warp specialized ARF kernels have not supported universal B transposition yet.");
+  
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
 
   struct SharedStorage
   {
-    struct TensorStorage : cute::aligned_struct<256> { 
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, 256> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, 256> smem_B;
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB)> { 
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, SmemAlignmentA> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, SmemAlignmentB> smem_B;
     } tensors;
 
     using PipelineStorage = typename MainloopPipeline::SharedStorage;
@@ -222,14 +229,14 @@ struct CollectiveMma<
     using TMA_A = decltype(make_tma_copy(
         GmemTiledCopyA{},
         make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
-        SmemLayoutA{}(_,_,0),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
         make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
         size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
     // Assumption: StrideB is congruent with Problem_NK
     using TMA_B = decltype(make_tma_copy(
         GmemTiledCopyB{},
         make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
-        SmemLayoutB{}(_,_,0),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
         make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
         size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
     TMA_A tma_load_a;
@@ -353,8 +360,10 @@ struct CollectiveMma<
     int lane_predicate = cute::elect_one_sync();
 
     if (warp_idx_in_warp_group == 0 and lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});       // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});       // (BLK_N,BLK_K,PIPE)
+      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
 
       //
       // Prepare the TMA loads for A and B
@@ -466,12 +475,15 @@ struct CollectiveMma<
     [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
     
     Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
-    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                     // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+    
     Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
     Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
 
     // If TransposeB, GMMA will read from transposed B layout SMEM
-    Tensor gmma_sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), GmmaSmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+    Tensor gmma_sB_position_dependent = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), 
+                                          GmmaSmemLayoutB{});                                     // (BLK_N,BLK_K,PIPE)
+    Tensor gmma_sB = as_position_independent_swizzle_tensor(gmma_sB_position_dependent);          // (BLK_N,BLK_K,PIPE)
 
     //
     // Define C accumulators and A/B partitioning
@@ -483,7 +495,7 @@ struct CollectiveMma<
     // Allocate fragments and descriptors
     Tensor tCsA = thread_mma.partition_A(sA);
     Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                          // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(gmma_sB);                                            // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(gmma_sB_position_dependent);                         // (MMA,MMA_N,MMA_K,PIPE)
     Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
 
     //
@@ -523,29 +535,29 @@ struct CollectiveMma<
                                     cute::bool_constant<TransposeB>{});
 
     warpgroup_fence_operand(accum);
+    
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
     // first k tile
     {
-      pipeline.consumer_wait(smem_pipe_read);
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
 
       int read_stage = smem_pipe_read.index();
 
       ++smem_pipe_read;
-
-      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
 
       // copy smem->rmem for A operand
       copy(smem_tiled_copy_A, tCsA(_,_,0,read_stage), tCrA_copy_view(_,_,0));
       // transpose B operand in SMEM
       transpose(sB, gmma_sB, read_stage, 0);
-
+      
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
         copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        if (k_block == 0) {
-          transpose(sB, gmma_sB, read_stage, 1);
-          transpose.synchronize();
-        }
+        transpose.synchronize(k_block);
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
         warpgroup_arrive();
         // (V,M) x (V,N) => (V,M,N)
         cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
@@ -555,15 +567,12 @@ struct CollectiveMma<
 
       warpgroup_wait<2>();
       
-      
-      if (k_tile_count - 1 > 0) {
-        if (!skip_wait) {
-          pipeline.consumer_wait(smem_pipe_read);
-        }
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
         copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
         transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
       }
-
       warpgroup_arrive();
       // (V,M) x (V,N) => (V,M,N)
       cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
@@ -572,9 +581,12 @@ struct CollectiveMma<
       warpgroup_wait<2>();
     }
 
+    if (k_tile_count == 0) {
+      return;
+    }
+
     warpgroup_fence_operand(accum);
     // Mainloop GMMAs
-    --k_tile_count;
     CUTLASS_PRAGMA_NO_UNROLL
     for ( ; k_tile_count > 1; --k_tile_count) {
 
@@ -583,30 +595,26 @@ struct CollectiveMma<
       //
 
       int read_stage = smem_pipe_read.index();
-
       ++smem_pipe_read;
-      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
 
       warpgroup_fence_operand(accum);
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
         if (k_block == size<2>(tCrA) - 1) {
-          if (!skip_wait) {
-            pipeline.consumer_wait(smem_pipe_read);
-          }
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
           copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
           // transpose B operand in SMEM
           transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
-        } else {
+        } 
+        else {
           copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
           // transpose B operand in SMEM
-          if (k_block < 2) {
-            transpose.synchronize(k_block);                                      // make transpose of k_block available
-          }
-          if (k_block == 0) {
-            transpose(sB, gmma_sB, read_stage, 1);
-          }
+          transpose.synchronize(k_block);                                      // make transpose of k_block available
+          transpose(sB, gmma_sB, read_stage, k_block + 1);
         }
         
         warpgroup_arrive();
@@ -627,7 +635,7 @@ struct CollectiveMma<
 
     warpgroup_fence_operand(accum);
 
-    if (k_tile_count > 0) {
+    {
       //
       // Compute on k_tile
       //
@@ -635,16 +643,14 @@ struct CollectiveMma<
       int read_stage = smem_pipe_read.index();
 
       warpgroup_fence_operand(accum);
+      
       // Unroll the K mode manually to set scale D to 1
       CUTLASS_PRAGMA_UNROLL
       for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        
         copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        if (k_block < 2) {
-          transpose.synchronize(k_block);                                           // make k_block transpose available
-        }
-        if (k_block == 0) {
-          transpose(sB, gmma_sB, read_stage, 1);
-        }
+        transpose.synchronize(k_block);                                           // make k_block transpose available
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
         warpgroup_arrive();
         // (V,M) x (V,N) => (V,M,N)
         cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
@@ -663,8 +669,6 @@ struct CollectiveMma<
       cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
       tiled_mma.accumulate_ = GMMA::ScaleOut::One;
       warpgroup_commit_batch();
-      warpgroup_wait<2>();
-      warpgroup_fence_operand(accum);
     }
 
     warpgroup_fence_operand(accum);
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index 204835fe8f..66e6a6d5b7 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -157,14 +157,6 @@ class GemmUniversalBase {
         CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
         return Status::kErrorInternal;
       }
-
-      cudart_result = cudaFuncSetAttribute(
-          Kernel2<GemmKernel>,
-          cudaFuncAttributePreferredSharedMemoryCarveout, 100); // 100% shared memory
-      if (cudart_result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
-        return Status::kErrorInternal;
-      }
     }
 
     // Update SM occupancy member
@@ -228,12 +220,6 @@ class GemmUniversalBase {
   {
     CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
 
-    // Initialize static kernel and device properties, if necessary.
-    Status result = init_device_props();
-    if (result != Status::kSuccess) {
-      return result;
-    }
-
     dim3 grid = get_grid_shape(args);
 
     if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index ec90721376..65634175cc 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -37,40 +37,15 @@
 #include "cutlass/coord.h"
 #include "cutlass/gemm_coord.h"
 #include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cute/layout.hpp"
 #include "cutlass/detail/layout.hpp"
 
-namespace cutlass {
-namespace gemm {
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// GEMM operand enumeration: D = A * B + C
-enum class Operand {
-  kA, /// A multiplicand
-  kB, /// B multiplicand
-  kC, /// Source accumulator
-  kD  /// Destination accumulator
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
 
-enum class GemmUniversalMode {
-  kGemm,
-  kGemmSplitKParallel,
-  kBatched,
-  kArray,
-  kInvalid
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Some options for clearing shared memory
-enum class SharedMemoryClearOption {
-  kNone,            ///< SMEM is in don't-care state
-  kZfill,           ///< Kernels fill out of bounds accesses with zeros
-  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
-};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/python/cutlass/cpp/include/arch.h b/include/cutlass/gemm/gemm_enumerated_types.h
similarity index 52%
rename from python/cutlass/cpp/include/arch.h
rename to include/cutlass/gemm/gemm_enumerated_types.h
index 93a313d594..25e12404ed 100644
--- a/python/cutlass/cpp/include/arch.h
+++ b/include/cutlass/gemm/gemm_enumerated_types.h
@@ -28,32 +28,62 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/* \file
-   \brief Bind opcode classes to python
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
 */
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
 
-#include "cutlass/arch/mma.h"
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
 
-namespace py = pybind11;
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
 
 namespace cutlass {
-enum class OpcodeClass {
-    kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM operand enumeration: D = A * B + C
+enum class Operand {
+  kA, /// A multiplicand
+  kB, /// B multiplicand
+  kC, /// Source accumulator
+  kD  /// Destination accumulator
 };
-}
-
-void bind_opcode(py::module &m) {
-    py::enum_<cutlass::OpcodeClass>(m, "OpClass",
-        R"pbdoc(classification of math operators)pbdoc")
-        .value("Simt", cutlass::OpcodeClass::kSimt, 
-            R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
-        .value("TensorOp", cutlass::OpcodeClass::kTensorOp, 
-            R"pbdoc(Tag classifying operators as Tensor Core operations)pbdoc")
-        .value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp, 
-            R"pbdoc(Tag classifying operators as WMMA Tensor Core operations)pbdoc")
-        .value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp, 
-            R"pbdoc(Tag classifying operators as sparseTensor Core operations)pbdoc");
-}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class GemmUniversalMode {
+  kGemm,
+  kGemmSplitKParallel,
+  kBatched,
+  kArray,
+  kInvalid
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Some options for clearing shared memory
+enum class SharedMemoryClearOption {
+  kNone,            ///< SMEM is in don't-care state
+  kZfill,           ///< Kernels fill out of bounds accesses with zeros
+  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
+};
+
+/////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
new file mode 100644
index 0000000000..1b83e274d7
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused epilogue visitor callbacks
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Access granularity of C matrix in unit of elements
+  int kAlignmentC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Element type for epilogue computation
+  typename ElementEpilogue,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename FusionCallbacks,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Number of stages used in the pipelined epilogue
+  int EpilogueStages = 1
+>
+struct DefaultGemmWithVisitor {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA, 
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    epilogue::thread::LinearCombination<
+        ElementC_, kAlignmentC, 
+        ElementAccumulator, ElementEpilogue 
+    >,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      typename GemmBase::Epilogue,
+      FusionCallbacks,
+      EpilogueStages
+  >;
+
+  /// GemmWithVisitor without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public GemmWithEpilogueVisitor<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// GemmWIthVisitor with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public GemmWithEpilogueVisitorStreamk<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
index 9df78c9cdf..dcf7bdd45a 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
@@ -39,7 +39,6 @@
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/gemm/kernel/grouped_problem_visitor.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index 8f146afbc2..f095bc533f 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -47,7 +47,6 @@
 #include "cutlass/layout/matrix.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/kernel/params_universal_base.h"
-
 #include "cutlass/trace.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -346,8 +345,8 @@ class GemmUniversal<
 
       output_op = args.epilogue;
     }
-  };
 
+  };
 
   /// Shared memory storage structure
   union SharedStorage {
@@ -465,13 +464,14 @@ class GemmUniversal<
 
   /// Executes one GEMM
   CUTLASS_DEVICE
-  void operator()(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-
-    // Compute threadblock location
+  void operator()(Params const &params, SharedStorage &shared_storage) {
     ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
 
     cutlass::gemm::GemmCoord threadblock_tile_offset =
         threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
new file mode 100644
index 0000000000..2b9f04fdf1
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that compute the epilogue visitor functor
+template <
+  typename Mma,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue,             ///! Epilogue
+  typename ThreadblockSwizzle_   ///! Threadblock swizzling function
+>
+class GemmWithEpilogueVisitor: GemmUniversal<Mma,Epilogue, ThreadblockSwizzle_> {
+public:
+
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using Base = GemmUniversal<Mma,Epilogue, ThreadblockSwizzle>;
+  using Base::Base;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ElementA = typename Base::ElementA;
+  using LayoutA = typename Base::LayoutA;
+  using ElementB = typename Base::ElementB;
+  using LayoutB = typename Base::LayoutB;
+  using ElementC = typename Base::ElementC;
+  using LayoutC = typename Base::LayoutC;
+
+  using ThreadblockShape = typename Mma::Shape;
+
+  //
+  // Structures
+  //
+
+  using SharedStorage = typename Base::SharedStorage;
+  using Arguments = typename Base::Arguments;
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename FusionCallbacks::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices))
+    {
+      // Raise error on unsupported modes
+      assert(args.mode != GemmUniversalMode::kGemmSplitKParallel && "Sm80 EVT does not support SplitKParallel.");
+      assert(!(args.mode == GemmUniversalMode::kGemm && this->grid_tiled_shape.k() > 1 )
+        && "Sm80 EVT does not support SplitKSerial.");
+      assert(args.mode != GemmUniversalMode::kArray && "Sm80 EVT does not support Array Gemm.");
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalwithVisitor::Params::update()");
+
+      // Update input pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+  };
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitor op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
new file mode 100644
index 0000000000..dd2c52f46b
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
@@ -0,0 +1,892 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept with streamk.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+class GemmWithEpilogueVisitorStreamk {
+public:
+
+  using Base = GemmUniversalStreamk<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  using Arguments = typename Base::Arguments;
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    GemmUniversalMode mode;
+
+    ThreadblockSwizzle block_mapping;
+
+    void *barrier_workspace;
+    void *partials_workspace;
+
+    typename FusionCallbacks::Params output_op;
+
+
+    void * ptr_D;
+    void * ptr_C;
+
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::Params params_C;
+
+    int64_t batch_stride_D;
+    int64_t batch_stride_C;
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        typename ThreadblockSwizzle::template KernelTraits<GemmWithEpilogueVisitorStreamk>(),
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+
+  };
+
+  struct TileWorkDesc: Base::TileWorkDesc {
+    int k_end;
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+  // using TileWorkDesc = typename Base::TileWorkDesc;
+  using SharedStorage = typename Base::SharedStorage;
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    return Base::can_implement(problem_size);
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    cutlass::gemm::GemmCoord threadblock_tile_offset{
+      tile_work.tiled_coord.m(),
+      tile_work.tiled_coord.n(),
+      tile_work.tiled_coord.k()
+    };
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      accumulator_tile,
+      threadblock_tile_offset,
+      params.problem_shape,
+      thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        tiled_coord,
+        params.problem_shape,
+        thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitorStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitorStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        params.output_op,
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/params_universal_base.h b/include/cutlass/gemm/kernel/params_universal_base.h
index 2d276d2548..57e86af93f 100644
--- a/include/cutlass/gemm/kernel/params_universal_base.h
+++ b/include/cutlass/gemm/kernel/params_universal_base.h
@@ -51,7 +51,8 @@ namespace kernel {
 namespace util {
 
 template <class LayoutA, class LayoutB>
-static inline bool 
+CUTLASS_HOST_DEVICE
+static bool 
 is_continous_k_aligned(GemmCoord problem_size, size_t alignmentA, size_t alignmentB) {
   return (std::is_same<LayoutA, layout::RowMajor>::value && (problem_size.k() % alignmentA) == 0) ||
          (std::is_same<LayoutB, layout::ColumnMajor>::value && (problem_size.k() % alignmentB) == 0);
@@ -149,42 +150,9 @@ struct UniversalParamsBase
     batch_stride_D(args.batch_stride_D),
     semaphore(nullptr)
   {
-    ThreadblockSwizzle swizzle;
-
-    // Get GEMM volume in thread block tiles
-    grid_tiled_shape = swizzle.get_tiled_shape(
-      args.problem_size,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    swizzle_log_tile = swizzle.get_log_tile(grid_tiled_shape);
-
-    // Determine extent of K-dimension assigned to each block
-    gemm_k_size = args.problem_size.k();
-
-    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel)
-    {
-      static const uint32_t CACHELINE_BYTES = 128;
-      static const size_t element_bytes_a = sizeof(ElementA);
-      static const size_t element_bytes_b = sizeof(ElementB);
-      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
-      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
-
-      const bool cacheline_alignment_needed =
-          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
-
-      int const kAlignK = const_max(
-                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
-                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
-
-      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
-      if (gemm_k_size) {
-        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
-      }
-    }
+    init_grid_tiled_shape();
   }
 
-
   /// Returns the workspace size (in bytes) needed for this problem geometry
   size_t get_workspace_size() const
   {
@@ -259,6 +227,41 @@ struct UniversalParamsBase
     return ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
   }
 
+private:
+  CUTLASS_HOST_DEVICE
+  void init_grid_tiled_shape() {
+    // Get GEMM volume in thread block tiles
+    grid_tiled_shape = ThreadblockSwizzle::get_tiled_shape(
+      problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      batch_count);
+
+    swizzle_log_tile = ThreadblockSwizzle::get_log_tile(grid_tiled_shape);
+
+    // Determine extent of K-dimension assigned to each block
+    gemm_k_size = problem_size.k();
+
+    if (mode == GemmUniversalMode::kGemm || mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      static const uint32_t CACHELINE_BYTES = 128;
+      static const size_t element_bytes_a = sizeof(ElementA);
+      static const size_t element_bytes_b = sizeof(ElementB);
+      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
+      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
+
+      const bool cacheline_alignment_needed =
+          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
+
+      int const kAlignK = const_max(
+                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
+                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
+
+      gemm_k_size = round_up(ceil_div(problem_size.k(), batch_count), kAlignK);
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(problem_size.k(), gemm_k_size);
+      }
+    }
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
index e1fc4ec92f..ce811a7bba 100644
--- a/include/cutlass/gemm/kernel/sm70_gemm.hpp
+++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -79,7 +79,7 @@ class GemmUniversal<
 
   static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
     "SM70 kernel does not support specializing the tile scheduler.");
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape,
     cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
index 7ab238f6de..8091672f87 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -105,14 +105,14 @@ class GemmUniversal<
   using StrideC  = typename CollectiveEpilogue::StrideC;
   using ElementD = typename CollectiveEpilogue::ElementD;
   using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Params;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
   using EpilogueParams = typename CollectiveEpilogue::Params;
   static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
     "Mainloop and epilogue do not agree on accumulator value type.");
 
   static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
     "TMA kernel does not support specializing the tile scheduler.");
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
   using TileSchedulerArguments = typename TileScheduler::Arguments;
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
index 01d442746d..e5ae25a70c 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -99,7 +99,7 @@ class GemmUniversal<
 
   static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
     "TMA warp-specialized kernel does not support specializing the tile scheduler.");
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
   using TileSchedulerArguments = typename TileScheduler::Arguments;
@@ -357,8 +357,6 @@ class GemmUniversal<
     // Get pipeline iterators and increments from tensor shapes
     auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
     auto k_tile_count = size<2>(gA);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    [[maybe_unused]] auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
 
     // Wait for all thread blocks in the Cluster
     cluster_wait_fn();
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index dfa18ad2c6..7ad54f4afc 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -97,7 +97,7 @@ class GemmUniversal<
 
   static_assert(ArchTag::kMinComputeCapability >= 90);
 
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
   using TileSchedulerArguments = typename TileScheduler::Arguments;
@@ -238,6 +238,7 @@ class GemmUniversal<
     if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
       args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
     }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
     return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
   }
 
@@ -390,8 +391,6 @@ class GemmUniversal<
 
     // Get pipeline stage increments from tensor shapes
     auto k_tile_count = size<3>(gA_mkl);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
 
     TileScheduler scheduler{params.scheduler};
     auto work_tile_info = scheduler.get_current_work();
@@ -417,15 +416,13 @@ class GemmUniversal<
           auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
 
           // Slice with our work tile coordinates to construct mainloop tensor views
-          Tensor gA_presplit = gA_mkl(_,_,m_coord,_,l_coord);                                        // (BLK_M,BLK_K,k)
-          Tensor gB_presplit = gB_nkl(_,_,n_coord,_,l_coord);                                        // (BLK_N,BLK_K,k)
+          Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                        // (BLK_M,BLK_K,k)
+          Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                        // (BLK_N,BLK_K,k)
 
-          // Split operands A and B along the K dimension according to work_tile_info
-          Tensor gA = TileScheduler::split_MK(gA_presplit, work_tile_info);              // (BLK_N,BLK_K,k_split_iters)
-          Tensor gB = TileScheduler::split_NK(gB_presplit, work_tile_info);              // (BLK_N,BLK_K,k_split_iters)
-
-          auto work_k_tile_count = size<2>(gA);
-          auto k_tile_iter = cute::make_coord_iterator(shape<2>(gA_presplit));
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<2>(gA)), shape<2>(gA));
 
           collective_mainloop.load(
             mainloop_pipeline,
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
index 77c88c32a0..dd1f5a6b0c 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -99,7 +99,7 @@ class GemmUniversal<
 
   static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
     "Ping-pong kernel only supports the default scheduler.");
-  using TileScheduleTag = TileScheduler_;
+  using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
   using TileSchedulerArguments = typename TileScheduler::Arguments;
@@ -240,6 +240,7 @@ class GemmUniversal<
     if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
       args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
     }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
     return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
   }
 
@@ -282,7 +283,7 @@ class GemmUniversal<
 
     // Kernel level shared memory storage
     SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
+    
     int thread_idx = int(threadIdx.x);
     int lane_idx = canonical_lane_idx();
     int warp_idx = canonical_warp_idx_sync();
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
index d8149692be..8fb60d9004 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -31,7 +31,9 @@
 #pragma once
 
 #include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
 #include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
 #include "cute/layout.hpp"
 #include "cute/tensor.hpp"
 #include "cute/arch/cluster_sm90.hpp"
@@ -57,30 +59,15 @@ class PersistentTileSchedulerSm90 {
     bool is_valid_tile = false;
   };
 
-  //
-  // Methods
-  //
-
-  enum class RasterOrder {
-    AlongM,
-    AlongN
-  };
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
 
   struct Arguments {
     int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
   };
 
-  struct Params {
-
-    FastDivmodU64 divmod_cluster_shape_major_{};
-    FastDivmodU64 divmod_cluster_shape_minor_{};
-    FastDivmodU64 divmod_batch_{};
-    FastDivmodU64 divmod_cluster_blk_major_{};
-
-    uint64_t blocks_per_problem_ = 0;
-    int32_t log_swizzle_size_ = 0;
-    RasterOrder raster_order_ = RasterOrder::AlongN;
-  };
   // Sink scheduler params as a member
   Params scheduler_params;
 
@@ -102,40 +89,18 @@ class PersistentTileSchedulerSm90 {
     static_assert(cute::is_static<TileShape>::value);
     static_assert(cute::is_static<ClusterShape>::value);
 
-    // Round up to nearest multiple of cluster dim along each mode
-    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = get_tiled_cta_shape_mnl(
-        problem_shape_mnkl, tile_shape, cluster_shape);
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
 
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto log_swizzle_size = get_log_swizzle_size(problem_blocks_m, problem_blocks_n, arguments.max_swizzle_size);
-    problem_blocks_m = round_up(problem_blocks_m, (1 << log_swizzle_size) * cute::size<0>(cluster_shape));
-    problem_blocks_n = round_up(problem_blocks_n, (1 << log_swizzle_size) * cute::size<1>(cluster_shape));
-  
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size, 
+      arguments.raster_order
+    );
 
-    RasterOrder raster_order;
-    raster_order = get_rasterization_order(problem_shape_mnkl, tile_shape);
-    if (raster_order == RasterOrder::AlongN) {
-      return {
-        FastDivmodU64(cute::size<1>(cluster_shape)),
-        FastDivmodU64(cute::size<0>(cluster_shape)),
-        FastDivmodU64(problem_blocks_m * problem_blocks_n),
-        FastDivmodU64(problem_blocks_n / cute::size<1>(cluster_shape)),
-        problem_blocks_m * problem_blocks_n * problem_blocks_l,
-        log_swizzle_size,
-        raster_order
-      };
-    }
-    else {
-      return {
-        FastDivmodU64(cute::size<0>(cluster_shape)),
-        FastDivmodU64(cute::size<1>(cluster_shape)),
-        FastDivmodU64(problem_blocks_m * problem_blocks_n),
-        FastDivmodU64(problem_blocks_m / cute::size<0>(cluster_shape)),
-        problem_blocks_m * problem_blocks_n * problem_blocks_l,
-        log_swizzle_size,
-        raster_order
-      };
-    }
+    return params;
   }
 
   CUTLASS_HOST_DEVICE
@@ -146,10 +111,10 @@ class PersistentTileSchedulerSm90 {
     // like blockIdx and gridDim, with __CUDA_ARCH__.
 #if defined(__CUDA_ARCH__)
     if (params_.raster_order_ == RasterOrder::AlongN) {
-      current_work_linear_idx_ = static_cast<uint64_t>(int(blockIdx.x) + (int(blockIdx.y) * int(gridDim.x)));
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
     }
     else {
-      current_work_linear_idx_ = static_cast<uint64_t>((int(blockIdx.x) * int(gridDim.y)) + int(blockIdx.y));
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
     }
 #else
     CUTLASS_ASSERT(false && "This line should never be reached");
@@ -187,7 +152,7 @@ class PersistentTileSchedulerSm90 {
     // MSVC requires protecting use of CUDA-specific nonstandard syntax,
     // like blockIdx and gridDim, with __CUDA_ARCH__.
 #if defined(__CUDA_ARCH__)
-    current_work_linear_idx_ += static_cast<uint64_t>(int(gridDim.x) * int(gridDim.y) * int(gridDim.z)) * advance_count;
+    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
 #else
     CUTLASS_ASSERT(false && "This line should never be reached");
 #endif
@@ -246,35 +211,14 @@ class PersistentTileSchedulerSm90 {
   CUTLASS_HOST_DEVICE static
   dim3
   get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) {
-    // Across M and N is our Cluster tile, so we must round up the blocks to the nearest whole number of Cluster tiles
     auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
     auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
 
-    // Round up to nearest multiple of cluster dim along each mode
-    int problem_blocks_m = round_up(cta_m, cute::size<0>(cluster_shape));
-    int problem_blocks_n = round_up(cta_n, cute::size<1>(cluster_shape));
-
-    // Cluster tile does not span the batch mode, so no extra rounding up required for it
-    int problem_blocks_l = int(cute::size<3>(problem_shape_mnkl));
-    return {uint32_t(problem_blocks_m), uint32_t(problem_blocks_n), uint32_t(problem_blocks_l)};
-  }
-
-  CUTLASS_HOST_DEVICE 
-  static int32_t 
-  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
-    int min_cta_dim = min(problem_ctas_m, problem_ctas_n);
-    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
-      return 3;
-    } 
-    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
-      return 2;
-    }
-    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
-      return 1;
-    }
-    else {
-      return 0;
-    }
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape),
+      cta_m, cta_n
+    );
   }
 
   // Given the inputs, computes the physical grid we should launch.
@@ -289,111 +233,17 @@ class PersistentTileSchedulerSm90 {
     Arguments arguments,
     bool truncate_by_problem_size=true) {
 
-    int const sm_count = hw_info.sm_count;
-    CUTLASS_TRACE_HOST("get_grid_shape(): Persistent schedule grid plan using SM count = " << sm_count);
-
-    // Compute the total number of output tiles our problem has
-    auto problem_shape_MNKL = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
-    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] =
-        get_tiled_cta_shape_mnl(problem_shape_MNKL, cta_shape, cluster_shape);
-    
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto swizzle_size = 1 << get_log_swizzle_size(problem_blocks_m, problem_blocks_n, arguments.max_swizzle_size);
-    problem_blocks_m = round_up(problem_blocks_m, swizzle_size * cute::size<0>(cluster_shape));
-    problem_blocks_n = round_up(problem_blocks_n, swizzle_size * cute::size<1>(cluster_shape));
-
-    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks_l;
-
-    RasterOrder raster_order;
-    raster_order = get_rasterization_order(problem_shape_mnk, cta_shape);
-    dim3 launch_grid;
-
-    if (raster_order == RasterOrder::AlongN) {
-      launch_grid = dim3(cute::size<0>(cluster_shape), 1, 1);
-    }
-    else {
-      launch_grid = dim3(1, cute::size<1>(cluster_shape), 1);
-    }
-
-    auto possibly_truncate = [&](int x, int y) {
-      if (truncate_by_problem_size) {
-        return std::min(x, y);
-      }
-      else {
-        return x;
-      }
-    };
-
-    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
-    if constexpr (size(cluster_shape) == 1) {
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
-      }
-      else {
-        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
-      }
-    }
-    else {
-      /*
-      * Optimal grid size calculation is based on
-      * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
-      * Hence, maximum SMs per GPC = 18
-      */
-      constexpr int max_sm_per_gpc = 18;
-      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
-      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
-      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(cluster_shape));
-      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
-
-      // The calculation below allows for larger grid size launch for different GPUs.
-      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
-      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % size(cluster_shape));
-      cta_per_device += max_cta_occupancy_per_residual_gpc;
-
-      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
-
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(
-            cta_per_device       / cute::size<0>(cluster_shape),
-            problem_blocks_total / cute::size<0>(cluster_shape));
-      }
-      else {
-        launch_grid.x = possibly_truncate(
-            cta_per_device       / cute::size<1>(cluster_shape),
-            problem_blocks_total / cute::size<1>(cluster_shape));
-      }
-    }
-    return launch_grid;
-  }
-
-  template <class ProblemShapeMNKL, class BlockShape>
-  CUTLASS_HOST_DEVICE static RasterOrder get_rasterization_order(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape) {
-    auto tiles_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
-    auto tiles_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
-
-    if (tiles_n > tiles_m) {
-      return RasterOrder::AlongM;
-    }
-
-    return RasterOrder::AlongN;
-  }
-
-  // Splits an input tensor with MxK according to the splitting configuration specified by work_tile_info.
-  // Since the basic tile scheduler does not split output tiles, this method is a no-op.
-  template<class Engine, class Layout>
-  CUTLASS_DEVICE
-  static auto
-  split_MK(cute::Tensor<Engine, Layout> const& tensor, WorkTileInfo const&) {
-    return tensor;
-  }
-
-  // Splits an input tensor with NxK tiles according to the splitting configuration specified by work_tile_info.
-  // Since the basic tile scheduler does not split output tiles, this method is a no-op.
-  template<class Engine, class Layout>
-  CUTLASS_DEVICE
-  static auto
-  split_NK(cute::Tensor<Engine, Layout> const& tensor, WorkTileInfo const&) {
-    return tensor;
+    auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
   }
 
   // Returns whether the block assigned this work should compute the epilogue for the corresponding
@@ -441,6 +291,13 @@ class PersistentTileSchedulerSm90 {
     // space of the output tile assigned to the work unit.
     return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
   }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
 };
 
 } // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
index 912435441d..ff9cb20972 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -34,7 +34,6 @@
 #include "cutlass/barrier.h"
 #include "cutlass/block_striped.h"
 #include "cutlass/fast_math.h"
-#include "cutlass/workspace.hpp"
 #include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cute/layout.hpp"
@@ -54,22 +53,21 @@ class PersistentTileSchedulerSm90StreamK {
 
 private:
   using UnderlyingScheduler = PersistentTileSchedulerSm90;
-public:
-  using RasterOrder = UnderlyingScheduler::RasterOrder;
+
 private:
   using UnderlyingArguments = typename UnderlyingScheduler::Arguments;
   using UnderlyingParams = typename UnderlyingScheduler::Params;
 
   uint64_t current_work_linear_idx_ = 0;
 
-  // Minimum number of k iterations that can be assigned to a stream-K unit
-  static constexpr uint32_t min_iters_per_sk_unit_ = 2;
+public:
+
+  using RasterOrder = UnderlyingScheduler::RasterOrder;
+  using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
 
   // Use a dummy barrier manager to simply get the type used to store the barrier
   using BarrierType = typename NamedBarrierManager<1>::T;
 
-public:
-
   struct WorkTileInfo {
     int32_t M_idx = 0;
     int32_t N_idx = 0;
@@ -91,27 +89,32 @@ class PersistentTileSchedulerSm90StreamK {
     bool is_final_split = true;
   };
 
+  using Params = PersistentTileSchedulerSm90StreamKParams;
+  using ReductionMode = Params::ReductionMode;
+
   struct Arguments {
 
     Arguments() = default;
     Arguments(Arguments const&) = default;
     Arguments(Arguments&&) = default;
-    
+
     CUTLASS_HOST_DEVICE
-    Arguments& 
+    Arguments&
     operator=(Arguments const& args) {
       splits = args.splits;
+      raster_order = args.raster_order;
       return *this;
     }
-   
+
     CUTLASS_HOST_DEVICE
-    Arguments& 
+    Arguments&
     operator=(Arguments&& args) noexcept {
       splits = args.splits;
+      raster_order = args.raster_order;
       return *this;
     }
 
-    CUTLASS_HOST_DEVICE 
+    CUTLASS_HOST_DEVICE
     Arguments(int splits_) : splits(splits_) {}
 
     // The splitting factor to be used in a split-K decomposition of the problem.
@@ -119,48 +122,8 @@ class PersistentTileSchedulerSm90StreamK {
     // is bypassed in favor of a split-K decomposition.
     int splits = 1;
     const int max_swizzle_size = 1;
-  };
-
-  struct Params {
-    FastDivmodU64 divmod_cluster_shape_major_{};
-    FastDivmodU64 divmod_cluster_shape_minor_{};
-    FastDivmodU64 divmod_batch_{};
-    FastDivmodU64 divmod_k_{};
-    FastDivmodU64 divmod_cluster_blk_major_{};
-
-    int32_t log_swizzle_size_ = 0;
-
-
-    uint64_t units_per_problem_ = 0;
-    RasterOrder raster_order_ = RasterOrder::AlongN;
-    ClusterShape cluster_shape_{};
-
-    // The splitting factor to be used in a split-K decomposition of the problem.
-    // If this is set to a value greater than 1, stream-K decomposition logic
-    // is bypassed in favor of a split-K decomposition.
-    uint32_t splits_ = 1;
-
-    // Number of tiled k iterations required to compute a single output tile.
-    uint32_t k_tiles_per_output_tile_ = 0;
-
-    // Number of stream-K or split-K work units that compute an extra k iteration.
-    // This is done to handle residuals in dividing up the k iteration space.
-    // For stream-K, since the actual assignment of work to stream-K units will be done
-    // at the granularity of a cluster, we store only the number of big clusters.
-    uint32_t big_units_ = 0;
-
-    // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
-    void* reduction_workspace_ = nullptr;
-
-    // Number of tiles covered by stream-K work units
-    uint32_t sk_tiles_ = 0;
-
-    // Number of work units computing stream-K tiles
-    uint32_t sk_units_ = 0;
-
-    // Number of tiled k iterations computed by each stream-K work unit. This
-    // can potentially cover more than one output tile.
-    uint32_t k_tiles_per_sk_unit_ = 0;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+    ReductionMode reduction_mode = ReductionMode::Deterministic;
   };
 
   // Sink scheduler params as a member
@@ -173,7 +136,7 @@ class PersistentTileSchedulerSm90StreamK {
   template <class ProblemShape>
   static Params
   to_underlying_arguments(
-    ProblemShape problem_shape_mnkl,
+    ProblemShape problem_shape,
     TileShape tile_shape,
     ClusterShape cluster_shape,
     KernelHardwareInfo const& hw_info,
@@ -183,143 +146,23 @@ class PersistentTileSchedulerSm90StreamK {
     static_assert(cute::is_static<TileShape>::value);
     static_assert(cute::is_static<ClusterShape>::value);
 
-    // Round up to nearest multiple of cluster dim along each mode
-    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = get_tiled_cta_shape_mnl(
-        problem_shape_mnkl, tile_shape, cluster_shape);
-
-    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
-
-    // Number of k tile iterations in each output tile
-    uint32_t k_tiles_per_output_tile = (cute::size<2>(problem_shape_mnkl) + cute::size<2>(tile_shape) - 1) /
-                               cute::size<2>(tile_shape);
-
-    UnderlyingArguments underlying_args;
-    underlying_args.max_swizzle_size = 1;
-    UnderlyingParams underlying_params = UnderlyingScheduler::to_underlying_arguments(
-      problem_shape_mnkl, tile_shape, cluster_shape, hw_info, underlying_args, workspace);
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
 
-    void* reduction_workspace = nullptr;
-
-    if (workspace != nullptr) {
-      // Reduction workspace is at the beginning of the workspace. Lock workspace follows.
-      reduction_workspace = workspace;
-    }
-
-    if (args.splits > 1) {
-      // Short circuit to basic split-K decomposition
-
-      // Don't split by more than the available number of SMs
-      auto splits = args.splits > hw_info.sm_count ? hw_info.sm_count : args.splits;
-
-      // Don't split by more than the K tile iterations
-      //
-      // splits is almost certainly nonnegative here (e.g., hw_info.sm_count,
-      // despite being an int, is a count), so it can safely be converted to unsigned
-      // in the comparison to avoid a signed-unsigned comparison warning-as-error.
-      splits = static_cast<decltype(k_tiles_per_output_tile)>(splits) > k_tiles_per_output_tile ? k_tiles_per_output_tile : splits;
-
-      return get_params_basic(
-        underlying_params, problem_blocks_m, problem_blocks_n, problem_blocks_l, cluster_shape,
-        splits, k_tiles_per_output_tile, reduction_workspace);
-    }
-
-    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
-    // can fit within sm_count SMs.
-    dim3 grid = get_grid_shape(problem_shape_mnkl, tile_shape, cluster_shape, hw_info, args);
-    uint64_t ctas_per_wave = grid.x * grid.y;
-
-    // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
-    uint32_t sk_tiles = get_num_sk_tiles(output_tiles, ctas_per_wave, k_tiles_per_output_tile);
-    uint64_t dp_tiles = output_tiles - sk_tiles;
-
-    // Calculate the number of work units covering the data-parallel and stream-K tiles.
-    // A "work unit" is a single index in the linearized ID space used by the scheduler.
-    // We distinguish it from a "block," which is typically tied to a hardware unit
-    // (e.g., the callers into this scheduler will be persistent thread blocks).
-    // A work unit can encompass multiple output tiles worth of work (as will be the
-    // case for stream-K blocks).
-    // Since splitting is not required for data-parallel tiles, only one data-parallel unit
-    // is needed per data-parallel tile.
-    uint64_t dp_units = dp_tiles;
-
-    // Number of k iterations computed by the stream-K units as a whole
-    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
-
-    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
-    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
-    // will be as many work units as there are threadblocks in a single wave.
-    //
-    // When the total k iterations across stream-K tiles is too small to justify distributing
-    // across an entire wave of blocks, we instead distribute the iterations over a smaller
-    // set of blocks.
-
-    // Calculate the number of stream-K units that would be needed if each stream-K unit
-    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
-    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
-    min_sized_sk_units = (min_sized_sk_units / cute::size(cluster_shape)) * cute::size(cluster_shape);
-
-    uint64_t sk_units = min(ctas_per_wave, min_sized_sk_units);
-
-    if (sk_units == 0) {
-      // Short circuit to basic data-parallel decomposition
-      return get_params_basic(
-        underlying_params, problem_blocks_m, problem_blocks_n, problem_blocks_l, cluster_shape,
-        1, k_tiles_per_output_tile, reduction_workspace);
-    }
-
-    // If the number of stream-K units is a multiple of the number of stream-K tiles, then
-    // the problem can leverage a basic split-K decomposition for the stream-K tiles.
-    if (sk_tiles < sk_units && sk_units % sk_tiles == 0) {
-      // Short circuit to basic split-K decomposition
-      uint32_t sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
-      return get_params_basic(
-        underlying_params, problem_blocks_m, problem_blocks_n, problem_blocks_l, cluster_shape,
-        sk_splits, k_tiles_per_output_tile, reduction_workspace);
-    }
-
-    // Number of k iterations computed per stream-K units
-    uint64_t k_tiles_per_sk_unit = k_tiles_sk_total / sk_units;
-
-    // Number of stream-K units that need to compute extra iterations in order to cover
-    // the residual k iterations. This assumes that each such unit computes one additional
-    // iteration.
-    uint64_t sk_big_units = k_tiles_sk_total - (k_tiles_per_sk_unit * sk_units);
-
-    // The division below is guaranteed to be exact because sk_big_units is guaranteed
-    // to be a multiple of cluster_size (cute::size(cluster_shape)). This is useful because
-    // it allows us to use a block's linearized cluster ID  to determine whether it is
-    // a big block. The reasoning behind this guarnatee is explained as follows:
-    //     sk_big_units = k_tiles_sk_total - (k_tiles_per_sk_unit * sk_units);
-    //
-    // - k_tiles_sk_total is a multiple of cluster_size because it is the product
-    //   of number of tail tiles and the number of k iterations per tile. Because
-    //   both the number of output tiles and number of available SMs are rounded
-    //   to be multiples of cluster shape, the number of tail tiles
-    //   (output_tiles % avail_sms) is a multpile of cluster_size.
-    //
-    // - sk_units is a multiple of cluster_size because it is either blocks_per_wave
-    //   or 0, and blocks_per_wave is a multiple of the cluster_size due to the grid-planning
-    //   logic rounding to multiples of cluster dimensions
-    uint64_t sk_big_units_per_cluster = sk_big_units / cute::size(cluster_shape);
-
-    return {
-      underlying_params.divmod_cluster_shape_major_,
-      underlying_params.divmod_cluster_shape_minor_,
-      underlying_params.divmod_batch_,
-      FastDivmodU64(problem_blocks_m * problem_blocks_n),  // Static k-splitting divmod. Unused for stream-K.
-      underlying_params.divmod_cluster_blk_major_,
-      underlying_params.log_swizzle_size_,
-      static_cast<uint32_t>(dp_units + sk_units),
-      underlying_params.raster_order_,
-      cluster_shape,
-      1,                                                   // Static k-splitting factor. Unused for stream-K.
-      k_tiles_per_output_tile,
-      static_cast<uint32_t>(sk_big_units_per_cluster),
-      reduction_workspace,
-      sk_tiles,
-      static_cast<uint32_t>(sk_units),
-      static_cast<uint32_t>(k_tiles_per_sk_unit)
-    };
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      workspace
+    );
+    return params;
   }
 
   CUTLASS_HOST_DEVICE
@@ -328,10 +171,10 @@ class PersistentTileSchedulerSm90StreamK {
   CUTLASS_HOST_DEVICE
   PersistentTileSchedulerSm90StreamK(Params const& params_) : scheduler_params(params_) {
     if (params_.raster_order_ == RasterOrder::AlongN) {
-      current_work_linear_idx_ = static_cast<uint64_t>(int(blockIdx.x) + (int(blockIdx.y) * int(gridDim.x)));
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
     }
     else {
-      current_work_linear_idx_ = static_cast<uint64_t>((int(blockIdx.x) * int(gridDim.y)) + int(blockIdx.y));
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
     }
   }
 
@@ -397,7 +240,7 @@ class PersistentTileSchedulerSm90StreamK {
   CUTLASS_DEVICE
   void
   advance_to_next_work(uint32_t advance_count = 1) {
-    current_work_linear_idx_ += static_cast<uint64_t>(int(gridDim.x) * int(gridDim.y) * int(gridDim.z)) * advance_count;
+    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
   }
 
   // Given the inputs, computes the total number of output blocks this problem will compute over
@@ -420,17 +263,16 @@ class PersistentTileSchedulerSm90StreamK {
     KernelHardwareInfo hw_info,
     Arguments arguments) {
 
-    UnderlyingArguments underlying_args;
-    underlying_args.max_swizzle_size = 1;
-    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
-    // to be truncated based on the number of output tiles in the problem.
-    return UnderlyingScheduler::get_grid_shape(
-      problem_shape,
-      tile_shape,
-      cluster_shape,
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
       hw_info,
-      underlying_args,
-      /*truncate_by_problem_size=*/false);
+      arguments.max_swizzle_size, 
+      arguments.raster_order
+    );
   }
 
   // Returns whether fixup is needed for `work_tile_info`.
@@ -501,7 +343,8 @@ class PersistentTileSchedulerSm90StreamK {
     // note that, in the split-K case, the units_per_problem_ member of Params will be
     // the total number of output tiles multiplied by the number of splits.
     auto reduction_tiles = params.splits_ > 1 ? (params.units_per_problem_ / params.splits_) : params.sk_tiles_;
-    auto reduction_workspace_size = get_reduction_workspace_size<ElementAccumulator>(reduction_tiles);
+    auto reduction_workspace_size = Params::get_reduction_workspace_size(
+      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value);
     BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
       reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
 
@@ -511,8 +354,14 @@ class PersistentTileSchedulerSm90StreamK {
         BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
       }
       else {
-        // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        if (params.reduction_mode_ == ReductionMode::Deterministic) {
+          // Wait until the preceding split added its accumulators
+          BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        }
+        else {
+          // Wait unitl the first split has stored its accumulators
+          BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+        }
 
         // Perform reduction in workspace
         BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
@@ -531,22 +380,6 @@ class PersistentTileSchedulerSm90StreamK {
     }
   }
 
-  // Splits an input tensor with MxK according to the splitting configuration specified by work_tile_info
-  template<class Engine, class Layout>
-  CUTLASS_DEVICE
-  static auto
-  split_MK(cute::Tensor<Engine, Layout> const& tensor, WorkTileInfo const& work_tile_info) {
-    return split<Engine, Layout, 0>(tensor, work_tile_info);
-  }
-
-  // Splits an input tensor with NxK tiles according to the splitting configuration specified by work_tile_info
-  template<class Engine, class Layout>
-  CUTLASS_DEVICE
-  static auto
-  split_NK(cute::Tensor<Engine, Layout> const& tensor, WorkTileInfo const& work_tile_info) {
-    return split<Engine, Layout, 1>(tensor, work_tile_info);
-  }
-
   // Returns whether the block assigned this work should compute the epilogue for the corresponding
   // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
   CUTLASS_HOST_DEVICE
@@ -564,14 +397,14 @@ class PersistentTileSchedulerSm90StreamK {
       if (params.raster_order_ == RasterOrder::AlongN) {
         return
           (tiles_mn * work_tile_info.L_idx) +
-          (params.divmod_cluster_shape_major_.divisor * 
+          (params.divmod_cluster_shape_major_.divisor *
            params.divmod_cluster_blk_major_.divisor * work_tile_info.M_idx) +
           work_tile_info.N_idx;
       }
       else {
         return
           (tiles_mn * work_tile_info.L_idx) +
-          (params.divmod_cluster_shape_major_.divisor * 
+          (params.divmod_cluster_shape_major_.divisor *
            params.divmod_cluster_blk_major_.divisor * work_tile_info.N_idx) +
           work_tile_info.M_idx;
       }
@@ -582,16 +415,16 @@ class PersistentTileSchedulerSm90StreamK {
       uint64_t cta_per_grid_dim;
       uint64_t cluster_dim_idx;
       if (params.raster_order_ == RasterOrder::AlongN) {
-        uint64_t block_idx_m = (work_tile_info.M_idx - cta_m_in_cluster) / cute::size<0>(params.cluster_shape_);
+        uint64_t block_idx_m = (work_tile_info.M_idx - cta_m_in_cluster) / params.divmod_cluster_shape_minor_.divisor;
         uint64_t block_idx_n = work_tile_info.N_idx;
-        cta_per_grid_dim = (params.divmod_cluster_shape_major_.divisor * 
+        cta_per_grid_dim = (params.divmod_cluster_shape_major_.divisor *
            params.divmod_cluster_blk_major_.divisor * block_idx_m) + block_idx_n;
         cluster_dim_idx = cta_m_in_cluster;
       }
       else {
         uint64_t block_idx_m = work_tile_info.M_idx;
-        uint64_t block_idx_n = (work_tile_info.N_idx - cta_n_in_cluster) / cute::size<1>(params.cluster_shape_);
-        cta_per_grid_dim = (params.divmod_cluster_shape_major_.divisor * 
+        uint64_t block_idx_n = (work_tile_info.N_idx - cta_n_in_cluster) / params.divmod_cluster_shape_minor_.divisor;
+        cta_per_grid_dim = (params.divmod_cluster_shape_major_.divisor *
            params.divmod_cluster_blk_major_.divisor * block_idx_n) + block_idx_m;
         cluster_dim_idx = cta_n_in_cluster;
       }
@@ -609,13 +442,27 @@ class PersistentTileSchedulerSm90StreamK {
     KernelHardwareInfo const& hw_info,
     uint32_t mma_warp_groups) {
 
-    int barrier_workspace_size = 0;
-    int reduction_workspace_size = 0;
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
 
-    get_workspace_component_sizes<ProblemShape, ElementAccumulator>(
-      args, problem_shape, barrier_workspace_size, reduction_workspace_size, hw_info, mma_warp_groups);
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
 
-    return barrier_workspace_size + reduction_workspace_size;
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value
+    );
   }
 
   template <class ProblemShape, class ElementAccumulator>
@@ -628,26 +475,29 @@ class PersistentTileSchedulerSm90StreamK {
      KernelHardwareInfo const& hw_info,
     uint32_t mma_warp_groups) {
 
-    #if !defined(__CUDACC_RTC__)
-      int barrier_workspace_size = 0;
-      int reduction_workspace_size = 0;
-
-      get_workspace_component_sizes<ProblemShape, ElementAccumulator>(
-        args, problem_shape, barrier_workspace_size, reduction_workspace_size, hw_info, mma_warp_groups);
-
-      if (barrier_workspace_size > 0) {
-        if (workspace == nullptr) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        // Only the barrier workspace needs to be cleared for stream-K.
-        // Barrier workspace follows reduction workspace.
-        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
-        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream);
-      }
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
 
-      return Status::kSuccess;
-    #endif
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value
+    );
   }
 
   template <class ProblemShape>
@@ -657,162 +507,10 @@ class PersistentTileSchedulerSm90StreamK {
     return work_tile_info.k_tile_count;
   }
 
-private:
-  // Splits a tensor using the splitting configuration specified by work_tile_info using
-  // a MN shape detemined by TileDim0.
-  template <class Engine, class Layout, int TileDim0>
-  CUTLASS_DEVICE
-  static auto
-  split(cute::Tensor<Engine, Layout> const& tensor, WorkTileInfo const& work_tile_info) {
-    using namespace cute;
-
-    // Divide input tensor into `splits` chunks along the k dimension
-    auto div_shape = make_shape(size<TileDim0>(TileShape{}), size<2>(TileShape{}), work_tile_info.splits);
-    auto split = zipped_divide(tensor, div_shape);
-
-    // Index into the split tensor at the work tile's split index
-    auto indexed = split(make_coord(make_coord(_, _, work_tile_info.K_idx), make_coord(0, 0, _)));
-
-    // Construct a layout for the indexed tensor. The main purpose of this new layout is to
-    // override the k extent to support cases in which the split computes a number of iterations
-    // not equal to total_k_tiles / splits. A common example of this is in stream-K is when a
-    // unit computes the final 20 of the total 32 k iterations of the output tile. In this case,
-    // set splits = 32 and the split index (K_idx) to 11. The zipped divide above results in each
-    // of the splits computing only one k iteration.
-    auto overridden_shape = make_shape(size<0>(indexed.layout()), size<1>(indexed.layout()), work_tile_info.k_tile_count);
-    auto layout = make_layout(overridden_shape, tensor.stride());
-
-    return make_tensor(indexed.data(), layout);
-  }
-
-  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
-  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
-  static uint32_t
-  get_num_sk_tiles(uint64_t output_tiles, uint64_t ctas_per_wave, uint32_t k_tiles_per_output_tile) {
-    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
-    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
-
-    if (full_waves == total_waves || k_tiles_per_output_tile == 1) {
-      // All tiles will be data-parallel tiles if there is either no quantization
-      // or if there is no work to be split.
-      return 0;
-    }
-
-    //
-    // The final wave is not full. Perform some stream-K work.
-    //
-
-    // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
-    // one wave and the tail wave is more than half full. This is subject to change.
-    if (full_waves != 0) {
-      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
-      if (tail_tiles >= (ctas_per_wave / 2)) {
-        return 0;
-      }
-    }
-
-    // If there is wave quantization, assign the first two waves worth of tiles to be
-    // covered by stream-K work and the remainder to be data-parallel. Since we know
-    // that full_waves == total_waves - 1 in this case, the number of data-parallel
-    // waves is simply full_waves-1 (unless full_waves == 0).
-    uint32_t dp_waves = full_waves > 0 ? full_waves - 1 : 0;
-
-    uint64_t dp_tiles = dp_waves * ctas_per_wave;
-    return static_cast<uint32_t>(output_tiles - dp_tiles);
-  }
-
-  // Calculates the size of the workspace needed for holding reduction barriers
-  CUTLASS_HOST_DEVICE
-  static int
-  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups) {
-    auto workspace_bits = num_tiles * mma_warp_groups * sizeof_bits<BarrierType>::value;
-    return bits_to_bytes(static_cast<int>(workspace_bits));
-  }
-
-  // Calculates the size of the workspace needed for holding partial outputs from splits
-  template <class ElementAccumulator>
   CUTLASS_HOST_DEVICE
-  static int
-  get_reduction_workspace_size(uint64_t num_tiles) {
-    auto output_tile_size = cute::size<0>(TileShape{}) * cute::size<1>(TileShape{});
-    auto workspace_bits = sizeof_bits<ElementAccumulator>::value * output_tile_size * num_tiles;
-    return bits_to_bytes(static_cast<int>(workspace_bits));
-  }
-
-  template <class ProblemShape, class ElementAccumulator>
-  static void
-  get_workspace_component_sizes(
-    Arguments const& args,
-    ProblemShape problem_shape,
-    int& barrier_workspace_size,
-    int& reduction_workspace_size,
-    KernelHardwareInfo const& hw_info,
-    uint32_t mma_warp_groups) {
-
-    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
-    // of output tiles that will be split, and then calculate the workspace needed to cover these.
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
-
-    ClusterShape cluster_shape;
-    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] = get_tiled_cta_shape_mnl(
-      problem_shape_mnkl, TileShape{}, cluster_shape);
-    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
-
-    if (args.splits > 1) {
-      // Basic split-K variant requires workspace for all output tiles
-      barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups);
-      reduction_workspace_size = get_reduction_workspace_size<ElementAccumulator>(output_tiles);
-    }
-    else {
-      int sm_count = hw_info.sm_count;
-      if (sm_count <= 0) {
-        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-        sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-      }
-
-      uint32_t k_tiles_per_output_tile = (cute::size<2>(problem_shape_mnkl) + cute::size<2>(TileShape{}) - 1) /
-                                   cute::size<2>(TileShape{});
-
-      dim3 grid = get_grid_shape(problem_shape_mnkl, TileShape{}, cluster_shape, {0, sm_count}, args);
-      uint64_t ctas_per_wave = grid.x * grid.y;
-      uint32_t sk_tiles = get_num_sk_tiles(output_tiles, ctas_per_wave, k_tiles_per_output_tile);
-
-      barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups);
-      reduction_workspace_size = get_reduction_workspace_size<ElementAccumulator>(sk_tiles);
-    }
-  }
-
-  // Constructs parameters for either a basic data-parallel or basic split-K decomposition of the problem
-  static Params
-  get_params_basic(
-    UnderlyingParams const& underlying_params,
-    uint32_t blocks_m,
-    uint32_t blocks_n,
-    uint32_t blocks_l,
-    ClusterShape cluster_shape,
-    uint32_t splits,
-    uint32_t k_tiles_per_output_tile,
-    void* reduction_workspace) {
-
-    uint32_t big_units = k_tiles_per_output_tile % splits;
-
-    return {
-      underlying_params.divmod_cluster_shape_major_,
-      underlying_params.divmod_cluster_shape_minor_,
-      FastDivmodU64(blocks_m * blocks_n * splits),
-      FastDivmodU64(blocks_m * blocks_n),
-      underlying_params.divmod_cluster_blk_major_,
-      underlying_params.log_swizzle_size_,
-      blocks_m * blocks_n * blocks_l * splits,
-      underlying_params.raster_order_,
-      cluster_shape,
-      splits,
-      k_tiles_per_output_tile,
-      big_units,
-      reduction_workspace
-    };
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
   }
 
   // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
@@ -840,7 +538,8 @@ class PersistentTileSchedulerSm90StreamK {
     //
     // To do so, we divide up the linearized stream-K units into clusters and share the same K
     // offsets for work within clusters.
-    auto cluster_linear_work_idx = linear_idx / size(params.cluster_shape_);
+    auto cluster_size = params.divmod_cluster_shape_major_.divisor * params.divmod_cluster_shape_minor_.divisor;
+    auto cluster_linear_work_idx = linear_idx / cluster_size;
 
     // Determine the starting k iteration computed by this stream-K work unit
     uint32_t unit_iter_start = params.k_tiles_per_sk_unit_ * cluster_linear_work_idx;
@@ -890,16 +589,16 @@ class PersistentTileSchedulerSm90StreamK {
     uint32_t true_tile_iter_end = true_tile_iter_start + params.k_tiles_per_output_tile_;
 
     // Bring the linearized tile ID back into the space of tiles, rather than clusters
-    true_tile_id *= size(params.cluster_shape_);
+    true_tile_id *= cluster_size;
 
     auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
 
     // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
     if (params.raster_order_ == RasterOrder::AlongN) {
-      true_tile_id += cta_n_in_cluster * cute::size<0>(params.cluster_shape_);
+      true_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
     }
     else {
-      true_tile_id += cta_m_in_cluster * cute::size<1>(params.cluster_shape_);
+      true_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
     }
 
     // The unit's starting k iteration in the current tile is either the starting
@@ -925,7 +624,7 @@ class PersistentTileSchedulerSm90StreamK {
                                           params.divmod_cluster_shape_major_,
                                           params.divmod_cluster_shape_minor_,
                                           params.divmod_cluster_blk_major_,
-                                          params.log_swizzle_size_, 
+                                          params.log_swizzle_size_,
                                           params.raster_order_);
 
     //
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
new file mode 100644
index 0000000000..eb98fd2f42
--- /dev/null
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -0,0 +1,1005 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Parameters structures for persistent tile schedulers
+*/
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
+#include "cutlass/coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/workspace.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters for SM90 tile schedulers
+//
+
+// Parameters for SM90 persistent tile scheduler
+struct PersistentTileSchedulerSm90Params {
+
+  enum class RasterOrder {
+    AlongM,
+    AlongN
+  };
+
+  enum class RasterOrderOptions {
+    Heuristic,
+    AlongM,
+    AlongN
+  };
+
+  FastDivmodU64 divmod_cluster_shape_major_{};
+  FastDivmodU64 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  uint64_t blocks_per_problem_ = 0;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    
+    CUTLASS_UNUSED(hw_info);
+    
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+
+    blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64(cluster_shape.m());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n / cluster_shape.n());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64(cluster_shape.n());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_m / cluster_shape.m());
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      truncate_by_problem_size
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true) {
+
+    int const sm_count = hw_info.sm_count;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return cutlass::platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    else {
+      /*
+      * Optimal grid size calculation is based on
+      * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      * Hence, maximum SMs per GPC = 18
+      */
+      constexpr int max_sm_per_gpc = 18;
+      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+      auto cluster_size = cluster_shape.m() * cluster_shape.n();
+      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+      cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
+
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = cutlass::platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) {
+    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
+    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
+
+    return get_tiled_cta_shape_mnl(problem_shape, cluster_shape, cta_m, cta_n);
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(problem_blocks_m),
+      static_cast<uint32_t>(problem_blocks_n),
+      static_cast<uint32_t>(problem_shape.batch())
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent stream-K scheduler
+struct PersistentTileSchedulerSm90StreamKParams {
+
+  // Strategies for computing reductions between CTAs computing portions of a given output tile
+  enum class ReductionMode {
+    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
+    // currently accumulating.
+    //
+    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+    Deterministic,
+
+    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+    // the accumulated value and accumulate it into registers on top of which the epilogue will
+    // be performed).
+    //
+    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+    // of accumulation)
+    Nondeterministic
+  };
+
+  using UnderlyingParams = PersistentTileSchedulerSm90Params;
+  using RasterOrder = UnderlyingParams::RasterOrder;
+  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+
+  FastDivmodU64 divmod_cluster_shape_major_{};
+  FastDivmodU64 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_k_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  int32_t log_swizzle_size_ = 0;
+
+  uint64_t units_per_problem_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  // The splitting factor to be used in a split-K decomposition of the problem.
+  // If this is set to a value greater than 1, stream-K decomposition logic
+  // is bypassed in favor of a split-K decomposition.
+  uint32_t splits_ = 1;
+
+  // Number of tiled k iterations required to compute a single output tile.
+  uint32_t k_tiles_per_output_tile_ = 0;
+
+  // Number of stream-K or split-K work units that compute an extra k iteration.
+  // This is done to handle residuals in dividing up the k iteration space.
+  // For stream-K, since the actual assignment of work to stream-K units will be done
+  // at the granularity of a cluster, we store only the number of big clusters.
+  uint32_t big_units_ = 0;
+
+  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
+  void* reduction_workspace_ = nullptr;
+
+  // Number of tiles covered by stream-K work units
+  uint32_t sk_tiles_ = 0;
+
+  // Number of work units computing stream-K tiles
+  uint32_t sk_units_ = 0;
+
+  // Number of tiled k iterations computed by each stream-K work unit. This
+  // can potentially cover more than one output tile.
+  uint32_t k_tiles_per_sk_unit_ = 0;
+
+  // Strategy to use when reducing between collaborating CTAs
+  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
+
+  // Minimum number of tiled k that can be assigned to a stream-K unit
+  static constexpr uint32_t min_iters_per_sk_unit_ = 4u;
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    void* workspace
+  ) {
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
+      problem_shape, tile_shape, cluster_shape);
+
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      reduction_mode,
+      workspace
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    void* workspace
+  ) {
+    UnderlyingParams underlying_params;
+    underlying_params.initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    auto problem_blocks_m = problem_blocks.x;
+    auto problem_blocks_n = problem_blocks.y;
+    auto problem_blocks_l = problem_blocks.z;
+
+    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
+
+    // Reduction workspace is at the beginning of the workspace. Lock workspace follows.
+    void* reduction_workspace = workspace;
+
+    if (splits > 1) {
+      // Short circuit to basic split-K decomposition
+
+      // Don't split by more than the available number of SMs
+      if (splits > hw_info.sm_count) {
+        splits = hw_info.sm_count;
+      }
+
+      // Don't split by more than the K tile iterations
+      //
+      // splits is almost certainly nonnegative here (e.g., hw_info.sm_count,
+      // despite being an int, is a count), so it can safely be converted to unsigned
+      // in the comparison to avoid a signed-unsigned comparison warning-as-error.
+      if (static_cast<decltype(k_tiles_per_output_tile)>(splits) > k_tiles_per_output_tile) {
+        splits = k_tiles_per_output_tile;
+      }
+
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
+    // can fit within sm_count SMs.
+    dim3 grid = get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    uint64_t ctas_per_wave = grid.x * grid.y;
+
+    // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
+    uint32_t sk_tiles = get_num_sk_tiles(output_tiles, ctas_per_wave, k_tiles_per_output_tile);
+    uint64_t dp_tiles = output_tiles - sk_tiles;
+
+    if (sk_tiles == 0) {
+      // Short circuit to basic data-parallel decomposition
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        /* splits = */ 1,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    // Calculate the number of work units covering the data-parallel and stream-K tiles.
+    // A "work unit" is a single index in the linearized ID space used by the scheduler.
+    // We distinguish it from a "block," which is typically tied to a hardware unit
+    // (e.g., the callers into this scheduler will be persistent thread blocks).
+    // A work unit can encompass multiple output tiles worth of work (as will be the
+    // case for stream-K blocks).
+    // Since splitting is not required for data-parallel tiles, only one data-parallel unit
+    // is needed per data-parallel tile.
+    uint64_t dp_units = dp_tiles;
+
+    // Number of k iterations computed by the stream-K units as a whole
+    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
+
+    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
+    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
+    // will be as many work units as there are threadblocks in a single wave.
+    //
+    // When the total k iterations across stream-K tiles is too small to justify distributing
+    // across an entire wave of blocks, we instead distribute the iterations over a smaller
+    // set of blocks.
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
+    min_sized_sk_units = (min_sized_sk_units / cluster_size) * cluster_size;
+
+    uint64_t sk_units = cutlass::platform::min(ctas_per_wave, min_sized_sk_units);
+
+    // If the number of stream-K units is a multiple of the number of stream-K tiles, then
+    // the problem can leverage a basic split-K decomposition for the stream-K tiles.
+    if (sk_tiles < sk_units && sk_units % sk_tiles == 0) {
+      // Short circuit to basic split-K decomposition
+      uint32_t sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        sk_splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    // Number of k iterations computed per stream-K units
+    uint64_t k_tiles_per_sk_unit = k_tiles_sk_total / sk_units;
+
+    // Number of stream-K units that need to compute extra iterations in order to cover
+    // the residual k iterations. This assumes that each such unit computes one additional
+    // iteration.
+    uint64_t sk_big_units = k_tiles_sk_total - (k_tiles_per_sk_unit * sk_units);
+
+    // The division below is guaranteed to be exact because sk_big_units is guaranteed
+    // to be a multiple of cluster_size. This is useful because
+    // it allows us to use a block's linearized cluster ID  to determine whether it is
+    // a big block. The reasoning behind this guarnatee is explained as follows:
+    //     sk_big_units = k_tiles_sk_total - (k_tiles_per_sk_unit * sk_units);
+    //
+    // - k_tiles_sk_total is a multiple of cluster_size because it is the product
+    //   of number of tail tiles and the number of k iterations per tile. Because
+    //   both the number of output tiles and number of available SMs are rounded
+    //   to be multiples of cluster shape, the number of tail tiles
+    //   (output_tiles % avail_sms) is a multpile of cluster_size.
+    //
+    // - sk_units is a multiple of cluster_size because it is either blocks_per_wave
+    //   or 0, and blocks_per_wave is a multiple of the cluster_size due to the grid-planning
+    //   logic rounding to multiples of cluster dimensions
+    uint64_t sk_big_units_per_cluster = sk_big_units / cluster_size;
+
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
+    divmod_batch_ = underlying_params.divmod_batch_;
+    divmod_k_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);  // Static k-splitting divmod. Unused for stream-K.
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
+    log_swizzle_size_ = underlying_params.log_swizzle_size_;
+    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
+    raster_order_ = underlying_params.raster_order_;
+    splits_ = 1;                                                   // Static k-splitting factor. Unused for stream-K.
+    k_tiles_per_output_tile_ = k_tiles_per_output_tile;
+    big_units_ = static_cast<uint32_t>(sk_big_units_per_cluster);
+    reduction_workspace_ = reduction_workspace;
+    sk_tiles_ = sk_tiles;
+    sk_units_ = static_cast<uint32_t>(sk_units);
+    k_tiles_per_sk_unit_ = static_cast<uint32_t>(k_tiles_per_sk_unit);
+    reduction_mode_ = reduction_mode;
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
+    // to be truncated based on the number of output tiles in the problem.
+    return UnderlyingParams::get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      /* truncate_by_problem_size = */false
+    );
+  }
+
+  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
+  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  static uint32_t
+  get_num_sk_tiles(uint64_t output_tiles, uint64_t ctas_per_wave, uint32_t k_tiles_per_output_tile) {
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+
+    if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
+      // All tiles will be data-parallel tiles if there is either no quantization
+      // or if there is no work to be split.
+      return 0;
+    }
+
+    //
+    // The final wave is not full. Perform some stream-K work.
+    //
+
+    // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
+    // one wave and the tail wave is more than half full. This is subject to change.
+    if (full_waves != 0) {
+      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
+      if (tail_tiles >= (ctas_per_wave / 2)) {
+        return 0;
+      }
+    }
+
+    // If there is wave quantization, assign the first two waves worth of tiles to be
+    // covered by stream-K work and the remainder to be data-parallel. Since we know
+    // that full_waves == total_waves - 1 in this case, the number of data-parallel
+    // waves is simply full_waves-1 (unless full_waves == 0).
+    uint32_t dp_waves = full_waves > 0 ? full_waves - 1 : 0;
+
+    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    return static_cast<uint32_t>(output_tiles - dp_tiles);
+  }
+
+  // Calculates the size of the workspace needed for holding reduction barriers
+  CUTLASS_HOST_DEVICE
+  static int
+  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups, uint32_t barrier_bits) {
+    auto workspace_bits = num_tiles * mma_warp_groups * barrier_bits;
+    return round_up_to_l2_alignment(bits_to_bytes(static_cast<int>(workspace_bits)));
+  }
+
+  // Calculates the size of the workspace needed for holding partial outputs from splits
+  CUTLASS_HOST_DEVICE
+  static int
+  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits) {
+    auto output_tile_size = tile_shape.m() * tile_shape.n();
+    auto workspace_bits = accumulator_bits * output_tile_size * num_tiles;
+    return round_up_to_l2_alignment(bits_to_bytes(static_cast<int>(workspace_bits)));
+  }
+
+  #if !defined(__CUDACC_RTC__)
+  static void
+  get_workspace_component_sizes(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    int& barrier_workspace_size,
+    int& reduction_workspace_size,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t accumulator_bits) {
+
+    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
+    // of output tiles that will be split, and then calculate the workspace needed to cover these.
+    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+
+    if (splits > 1) {
+      // Basic split-K variant requires workspace for all output tiles
+      barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits);
+    }
+    else {
+      KernelHardwareInfo new_hw_info;
+      new_hw_info.device_id = hw_info.device_id;
+      new_hw_info.sm_count = hw_info.sm_count;
+      if (new_hw_info.sm_count <= 0) {
+        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
+      }
+
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        new_hw_info,
+        max_swizzle,
+        raster_order_option
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint32_t sk_tiles = get_num_sk_tiles(output_tiles, ctas_per_wave, static_cast<uint32_t>(k_tiles_per_output_tile));
+
+      barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(sk_tiles, tile_shape, accumulator_bits);
+    }
+  }
+  #endif // !defined(__CUDACC_RTC__)
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static int
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static int
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    int barrier_workspace_size = 0;
+    int reduction_workspace_size = 0;
+
+    #if !defined(__CUDACC_RTC__)
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits
+      );
+    #endif
+
+    return barrier_workspace_size + reduction_workspace_size;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    #if !defined(__CUDACC_RTC__)
+      int barrier_workspace_size = 0;
+      int reduction_workspace_size = 0;
+
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits
+      );
+
+      if (barrier_workspace_size > 0) {
+        if (workspace == nullptr) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        // Only the barrier workspace needs to be cleared for stream-K.
+        // Barrier workspace follows reduction workspace.
+        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
+        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream);
+      }
+    #endif // !defined(__CUDACC_RTC__)
+
+    return Status::kSuccess;
+  }
+
+  void
+  set_params_basic(
+    UnderlyingParams const& underlying_params,
+    uint32_t blocks_m,
+    uint32_t blocks_n,
+    uint32_t blocks_l,
+    uint32_t splits,
+    uint32_t k_tiles_per_output_tile,
+    void* reduction_workspace,
+    ReductionMode reduction_mode) {
+
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_,
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_,
+    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n * splits),
+    divmod_k_ = FastDivmodU64(blocks_m * blocks_n),
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_,
+    log_swizzle_size_ = underlying_params.log_swizzle_size_,
+    units_per_problem_ = blocks_m * blocks_n * blocks_l * splits,
+    raster_order_ = underlying_params.raster_order_,
+    splits_ = splits,
+    k_tiles_per_output_tile_ = k_tiles_per_output_tile,
+    big_units_ = k_tiles_per_output_tile % splits,
+    reduction_workspace_ = reduction_workspace;
+    reduction_mode_ = reduction_mode;
+
+    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
+    sk_tiles_ = 0;
+    sk_units_ = 0;
+    k_tiles_per_sk_unit_ = 0;
+  }
+
+private:
+  // Round up number of bytes to the nearest multiple of L2 cache line alignment
+  CUTLASS_HOST_DEVICE
+  static int
+  round_up_to_l2_alignment(int bytes) {
+    constexpr static uint32_t L2CacheLineSizeBytes = 128;
+    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 34b68988e4..dd9ce782b3 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -62,10 +62,10 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *Gemm* problem size: gemm(M, N, K)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -76,11 +76,11 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *ImplicitGemm* Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv2dProblemSize const &problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -92,11 +92,11 @@ struct GemmIdentityThreadblockSwizzle {
   /// Returns the shape of the problem in units of logical tiles
   /// *ImplicitGemm* Conv3d problem size: conv_operator(NZPQK, NDHWC, KTRSC)
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     cutlass::conv::Operator conv_operator,
     cutlass::conv::Conv3dProblemSize const &problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     gemm::GemmCoord implicit_gemm_problem_size = 
     cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
@@ -107,7 +107,7 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
     return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
   }
@@ -129,7 +129,7 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -141,7 +141,7 @@ struct GemmIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
 
     int const kTile = N;
     int block_idx_x = RematerializeBlockIdxX();
@@ -168,10 +168,10 @@ struct GemmHorizontalThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int split_k_slices) const {
+    int split_k_slices) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -181,7 +181,7 @@ struct GemmHorizontalThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }
 
@@ -193,7 +193,7 @@ struct GemmHorizontalThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -209,10 +209,10 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int batch_count) const {
+    int batch_count) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -222,7 +222,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
   }
 
@@ -234,7 +234,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxX(),
       RematerializeBlockIdxY(),
@@ -244,7 +244,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -256,7 +256,7 @@ struct GemmBatchedIdentityThreadblockSwizzle {
 
   /// Gets the batch index
   CUTLASS_DEVICE
-  int get_batch_idx() const {
+  static int get_batch_idx() {
     return RematerializeBlockIdxZ();
   }
 };
@@ -271,10 +271,10 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int partitions) const {
+    int partitions) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -299,14 +299,14 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     int tile = 1 << get_log_tile(tiled_shape);
     return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
   }
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();
     int block_idx_z = RematerializeBlockIdxZ();
@@ -318,7 +318,7 @@ struct GemmSplitKIdentityThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
 
     int const kTile = N;
     int block_idx_x = RematerializeBlockIdxX();
@@ -342,10 +342,10 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  GemmCoord get_tiled_shape(
+  static GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
-    int partitions) const {
+    int partitions) {
 
     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
@@ -355,7 +355,7 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(GemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }
 
@@ -367,7 +367,7 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int log_tile) const {
+  static GemmCoord get_tile_offset(int log_tile) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -377,7 +377,7 @@ struct GemmSplitKHorizontalThreadblockSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  GemmCoord get_tile_offset(GemmCoord tiled_shape) const {
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
@@ -393,9 +393,9 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Returns the shape of the problem in units of logical tiles
   CUTLASS_HOST_DEVICE
-  BatchedGemmCoord get_tiled_shape(
+  static BatchedGemmCoord get_tiled_shape(
     BatchedGemmCoord problem_size,
-    BatchedGemmCoord tile_size) const {
+    BatchedGemmCoord tile_size) {
 
     return BatchedGemmCoord(
       1, // M is always 1
@@ -406,7 +406,7 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Computes CUDA grid dimensions given a size in units of logical tiles
   CUTLASS_HOST_DEVICE
-  dim3 get_grid_shape(BatchedGemmCoord tiled_shape) const {
+  static dim3 get_grid_shape(BatchedGemmCoord tiled_shape) {
     return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
   }
 
@@ -418,7 +418,7 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  BatchedGemmCoord get_tile_offset(int log_tile) const {
+  static BatchedGemmCoord get_tile_offset(int log_tile) {
     return BatchedGemmCoord{
       0, // M is always 1
       RematerializeBlockIdxX(),
@@ -429,7 +429,7 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
   CUTLASS_DEVICE
-  BatchedGemmCoord get_tile_offset() const {
+  static BatchedGemmCoord get_tile_offset() {
     return BatchedGemmCoord{
       0, // M is always 1
       RematerializeBlockIdxX(),
@@ -440,13 +440,13 @@ struct GemvBatchedStridedThreadblockDefaultSwizzle {
 
   /// Gets the batch tile index
   CUTLASS_DEVICE
-  int get_batch_tile_idx() const {
+  static int get_batch_tile_idx() {
     return RematerializeBlockIdxY();
   }
 
   /// Gets the absolute batch index
   CUTLASS_DEVICE
-  int get_batch_idx() const {
+  static int get_batch_idx() {
     return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
   }
 };
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
index f0468f89fa..196fe1a37a 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -32,13 +32,23 @@
     \brief Implements streamk threadblock mapping blockIdx to GEMM problems.
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/platform/platform.h"
-#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
 #include "cutlass/conv/conv2d_problem_size.h"
 #include "cutlass/conv/conv3d_problem_size.h"
 #include "cutlass/gemm/threadblock/index_remat.h"
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h b/include/cutlass/gemm_coord.hpp
similarity index 69%
rename from python/cutlass/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h
rename to include/cutlass/gemm_coord.hpp
index f64066a0eb..390de0c5b7 100644
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/binary_ops.h
+++ b/include/cutlass/gemm_coord.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -14,7 +14,7 @@
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
+ * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -30,55 +30,36 @@
  **************************************************************************************************/
 
 /*! \file
-  
-  \brief A file contains the binary ops
+    \brief Utilities to convert a CuTe tuple to a GemmCoord or BatchedGemmCoord
 */
 
 #pragma once
-#include "cutlass/cutlass.h"
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm_coord.h"
 
 namespace cutlass {
+namespace gemm {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-/// Scalar multiplication
-template <typename T, int N>
-struct VectorAdd {
-
-    struct Arguments {
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():tmp(0){ }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { }
-    };
-    
-    struct Params {
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args) { }
-    };
-
-    CUTLASS_HOST_DEVICE
-    VectorAdd(
-        Params const &params
-    ) { }
-
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-        cutlass::plus<Array<T, N>> add_op;
-        return add_op(lhs, rhs);
-    }
-
-};
+template <class Tuple>
+CUTLASS_HOST_DEVICE
+auto
+to_gemm_coord(Tuple tuple) {
+  static_assert(cute::rank(tuple) <= 4, "Can only convert tuples of rank <= 4.");
+
+  if constexpr (cute::rank(tuple) <= 3) {
+    auto tuple_mnk = cute::append<3>(tuple, cute::Int<0>{});
+    return GemmCoord(cute::size<0>(tuple_mnk), cute::size<1>(tuple_mnk), cute::size<2>(tuple_mnk));
+  }
+  else {
+    return BatchedGemmCoord(cute::size<0>(tuple), cute::size<1>(tuple), cute::size<2>(tuple), cute::size<3>(tuple));
+  }
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+} // namespace gemm
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index 8d90b261a5..10262db6a3 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -33,6 +33,17 @@
     \brief Defines a class for using IEEE half-precision floating-point types in host or
       device code.
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #ifndef CUTLASS_ENABLE_F16C
@@ -158,7 +169,6 @@ class CpuId {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 namespace cutlass {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index f02a7d3d75..0d5eb7bb3b 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -33,6 +33,17 @@
     \brief Defines a class for using integer types smaller than one byte in host or
       device code.
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #if defined(__CUDACC_RTC__)
@@ -41,6 +52,8 @@
 #include <cstdint>
 #endif
 
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_size.h"
 #include "cutlass/platform/platform.h"
 
 namespace cutlass {
@@ -238,3 +251,4 @@ struct numeric_limits<cutlass::uint1b_t> {
 
 } // namespace platform
 } // namespace cutlass
+
diff --git a/python/cutlass/cpp/compiler.h b/include/cutlass/kernel_hardware_info.h
similarity index 57%
rename from python/cutlass/cpp/compiler.h
rename to include/cutlass/kernel_hardware_info.h
index b8e60bcb39..46e5a89ca0 100644
--- a/python/cutlass/cpp/compiler.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,48 +28,59 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/* \file
-   \brief In-memory compiled artifact cache
-*/
+#pragma once
 
-#include <pybind11/pybind11.h>
-#include <string>
-#include <unordered_map>
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
 
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
 
-namespace py = pybind11;
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
 
-namespace cutlass {
+#if !defined(__CUDACC_RTC__)
+#include "cuda_runtime.h"
 
-struct CompileCache {
-public:
-    CompileCache() = default;
-    ~CompileCache() = default;
+#include "cutlass/trace.h"
+#endif
 
-    using Cache = std::unordered_map<std::string, py::object>;
+namespace cutlass {
 
-    /// Check if the kernel has already been compiled
-    py::object at(const std::string &kernel) {
-        auto item = cache_.find(kernel);
+struct KernelHardwareInfo {
+  //
+  // Data members
+  //
+  int device_id = 0;
+  int sm_count = 0;
 
-        if (item != cache_.end()) {
-            return item->second;
-        }
-        return py::none();
-    }
+  //
+  // Methods
+  //
 
-    /// Insert a new compiled kernel for new configuration
-    void insert(const std::string &kernel, const py::object &compiled_kernel){
-        cache_.emplace(kernel, compiled_kernel);
+#if !defined(__CUDACC_RTC__)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    cudaError_t result = cudaGetDevice(&device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
     }
-
-    const int64_t size() const { return cache_.size(); }
-
-    /// Clear the cache
-    void clear() { cache_.clear(); }
-
-private:
-    Cache cache_;
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    return multiprocessor_count;
+  }
+#endif
 };
 
 } // namespace cutlass
diff --git a/include/cutlass/kernel_hardware_info.hpp b/include/cutlass/kernel_hardware_info.hpp
index 680036dff6..08711ea413 100644
--- a/include/cutlass/kernel_hardware_info.hpp
+++ b/include/cutlass/kernel_hardware_info.hpp
@@ -30,47 +30,6 @@
  **************************************************************************************************/
 #pragma once
 
-#if !defined(__CUDACC_RTC__)
-#include "cuda_runtime.h"
-
-#include "cutlass/trace.h"
-#endif
-
-namespace cutlass {
-
-struct KernelHardwareInfo {
-  //
-  // Data members
-  //
-  int device_id = 0;
-  int sm_count = 0;
-
-  //
-  // Methods
-  //
-
-#if !defined(__CUDACC_RTC__)
-  static int
-  query_device_multiprocessor_count(int device_id = 0) {
-    cudaError_t result = cudaGetDevice(&device_id);
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaGetDevice() returned error "
-        << cudaGetErrorString(result));
-      return 0;
-    }
-    int multiprocessor_count;
-    result = cudaDeviceGetAttribute(&multiprocessor_count,
-      cudaDevAttrMultiProcessorCount, device_id);
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaDeviceGetAttribute() returned error "
-        << cudaGetErrorString(result));
-      return 0;
-    }
-    return multiprocessor_count;
-  }
-#endif
-};
-
-} // namespace cutlass
+// Simply import .h version of header so as to avoid breaking any existing CUTLASS builds
+// after .hpp was changed to .h
+#include "cutlass/kernel_hardware_info.h"
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index ae84ff80a0..f0b4543fa1 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -37,6 +37,17 @@
     Layout functions must implement all members in the public interface of IdentityTensorLayout<>
     defined in cutlass/tensor_ref.h.
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -538,6 +549,7 @@ struct ContiguousMatrix {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
@@ -796,6 +808,7 @@ struct AffineRank2ColumnMajor {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
@@ -901,6 +914,7 @@ struct AffineRank2RowMajor {
   /// Inverse of layout function, mapping linear offset to logical coordinate
   CUTLASS_HOST_DEVICE
   MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
     return MatrixCoord(0, 0);
   }
 
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index b49ab95540..eefccf8f90 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -31,6 +31,17 @@
 /*! \file
     \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index e9ad6da63a..188dfdcefb 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -78,6 +78,7 @@ class PackedVectorLayout {
   /// Helper returns a layout to a tightly packed tensor
   CUTLASS_HOST_DEVICE
   static PackedVectorLayout packed(TensorCoord const &size) {
+    CUTLASS_UNUSED(size);
     return PackedVectorLayout();
   }
 
diff --git a/include/cutlass/numeric_size.h b/include/cutlass/numeric_size.h
new file mode 100644
index 0000000000..951e521c0b
--- /dev/null
+++ b/include/cutlass/numeric_size.h
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an element in bits
+template <typename T>
+struct sizeof_bits {
+  static int const value = int(sizeof(T) * 8);
+};
+
+template <typename T>
+struct sizeof_bits<T const>: sizeof_bits<T> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Definitions for 1-bit binary and 4-bit integer types
+//
+
+/// 1-bit binary type
+using bin1_t = bool;
+
+/// Defines the size of an element in bits - specialized for bin1_t
+template <>
+struct sizeof_bits<bin1_t> {
+  static int const value = 1;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the number of bytes required to hold a specified number of bits
+CUTLASS_HOST_DEVICE
+constexpr int
+bits_to_bytes(int bits) {
+  return (bits + 7) / 8;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index 18715ae790..f7cd654f07 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -32,46 +32,23 @@
     \file
     \brief Top-level include for all CUTLASS numeric types.
 */
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the size of an element in bits
-template <typename T>
-struct sizeof_bits {
-  static int const value = int(sizeof(T) * 8);
-};
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
 
-template <typename T>
-struct sizeof_bits<T const>: sizeof_bits<T> {};
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Definitions for 1-bit binary and 4-bit integer types
-//
-
-/// 1-bit binary type
-using bin1_t = bool;
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+#pragma once
 
-/// Defines the size of an element in bits - specialized for bin1_t
-template <>
-struct sizeof_bits<bin1_t> {
-  static int const value = 1;
-};
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_size.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Returns the number of bytes required to hold a specified number of bits
-CUTLASS_HOST_DEVICE
-constexpr int
-bits_to_bytes(int bits) {
-  return (bits + 7) / 8;
-}
+namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index e86d04ce5b..f0632830bc 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -219,7 +219,6 @@ public :
     ThreadCategory role = ThreadCategory::NonParticipant;
     uint32_t is_leader = 0;
     uint32_t num_consumers = 0;
-    cute::tuple<int, int> active_warps = {0, 0}; 
   };
 
   // Constructor
@@ -232,7 +231,7 @@ public :
     int warp_idx = canonical_warp_idx();
     int lane_predicate = cute::elect_one_sync();
     auto cluster_shape = ClusterShape{};
-    if (warp_idx == cute::get<0>(params.active_warps) && lane_predicate == 1) {
+    if (warp_idx == 0 && lane_predicate == 1) {
       // Barrier FULL init
       for (int i = 0; i < Stages; ++i) {
         full_barrier_ptr_[i].init(1);
@@ -350,6 +349,11 @@ public :
     return consumer_try_wait(state.index(), state.phase(), skip_wait);
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+  
   CUTLASS_DEVICE
   void consumer_wait(PipelineState state) {
     consumer_wait(state.index(), state.phase());
@@ -440,6 +444,15 @@ private :
     return {static_cast<BarrierStatus>(barrier_status)};
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    uint32_t barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+  
   // Wait for producer to commit transactions (done by TMA)
   CUTLASS_DEVICE
   void consumer_wait(uint32_t stage, uint32_t phase) {
@@ -621,7 +634,6 @@ public :
     uint32_t producer_arv_count = 1;
     uint32_t consumer_arv_count = 1;
     uint32_t dst_blockid = cute::block_rank_in_cluster();
-    cute::tuple<int, int> active_warps = {0, 0};
   };
 
   // Constructor
@@ -636,7 +648,7 @@ public :
 
     // Barrier FULL, EMPTY init
     // Init is done only by thread 0 of the block
-    if (warp_idx == cute::get<0>(params.active_warps) && lane_predicate == 1) {
+    if (warp_idx == 0 && lane_predicate == 1) {
       for (int i = 0; i < Stages; ++i) {
         full_barrier_ptr_[i].init(params.producer_arv_count);
         empty_barrier_ptr_[i].init(params.consumer_arv_count);
@@ -708,6 +720,11 @@ public :
     return consumer_try_wait(state.index(), state.phase(), skip_wait);
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+  
   CUTLASS_DEVICE
   void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
     consumer_wait(state.index(), state.phase(), barrier_token);
@@ -764,6 +781,15 @@ public :
     return {static_cast<BarrierStatus>(barrier_status)};
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    uint32_t barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+  
   CUTLASS_DEVICE
   void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
     if (barrier_token == BarrierStatus::WaitAgain) {
@@ -809,7 +835,6 @@ public :
     uint32_t producer_arv_count = 1;
     uint32_t consumer_arv_count = 1;
     uint32_t dst_blockid = cute::block_rank_in_cluster();
-    cute::tuple<int, int> active_warps = {0, 0};
   };
 
   // Default assumption when only storage is passed is :
@@ -831,7 +856,7 @@ public :
 
     // Barrier FULL, EMPTY init
     // Init is done only by thread 0 of the block
-    if (warp_idx == cute::get<0>(params.active_warps) && lane_predicate == 1) {
+    if (warp_idx == 0 && lane_predicate == 1) {
       for (int i = 0; i < Stages; ++i) {
         full_barrier_ptr_[i].init(params.producer_arv_count);
         empty_barrier_ptr_[i].init(params.consumer_arv_count);
@@ -897,6 +922,11 @@ public :
     return consumer_try_wait(state.index(), state.phase(), skip_wait);
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+  
   CUTLASS_DEVICE
   void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
     consumer_wait(state.index(), state.phase(), barrier_token);
@@ -947,6 +977,15 @@ public :
     return {static_cast<BarrierStatus>(barrier_status)};
   }
 
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    uint32_t barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+  
   CUTLASS_DEVICE
   void consumer_wait(uint32_t stage, uint32_t phase) {
     uint32_t done = full_barrier_ptr_[stage].test_wait(phase);
@@ -968,6 +1007,7 @@ public :
   }
 };
 
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Barrier to ensure an Ordered Sequence between
@@ -989,7 +1029,6 @@ public :
   struct Params {
     uint32_t group_id;
     uint32_t group_size;
-    cute::tuple<int, int> active_warps = {0, 0};
   };
 
 private :
@@ -1020,7 +1059,7 @@ private :
 
     // Barrier FULL, EMPTY init
     // Init is done only by the one elected thread of the block
-    if (warp_idx == cute::get<0>(params.active_warps) && lane_predicate == 1) {
+    if (warp_idx == 0 && lane_predicate == 1) {
       for (int d = 0; d < Depth; ++d) {
         for (int l = 0; l < Length; ++l) {
           barrier_ptr_[d * Length + l].init(params.group_size);
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index f2cd5627cd..4e8ee96f82 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -95,6 +95,16 @@
  * counterparts (or trivially find-and-replace their occurrences in code text).
  */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
+
 //-----------------------------------------------------------------------------
 // Dependencies
 //-----------------------------------------------------------------------------
@@ -354,7 +364,7 @@ using std::nullptr_t;
 // Conditional metaprogramming <type_traits>
 //-----------------------------------------------------------------------------
 
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201700L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
 
 /// std::enable_if (true specialization)
 template <bool C, typename T = void>
@@ -385,14 +395,16 @@ using std::conditional;
 
 #endif
 
+#if (201703L <=__cplusplus)
 /// std::conditional_t
 using CUTLASS_STL_NAMESPACE::conditional_t;
+#endif
 
 //-----------------------------------------------------------------------------
 // Const/volatility specifiers <type_traits>
 //-----------------------------------------------------------------------------
 
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201703L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
 
 /// std::remove_const (non-const specialization)
 template <typename T>
@@ -432,6 +444,8 @@ using std::remove_cv;
 
 #endif
 
+#if (201703L <=__cplusplus)
+
 /// std::remove_cv_t
 using CUTLASS_STL_NAMESPACE::remove_cv_t;
 /// std::remove_reference_t
@@ -449,6 +463,9 @@ struct remove_cvref {
 template <class T>
 using remove_cvref_t = typename remove_cvref<T>::type;
 
+#endif
+
+
 //-----------------------------------------------------------------------------
 // Type relationships <type_traits>
 //-----------------------------------------------------------------------------
@@ -613,11 +630,15 @@ using std::is_trivially_copyable;
 
 #endif
 
+#if (201703L <=__cplusplus)
+
 /// std::is_unsigned_v
 using CUTLASS_STL_NAMESPACE::is_integral_v;
 /// std::is_unsigned_v
 using CUTLASS_STL_NAMESPACE::is_unsigned_v;
 
+#endif
+
 //-----------------------------------------------------------------------------
 // bit_cast <bit>
 //-----------------------------------------------------------------------------
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index ba6fe3aa75..d06580522a 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -33,7 +33,8 @@
 */
 #pragma once
 
-#include "cutlass/numeric_types.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/integer_subbyte.h"
 #include "cutlass/fast_math.h"
 
 namespace cutlass {
@@ -619,7 +620,7 @@ class SubbyteReference<Element_, Storage_,
   ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
   ///   number of objects of type Element.
   using StorageUnit = Storage_;
-  static int const kBitsStoredVec = cutlass::lcm(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
   static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
 
   using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
@@ -994,7 +995,7 @@ class ConstSubbyteReference<Element_, Storage_,
   ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
   ///   number of objects of type Element.
   using StorageUnit = Storage_;
-  static int const kBitsStoredVec = cutlass::lcm(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
   static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
 
   using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
diff --git a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
index 27039c64f5..6e41b7dbbb 100644
--- a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
+++ b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -383,7 +383,7 @@ class AsyncTranspositionOperandB {
         make_fragment_like(tmp_tCsB)
       };
 
-      int step = current_step * NumMathWarpGroup;
+      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
       if constexpr (NumMathWarpGroup == 2) { 
         // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8. 
         step += warp_idx / (NumWarpsPerWarpGroup * 2);
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
index cbabc4e9ed..f284b0ae8d 100755
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -32,9 +32,21 @@
   \brief 
 */
 
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h
index 68ad4f98a6..155e042554 100644
--- a/include/cutlass/uint128.h
+++ b/include/cutlass/uint128.h
@@ -32,7 +32,15 @@
   \file
   \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
 */
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
 
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by `cutlass_test_unit_core_cpp11`.
+*/
 #pragma once
 
 #if defined(__CUDACC_RTC__)
@@ -46,7 +54,6 @@
 #endif
 
 #include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
 
 /// Optionally enable GCC's built-in type
 #if (defined(__x86_64) || defined (__aarch64__)) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
diff --git a/include/cutlass/workspace.hpp b/include/cutlass/workspace.h
similarity index 87%
rename from include/cutlass/workspace.hpp
rename to include/cutlass/workspace.h
index 35f0c0f8be..3c71f87826 100644
--- a/include/cutlass/workspace.hpp
+++ b/include/cutlass/workspace.h
@@ -31,6 +31,17 @@
 /*! \file
     \brief Utilities for initializing workspaces
 */
+
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
 #pragma once
 
 #if !defined(__CUDACC_RTC__)
diff --git a/media/docs/build/building_in_windows_with_visual_studio.md b/media/docs/build/building_in_windows_with_visual_studio.md
new file mode 100644
index 0000000000..51bdf7e5f7
--- /dev/null
+++ b/media/docs/build/building_in_windows_with_visual_studio.md
@@ -0,0 +1,93 @@
+[README](../README.md#documentation) > **CUTLASS 3.0: Building on Windows with Visual Studio**
+
+# Building on Windows with Visual Studio
+
+CUTLASS 3.2 reintroduces support for the Microsoft Visual Studio compiler on Windows.
+Users and developers may build either
+in Visual Studio's graphical integrated development environment,
+or on the command line with `cmake --build`.
+
+# Software prerequisites
+
+1. Windows 10 or 11
+
+2. Visual Studio 2019 version 16.11.27, or Visual Studio 2022
+
+3. CUDA Toolkit (at least 12.2; earlier 12.x versions may work)
+
+4. CMake (at least 3.18)
+
+5. git
+
+6. Python (at least 3.6)
+
+Visual Studio must be installed *before* the CUDA Toolkit.
+Otherwise, Visual Studio's build system won't know about CUDA.
+
+# Operating system settings
+
+By default, Windows restricts the maximum file path length (`MAX_PATH`) to 260 characters.
+CUTLASS has many files and directory paths that challenge this requirement.
+As a result, CUTLASS is unlikely to build with this default setting.
+The choice of source and build directories affect path lengths,
+so the kinds of errors and whether they occur may depend on this.
+Symptoms may vary, from errors when running `cmake`
+(e.g., during the "generating library instances" step) to build failures.
+
+CUTLASS recommends changing the maximum file path length setting
+and rebooting the computer before attempting to clone or build CUTLASS.
+Windows 10 (as of version 1607) and 11 permit changing this setting
+by making sure that the following registry key exists,
+and that its value is set to 1.
+
+```
+Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled
+```
+
+After changing the registry key's value, reboot the computer first
+before attempting to clone or build CUTLASS.
+
+[This Microsoft help article](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry)
+explains different ways to change the registry setting.
+
+# Limitations
+
+Currently, it's possible to build examples and tests.
+Building the CUTLASS library (e.g., for profiling) with default settings does not currently work,
+because Visual Studio's linker cannot handle more than 65535 symbols in a library.
+(The symptom of this issue is a LNK1189 linker error.)
+The known way to work around this Visual Studio limitation is to disable building CUTLASS's library,
+by setting the CMake option `CUTLASS_ENABLE_LIBRARY` to `OFF`.
+Another approach may be to limit the number of kernels in the library
+by setting the CMake option `CUTLASS_LIBRARY_KERNELS`
+so that CUTLASS tries to put fewer kernels in the library.
+
+# Set up build environment
+
+1. Run "git bash" to get a familiar command-line interface
+
+2. Edit `~/.profile` and set the environment variables as needed to access the CUTLASS repository
+
+3. Clone the CUTLASS repository
+
+4. Create the `build` subdirectory in the CUTLASS clone directory, and run CMake in it,
+    specifying whatever CMake options are desired, e.g.,
+    `cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_ENABLE_LIBRARY=OFF`
+
+Alternate approaches may rely on the CMake GUI and/or Windows' native command line.
+
+# Building
+
+A successful CMake run will create a `CUTLASS.sln` Visual Studio "solution" file in the build directory.
+One can open this in Visual Studio and build the entire solution or any subset of projects as desired.
+It may be necessary to limit maximum build parallelism by setting the appropriate Visual Studio option.
+
+Alternately, one can run `cmake --build . --config Release -j 4` in the build directory.
+Replace 4 with the desired maximum build parallelism.
+It's important to put the `--build` option before the period that signifies the build directory.
+The `--config` option specifies the kind of build;
+`--config Release` builds a Release build, while `--config Debug` builds a Debug build.
+Unlike with CMake's Makefile or Ninja generators,
+`CMAKE_BUILD_TYPE` has no effect on the Visual Studio generator,
+because the Visual Studio generator creates all build configurations.
+
diff --git a/media/docs/build/building_with_clang_as_host_compiler.md b/media/docs/build/building_with_clang_as_host_compiler.md
new file mode 100644
index 0000000000..cde9220690
--- /dev/null
+++ b/media/docs/build/building_with_clang_as_host_compiler.md
@@ -0,0 +1,53 @@
+[README](../README.md#documentation) > **CUTLASS 3: Building with Clang as host compiler**
+
+# Building with Clang as host compiler
+
+CUTLASS 3.2(.1) reintroduces support for building with
+Clang as host compiler, and NVCC as device compiler.
+This is NOT the same as building with
+Clang as both host and device compiler ("CUDA Clang").
+
+# Software prerequisites
+
+1. Clang (tested with Clang 14)
+
+2. CUDA Toolkit (tested with 12.2; other versions likely work)
+
+3. CMake (at least 3.18)
+
+4. git
+
+5. Python (at least 3.6)
+
+Experience with Ubuntu 22.04 LTS is that
+clang requires the following packages to be installed.
+
+```bash
+$ sudo apt-get install clang cmake ninja-build pkg-config libgtk-3-dev liblzma-dev libstdc++-12-dev
+```
+
+A symptom of not installing all needed dependencies
+is the following error when attempting to use clang:
+`"/usr/bin/ld: cannot find -lstdc++: No such file or directory"`.
+
+# Running CMake
+
+The Clang build requires specifying the following three CMake options.
+
+* `CMAKE_CXX_COMPILER=clang++`
+* `CMAKE_CUDA_HOST_COMPILER=clang++`
+
+* `CMAKE_C_COMPILER=clang`
+
+This assumes that `clang++` and `clang` are in the user's `PATH`.
+Please note that both `CMAKE_CXX_COMPILER` and `CMAKE_C_COMPILER`
+must be set, even though CUTLASS is a C++ project, not a C project.
+
+Users can also specify a particular CUDA Toolkit version
+by setting the CMake option `CMAKE_CUDA_COMPILER`
+to the path to the `nvcc` executable
+that lives in the CUDA Toolkit's directory.  For example,
+if `${PATH_TO_CUDA_TOOLKIT}` is the CUDA Toolkit directory,
+then one can set `CMAKE_CUDA_COMPILER` as follows.
+
+* `CMAKE_CUDA_COMPILER=${PATH_TO_CUDA_TOOLKIT}/bin/nvcc`
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index 53ffc84dfe..ea19679530 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -109,14 +109,15 @@ tools/
           library.h          # defines enums and structs to describe the tiled structure of operator instances          
           manifest.h         # collection of all instances
 
-    scripts/                 # scripts to procedurally generate CUTLASS template instances
+    src/
+
+python/
+    cutlass_library/       # scripts to procedurally generate CUTLASS template instances
 
       gemm_operations.py
       library.py
-      generator.py           # entry point of procedural generation scripts - invoked by cmake
+      generator.py            # entry point of procedural generation scripts - invoked by cmake
       manifest.py
-
-    src/
 ```
 
 When CMake is executed, the CUTLASS Instance Library generator scripts are executed to construct a set of
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 06341eebb3..9e76d37091 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -242,6 +242,8 @@ Test your changes to gemm kernels with a quick functional test and save results
    --providers=cutlass --output=functional-test.csv
 ```
 
+The format of tensor argument is followed by `<type>:<layout>`. The type could be `f32` as 32-bit floating point, `s8` as 8-bit signed integer, etc. The available types can be referred to the `NumericTypeID_enumerants` in [util.cu](tools/library/src/util.cu). The layout could be `row` or `column`.
+
 ## Example CUDA Core GEMM Operation
 
 Example command line for profiling SGEMM kernels is as follows:
diff --git a/python/README.md b/python/README.md
index 1a58d70ea7..63388b077d 100644
--- a/python/README.md
+++ b/python/README.md
@@ -1,6 +1,12 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS Python Interface
+# Python packages associated with CUTLASS
+This directory contains Python packages that are associated with CUTLASS:
+
+* `cutlass`: the CUTLASS Python interface, which enables one to compile and run CUTLASS kernels from within Python
+* `cutlass_library`: utilities used for enumerating and emitting C++ code for CUTLASS kernels
+
+## CUTLASS Python Interface
 The CUTLASS Python interface enables one to compile and run CUTLASS operations from within Python.
 
 ```python
@@ -15,7 +21,7 @@ plan.run(A, B, C, D)
 **NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
 We welcome feedback from the community.
 
-## Overview
+### Overview
 The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
 the CUTLASS Python interface attempts to:
 
@@ -25,7 +31,7 @@ the CUTLASS Python interface attempts to:
 * Reduce the occurrence of C++ compile-time errors in favor of descriptive Python exceptions
 * Make it easy to export CUTLASS kernels to framework extensions (e.g., PyTorch CUDA extensions)
 
-### Non-goals
+#### Non-goals
 The CUTLASS Python interface does not intended to:
 
 **Select optimal kernel configurations.**
@@ -43,7 +49,7 @@ one of the CUTLASS emitters for automatically creating a framework extension for
 The CUTLASS Python interface intends to enable one to use CUTLASS via Python. It can be used by frameworks for JIT compiling
 Python to CUDA kernels, but does not set out to be such a framework.
 
-### Comparison to PyCUTLASS
+#### Comparison to PyCUTLASS
 The CUTLASS Python interface builds atop CUTLASS's [PyCUTLASS](https://github.com/NVIDIA/cutlass/tree/v3.0.0/tools/library/scripts/pycutlass) library. PyCUTLASS enables
 one to declare, compile, and run GEMMs, convolutions, and grouped GEMM operators with nearly the same configuration
 space as CUTLASS's C++ interface. While this flexibility enables one to achieve the similar levels of functionality
@@ -53,43 +59,14 @@ to operators -- similar to what one must do in specifying template parameters to
 In contrast, the CUTLASS Python interface aims to provide a higher-level API for declaring, emitting, and compiling
 kernels that does not require exhaustively defining template parameters.
 
-#### Transitioning from PyCUTLASS
-At present, existing PyCUTLASS functionality remains available via the CUTLASS Python interface. One can
-continue to use PyCUTLASS by replacing references to the PyCUTLASS `cutlass` module with `cutlass_bindings`
-and the PyCUTLASS `pycutlass` module with `cutlass.backend`.
-
-For example, the following code using PyCUTLASS:
-```python
-import pycutlass
-import cutlass
-
-math_inst = pycutlass.MathInstruction(
-    [1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
-    cutlass.OpClass.Simt, pycutlass.MathOperation.multiply_add
-)
-```
-
-can work with the Python interface via:
-```python
-import cutlass.backend as pycutlass
-import cutlass_bindings
-
-math_inst = pycutlass.MathInstruction(
-    [1, 1, 1], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32,
-    cutlass_bindings.OpClass.Simt, pycutlass.MathOperation.multiply_add
-)
-```
-
-**NOTE:** backwards compatibility of `cutlass.backend` with `pycutlass` will not be maintained moving forward.
-
-## Current functionality
+### Current functionality
 The CUTLASS Python interface currently supports the following operations:
 * GEMMs
 * GEMMs with fused elementwise epilogues (e.g., ReLU) (for pre-SM90 kernels)
 * Stream K swizzling (for pre-SM90 kernels)
 * Grouped GEMM (for pre-SM90 kernels)
 
-## Getting started
+### Getting started
 We recommend using the CUTLASS Python interface via one of the Docker images located in the [docker](/python/docker) directory.
 
 ```bash
@@ -99,7 +76,7 @@ docker run --gpus all -it --rm cutlass-cuda12.1:latest
 
 The CUTLASS Python interface has been tested with CUDA 11.8, 12.0, and 12.1 on Python 3.8.10 and 3.9.7.
 
-### Optional environment variables
+#### Optional environment variables
 Prior to installing the CUTLASS Python interface, one may optionally set the following environment variables:
 * `CUTLASS_PATH`: the path to the cloned CUTLASS repository
 * `CUDA_INSTALL_PATH`: the path to the installation of CUDA
@@ -110,7 +87,7 @@ If these environment variables are not set, the installation process will infer
 
 **NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
 
-### Installation
+#### Installation
 The CUTLASS Python interface can currently be installed via:
 ```bash
 python setup.py develop --user
@@ -119,7 +96,7 @@ This will allow changes to the Python interface source to be reflected when usin
 
 We plan to add support for installing via `python setup.py install` in a future release.
 
-## Examples
+### Examples
 Jupyter notebook examples of using the CUTLASS Python interface are located in [examples/python](/examples/python).
 
 To launch these notebooks from this directory, run:
@@ -127,7 +104,7 @@ To launch these notebooks from this directory, run:
 jupyter-lab ../examples/python
 ```
 
-## Building documentation
+### Building documentation
 The CUTLASS Python interface uses [Sphinx](https://www.sphinx-doc.org/en/master/) for documentation.
 
 Building the documentation requires additional packages. These can be installed via:
@@ -147,6 +124,22 @@ make html
 mv _build/* ../docs
 ```
 
+## CUTLASS library package
+[cutlass_library](/python/cutlass_library) contains utilities for enumerating and emitting CUTLASS C++ kernels.
+It is used by the CUTLASS CMake system to construct a library of kernels that can be profiled using the CUTLASS profiler.
+
+To install the `cutlass_library` package, run
+```bash
+python setup_library.py develop --user
+```
+
+Alternatively, `cutlass_library` will automatically be installed if you install the CUTLASS Python interface package.
+
+You can also use the [generator.py](/python/cutlass_library/generator.py) script directly without installing the module via:
+```bash
+python -m cutlass_library.generator
+```
+
 # Copyright
 
 Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
index 77724f7e83..39e9b4076f 100644
--- a/python/cutlass/__init__.py
+++ b/python/cutlass/__init__.py
@@ -34,6 +34,8 @@
 import os
 import sys
 
+import cutlass_library
+
 
 def _cutlass_path_from_dir() -> str:
     cutlass_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../')
@@ -62,19 +64,29 @@ def _cuda_install_path_from_nvcc() -> str:
 CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())
 CACHE_FILE = "compiled_cache.db"
 
-# Add the path to the CUTLASS profiler generation/manifest scripts to PYTHONPATH
-sys.path.insert(0, os.path.join(CUTLASS_PATH, "tools/library/scripts/"))
-
 # Import types/methods from the CUTLASS utility libraries for profiler generation/emission under
-from library import (
+from cutlass_library.library import (
     ArchitectureNames,
+    ComplexTransform,
+    ComplexTransformTag,
+    ConvKind,
+    ConvKindNames,
+    ConvKindTag,
+    ConvMode,
     DataType,
+    DataTypeNames,
     DataTypeSize,
+    DataTypeTag,
     EpilogueFunctor,
     EpilogueScheduleSuffixes,
     EpilogueScheduleTag,
     EpilogueScheduleType,
     GemmKind,
+    GemmKindNames,
+    GemmUniversalMode,
+    IteratorAlgorithm,
+    IteratorAlgorithmNames,
+    IteratorAlgorithmTag,
     LayoutTag,
     LayoutType,
     KernelScheduleSuffixes,
@@ -82,15 +94,27 @@ def _cuda_install_path_from_nvcc() -> str:
     KernelScheduleType,
     MathInstruction,
     MathOperation,
+    MathOperationTag,
     OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
     OperationKind,
     SharedMemPerCC,
+    ShortComplexLayoutNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+    StrideSupport,
+    StrideSupportNames,
+    StrideSupportTag,
     SwizzlingFunctor,
+    SwizzlingFunctorTag,
     TensorDescription,
     TileDescription,
     TileSchedulerSuffixes,
     TileSchedulerTag,
-    TileSchedulerType
+    TileSchedulerType,
+    get_complex_from_real,
 )
 
 this = sys.modules[__name__]
@@ -112,7 +136,7 @@ def set_log_level(level: int):
 
 this.option_registry = OptionRegistry(device_cc())
 
-this.__version__ = '3.2.0'
+this.__version__ = '3.2.1'
 
 from cutlass.backend import get_memory_pool
 from cutlass.emit.pytorch import pytorch
@@ -120,5 +144,6 @@ def set_log_level(level: int):
 from cutlass.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
 from cutlass.op.gemm_grouped import GroupedGemm
 from cutlass.op.op import OperationBase
+from cutlass.backend.evt.ir.tensor import Tensor
 
 get_memory_pool(init_pool_size=2 ** 30, max_pool_size=2 ** 32)
diff --git a/python/cutlass/backend/__init__.py b/python/cutlass/backend/__init__.py
index 92db147981..9b94c78d50 100644
--- a/python/cutlass/backend/__init__.py
+++ b/python/cutlass/backend/__init__.py
@@ -1,6 +1,3 @@
-# module-wide variables
-import os
-
 from cutlass.backend.arguments import *
 from cutlass.backend.c_types import *
 from cutlass.backend.compiler import ArtifactManager
@@ -11,9 +8,7 @@
 from cutlass.backend.library import *
 from cutlass.backend.memory_manager import PoolMemoryManager
 from cutlass.backend.operation import *
-from cutlass.backend.parser import *
 from cutlass.backend.reduction_operation import *
-from cutlass.backend.tensor_ref import *
 from cutlass.backend.type_hint import *
 from cutlass.backend.utils import *
 from cutlass.backend.utils.device import device_cc
diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py
index 68c8638da9..20a01e6267 100644
--- a/python/cutlass/backend/arguments.py
+++ b/python/cutlass/backend/arguments.py
@@ -30,6 +30,7 @@
 #
 #################################################################################################
 
+from math import prod
 from typing import Union
 
 from cuda import cuda, cudart
@@ -67,39 +68,39 @@ def __init__(
             # by default, tensor_C is not bias
             self.bias = False
 
-        # preprocessing input tensors
-        if isinstance(A, np.ndarray):
-            self.host_D = D
-            self.buffer_A = NumpyFrontend.argument(A, False)
-            self.buffer_B = NumpyFrontend.argument(B, False)
-            self.buffer_C = NumpyFrontend.argument(C, False)
-            self.buffer_D = NumpyFrontend.argument(D, True)
-            self.ptr_A = self.buffer_A.ptr
-            self.ptr_B = self.buffer_B.ptr
-            self.ptr_C = self.buffer_C.ptr
-            self.ptr_D = self.buffer_D.ptr
-            # number of elements in C
-            self.tensor_c_numel = C.size
-        elif torch_available and isinstance(A, torch.Tensor):
-            self.ptr_A = TorchFrontend.argument(A)
-            self.ptr_B = TorchFrontend.argument(B)
-            self.ptr_C = TorchFrontend.argument(C)
-            self.ptr_D = TorchFrontend.argument(D)
-            # number of elements in C
-            self.tensor_c_numel = C.numel()
-        elif isinstance(A, cuda.CUdeviceptr):
-            self.ptr_A = A
-            self.ptr_B = B
-            self.ptr_C = C
-            self.ptr_D = D
+        # RMM buffers used to track tensor lifetime
+        self.buffers = {}
+        # Host tensor to copy the computed result back
+        self.host_tensors = {}
 
-        elif cupy_available and isinstance(A, cp.ndarray):
-            self.ptr_A = CupyFrontend.argument(A)
-            self.ptr_B = CupyFrontend.argument(B)
-            self.ptr_C = CupyFrontend.argument(C)
-            self.ptr_D = CupyFrontend.argument(D)
-            # number of elements in C
-            self.tensor_c_numel = C.size
+        self.ptr_A = self.tensor_to_ptr(A, "A")
+        self.ptr_B = self.tensor_to_ptr(B, "B")
+        self.ptr_C = self.tensor_to_ptr(C, "C")
+        self.ptr_D = self.tensor_to_ptr(D, "D", True)
+        if C is not None:
+            if not isinstance(C, cuda.CUdeviceptr):
+                self.tensor_c_numel = prod(C.shape)
+
+    def tensor_to_ptr(self, tensor, name, is_output=False):
+        """
+        Convert and remember the input tensor to cuda.CUdeviceptr used by cuda python
+        For numpy.ndarray, it also remembers the host buffer for synchronization
+        """
+        if tensor is None:
+            return cuda.CUdeviceptr(0)
+        if isinstance(tensor, np.ndarray):
+            if is_output:
+                assert name
+            self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
+            if is_output:
+                self.host_tensors[name] = tensor
+            return self.buffers[name].ptr
+        elif torch_available and isinstance(tensor, torch.Tensor):
+            return TorchFrontend.argument(tensor)
+        elif isinstance(tensor, cuda.CUdeviceptr):
+            return tensor
+        elif cupy_available and isinstance(tensor, cp.ndarray):
+            return CupyFrontend.argument(tensor)
         else:
             raise TypeError("Unsupported Frontend. Only support numpy and torch")
 
@@ -109,11 +110,12 @@ def sync(self, stream_sync=True):
             if err != cuda.CUresult.CUDA_SUCCESS:
                 raise RuntimeError("CUDA Error %s" % str(err))
 
-        if hasattr(self, "host_D"):
+        for key in self.host_tensors.keys():
+            host_tensor = self.host_tensors[key]
             (err,) = cuda.cuMemcpyDtoH(
-                self.host_D,
-                self.ptr_D,
-                self.host_D.size * self.host_D.itemsize,
+                host_tensor,
+                self.buffers[key].ptr,
+                host_tensor.size * host_tensor.itemsize,
             )
             if err != cuda.CUresult.CUDA_SUCCESS:
                 raise RuntimeError("CUDA Error %s" % str(err))
diff --git a/python/cutlass/backend/c_types.py b/python/cutlass/backend/c_types.py
index 0a429bf8ad..73d0c66d94 100644
--- a/python/cutlass/backend/c_types.py
+++ b/python/cutlass/backend/c_types.py
@@ -32,7 +32,6 @@
 
 import ctypes
 
-import cutlass_bindings
 from cutlass import (
     DataType,
     KernelScheduleType
@@ -47,9 +46,10 @@ class GemmCoord_(ctypes.Structure):
         ("k", ctypes.c_int)
     ]
 
-    def __init__(self, gemm_coord) -> None:
-        for field_name, _ in self._fields_:
-            setattr(self, field_name, getattr(gemm_coord, field_name)())
+    def __init__(self, m, n, k) -> None:
+        self.m = m
+        self.n = n
+        self.k = k
 
 
 class GemmCoordBatched_(ctypes.Structure):
@@ -66,9 +66,10 @@ class GemmCoordBatched_(ctypes.Structure):
     ]
 
     def __init__(self, gemm_coord, batch_count) -> None:
-        for field_name, _ in self._fields_[:-1]:
-            setattr(self, field_name, getattr(gemm_coord, field_name)())
-        setattr(self, "batch_count", batch_count)
+        self.m = gemm_coord.m
+        self.n = gemm_coord.n
+        self.k = gemm_coord.k
+        self.batch_count = batch_count
 
 
 class MatrixCoord_(ctypes.Structure):
@@ -98,14 +99,6 @@ class StrideBatched_(ctypes.Structure):
     ]
 
 
-dtype2ctype = {
-    cutlass_bindings.float16: ctypes.c_uint16,
-    cutlass_bindings.float32: ctypes.c_float,
-    cutlass_bindings.float64: ctypes.c_double,
-    cutlass_bindings.int32: ctypes.c_int32,
-}
-
-
 class GenericMainloopArguments3x_(ctypes.Structure):
     """
     Structure representing the superset of possible mainloop arguments.
@@ -196,15 +189,28 @@ def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
 
 def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor):
     _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+    if hasattr(epilogue_functor, "visitor"):
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("arg_C", epilogue_functor.arg_c_type),
+                ("arg_D", epilogue_functor.arg_d_type)
+            ]
+
+            def __init__(self, output_op, ptr_c, stride_c, ptr_d, stride_d) -> None:
+                self.epilogue = output_op
+                self.arg_C = epilogue_functor.arg_c_type(ptr_c)
+                self.arg_D = epilogue_functor.arg_d_type(ptr_d)
+    else:
 
-    class _EpilogueArguments(ctypes.Structure):
-        _fields_ = [
-            ("epilogue", _EpilogueOutputOpParams),
-            ("ptr_C", ctypes.c_void_p),
-            ("stride_C", StrideBatched_),
-            ("ptr_D", ctypes.c_void_p),
-            ("stride_D", StrideBatched_),
-        ]
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", StrideBatched_),
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", StrideBatched_),
+            ]
 
     class _HardwareInfo(ctypes.Structure):
         _fields_ = [
@@ -324,7 +330,7 @@ class _GEMMGroupedArguments(ctypes.Structure):
 ############################################################################################
 
 
-class Conv2DProblemSize(ctypes.Structure):
+class Conv2DProblemSize_(ctypes.Structure):
     _fields_ = [
         ("N", ctypes.c_int),
         ("H", ctypes.c_int),
@@ -382,11 +388,13 @@ def get_conv2d_arguments(epilogue_functor):
 
     class _Conv2dArguments(ctypes.Structure):
         _fields_ = [
-            ("problem_size", Conv2DProblemSize),
-            ("ref_A", TensorRef_),
-            ("ref_B", TensorRef_),
-            ("ref_C", TensorRef_),
-            ("ref_D", TensorRef_),
+            ("conv_kind", ctypes.c_int),
+            ("problem_size", Conv2DProblemSize_),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("tensor_C_numel", ctypes.c_int),
             ("output_op", _EpilogueOutputOpParams),
             ("split_k_mode", ctypes.c_int)
         ]
@@ -414,3 +422,189 @@ class _ReductionParams(ctypes.Structure):
         ]
 
     return _ReductionParams, _EpilogueOutputParams
+
+
+###########################################################################################
+# Epilogue Visitor Type Factory
+###########################################################################################
+
+class Empty(ctypes.Structure):
+    _fields_ = []
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EmptyByte(ctypes.Structure):
+    _fields_ = [
+        ("byte", ctypes.c_byte)
+    ]
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EBO:
+    def __init__(self, index: int, type) -> None:
+        self.index = index
+        self.type = type
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, EBO):
+            return self.index == other.index and self.type == other.type
+        return False
+
+    def __hash__(self) -> int:
+        return hash((self.index, self.type))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self) -> str:
+        return f"<{self.index}, {self.type}>"
+
+
+def tuple_factory_(input_tuple, dtype, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+
+    # The empty base classes of the current tuple
+    empty_bases = []
+    # The first non empty base class
+    first_non_empty_base = None
+    # The ctype fields of the current tuple
+    ctype_fields = []
+
+    for idx, entry in enumerate(input_tuple):
+        # For nested tuples
+        if isinstance(entry, tuple):
+            sub_tuple_ctype, sub_empty_bases = tuple_factory_(entry, dtype, constants)
+            if ctypes.sizeof(sub_tuple_ctype) == 0:
+                # The empty tuple base class is also an empty EBO
+                empty_bases.append(EBO(idx, entry))
+            else:
+                if first_non_empty_base is None:
+                    first_non_empty_base = sub_empty_bases
+            ctype_fields.append((f"entry_{idx}", sub_tuple_ctype))
+        else:
+            if entry in constants:
+                empty_bases.append(EBO(idx, entry))
+                ctype_fields.append((f"entry_{idx}", Empty))
+            else:
+                ctype_fields.append((f"entry_{idx}", dtype))
+                if first_non_empty_base is None:
+                    first_non_empty_base = []
+
+    # Determine whether or not add an additional byte for empty base classes
+    additional_byte = False
+    # Special case for constant tuple
+    if first_non_empty_base is None:
+        additional_byte = False
+    else:
+        for base in first_non_empty_base:
+            if base in empty_bases:
+                additional_byte = True
+                break
+
+    if additional_byte:
+        ctype_fields = [("empty_byte", EmptyByte), ] + ctype_fields
+
+    # Create the ctype tuple
+    class TupleType(ctypes.Structure):
+        _fields_ = ctype_fields
+
+        def __init__(self, args) -> None:
+            if additional_byte:
+                fields = self._fields_[1:]
+            else:
+                fields = self._fields_
+
+            assert len(fields) == len(args)
+            for field, arg in zip(fields, args):
+                name = field[0]
+                field_type = field[1]
+                setattr(self, name, field_type(arg))
+
+    return TupleType, empty_bases
+
+def tuple_factory(input_tuple, dtype: str, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+    # Step 1: convert the dtype
+    if dtype == "int64_t":
+        dtype = ctypes.c_longlong
+    elif dtype in ["int", "int32_t"]:
+        dtype = ctypes.c_int32
+    else:
+        raise NotImplementedError(f"Type {dtype} is not supported")
+
+    tuple_type, _ = tuple_factory_(input_tuple, dtype, constants)
+
+    if ctypes.sizeof(tuple_type) == 0:
+        return EmptyByte
+    return tuple_type
+
+
+def visitor_factory(node_types, node_names):
+    """
+    Creates the argument type of epilogue visitor type
+
+    :param node_types: list of argument types under ctypes
+    :param node_names: list of argument names under str
+
+    :return: tuple type in ctypes.Structure
+    """
+    ctypes_field = []
+    # Struct is used when number of nodes < 4
+    # Because the Sm90VisitorImplBase has specification up to 4 nodes
+    # in `include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp`
+    if len(node_types) <= 4:
+        for idx, node_type in enumerate(node_types):
+            if ctypes.sizeof(node_type) == 0:
+                # Special case for empty struct
+                # 1 byte placeholder is used for correct alignment
+                ctypes_field.append((node_names[idx], ctypes.c_byte))
+            else:
+                ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    if ftype != ctypes.c_byte:
+                        setattr(self, fname, ftype(kwargs))
+
+    # For cases with more than 4 nodes, tuple is used
+    else:
+        for idx, node_type in enumerate(node_types):
+            ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    setattr(self, fname, ftype(kwargs))
+
+    return VisitorType
diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
index 21117b031b..f03cd2be6f 100644
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@@ -34,17 +34,16 @@
 import json
 import os
 import sqlite3
+import subprocess
 import tempfile
 
 from cuda import cuda, nvrtc
-import cutlass_bindings
 
 from cutlass import CACHE_FILE, CUDA_INSTALL_PATH, CUTLASS_PATH, logger
 from cutlass.backend.gemm_operation import GemmOperationUniversal
 from cutlass.backend.library import ApiVersion
 from cutlass.backend.utils.device import device_cc
 from cutlass.backend.utils.software import SubstituteTemplate
-import subprocess
 
 IncludeTemplate = r"""#include "${include}"
 """
@@ -157,8 +156,8 @@ def __init__(self) -> None:
             "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
         ]
         self.nvcc()
-        self.compiled_cache_device = cutlass_bindings.CompileCache()
-        self.compiled_cache_host = cutlass_bindings.CompileCache()
+        self.compiled_cache_device = {}
+        self.compiled_cache_host = {}
 
     def nvrtc(self):
         self.backend = "nvrtc"
@@ -197,7 +196,7 @@ def load_operation(self, op_key, extra_funcs):
                 raise RuntimeError("Cuda Error: {}".format(err))
 
             err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
-            self.compiled_cache_device.insert(key, kernel)
+            self.compiled_cache_device[key] = kernel
 
             compiled_host_fns = {}
             host_lib = CDLLBin(host_binary)
@@ -222,7 +221,7 @@ def load_operation(self, op_key, extra_funcs):
 
                     compiled_host_fns[attr] = func
 
-            self.compiled_cache_host.insert(key, compiled_host_fns)
+            self.compiled_cache_host[key] = compiled_host_fns
         return True
 
     def emit_compile_(self, operation_list, compilation_options, host_compilation_options):
@@ -246,11 +245,10 @@ def emit_compile_(self, operation_list, compilation_options, host_compilation_op
             )
 
         for incl in includes_host:
-            if "/device/" not in incl:
-                source_buffer_host += SubstituteTemplate(
-                    IncludeTemplate,
-                    {"include": incl},
-                )
+            source_buffer_host += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
 
         # 2. Operations
         for operation in operation_list:
@@ -382,16 +380,16 @@ def add_module(self, operations, compile_options=None, bypass_cache=False):
             # step 1: get kernel string as key
             key = operation.rt_module.emit() + operation.procedural_name() + self.backend
             # step 1: check if the operation is in cache
-            compiled_kernel = self.compiled_cache_device.at(key)
+            compiled_kernel = self.compiled_cache_device.get(key)
 
             if compiled_kernel is None and not bypass_cache:
                 hit = self.load_operation(key, getattr( operation.rt_module, "extra_funcs", {}))
                 if hit:
-                    compiled_kernel = self.compiled_cache_device.at(key)
+                    compiled_kernel = self.compiled_cache_device.get(key)
                     assert compiled_kernel is not None
             if compiled_kernel is not None:
                 operation.rt_module.kernel = compiled_kernel
-                compiled_host_fns = self.compiled_cache_host.at(key)
+                compiled_host_fns = self.compiled_cache_host.get(key)
                 assert compiled_host_fns is not None
                 for key in compiled_host_fns.keys():
                     setattr(operation.rt_module, key, compiled_host_fns[key])
@@ -417,7 +415,7 @@ def add_module(self, operations, compile_options=None, bypass_cache=False):
                     bytes(str.encode(operation.name()))
                 )
                 operation_name.append(operation.name())
-                self.compiled_cache_device.insert(key, operation.kernel)
+                self.compiled_cache_device[key] = operation.kernel
                 # get host functions
                 compiled_host_fns = {}
                 op_attr = []
@@ -456,7 +454,7 @@ def add_module(self, operations, compile_options=None, bypass_cache=False):
                         op_attr.append(suffix)
 
                 operation_attr.append(op_attr)
-                self.compiled_cache_host.insert(key, compiled_host_fns)
+                self.compiled_cache_host[key] = compiled_host_fns
 
             for (key, operation_name, operation_attr,) in zip(operation_key, operation_name, operation_attr):
                 self.insert_operation(
diff --git a/python/cutlass/backend/conv2d_operation.py b/python/cutlass/backend/conv2d_operation.py
index 977f5d4cba..466c71b491 100644
--- a/python/cutlass/backend/conv2d_operation.py
+++ b/python/cutlass/backend/conv2d_operation.py
@@ -29,19 +29,14 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
-# from typeguard import typechecked
 
 import ctypes
 from typing import Union
 
 from cuda import cuda
-import cutlass_bindings
 import numpy as np
 
-from cutlass.backend.arguments import ArgumentBase
-from cutlass.backend.c_types import Conv2DProblemSize, TensorRef_, get_conv2d_arguments
-from cutlass.backend.library import (
-    EmissionType,
+from cutlass import (
     ConvKindNames,
     ConvKindTag,
     DataTypeNames,
@@ -50,29 +45,40 @@
     IteratorAlgorithmNames,
     IteratorAlgorithmTag,
     LayoutTag,
+    LayoutType,
     MathOperation,
     MathOperationTag,
+    OpcodeClass,
     OpcodeClassNames,
     OpcodeClassTag,
     OperationKind,
     ShortDataTypeNames,
     ShortLayoutTypeNames,
+    SplitKMode,
     StrideSupport,
     StrideSupportTag,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
+    get_complex_from_real,
+)
+
+from cutlass.backend.arguments import ArgumentBase
+from cutlass.backend.c_types import dim3_, get_conv2d_arguments
+from cutlass.backend.library import (
+    EmissionType,
     TensorDescription,
     TileDescription,
-    get_complex_from_real,
 )
 from cutlass.backend.memory_manager import device_mem_alloc
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
-from cutlass.backend.tensor_ref import TensorRef
+from cutlass.backend.utils.datatypes import to_device_ptr
 from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
+from cutlass.shape import GemmCoord
 
 if CheckPackages().check_torch():
     import torch
 
 
-# @typechecked
 class Conv2dArguments(ArgumentBase):
     """
     Argument wrapper for Conv2d. It encodes problem information and
@@ -81,7 +87,7 @@ class Conv2dArguments(ArgumentBase):
     :param operation: the Conv2d operation to take the argument
     :type operation: :class:`cutlass.backend.Conv2dOperation`
     :param problem_size: the Conv2d problem size
-    :type problem_size: :class:`cutlass_bindings.conv.Conv2dProblemSize`
+    :type problem_size: :class:`cutlass.shape.Conv2dProblemSize`
     :param A: tensor A
     :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
     :param B: tensor B
@@ -90,135 +96,70 @@ class Conv2dArguments(ArgumentBase):
     :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
     :param D: tensor D
     :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
-    :param split_k_mode: conv2d split K mode, defaults to cutlass_bindings.conv.SplitKMode.Serial
-    :type split_k_mode: cutlass_bindings.conv.SplitKMode, optional
+    :param split_k_mode: conv2d split K mode, defaults to cutlass_library.library.SplitKMode.Serial
+    :type split_k_mode: cutlass_library.library.SplitKMode, optional
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
 
-    def __init__(
-        self,
-        operation: "Conv2dOperation",
-        problem_size: "cutlass_bindings.conv.Conv2dProblemSize",
-        A: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
-        B: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
-        C: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
-        D: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
-        split_k_mode: "cutlass_bindings.conv.SplitKMode" = cutlass_bindings.conv.SplitKMode.Serial,
-        **kwargs,
-    ) -> None:
+    def __init__(self, operation, problem_size, A, B, C, D,
+        split_k_mode=SplitKMode.Serial, **kwargs, ) -> None:
         self.operation = operation
-        #: convolution kind
-        self.conv_kind: cutlass_bindings.conv.Operator = operation.conv_kind
-        self.layout_A: cutlass_bindings.layout = operation.A.layout
-        self.layout_B: cutlass_bindings.layout = operation.B.layout
-        self.layout_C: cutlass_bindings.layout = operation.C.layout
+        self.conv_kind = operation.conv_kind
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
 
         self.element_A = operation.A.element
         self.element_B = operation.B.element
         self.element_C = operation.C.element
 
-        if self.layout_C == cutlass_bindings.TensorNC32HW32:
-            B = self.reorder_tensor_B(B, problem_size)
+        if self.layout_C == LayoutType.TensorNC32HW32:
+            raise Exception("Layout type TensorNC32HW32 is not currently supported")
 
         super().__init__(A, B, C, D, **kwargs)
-        # preprocessing output ops
 
         if "split_k_slices" in kwargs.keys() and kwargs["split_k_slices"] > 1:
             self.split_k_mode = split_k_mode
             self.split_k_slices = kwargs["split_k_slices"]
         else:
-            self.split_k_mode = cutlass_bindings.conv.SplitKMode.Serial
+            self.split_k_mode = SplitKMode.Serial
             self.split_k_slices = 1
-            
-        if "output_op" in kwargs.keys() and self.split_k_mode != cutlass_bindings.conv.SplitKMode.Parallel:
+
+        if "output_op" in kwargs.keys() and self.split_k_mode != SplitKMode.Parallel:
             self.output_op = kwargs["output_op"]
         else:
             self.output_op = self.operation.epilogue_type(1.0, 0.0)
 
-        #: problem_size
-        self.problem_size: cutlass_bindings.conv.Conv2dProblemSize = problem_size
+        self.problem_size = problem_size
         self.problem_size.split_k_slices = self.split_k_slices
 
-        if hasattr(self, "tensor_c_numel"):
-            c_coord = cutlass_bindings.conv.implicit_gemm_tensor_c_extent(
-                self.conv_kind, problem_size)
-            if self.tensor_c_numel == c_coord.at(3) and self.tensor_c_numel < c_coord.size():
-                self.bias = True
-
-        #
-        # initialize the argument
-        #
         self.initialize()
 
-    # @typechecked
-    def reorder_tensor_B(self, tensor_B: "np.ndarray",
-            problem_size: "cutlass_bindings.conv.Conv2dProblemSize"):
-        """
-        Reorder tensor_B for interleaved layout
-
-        :param tensor_B: input tensor B
-        :type tensor_B: numpy.ndarray
-        :param problem_size: Conv2d problem size
-        :type problem_size: :class:`cutlass_bindings.conv.Conv2dProblemSize`
-
-        :return: reordered tensor B
-        :rtype: numpy.ndarray
-        """
-        reordered_tensor_B = np.empty_like(tensor_B)
-        tensor_ref_B = self.get_tensor_ref(
-            tensor_B, self.element_B, self.layout_B, problem_size, "b")
-        reordered_tensor_ref_B = self.get_tensor_ref(
-            reordered_tensor_B, self.element_B, self.layout_B, problem_size, "b")
-        cutlass_bindings.conv.host.reorder_convK(
-            reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
-
-        return reordered_tensor_B
-
-    def get_tensor_ref(
-        self, tensor, dtype, tensor_layout, problem_size, operand):
-        if operand == "a":
-            tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_a_extent(
-                self.conv_kind, problem_size)
-        elif operand == "b":
-            tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_b_extent(
-                self.conv_kind, problem_size)
-        elif operand in ["c", "d"]:
-            tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_c_extent(
-                self.conv_kind, problem_size)
-        else:
-            raise ValueError("unknown operand: " + operand)
-        # Zero stride trick
-        if operand == "c" and self.bias:
-            tensor_coord = cutlass_bindings.Tensor4DCoord(0, 0, 0, 0)
-
-        layout = tensor_layout.packed(tensor_coord)
-
-        return TensorRef(tensor, dtype, layout).tensor_ref
-
-    def get_arguments(self, semaphore):
-        ref_A = TensorRef_(self.get_tensor_ref(
-            self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
-        ref_B = TensorRef_(self.get_tensor_ref(
-            self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
-        ref_C = TensorRef_(self.get_tensor_ref(
-            self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
-        ref_D = TensorRef_(self.get_tensor_ref(
-            self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
+    def get_arguments(self):
+        tc_numel = -1
+        if hasattr(self, "tensor_c_numel"):
+            tc_numel = self.tensor_c_numel
 
         self.c_arguments = self.operation.argument_type(
-            Conv2DProblemSize(self.problem_size),
-            ref_A, ref_B, ref_C, ref_D, self.output_op, self.split_k_mode)
-
-        self.semaphore = semaphore
+            int(self.conv_kind),
+            self.problem_size.ctype,
+            int(to_device_ptr(self.ptr_A)),
+            int(to_device_ptr(self.ptr_B)),
+            int(to_device_ptr(self.ptr_C)),
+            int(to_device_ptr(self.ptr_D)),
+            tc_numel,
+            self.output_op,
+            int(self.split_k_mode)
+        )
 
     def initialize(self):
-        # Get launch configuration
         self.launch_config = self.operation.rt_module.plan(self)
 
-        # Allocate and initialize device workspace
-        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+        self.get_arguments()
 
+        # Allocate and initialize device workspace
+        device_workspace_size = self.operation.rt_module.get_workspace_size(self.c_arguments)
         if device_workspace_size > 0:
             self.workspace_buffer = device_mem_alloc(device_workspace_size)
             workspace_ptr = self.workspace_buffer.ptr
@@ -227,19 +168,16 @@ def initialize(self):
         else:
             workspace_ptr = None
 
-        # Get kernel params as a bytearray
-        semaphore = 0
-        if (workspace_ptr is not None
-            and self.split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel):
+        self.semaphore = 0
+        if workspace_ptr is not None and self.split_k_mode == SplitKMode.Parallel:
             self.ptr_D = workspace_ptr
-        elif (workspace_ptr is not None
-              and self.split_k_mode == cutlass_bindings.conv.SplitKMode.Serial):
-            semaphore = workspace_ptr
-
-        self.get_arguments(semaphore)
+            # Reset arguments now that ptr_D has been updated
+            self.get_arguments()
+        elif workspace_ptr is not None and self.split_k_mode == SplitKMode.Serial:
+            self.semaphore = workspace_ptr
 
         params_ = self.operation.rt_module.get_args(
-            ctypes.byref(self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
+            self.c_arguments, ctypes.c_void_p(int(self.semaphore)))
         self.host_workspace = bytearray(params_.contents)
         self.device_workspace = None
 
@@ -251,7 +189,6 @@ def sync(self):
         return super().sync()
 
 
-# @typechecked
 class Conv2dRT(ExecutableOperation):
     """
     Conv2dRT manages the CUTLASS runtime components
@@ -287,24 +224,104 @@ class Conv2dRT(ExecutableOperation):
     return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
   }
 
+  using ElementA = typename ${operation_name}_base::ElementA;
+  using ElementB = typename ${operation_name}_base::ElementB;
+  using ElementC = typename ${operation_name}_base::ElementC;
+  using LayoutA = typename ${operation_name}_base::LayoutA;
+  using LayoutB = typename ${operation_name}_base::LayoutB;
+  using LayoutC = typename ${operation_name}_base::LayoutC;
+  using EpilogueOutputOp = typename ${operation_name}_base::EpilogueOutputOp;
+
+  struct ${operation_name}_TemporaryArgs {
+    int conv_kind;
+    cutlass::conv::Conv2dProblemSize problem_size;
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+    int tensor_c_numel;
+    typename EpilogueOutputOp::Params epilogue_params;
+    int split_k_mode;
+  };
+
+  typename ${operation_name}${operation_suffix}::Arguments
+  construct_arguments(${operation_name}_TemporaryArgs args) {
+    cutlass::conv::Operator conv_operator = static_cast<cutlass::conv::Operator>(args.conv_kind);
+    auto tc_A = cutlass::conv::implicit_gemm_tensor_a_extent(conv_operator, args.problem_size);
+    auto tc_B = cutlass::conv::implicit_gemm_tensor_b_extent(conv_operator, args.problem_size);
+    auto tc_C = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+    auto tc_D = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+
+    auto size_C = tc_C.at(0) * tc_C.at(1) * tc_C.at(2) * tc_C.at(3);
+    if (args.tensor_c_numel >= 0 && args.tensor_c_numel == tc_C.at(3) && args.tensor_c_numel < size_C) {
+      // C is interpreted as bias
+      tc_C = {0, 0, 0, 0};
+    }
+
+    cutlass::TensorRef<ElementA, LayoutA> tref_A(args.ptr_A, LayoutA::packed(tc_A));
+    cutlass::TensorRef<ElementB, LayoutA> tref_B(args.ptr_B, LayoutB::packed(tc_B));
+    cutlass::TensorRef<ElementC, LayoutA> tref_C(args.ptr_C, LayoutC::packed(tc_C));
+    cutlass::TensorRef<ElementC, LayoutA> tref_D(args.ptr_D, LayoutC::packed(tc_D));
+
+    return {
+      args.problem_size,
+      tref_A,
+      tref_B,
+      tref_C,
+      tref_D,
+      args.epilogue_params,
+      static_cast<cutlass::conv::SplitKMode>(args.split_k_mode)
+    };
+  }
+
   // Get the params as byte array
-  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
+  char* ${operation_name}_get_params(${operation_name}_TemporaryArgs args, int *semaphore=nullptr) {
+    auto arguments = construct_arguments(args);
     typename ${operation_name}${operation_suffix}::Params* params;
-    params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
+    params = new ${operation_name}${operation_suffix}::Params(arguments, semaphore);
 
     char *bytes = ((char*)(params));
     char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
     for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
-        output[i] = bytes[i];
+      output[i] = bytes[i];
 
     return output;
   }
+
+  dim3 ${operation_name}_get_grid_shape(
+    int conv_kind,
+    cutlass::conv::Conv2dProblemSize problem_size,
+    cutlass::gemm::GemmCoord tile_size,
+    int split_k_slices
+  ) {
+
+    using Swizzle = typename ${operation_name}_base::ThreadblockSwizzle;
+    auto tiled_shape = Swizzle::get_tiled_shape(
+      static_cast<cutlass::conv::Operator>(conv_kind),
+      problem_size,
+      tile_size,
+      split_k_slices);
+
+    return Swizzle::get_grid_shape(tiled_shape);
+  }
+
+  size_t ${operation_name}_get_workspace_size(${operation_name}_TemporaryArgs args) {
+    auto arguments = construct_arguments(args);
+
+    // Temporarily define device::-level Conv2d so that we can call get_workspace_size
+    using DeviceConv = cutlass::conv::device::ImplicitGemmConvolution<${operation_name}_base>;
+    return DeviceConv::get_workspace_size(arguments);
+  }
 }
 
     """
 
     def __init__(self, operation: "Conv2dOperation"):
         super().__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": dim3_,
+            "get_workspace_size": ctypes.c_uint64
+        }
         self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
         self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
         self.conv_kind = operation.conv_kind
@@ -313,47 +330,27 @@ def __init__(self, operation: "Conv2dOperation"):
 
         self.emitter = EmitConv2dInstance("_type")
 
-        self.threads: int = operation.tile_description.num_threads
+        self.threads = operation.tile_description.num_threads
 
         self.swizzle_functor = operation.swizzling_functor
 
     def emit(self):
         return self.emitter.emit(self.operation)
 
-    def get_device_workspace_size(self, arguments: Conv2dArguments):
-        workspace_bytes = 0
-
-        launch_config = arguments.launch_config
-
-        self.conv_kind = self.operation.conv_kind
-
-        if arguments.split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-            problem_size = arguments.problem_size
-            workspace_bytes = DataTypeSize[self.operation.C.element] \
-            * launch_config.grid[2] * cutlass_bindings.conv.implicit_gemm_tensor_c_size(
-                self.conv_kind, problem_size
-            ) // 8
-
-        elif arguments.split_k_mode == cutlass_bindings.conv.SplitKMode.Serial and \
-            arguments.split_k_slices > 1:
-            workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
-
-        return workspace_bytes
-
-    # @typechecked
     def plan(self, arguments: Conv2dArguments):
-        tile_size = cutlass_bindings.gemm.GemmCoord(
+        tile_size = GemmCoord(
             self.operation.tile_description.threadblock_shape[0],
             self.operation.tile_description.threadblock_shape[1],
             self.operation.tile_description.threadblock_shape[2],
         )
 
-        grid = self.swizzle_functor.get_grid_shape(
-            self.swizzle_functor.get_tiled_shape(
-                self.conv_kind, arguments.problem_size,
-                tile_size, arguments.split_k_slices
-            )
+        grid = self.get_grid_shape(
+            int(self.conv_kind),
+            arguments.problem_size.ctype,
+            tile_size.ctype,
+            arguments.split_k_slices
         )
+
         return LaunchConfiguration(
             [grid.x, grid.y, grid.z], [self.threads, 1, 1],
             self.shared_memory_capacity)
@@ -364,7 +361,7 @@ def initialize(self):
             attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
             value=self.shared_memory_capacity)
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("Cuda Error: {}".format(err))
+            raise RuntimeError(f"CUDA Error: {err}")
 
 
 class Conv2dOperation:
@@ -372,11 +369,11 @@ class Conv2dOperation:
     CUTLASS Conv2d operation description.
 
     :param conv_kind: convolution operator
-    :type conv_kind: :class:`cutlass_bindings.conv.Operator`
+    :type conv_kind: :class:`cutlass_library.library.ConvKind`
 
-    :param iterator_algorithm: Selects among several implementation 
+    :param iterator_algorithm: Selects among several implementation
     variants trading off performance with simplicity
-    :type iterator_algorithm: :class:`cutlass_bindings.conv.IteratorAlgorithm`
+    :type iterator_algorithm: :class:`cutlass_library.library.IteratorAlgorithm`
 
     :param arch: GPU compute capability (sm_xx)
     :type arch: int
@@ -397,12 +394,11 @@ class Conv2dOperation:
     :type D: :class:`cutlass.backend.TensorDescription`
 
     :param element_epilogue: element type for computation in epilogue \
-    :type element_epilogue: cutlass_bindings.int8 | cutlass_bindings.int32 | cutlass_bindings.float16 | \
-    cutlass_bindings.bfloat16 | cutlass_bindings.float32 | cutlass_bindings.float64
+    :type element_epilogue: cutlass_library.library.DataType
 
     :param stride_support: distinguish among partial specializations that \
     accelerate certain problems where convolution stride is unit \
-    :type stride_support: :class:`cutlass_bindings.conv.StrideSupport`
+    :type stride_support: :class:`cutlass_library.library.StrideSupport`
 
     :param epilogue_functor: convolution epilogue functor
     :type epilogue_functor: :class:`EpilogueFunctor`
@@ -411,8 +407,8 @@ class Conv2dOperation:
     """
     def __init__(
         self,
-        conv_kind: cutlass_bindings.conv.Operator,
-        iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm,
+        conv_kind,
+        iterator_algorithm,
         arch: int,
         tile_description: TileDescription,
         A: TensorDescription,
@@ -420,7 +416,7 @@ def __init__(
         C: TensorDescription,
         stride_support,
         epilogue_functor,
-        swizzling_functor=cutlass_bindings.IdentitySwizzle1,
+        swizzling_functor=SwizzlingFunctor.Identity1,
         emission_type=EmissionType.Kernel,
         **kwargs
     ):
@@ -434,8 +430,8 @@ def __init__(
         self.epilogue_functor = epilogue_functor
         self.iterator_algorithm = iterator_algorithm
         self.stride_support = stride_support
-        self.swizzling_functor = swizzling_functor()
-        
+        self.swizzling_functor = swizzling_functor
+
         self.emission_type = emission_type
 
         self.rt_module: Conv2dRT = Conv2dRT(self)
@@ -458,7 +454,7 @@ def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
         )
 
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
+            raise RuntimeError(f"CUDA Error {err}")
 
         return err
 
@@ -470,8 +466,6 @@ def procedural_name(self):
         """The full procedural name indicates architecture, extended name, tile size, and layout."""
         return self.configuration_name()
 
-    #
-
     def configuration_name(self):
         """The full procedural name indicates architecture, extended name, tile size, and layout."""
 
@@ -503,7 +497,6 @@ def configuration_name(self):
             },
         )
 
-    #
     def extended_name(self):
         """Append data types if they differ from compute type."""
         if self.C.element != self.tile_description.math_instruction.element_accumulator and \
@@ -523,17 +516,15 @@ def extended_name(self):
 
         return extended_name
 
-    #
     def layout_name(self):
         return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
-    #
     def core_name(self):
         """The basic operation kind is prefixed with a letter indicating the accumulation type."""
 
         intermediate_type = ""
 
-        if self.tile_description.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp:
+        if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
             inst_shape = "%dx%dx%d" % tuple(
                 self.tile_description.math_instruction.instruction_shape)
             if self.tile_description.math_instruction.element_a != self.A.element and \
@@ -550,7 +541,6 @@ def core_name(self):
             IteratorAlgorithmNames[self.iterator_algorithm]
         )
 
-    #
     def is_complex(self):
         complex_operators = [
             MathOperation.multiply_add_complex,
@@ -558,7 +548,6 @@ def is_complex(self):
         ]
         return self.tile_description.math_instruction.math_operation in complex_operators
 
-    #
     def accumulator_type(self):
         accum = self.tile_description.math_instruction.element_accumulator
 
@@ -570,16 +559,17 @@ def accumulator_type(self):
     def device_op(self):
         """
         Returns a new Conv2dOperation object that is constructed with emission type
-        ``EmissionType.Device``. 
-        
+        ``EmissionType.Device``.
+
         :return: operation ready for device-level code emission
         :rtype: Conv2dOperation
         """
         return Conv2dOperation(
             self.conv_kind, self.iterator_algorithm, self.arch, self.tile_description,
-            self.A, self.B, self.C, self.stride_support, self.epilogue_functor, type(self.swizzling_functor),
+            self.A, self.B, self.C, self.stride_support, self.epilogue_functor, self.swizzling_functor,
             emission_type=EmissionType.Device)
 
+
 ###################################################################################################
 #
 # Emits single instances of a CUTLASS device-wide operator
@@ -594,17 +584,18 @@ def __init__(self, operation_suffix=""):
             "cutlass/cutlass.h",
             "cutlass/conv/kernel/default_conv2d_fprop.h",
             "cutlass/conv/kernel/default_conv2d_dgrad.h",
-            "cutlass/conv/kernel/default_conv2d_wgrad.h"
+            "cutlass/conv/kernel/default_conv2d_wgrad.h",
+            "cutlass/conv/device/implicit_gemm_convolution.h"
         ]
         self.template = """
 // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-using ${operation_name}_base = 
+using ${operation_name}_base =
 typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
-  ${element_a}, 
+  ${element_a},
   ${layout_a},
-  ${element_b}, 
+  ${element_b},
   ${layout_b},
-  ${element_c}, 
+  ${element_c},
   ${layout_c},
   ${element_accumulator},
   ${opcode_class},
@@ -631,11 +622,11 @@ def __init__(self, operation_suffix=""):
 // Conv2d operation ${operation_name}
 
 using Conv2d${conv_kind_name}Kernel = typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
-  ${element_a}, 
+  ${element_a},
   ${layout_a},
-  ${element_b}, 
+  ${element_b},
   ${layout_b},
-  ${element_c}, 
+  ${element_c},
   ${layout_c},
   ${element_accumulator},
   ${opcode_class},
@@ -689,7 +680,7 @@ def emit(self, operation):
             "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
             "epilogue_vector_length": str(epilogue_vector_length),
             "epilogue_functor": operation.epilogue_functor.emit(),
-            "swizzling_functor": operation.swizzling_functor.tag(),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
             "stages": str(operation.tile_description.stages),
             "iterator_algorithm": IteratorAlgorithmTag[operation.iterator_algorithm],
             "iterator_algorithm_name": IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
@@ -698,7 +689,7 @@ def emit(self, operation):
             "align_a": str(operation.A.alignment),
             "align_b": str(operation.B.alignment),
         }
-        
+
         if operation.emission_type == EmissionType.Kernel:
             conv2d_template = self.template
         else:
diff --git a/python/cutlass/backend/epilogue.py b/python/cutlass/backend/epilogue.py
index 8cf2c72832..df87f6c9c2 100644
--- a/python/cutlass/backend/epilogue.py
+++ b/python/cutlass/backend/epilogue.py
@@ -32,24 +32,57 @@
 
 import ctypes
 
-from cuda import cuda, cudart
-import cutlass_bindings
 import numpy as np
 from scipy.special import erf
 
+from cutlass import DataType, DataTypeTag
 from cutlass.backend.c_types import MatrixCoord_
 from cutlass.backend.frontend import NumpyFrontend
-from cutlass.backend.library import DataTypeTag
+from cutlass.backend.library import ActivationOp, ActivationOpTag
 from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
 
 dtype2ctype = {
-    cutlass_bindings.int8: ctypes.c_int8,
-    cutlass_bindings.float16: ctypes.c_uint16,
-    cutlass_bindings.float32: ctypes.c_float,
-    cutlass_bindings.float64: ctypes.c_double,
-    cutlass_bindings.int32: ctypes.c_int32
+    DataType.f16: ctypes.c_uint16,
+    DataType.f32: ctypes.c_float,
+    DataType.f64: ctypes.c_double,
+    DataType.s8: ctypes.c_int8,
+    DataType.s32: ctypes.c_int32
 }
 
+torch_available = CheckPackages().check_torch()
+if torch_available:
+    import torch
+    import torch.nn.functional as F
+
+
+def get_scalar(value):
+    """
+    Returns a scalar value from a container (e.g., np.ndarray)
+    """
+    if isinstance(value, np.ndarray):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    elif CheckPackages().check_torch() and isinstance(value, torch.Tensor):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    else:
+        return value
+
+
+def to_ctype_value(value, dtype):
+    """
+    Converts ``value`` to the corresponding storage needed for the ctype that
+    will store ``value``.
+    """
+    scalar = get_scalar(value)
+    if dtype == DataType.f16:
+        # Convert f16 value into an integer
+        return int.from_bytes(np.float16(scalar).tobytes(), "little")
+    else:
+        return scalar
+
 
 #################################################################################################
 #
@@ -121,7 +154,6 @@ def __init__(
             DataTypeTag[element_epilogue],
         ]
 
-        # get epilogue output op type
         c_element_epilogue = dtype2ctype[self.element_epilogue]
         element_epilogue = self.element_epilogue
 
@@ -134,8 +166,8 @@ class _EpilogueOutputOpParams(ctypes.Structure):
             ]
 
             def __init__(self, alpha, beta, *args) -> None:
-                self.alpha = element_epilogue(alpha).storage
-                self.beta = element_epilogue(beta).storage
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
 
         self.epilogue_type = _EpilogueOutputOpParams
 
@@ -186,8 +218,8 @@ class _EpilogueOutputOpParams(ctypes.Structure):
             ]
 
             def __init__(self, alpha, beta, *args) -> None:
-                self.alpha = element_epilogue(alpha).storage
-                self.beta = element_epilogue(beta).storage
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
 
         self.epilogue_type = _EpilogueOutputOpParams
 
@@ -219,8 +251,8 @@ def __init__(self, element_output, epilogue_vector_length, *args) -> None:
             DataTypeTag[element_output], str(epilogue_vector_length)
         ]
 
-        self.element_accumulator = cutlass_bindings.int32
-        self.element_epilogue = cutlass_bindings.float32
+        self.element_accumulator = DataType.s32
+        self.element_epilogue = DataType.f32
 
         # get epilogue output op
         c_element_epilogue = dtype2ctype[self.element_epilogue]
@@ -235,8 +267,8 @@ class _EpilogueOutputOpParams(ctypes.Structure):
             ]
 
             def __init__(self, alpha, beta, *args) -> None:
-                self.alpha = element_epilogue(alpha).storage
-                self.beta = element_epilogue(beta).storage
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
 
         self.epilogue_type = _EpilogueOutputOpParams
 
@@ -292,15 +324,13 @@ class ActivationFunctor:
     Base class for frequently used activation functions
     """
 
-    def __init__(self, element_compute) -> None:
-        pass
-
     @staticmethod
     def numpy(x: np.ndarray):
         raise NotImplementedError()
 
-    def emit(self):
-        return self.tag
+    @classmethod
+    def emit(cls):
+        return ActivationOpTag[cls.binding_type]
 
     @staticmethod
     def epilogue_output_op(element_epilogue):
@@ -315,70 +345,75 @@ class _EpilogueOutputOpParams(ctypes.Structure):
             ]
 
             def __init__(self, alpha, beta, *args) -> None:
-                self.alpha = element_epilogue(alpha).storage
-                self.beta = element_epilogue(beta).storage
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
 
         return _EpilogueOutputOpParams
 
+class ActivationMeta(type):
+    @classmethod
+    def __call__(cls, x, *args):
+        if isinstance(x, np.ndarray):
+            return cls.numpy(x, *args)
+        elif torch_available and isinstance(x, torch.Tensor):
+            return cls.torch(x, *args)
+        else:
+            raise NotImplementedError("Unsupported tensor type")
 
-# identity operator
-class identity(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::Identity"
-
-    def numpy(x: np.ndarray):
-        return x
+    @classmethod
+    def numpy(cls, *args):
+        raise NotImplementedError(f"Numpy reference for {cls.__name__[:-4]} is not implemented.")
 
+    @classmethod
+    def torch(cls, *args):
+        raise NotImplementedError(f"PyTorch reference for {cls.__name__[:-4]} is not implemented.")
 
-# ReLu operator,
-class relu(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::ReLu"
+##############################################################################
+# identity operator
+class identityMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x
 
-    def __init__(self, element_compute):
-        super().__init__(element_compute)
+    @classmethod
+    def torch(cls, x):
+        return x
 
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("threshold", dtype2ctype[element_compute])
-            ]
+class identity(ActivationFunctor, metaclass=identityMeta):
+    binding_type = ActivationOp.Identity
 
-            def __init__(self, threshold=0.0) -> None:
-                self.threshold = element_compute(threshold).storage
 
-        self.argument_type = _Arguments
+##############################################################################
+# ReLu operator
+class reluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return np.where(x > 0, x, 0)
 
-    def emit_visitor(self):
-        return "cutlass::ReLUVisitor"
+    @classmethod
+    def torch(cls, x):
+        return F.relu(x)
 
-    @staticmethod
-    def numpy(x: np.ndarray):
-        return np.maximum(x, 0)
+class relu(ActivationFunctor, metaclass=reluMeta):
+    binding_type = ActivationOp.ReLU
 
 
+##############################################################################
 # Leaky ReLu operator
-class leaky_relu(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::LeakyReLU"
-
-    def __init__(self, element_compute) -> None:
-        super().__init__(element_compute)
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("leaky_alpha", dtype2ctype[element_compute])
-            ]
-
-            def __init__(self, leaky_alpha) -> None:
-                self.leaky_alpha = element_compute(leaky_alpha).storage
+class leakyReLUMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x, leaky_alpha):
+        return np.maximum(x, 0) + np.minimum(x, 0) * leaky_alpha
 
-        self.argument_type = _Arguments
+    @classmethod
+    def torch(cls, x, leaky_alpha):
+        return F.leaky_relu(x, leaky_alpha)
 
-    def emit_visitor(self):
-        return "cutlass::LeakyReLUVisitor"
+class leaky_relu(ActivationFunctor, metaclass=leakyReLUMeta):
+    binding_type = ActivationOp.LeakyReLU
 
     @staticmethod
-    def numpy(x: np.ndarray, leaky_alpha):
-        return np.maximum(x, 0) + np.minimum(x, 0) * leaky_alpha
-
-    def epilogue_output_op(self, element_epilogue):
+    def epilogue_output_op(element_epilogue):
         c_element_epilogue = dtype2ctype[element_epilogue]
 
         class _EpilogueOutputOpParams(ctypes.Structure):
@@ -391,700 +426,89 @@ class _EpilogueOutputOpParams(ctypes.Structure):
             ]
 
             def __init__(self, alpha, beta, leaky_alpha=0.2, *args) -> None:
-                self.alpha = element_epilogue(alpha).storage
-                self.beta = element_epilogue(beta).storage
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
                 self.alpha_ptr = 0
                 self.beta_ptr = 0
-                self.leaky_alpha = element_epilogue(leaky_alpha).storage
+                self.leaky_alpha = to_ctype_value(leaky_alpha, element_epilogue)
 
         return _EpilogueOutputOpParams
 
 
+##############################################################################
 # Tanh operator
-class tanh(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::Tanh"
-
-    def __init__(self, element_compute) -> None:
-        super().__init__(element_compute)
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [("tmp", ctypes.c_int)]
-
-            def __init__(self, *args) -> None:
-                self.tmp = 0
-
-        self.argument_type = _Arguments
-
-    def emit_visitor(self):
-        return "cutlass::TanhVisitor"
-
-    @staticmethod
-    def numpy(x: np.ndarray):
+class tanhMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
         return np.tanh(x)
 
+    @classmethod
+    def torch(cls, x):
+        return torch.tanh(x)
 
-def sigmoid_op(x: np.ndarray):
-    return 1.0 / (1.0 + np.exp(-x))
+class tanh(ActivationFunctor, metaclass=tanhMeta):
+    binding_type = ActivationOp.Tanh
 
 
+##############################################################################
 # Sigmoid operator
-class sigmoid(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::Sigmoid"
+class sigmoidMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return 1.0 / (1.0 + np.exp(-x))
 
-    @staticmethod
-    def numpy(x: np.ndarray):
-        return sigmoid_op(x)
+    @classmethod
+    def torch(cls, x):
+        return F.sigmoid(x)
 
+class sigmoid(ActivationFunctor, metaclass=sigmoidMeta):
+    binding_type = ActivationOp.Sigmoid
 
+
+##############################################################################
 # SiLu operator
-class silu(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::SiLu"
+class siluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x * sigmoidMeta.numpy()
 
-    @staticmethod
-    def numpy(x: np.ndarray):
-        return x * sigmoid_op(x)
+    @classmethod
+    def silu(cls, x):
+        return F.silu(x)
 
 
-# Hardswish operator
-class hardswish(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::HardSwish"
+class silu(ActivationFunctor, metaclass=siluMeta):
+    binding_type = ActivationOp.SiLU
 
-    @staticmethod
-    def numpy(x: np.ndarray):
+
+##############################################################################
+# Hardswish operator
+class hardswishMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
         relu6 = np.minimum(np.maximum(x + 3.0, 0), 6.0)
         return x * relu6 / 6.0
 
+    @classmethod
+    def torch(cls, x):
+        return F.hardswish(x)
 
-# GELU operator
-class gelu(ActivationFunctor):
-    tag = "cutlass::epilogue::thread::GELU"
-
-    @staticmethod
-    def numpy(x: np.ndarray):
-        return 0.5 * x * (1 + erf(x / np.sqrt(2.0)))
-
-
-# reduction operator
-def reduction_op(tensor, direction, math, factor):
-    batch, m, n = tensor.shape
-    if math == "Add":
-        if direction == "row":
-            num_cta_n = (n + factor - 1) // factor
-            reduction = np.transpose(
-                np.sum(tensor.reshape(batch, m, num_cta_n, factor), axis=-1),
-                axes=[0, 2, 1]).flatten()
-        elif direction == "column":
-            num_cta_m = (m + factor - 1) // factor
-            reduction = np.sum(
-                tensor.reshape(batch, num_cta_m, factor, n), axis=-2).flatten()
-        else:
-            raise NotImplementedError
-        return reduction
-    else:
-        raise NotImplementedError
-
-
-################################################################################
-# Epilogue Visitor
-################################################################################
-
-
-class LayerNorm(EpilogueFunctorBase):
-    """
-    Apply a linear combination operator to an array of elements
-    D = alpha * accumulator + beta * source
-
-    :param element_output: data type used to load and store tensors
-
-    :param epilogue_vector_length: number of elements computed per operation.
-    Usually it is 128/sizeof_bits<ElementOutput_>, but we use 64 and 32 sometimes
-    when there are not enough data to store
-
-    :param element_accumulator: Accumulator data type
-
-    :param element_epilogue: data type used to compute linear combination
-    """
-
-    KernelTemplate = """
-
-cutlass::epilogue::threadblock::EpilogueVisitorLayerNorm<
-    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-    ${operation_name}_default::kThreadCount,
-    ${operation_name}_default::Epilogue::OutputTileIterator,
-    ${operation_name}_default::Epilogue::AccumulatorFragmentIterator::AccumulatorTile,
-    ${element_compute}, // element_compute
-    ${element_variance}, // element_variance
-    ${element_mean}, // element_mean
-    ${element_layer_norm_compute}, // element_layer_norm_compute
-    ${epilogue_functor},
-    ${shifted_k}>;
-"""
-    headers = [
-        "gemm/gemm_universal_with_visitor.h",
-        "epilogue/epilogue_visitor_with_layernorm.h"
-    ]
-
-    def __init__(
-        self, elementwise_functor,
-        element_variance=None, element_mean=None,
-        element_layer_norm_compute=None, shifted_k=True, ) -> None:
-        super().__init__()
-
-        self.elementwise_functor = elementwise_functor
-        self.element_compute = elementwise_functor.element_epilogue
-        self.element_output = elementwise_functor.element_output
-
-        if element_variance is None:
-            self.element_variance = self.element_output
-        if element_mean is None:
-            self.element_mean = self.element_output
-        if element_layer_norm_compute is None:
-            self.element_layer_norm_compute = self.element_compute
-        if shifted_k:
-            self.shifted_k = "true"
-        else:
-            self.shifted_k = "false"
-
-        # get epilogue output op
-        elementwise_params_type = self.elementwise_functor.epilogue_type
-
-        class _EpilogueVisitorParams(ctypes.Structure):
-            _fields_ = [
-                ("element_wise", elementwise_params_type),
-                ("ptr_Variance", ctypes.c_void_p),
-                ("ptr_Mean_", ctypes.c_void_p),
-                ("ptr_Shifted_K_", ctypes.c_void_p),
-                ("extent", MatrixCoord_),
-            ]
-
-            def __init__(self, elementwise_params, variance, mean, shift_k, extent) -> None:
-                self.element_wise = elementwise_params
-                if isinstance(variance, np.ndarray):
-                    self.buffer_variance = NumpyFrontend.argument(variance, False)
-                    self.buffer_mean = NumpyFrontend.argument(mean, False)
-                    self.buffer_shift_k = NumpyFrontend.argument(shift_k, False)
-                    self.ptr_Variance = int(self.buffer_variance.ptr)
-                    self.ptr_Mean_ = int(self.buffer_mean.ptr)
-                    self.ptr_Shifted_K_ = int(self.buffer_shift_k.ptr)
-                    self.extent = MatrixCoord_(extent[0], extent[1])
-
-                    self.host_variance = variance
-                    self.host_mean = mean
-                    self.host_shift_k = shift_k
-
-            def sync(self, stream_sync=True):
-                if stream_sync:
-                    err, = cudart.cudaDeviceSynchronize()
-                    if err != cuda.CUresult.CUDA_SUCCESS:
-                        raise RuntimeError("CUDA Error %s" % str(err))
-
-                err, = cuda.cuMemcpyDtoH(
-                    self.host_variance,
-                    cuda.CUdeviceptr(self.ptr_Variance),
-                    self.host_variance.size * self.host_variance.itemsize)
-                err, = cuda.cuMemcpyDtoH(
-                    self.host_mean,
-                    cuda.CUdeviceptr(self.ptr_Mean_),
-                    self.host_mean.size * self.host_mean.itemsize)
-                err, = cuda.cuMemcpyDtoH(
-                    self.host_shift_k,
-                    cuda.CUdeviceptr(self.ptr_Shifted_K_),
-                    self.host_shift_k.size * self.host_shift_k.itemsize)
-                if err != cuda.CUresult.CUDA_SUCCESS:
-                    raise RuntimeError("CUDA Error %s" % str(err))
-
-        self.epilogue_type = _EpilogueVisitorParams
-
-    def emit(self, operation):
-        values = {
-            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
-            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
-            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
-            "operation_name": operation.procedural_name(),
-            "element_compute": DataTypeTag[self.element_compute],
-            "element_variance": DataTypeTag[self.element_variance],
-            "element_mean": DataTypeTag[self.element_mean],
-            "element_layer_norm_compute": DataTypeTag[self.element_layer_norm_compute],
-            "epilogue_functor": self.elementwise_functor.emit(),
-            "shifted_k": self.shifted_k,
-        }
-        return SubstituteTemplate(self.KernelTemplate, values)
-
-
-class AccumulatorOp:
-    Template = """
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpAccumulator<${element_accumulator}, ${elements_per_access}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, elements_per_access) -> None:
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-
-        self.instance_name = "AccumulatorOp%d" % AccumulatorOp.counter
-        AccumulatorOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [("tmp", ctypes.c_int)]
-
-            def __init__(self):
-                self.tmp = 0
-
-        self.argument_type = _Arguments
-
-    def emit(self, *args):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "elements_per_access": str(self.elements_per_access),
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class LinearCombinationOp:
-    Template = """
-${visitor_a}
-
-${visitor_b}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpLinearCombination<
-    ${element_accumulator}, ${element_compute}, 
-    ${elements_per_access}, ${visitor_a_name}, ${visitor_b_name}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_compute, 
-        elements_per_access, visitor_a, visitor_b) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-        self.visitor_a = visitor_a
-        self.visitor_b = visitor_b
-
-        self.instance_name = "LinearCombinationOp%d" % LinearCombinationOp.counter
-        LinearCombinationOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("alpha", dtype2ctype[self.element_compute]),
-                ("beta", dtype2ctype[self.element_compute]),
-                ("visitor_a", self.visitor_a.argument_type),
-                ("visitor_b", self.visitor_b.argument_type)
-            ]
-
-            def __init__(self, alpha, beta, visitor_a_arg, visitor_b_arg) -> None:
-                self.alpha = element_compute(alpha).storage
-                self.beta = element_compute(beta).storage
-                self.visitor_a = visitor_a_arg
-                self.visitor_b = visitor_b_arg
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_compute": DataTypeTag[self.element_compute],
-            "elements_per_access": str(self.elements_per_access),
-            "visitor_a_name": self.visitor_a.instance_name,
-            "visitor_b_name": self.visitor_b.instance_name,
-            "visitor_a": self.visitor_a.emit(operation),
-            "visitor_b": self.visitor_b.emit(operation)
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class VectorAdd:
-    def __init__(self, *args) -> None:
-        class _Arguments(ctypes.Structure):
-            _fields_ = [("tmp", ctypes.c_int)]
 
-            def __init__(self, *args) -> None:
-                self.tmp = 0
+class hardswish(ActivationFunctor, metaclass=hardswishMeta):
+    binding_type = ActivationOp.HardSwish
 
-        self.argument_type = _Arguments
 
-    def emit(self):
-        return "cutlass::VectorAdd"
-
-
-class VectorMult:
-    def __init__(self, *args) -> None:
-        class _Arguments(ctypes.Structure):
-            _fields_ = [("tmp", ctypes.c_int)]
-
-            def __init__(self, *args) -> None:
-                self.tmp = 0
-
-        self.argument_type = _Arguments
-
-    def emit(self):
-        return "cutlass::VectorMult"
-
-
-class BinaryOp:
-    Template = """
-${visitor_a}
-
-${visitor_b}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpBinary<
-    ${element_accumulator}, ${element_compute}, 
-    ${elements_per_access}, ${visitor_a_name}, ${visitor_b_name}, ${binary_op}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_compute,
-        elements_per_access, visitor_a, visitor_b, binary_op) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-        self.visitor_a = visitor_a
-        self.visitor_b = visitor_b
-        self.binary_op = binary_op
-
-        self.instance_name = "BinaryOp%d" % BinaryOp.counter
-        BinaryOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("binary_param", binary_op.argument_type),
-                ("visitor_a", self.visitor_a.argument_type),
-                ("visitor_b", self.visitor_b.argument_type)
-            ]
-
-            def __init__(self, binary_param, visitor_a_arg, visitor_b_arg) -> None:
-                self.binary_param = binary_param
-                self.visitor_a = visitor_a_arg
-                self.visitor_b = visitor_b_arg
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_compute": DataTypeTag[self.element_compute],
-            "elements_per_access": str(self.elements_per_access),
-            "visitor_a_name": self.visitor_a.instance_name,
-            "visitor_b_name": self.visitor_b.instance_name,
-            "visitor_a": self.visitor_a.emit(operation),
-            "visitor_b": self.visitor_b.emit(operation),
-            "binary_op": self.binary_op.emit()
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class Mult:
-    def __init__(self, element_compute) -> None:
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("alpha", dtype2ctype[element_compute])
-            ]
-
-            def __init__(self, alpha) -> None:
-                self.alpha = element_compute(alpha).storage
-
-        self.argument_type = _Arguments
-
-    def emit_visitor(self):
-        return "cutlass::Mult"
-
-
-class UnaryOp:
-    Template = """
-${visitor}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpUnary<
-    ${element_accumulator}, ${element_compute},
-    ${elements_per_access}, ${visitor_name}, ${unary_op}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_compute,
-        elements_per_access, visitor, unary_op) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-        self.visitor = visitor
-        self.unary_op = unary_op
-
-        self.instance_name = "UnaryOp%d" % UnaryOp.counter
-        UnaryOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("unary_param", unary_op.argument_type),
-                ("visitor_arg", self.visitor.argument_type),
-            ]
-
-            def __init__(self, unary_param, visitor_arg) -> None:
-                self.unary_param = unary_param
-                self.visitor_arg = visitor_arg
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_compute": DataTypeTag[self.element_compute],
-            "elements_per_access": str(self.elements_per_access),
-            "visitor_name": self.visitor.instance_name,
-            "unary_op": self.unary_op.emit_visitor(),
-            "visitor": self.visitor.emit(operation),
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class RowBroadcastOp:
-    Template = """
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpRowBroadcast<
-    ${element_accumulator}, ${element_fragment}, ${input_tile_iterator}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_fragment) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-
-        self.instance_name = "RowBroadcastOp%d" % RowBroadcastOp.counter
-        RowBroadcastOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("broadcast_ptr", ctypes.c_void_p),
-                ("batch_stride", ctypes.c_longlong)
-            ]
-
-            def __init__(self, broadcast_ptr, batch_stride=0):
-                self.broadcast_ptr = int(broadcast_ptr)
-                self.batch_stride = batch_stride
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_fragment": DataTypeTag[self.element_fragment],
-            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class ColumnBroadcastOp:
-    Template = """
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpColumnBroadcast<
-    ${element_accumulator}, ${element_fragment}, ${input_tile_iterator}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_fragment) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-
-        self.instance_name = "ColumnBroadcastOp%d" % ColumnBroadcastOp.counter
-        ColumnBroadcastOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("broadcast_ptr", ctypes.c_void_p),
-                ("batch_stride", ctypes.c_longlong)
-            ]
-
-            def __init__(self, broadcast_ptr, batch_stride=0):
-                self.broadcast_ptr = int(broadcast_ptr)
-                self.batch_stride = batch_stride
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_fragment": DataTypeTag[self.element_fragment],
-            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class TensorInputOp:
-    Template = """
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpTensorInput<
-    ${element_accumulator}, ${input_tile_iterator}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator) -> None:
-        self.element_accumulator = element_accumulator
-
-        self.instance_name = "TensorInputOp%d" % TensorInputOp.counter
-        TensorInputOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("input_ptr", ctypes.c_void_p),
-                ("ldt", ctypes.c_int),
-                ("batch_stride", ctypes.c_longlong)
-            ]
-
-            def __init__(self, input_ptr, ldt, batch_stride=0) -> None:
-                self.input_ptr = int(input_ptr)
-                self.ldt = ldt
-                self.batch_stride = batch_stride
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "input_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator"
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class TensorOutputOp:
-    Template = """
-${visitor}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpTensorOutput<
-    ${element_accumulator}, ${output_tile_iterator}, ${visitor_name}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, visitor) -> None:
-        self.element_accumulator = element_accumulator
-        self.visitor = visitor
-
-        self.instance_name = "TensorOutputOp%d" % TensorOutputOp.counter
-        TensorOutputOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("output_ptr", ctypes.c_void_p),
-                ("ldt", ctypes.c_int),
-                ("batch_stride", ctypes.c_longlong),
-                ("visitor_arg", self.visitor.argument_type)
-            ]
-
-            def __init__(self, output_ptr, ldt, visitor_arg, batch_stride=0) -> None:
-                self.output_ptr = int(output_ptr)
-                self.ldt = int(ldt)
-                self.visitor_arg = visitor_arg
-                self.batch_stride = batch_stride
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
-            "visitor_name": self.visitor.instance_name,
-            "visitor": self.visitor.emit(operation),
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class ColumnReductionOp:
-    Template = """
-${visitor}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpColumnReduction<
-    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-    ${element_accumulator}, ${element_reduction}, ${element_reduction_accumulator},
-    ${output_tile_iterator}, ${visitor_name}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_reduction,
-        element_reduction_accumulator, visitor) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.visitor = visitor
-
-        self.instance_name = "ColumnReductionOp%d" % ColumnReductionOp.counter
-        ColumnReductionOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("reduction_ptr", ctypes.c_void_p),
-                ("batch_stride", ctypes.c_longlong),
-                ("visitor_arg", self.visitor.argument_type)
-            ]
-
-            def __init__(self, reduction_ptr, visitor_arg, batch_stride=0) -> None:
-                self.reduction_ptr = reduction_ptr
-                self.batch_stride = batch_stride
-                self.visitor_arg = visitor_arg
-
-        self.argument_type = _Arguments
-
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
-            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
-            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_reduction": DataTypeTag[self.element_reduction],
-            "element_reduction_accumulator": DataTypeTag[self.element_reduction_accumulator],
-            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
-            "visitor_name": self.visitor.instance_name,
-            "visitor": self.visitor.emit(operation),
-        }
-        return SubstituteTemplate(self.Template, values)
-
-
-class RowReductionOp:
-    Template = """
-${visitor}
-
-using ${instance_name} = cutlass::epilogue::threadblock::VisitorOpRowReduction<
-    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-    ${element_accumulator}, ${element_reduction}, ${element_reduction_accumulator},
-    ${output_tile_iterator}, ${visitor_name}>;
-"""
-    counter = 0
-
-    def __init__(self, element_accumulator, element_reduction,
-        element_reduction_accumulator, visitor) -> None:
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.visitor = visitor
-
-        self.instance_name = "RowReductionOp%d" % RowReductionOp.counter
-        RowReductionOp.counter += 1
-
-        class _Arguments(ctypes.Structure):
-            _fields_ = [
-                ("reduction_ptr", ctypes.c_void_p),
-                ("batch_stride", ctypes.c_longlong),
-                ("visitor_arg", self.visitor.argument_type)
-            ]
+##############################################################################
+# GELU operator
+class geluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return 0.5 * x * (1 + erf(x / np.sqrt(2.0)))
 
-            def __init__(self, reduction_ptr, visitor_arg, batch_stride=0) -> None:
-                self.reduction_ptr = reduction_ptr
-                self.visitor_arg = visitor_arg
-                self.batch_stride = batch_stride
+    @classmethod
+    def torch(cls, x):
+        return F.gelu(x)
 
-        self.argument_type = _Arguments
 
-    def emit(self, operation):
-        values = {
-            "instance_name": self.instance_name,
-            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
-            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
-            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
-            "element_accumulator": DataTypeTag[self.element_accumulator],
-            "element_reduction": DataTypeTag[self.element_reduction],
-            "element_reduction_accumulator": DataTypeTag[self.element_reduction_accumulator],
-            "output_tile_iterator": operation.procedural_name() + "_default::Epilogue::OutputTileIterator",
-            "visitor_name": self.visitor.instance_name,
-            "visitor": self.visitor.emit(operation),
-        }
-        return SubstituteTemplate(self.Template, values)
+class gelu(ActivationFunctor, metaclass=geluMeta):
+    binding_type = ActivationOp.Gelu
diff --git a/python/cutlass/backend/test/__init__.py b/python/cutlass/backend/evt/__init__.py
similarity index 85%
rename from python/cutlass/backend/test/__init__.py
rename to python/cutlass/backend/evt/__init__.py
index 03f54cec50..6c82b71ad0 100644
--- a/python/cutlass/backend/test/__init__.py
+++ b/python/cutlass/backend/evt/__init__.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,5 @@
 #
 ################################################################################
 
-from cutlass.backend.test.conv2d_testbed import *
-from cutlass.backend.test.gemm_grouped_testbed import *
-from cutlass.backend.test.gemm_testbed import *
-from cutlass.backend.test.profiler import *
+from cutlass.backend.evt.epilogue import EpilogueFunctorVisitor
+from cutlass.backend.evt.frontend import PythonASTFrontend
diff --git a/python/cutlass/backend/evt/backend/__init__.py b/python/cutlass/backend/evt/backend/__init__.py
new file mode 100644
index 0000000000..6d424dd80c
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.backend.sm80_emitter import Sm80Emitter
+import cutlass.backend.evt.backend.sm80_nodes as sm80_nodes
+from cutlass.backend.evt.backend.sm90_emitter import Sm90Emitter
+import cutlass.backend.evt.backend.sm90_nodes as sm90_nodes
diff --git a/python/cutlass/backend/evt/backend/emitter_base.py b/python/cutlass/backend/evt/backend/emitter_base.py
new file mode 100644
index 0000000000..375378c943
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/emitter_base.py
@@ -0,0 +1,158 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Epilogue Visitor Emitter
+"""
+
+from cutlass import DataTypeTag
+from cutlass.backend.evt.ir import TopoVisitorNode, DAGIR
+
+
+class FusionCallbacks:
+    def __init__(self, dag_ir: DAGIR, cc: int, emit_CD=True) -> None:
+        """
+        Emit the EVT fusion callbacks
+        :param dag_ir: the DAG IR holding the epilogue visitor
+        :param cc: compute capability
+        :param emit_CD: whether to emit nodes C & D as a part of the fusion callbacks
+                        For Sm90, set emit_CD=False, as Tensor C & D are hardcoded in the collective API
+                        so that their shared memory can be explicitly reused
+                        For Sm89, set emit_CD=True as they are treated as normal AuxLoad & AuxStore nodes.
+        """
+        self.dag_ir = dag_ir
+        self.emit_CD = emit_CD
+        self.cc = cc
+        if self.cc < 90:
+            self.namespace = "threadblock"
+        else:
+            self.namespace = "fusion"
+
+    #
+    # Helper functions
+    #
+
+    def get_visitor_name(self, node: str):
+        """
+        Get the visitor name
+        """
+        meta = self.dag_ir.get_node_meta(node)
+        if not isinstance(meta, TopoVisitorNode) and self.dag_ir.in_degree(node) > 0:
+            return f"EVT{meta.name_camel}"
+        else:
+            return meta.name_camel
+
+    def emit(self):
+        node_metas = self.dag_ir.node_metas_topological_order()
+        epilogue_str = ""
+        # Step 1: emit individual node type decl
+        #         emit the EVT & DAG connector
+        for meta in node_metas:
+            if not meta.disabled:
+                epilogue_str += self.emit_node(meta)
+            if not self.emit_CD and meta.name == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                epilogue_str += self.emit_dag(meta)
+            else:
+                epilogue_str += self.emit_evt(meta)
+
+        # Step 2: post-processing & get callback name
+        if not self.emit_CD:
+            if not self.dag_ir.has_node("C"):
+                epilogue_str += "using ElementC = void;\nusing StrideC = StrideD;\n"
+            output_node = self.dag_ir.get_all_inputs("D")[0]
+            # The callback is the src of node D
+            callback_name = self.get_visitor_name(output_node)
+        else:
+            # The callback is the last node in the topological order
+            callback_name = self.get_visitor_name(node_metas[-1].name)
+        return epilogue_str, callback_name
+
+    def emit_evt(self, node):
+        if self.dag_ir.in_degree(node.name) == 0:
+            return ""
+
+        evt_tmp = f"""
+using EVT{node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.cc}EVT<
+    {node.name_camel},
+"""
+        sorted_children = self.dag_ir.get_all_inputs(node.name)
+        evt_node_strs = [f"    {self.get_visitor_name(child_name)}" for child_name in sorted_children]
+        evt_tmp += ",\n".join(evt_node_strs) + ">;\n"
+
+        return evt_tmp
+
+    def emit_dag(self, node):
+        subgraph = node.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Emit the Edge Tuple
+        edge_tuples = "cute::tuple<\n"
+        for n in subgraph_nodes[:-1]:
+            in_edges = subgraph.in_edges(n)
+            edge_weights = [subgraph.get_edge_weight(edge[0], edge[1]) for edge in in_edges]
+            sorted_children = [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+            edge_tuple = "        cute::seq<"
+            edge_str = [str(subgraph_nodes.index(child)) for child in sorted_children]
+            edge_tuple += ", ".join(edge_str) + ">,\n"
+
+            edge_tuples += edge_tuple
+        edge_tuples += "    >"
+
+        # Emit the node list
+        dag_nodes = ""
+        dag_node_strs = []
+        for n in subgraph_nodes[:-1]:
+            n_meta = subgraph.get_node_meta(n)
+            if n_meta.disabled:
+                dag_node_strs.append(f"    {self.get_visitor_name(n)}")
+            else:
+                dag_node_strs.append(f"    {n_meta.name_camel}")
+        dag_nodes = ",\n".join(dag_node_strs)
+
+        return f"""
+using {node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.cc}TopologicalVisitor<
+    {DataTypeTag[node.subgraph.element_compute]},
+    {edge_tuples},
+{dag_nodes}
+>;
+"""
+
+    def emit_node(self, node):
+        if isinstance(node, TopoVisitorNode):
+            emission = ""
+            for node in node.subgraph.node_metas_topological_order():
+                if not node.disabled:
+                    emission += self.emit_node(node)
+            return emission
+        else:
+            return node.underlying_impl.type_decl
diff --git a/python/cutlass/backend/evt/backend/sm80_emitter.py b/python/cutlass/backend/evt/backend/sm80_emitter.py
new file mode 100644
index 0000000000..849cb6222d
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm80_emitter.py
@@ -0,0 +1,47 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################  
+
+"""
+Emitter for Sm80 Epilogue Visitor
+"""
+
+from cutlass.backend.evt.backend.emitter_base import FusionCallbacks
+from cutlass.backend import GemmOperationUniversal
+
+
+class Sm80Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        self.fusion_callbacks = FusionCallbacks(graph, cc=80)
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, callback_decl
diff --git a/python/cutlass/backend/evt/backend/sm80_nodes.py b/python/cutlass/backend/evt/backend/sm80_nodes.py
new file mode 100644
index 0000000000..0158a905d1
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm80_nodes.py
@@ -0,0 +1,258 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass import DataTypeTag
+
+from cutlass.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
+
+from cutlass.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm80AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::threadblock::VisitorAccFetch;\n"""
+        return self._type_decl
+
+
+class Sm80AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80LoadSrcImpl(Sm80AuxLoadImpl):
+    pass
+
+
+class Sm80ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowBroadcastImpl(RowBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorCompute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80StoreDImpl(Sm80AuxStoreImpl):
+    pass
+
+
+class Sm80ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowReductionImpl(RowReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ScalarReductionImpl(ScalarReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/python/cutlass/backend/evt/backend/sm90_emitter.py b/python/cutlass/backend/evt/backend/sm90_emitter.py
new file mode 100644
index 0000000000..2e28cc3fb2
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm90_emitter.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Emitter for Sm90 Epilogue Visitor
+"""
+
+from cutlass import DataTypeTag, EpilogueScheduleTag
+from cutlass.backend import GemmOperationUniversal
+from cutlass.backend.evt.backend.emitter_base import FusionCallbacks
+
+
+class CollectiveEpilogue:
+    def __init__(self, tile_description,
+                 schedule,
+                 element_c,
+                 element_d,
+                 fusion_callbacks) -> None:
+
+        self.cta_tile_mnk = tile_description.threadblock_shape
+        self.element_c = element_c
+        self.element_d = element_d
+        self.schedule = schedule
+        self.fusion_callbacks = fusion_callbacks
+
+    @property
+    def CtaTileMNK(self) -> str:
+        """
+        The threadblock shape
+        """
+        return f"cute::Shape<_{self.cta_tile_mnk[0]}, _{self.cta_tile_mnk[1]}, _{self.cta_tile_mnk[2]}>"
+
+    @property
+    def EpilogueTileType(self) -> str:
+        """
+        The epilogue tile type
+        """
+        return "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    @property
+    def Schedule(self) -> str:
+        return EpilogueScheduleTag[self.schedule]
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, f"""
+using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+  {self.CtaTileMNK}, {self.EpilogueTileType},
+  {DataTypeTag[self.element_c]}, {DataTypeTag[self.element_d]},
+  {self.Schedule}
+>;
+{callback_decl}
+"""
+
+
+class Sm90Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        fusion_callbacks = FusionCallbacks(graph, cc=90, emit_CD=False)
+
+        self.collective_epilogue = CollectiveEpilogue(
+            tile_description=operation.tile_description,
+            schedule=operation.tile_description.epilogue_schedule,
+            element_c=operation.C.element,
+            element_d=operation.C.element,
+            fusion_callbacks=fusion_callbacks
+        )
+
+    def emit(self):
+        return self.collective_epilogue.emit()
diff --git a/python/cutlass/backend/evt/backend/sm90_nodes.py b/python/cutlass/backend/evt/backend/sm90_nodes.py
new file mode 100644
index 0000000000..3e29a3af1f
--- /dev/null
+++ b/python/cutlass/backend/evt/backend/sm90_nodes.py
@@ -0,0 +1,351 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from pycute import product
+
+from cutlass import DataTypeSize, DataTypeTag
+from cutlass.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    ComputeNode,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl,
+    StoreNode,
+    StoreDImpl,
+)
+from cutlass.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm90AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::fusion::Sm90AccFetch;\n"""
+        return self._type_decl
+
+
+class Sm90LoadSrcImpl(LoadSrcImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using ElementC = {DataTypeTag[self.element]};
+using StrideC = {self.stride_mnl};
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90SrcFetch;
+"""
+        return self._type_decl
+
+
+class Sm90AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"\nusing {self.descriptor} = cutlass::epilogue::collective::detail::AuxLoadDescriptor<EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}>;\n"
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxLoad<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom, typename {self.descriptor}::CopyOpS2R
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_c * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowBroadcastImpl(RowBroadcastImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"\nusing {self.descriptor} = cutlass::epilogue::collective::detail::RowBroadcastDescriptor<EpilogueDescriptor, {DataTypeTag[self.element]}>;\n"
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowBroadcast<
+    {self.descriptor}::Stages, typename EpilogueDescriptor::TileShape,
+    typename {self.descriptor}::Element, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        stages = (stages_c + epi_tiles - 1) // epi_tiles + 1
+        return (DataTypeSize[self.element] * cta_tile_mnk[1] * stages // 8, 16)
+
+
+class Sm90ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90Compute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"""
+using {self.descriptor} = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}
+>;
+"""
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxStore<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom,
+    typename {self.descriptor}::CopyOpR2S
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_d * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90StoreDImpl(StoreDImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        return f"""
+using ElementD = {DataTypeTag[self.element]};
+using StrideD = {self.stride_mnl};
+"""
+
+
+class Sm90ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowReductionImpl(RowReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0 /* Stages */,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ScalarReductionImpl(ScalarReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    {DataTypeTag[self.element]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/python/cutlass/backend/evt/epilogue.py b/python/cutlass/backend/evt/epilogue.py
new file mode 100644
index 0000000000..75bc703e9a
--- /dev/null
+++ b/python/cutlass/backend/evt/epilogue.py
@@ -0,0 +1,165 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Epilogue Visitor interface for compiling, and running visitor-based epilogue.
+"""
+
+import ctypes
+
+from cuda import cuda
+import numpy as np
+
+from cutlass import DataType
+from cutlass.backend.epilogue import EpilogueFunctorBase
+import cutlass.backend.evt.backend
+from cutlass.backend.frontend import TensorFrontend
+
+
+class EpilogueFunctorVisitor(EpilogueFunctorBase):
+    """
+    Apply an epilogue functor described by the epilogue EVT
+
+    :param cc: compute capability
+    :param visitor_frontend: user-provide visitor frontend
+
+    """
+    def __init__(self, cc: int, visitor, element_compute=DataType.f32) -> None:
+        # Type of Emitter based on CC
+        self.emit_cls = getattr(cutlass.backend.evt.backend, f"Sm{cc}Emitter")
+
+        # Visitor Types
+        self.visitor = visitor
+        self.graph = visitor.dag_ir
+
+        # Data types
+        self.element_epilogue = element_compute # element compute
+        self.element_output = self.graph.get_node_meta('D').underlying_impl.element
+
+        # Epilogue Thread Type
+        epilogue_thread_type = self.visitor.epilogue_thread_type
+        if cc == 90:
+            self.arg_c_type = self.visitor.arg_c_type
+            self.arg_d_type = self.visitor.arg_d_type
+        output_names = self.visitor.return_names
+        reduction_names = self.visitor.reduction_names
+
+        # Epilogue stages specialized for sm80 kernel
+        if cc == 80:
+            if hasattr(self.visitor, "epilogue_stages"):
+                self.epilogue_stages = self.visitor.epilogue_stages
+                assert self.epilogue_stages <= 2, "Only supports Stages <=2 in SM80 Epilogue"
+
+        # Epilogue Argument Type
+        class _Arguments(ctypes.Structure):
+            """
+            Concepts:
+            class _EpilogueArguments(ctypes.Structure):
+                _fields_ = [
+                    ("epilogue", _Arguments), <- this class
+                    ("ptr_C", ctypes.c_void_p),
+                    ("stride_C", StrideBatched_),
+                    ("ptr_D", ctypes.c_void_p),
+                    ("stride_D", StrideBatched_)
+                ]
+            """
+            _fields_ = [
+                ("output_op", epilogue_thread_type)
+            ]
+
+            def __init__(self, kwargs: dict) -> None:
+                # The user-input kwargs is a dict of (name: tensors)
+                # We first convert all of them to device pointers
+                ptr_kwargs = {}
+                for key in kwargs.keys():
+                    is_output = key in output_names and key not in reduction_names
+                    ptr_kwargs[key] = self.get_tensor_ptr(key, kwargs, is_output)
+                # Initialize the thread arguments
+                self.output_op = epilogue_thread_type(ptr_kwargs)
+
+            def get_tensor_ptr(self, tensor_name, kwargs, is_output=False):
+                """
+                Helper function for extracting device pointer
+                """
+                # Skip the special tensors
+                if cc == 90:
+                    if tensor_name in ["C", "D"]:
+                        return 0
+                if tensor_name not in kwargs.keys():
+                    raise ValueError(f"Tensor {tensor_name} is not provided.")
+                tensor = kwargs[tensor_name]
+
+                # For float scalar constant, directly return the value
+                if isinstance(tensor, float):
+                    return tensor
+
+                # The tensor frontend returns a device buffer for np.ndarray
+                # and device ptr for other frontends
+                buffer_or_ptr = TensorFrontend.argument(tensor, is_output)
+                if isinstance(tensor, np.ndarray):
+                    # Remember the host tensor for later synchronization
+                    setattr(self, f"{tensor_name}_buffer", buffer_or_ptr)
+                    setattr(self, f"{tensor_name}_host", tensor)
+                    return int(buffer_or_ptr.ptr)
+                else:
+                    return int(buffer_or_ptr)
+
+            def sync(self):
+                """
+                Synchronize the results from device to host
+                """
+                for name in output_names:
+                    if hasattr(self, f"{name}_host"):
+                        host_tensor = getattr(self, f"{name}_host")
+                        tensor_ptr = getattr(self, f"{name}_buffer").ptr
+                        (err,) = cuda.cuMemcpyDtoH(
+                            host_tensor,
+                            tensor_ptr,
+                            host_tensor.size * host_tensor.itemsize,
+                        )
+                        if err != cuda.CUresult.CUDA_SUCCESS:
+                            raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.epilogue_type = _Arguments
+
+    def emit(self, operation):
+        """
+        Emit the C++ code
+        """
+        emitter = self.emit_cls(operation, self.graph)
+        return emitter.emit()
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size in bytes
+        """
+        return self.visitor.get_smem_size(tile_description)
diff --git a/python/cutlass/backend/evt/frontend/__init__.py b/python/cutlass/backend/evt/frontend/__init__.py
new file mode 100644
index 0000000000..fd50a00845
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/__init__.py
@@ -0,0 +1,33 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.frontend.python_ast import PythonASTFrontend
diff --git a/python/cutlass/backend/evt/frontend/frontend_base.py b/python/cutlass/backend/evt/frontend/frontend_base.py
new file mode 100644
index 0000000000..8d9f6c6e37
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/frontend_base.py
@@ -0,0 +1,262 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Python EVT Frontend
+"""
+
+from typing import Union
+
+from cutlass import DataType
+from cutlass.backend.evt.ir import (
+    ComputeNode,
+    DAGIR,
+    LayoutNode,
+    LoadNode,
+    StoreNode,
+)
+from cutlass.backend.evt.passes import (
+    EVTGraphDrawer,
+    EVTPassManager,
+    GetSmemSize,
+    PassDAG2Tree,
+    PassGetArgumentType,
+    PassGetImpl,
+    PassFixElementD,
+    PassLayoutManipulateElimination,
+    PassPreprocessRed,
+    PassShapeTypePropagation,
+)
+from cutlass.backend.utils import device_cc
+from cutlass.epilogue.evt_ops import permute, reshape
+from cutlass.utils.datatypes import library_type
+
+
+class EVTFrontendBase:
+    layout_fns = {
+        "permute": permute,
+        "reshape": reshape
+    }
+
+    def __init__(self, element_compute=DataType.f32, cc=None, additional_passes=[], **kwargs) -> None:
+        self.cc = cc if cc else device_cc()
+        self.element_compute = library_type(element_compute)
+        self.dag_ir = DAGIR(self.element_compute, self.cc)
+        self.compute_cnt = 0
+        self.layout_cnt = 0
+
+        self.pass_manager = EVTPassManager(
+            self.dag_ir,
+            [
+                PassPreprocessRed,
+                PassGetArgumentType,
+                PassShapeTypePropagation,
+                PassLayoutManipulateElimination,
+                PassGetImpl,
+                PassDAG2Tree,
+                PassFixElementD
+            ] + additional_passes)
+
+        if self.cc == 80:
+            self._epilogue_stages = 1
+        else:
+            self._epilogue_stages = None
+
+    @property
+    def epilogue_stages(self):
+        return self._epilogue_stages
+
+    @epilogue_stages.setter
+    def epilogue_stages(self, stages):
+        self._epilogue_stages = stages
+
+
+    def parse(self, *args, **kwargs):
+        raise NotImplementedError(f"The 'parse' function must be overloaded in frontend class")
+
+    def trace(self, *args, **kwargs):
+        # Parse the input
+        self.parse(*args, **kwargs)
+
+        # Run the passes
+        self.pass_manager()
+        # Set the epilogue type
+        self.epilogue_thread_type = self.dag_ir.epilogue_thread_type
+        if self.cc == 90:
+            self.arg_c_type = self.dag_ir.arg_c_type
+            self.arg_d_type = self.dag_ir.arg_d_type
+        self.reduction_names = self.dag_ir.reduction_names
+
+    #
+    # Helper functions for DAG IR manipulation
+    #
+
+    def add_node(self, node):
+        self.dag_ir.add_node(node)
+
+    def add_edge(self, src, tgt, weight=0):
+        self.dag_ir.add_edge(src, tgt, weight=weight)
+
+    def set_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.tensor = {"tensor": example}
+
+    def set_store_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.store_tensor = {"tensor": example}
+
+    def mark_output(self, node_name):
+        """
+        Mark a store node as output
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        if not isinstance(meta, StoreNode):
+            raise ValueError(
+                f"Only StoreNodes can be marked as output. "
+                f"Got {type(meta).__name__}: {node_name}")
+        meta.is_output = True
+
+    # Add node with specific type
+
+    def add_load_node(self, name, example):
+        """
+        Add a Load node to DAG IR
+        :param name: name of the loaded variable
+        :type name: str
+        :param example: example input
+        :type example: np.ndarray|torch.Tensor|cupy.ndarray|float
+        """
+        if name is None:
+            raise ValueError(f"Name is not provided.")
+        if example is None:
+            raise ValueError(f"Example input for {name} is not provided.")
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": example}
+        # Special logics for accumulator
+        if name == "accum":
+            if load_node.tensor.rank == 2:
+                new_shape = tuple([1, ] + list(load_node.tensor.shape))
+                load_node.tensor.broadcast(new_shape)
+            elif load_node.tensor.rank < 2 or load_node.tensor.rank > 3:
+                raise ValueError(f"Expect example inputs for 'accum' be a rank-2 or rank-3 tensor. Got {load_node.tensor.shape}.")
+        self.add_node(load_node)
+
+    def add_imm(self, value: Union[float,int]):
+        """
+        Add an immediate scalar value to DAG IR
+        :param value: the value of the immediate scalar
+        :type value: float
+        """
+        try:
+            value = float(value)
+        except:
+            raise ValueError(f"{type(value).__name__} cannot be converted to float.")
+
+        name = f"imm_{value}".replace('.', '_')
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": value, "is_constant": True}
+        self.add_node(load_node)
+        return name
+
+    def add_compute_node(self, op, name=None):
+        """
+        Add a compute node.
+        :param op: the computation op
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the compute node
+        """
+        if name is None:
+            name = f"compute_{self.compute_cnt}"
+            self.compute_cnt += 1
+        compute_node = ComputeNode(
+            name=name, fn=op,
+            element_output=self.element_compute,
+            element_compute=self.element_compute)
+        self.add_node(compute_node)
+        return compute_node.name
+
+    def add_layout_node(self, op, kwargs, name=None):
+        """
+        Add a layout node.
+        :param op: the layout op
+        :type op: evt_ops
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the layout node
+        """
+        if name is None:
+            name = f"layout_{self.layout_cnt}"
+            self.layout_cnt += 1
+        layout_node = LayoutNode(name=name, fn=op, kwargs=kwargs)
+        self.add_node(layout_node)
+        return layout_node.name
+
+    def add_store_node(self, name):
+        store_node = StoreNode(name)
+        self.add_node(store_node)
+
+    #
+    # Visualization The DAG IR
+    #
+
+    def visualize(self, name="dag_ir"):
+        """
+        Visualize the dag ir with svg file
+        :param name: the name of the graph
+        """
+        drawer = EVTGraphDrawer(self.dag_ir, name)
+        if drawer.dot_available:
+            for name, graph in drawer.get_dot_graph():
+                graph.write_svg(f"./{name}.svg")
+        else:
+            raise RuntimeError(
+                "'dot' is not found in path. GraphDrawer is disabled. "
+                "Please install it with 'sudo apt-get install graphviz'."
+            )
+
+    #
+    # Get shared memory size
+    #
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size of the epilogue
+        """
+        smem_size = GetSmemSize(self.dag_ir)(tile_description)
+        return smem_size
diff --git a/python/cutlass/backend/evt/frontend/python_ast.py b/python/cutlass/backend/evt/frontend/python_ast.py
new file mode 100644
index 0000000000..ac799d8092
--- /dev/null
+++ b/python/cutlass/backend/evt/frontend/python_ast.py
@@ -0,0 +1,184 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Python AST frontend that parses input into DAG IR
+"""
+
+import ast
+import inspect
+import textwrap
+
+import cutlass
+from cutlass import DataType
+from cutlass.backend.evt.frontend.frontend_base import EVTFrontendBase
+from cutlass.backend.epilogue import relu
+from cutlass.backend.library import FunctionalOp
+
+
+class PythonASTFrontend(EVTFrontendBase, ast.NodeVisitor):
+    def __init__(self, element_compute=DataType.f32, **kwargs):
+        super().__init__(element_compute, **kwargs)
+        # Flags
+        # If this state is True, visit_Constant returns values without creating imm node
+        self.no_imm = False
+        self.visiting_return = False
+
+    def parse(self, example_inputs):
+        self.example_inputs = example_inputs
+        self.source = textwrap.dedent(inspect.getsource(self.__call__))
+        self.ast = ast.parse(self.source)
+        self.visit(self.ast)
+
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def ast_op_to_bindings(op):
+        mapping = {
+            ast.Add: FunctionalOp.Plus,
+            ast.Sub: FunctionalOp.Minus,
+            ast.Mult: FunctionalOp.Multiplies,
+            ast.Div: FunctionalOp.Divides,
+            "relu": relu.binding_type,
+            "multiply_add": FunctionalOp.MultiplyAdd,
+            "sum": (FunctionalOp.Plus, FunctionalOp.AtomicAdd),
+            "max": (FunctionalOp.Maximum, FunctionalOp.AtomicMaximum)
+        }
+        return mapping[op]
+
+    #
+    # Visiting different node types
+    #
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        # Visit args and register load nodes
+        for arg in node.args.args:
+            self.visit(arg)
+        for expr in node.body:
+            self.visit(expr)
+
+    def visit_arg(self, node: ast.arg):
+        # Name of the argument
+        name = node.arg
+        try:
+            example_tensor = self.example_inputs[name]
+        except:
+            raise RuntimeError(f"Example input for {name} is not provided.")
+
+        self.add_load_node(name, example_tensor)
+
+    def visit_Name(self, node: ast.Name):
+        return node.id
+
+    def visit_Constant(self, node: ast.Constant):
+        if self.no_imm:
+            return node.value
+        else:
+            name = self.add_imm(node.value)
+            return name
+
+    def visit_Tuple(self, node: ast.Tuple):
+        results = []
+        for elt in node.elts:
+            results.append(self.visit(elt))
+        return tuple(results)
+
+    def visit_keyword(self, node: ast.keyword):
+        return {node.arg: self.visit(node.value)}
+
+    def visit_BinOp(self, node: ast.BinOp):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.right)
+        op = self.ast_op_to_bindings(type(node.op))
+        name = self.add_compute_node(op)
+
+        # Add edges
+        # The edge weights are used to sort the input args
+        self.add_edge(lhs, name, weight=0)
+        self.add_edge(rhs, name, weight=1)
+        return name
+
+    def visit_Assign(self, node: ast.BinOp):
+        target = self.visit(node.targets[0])
+        value = self.visit(node.value)
+        # Create the assign node
+        self.add_store_node(target)
+
+        # Add edges
+        self.add_edge(value, target)
+        return target
+
+    def visit_Call(self, node: ast.Call):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        func = self.visit(node.func)
+        args = [self.visit(arg) for arg in node.args]
+
+        if func in self.layout_fns.keys():
+            # Parse kwargs
+            # By default, visiting imm automatically creates a load node
+            # However, in function call, keyword args are used to set
+            # specific function attributes such as indices for permute
+            # So no_imm is set to True temporarily
+            self.no_imm = True
+            kwargs = {}
+            for kw in node.keywords:
+                kwargs.update(self.visit(kw))
+            self.no_imm = False
+            op = self.layout_fns[func]
+            name = self.add_layout_node(op, kwargs)
+        else:
+            op = self.ast_op_to_bindings(func)
+            name = self.add_compute_node(op)
+
+        # Add edges
+        for idx, arg in enumerate(args):
+            self.add_edge(arg, name, weight=idx)
+        return name
+
+    def visit_Return(self, node: ast.Return):
+        self.visiting_return = True
+        results = self.visit(node.value)
+        self.visiting_return = False
+        self.return_names = results
+        if not isinstance(results, tuple):
+            results = (results,)
+        for rst in results:
+            try:
+                example_tensor = self.example_inputs[rst]
+            except:
+                raise RuntimeError(f"Example input for {rst} is not provided.")
+            self.set_store_tensor(rst, example_tensor)
+            self.mark_output(rst)
diff --git a/python/cutlass/backend/evt/ir/__init__.py b/python/cutlass/backend/evt/ir/__init__.py
new file mode 100644
index 0000000000..9fc3614202
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/__init__.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.ir.compute_nodes import ComputeNode, ComputeImpl
+from cutlass.backend.evt.ir.dag_ir import DAGIR
+from cutlass.backend.evt.ir.layout_nodes import LayoutNode
+from cutlass.backend.evt.ir.load_nodes import (
+    LoadNode,
+    AccumulatorImpl,
+    LoadSrcImpl,
+    AuxLoadImpl,
+    RowBroadcastImpl,
+    ColumnBroadcastImpl,
+    ScalarBroadcastImpl
+)
+from cutlass.backend.evt.ir.node import TopoVisitorNode, NoOpImpl
+from cutlass.backend.evt.ir.store_nodes import (
+    StoreNode,
+    StoreDImpl,
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
diff --git a/python/cutlass/backend/evt/ir/compute_nodes.py b/python/cutlass/backend/evt/ir/compute_nodes.py
new file mode 100644
index 0000000000..2159295507
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/compute_nodes.py
@@ -0,0 +1,91 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Python registration for compute nodes in EVT
+"""
+
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase
+from cutlass.backend.library import FloatRoundStyle
+
+
+class ComputeImplBase(ImplBase):
+    """
+    Base class for compute implementation
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+
+class ComputeImpl(ComputeImplBase):
+    """
+    Implementation for Compute Node
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+        self.fn = node.fn
+        self.element_output = node.element_output
+        self.element_compute = node.element_compute
+        self.round_style = node.round_style
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return True
+
+
+class ComputeNode(NodeBase):
+    """
+    Compute Node in DAG IR
+    """
+    possible_impls = [
+        ComputeImpl
+    ]
+    def __init__(
+        self, name: str, fn, element_output,
+        element_compute,
+        round_style=FloatRoundStyle.ToNearest) -> None:
+        super().__init__(name)
+        self.op = "compute"
+        self.fn = fn
+        self.element_compute = element_compute
+        self.round_style = round_style
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        self.element = self.element_compute
+        # In general, the compute nodes have element_output = element_compute
+        # In certain cases like producer of D it is overwritten by other passes
+        if not hasattr(self, "element_output"):
+            self.element_output = self.element
diff --git a/python/cutlass/backend/evt/ir/dag_ir.py b/python/cutlass/backend/evt/ir/dag_ir.py
new file mode 100644
index 0000000000..d0ac9402f0
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/dag_ir.py
@@ -0,0 +1,235 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+DAG IR used by Python EVT
+"""
+
+import networkx as nx
+
+from cutlass import DataType
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.utils import device_cc
+
+
+class DAGIR:
+    """
+    ``DAGIR`` is the main data structure used in the EVT Intermediate Representation.
+    It consists of a series of ``Node`` s, each representing epilogue visitor nodes.
+
+    In the DAGIR, ``node`` is an string of its name. ``node_meta`` is the underlying class of the node
+    """
+    def __init__(self, element_compute=DataType.f32, cc: int=None) -> None:
+        # The EVT DAGIR is managed through the nextworkX Digraph class
+        self._graph = nx.DiGraph()
+
+        self.element_compute = element_compute
+
+        self.reduction_names = []
+
+        self.cc = cc if cc else device_cc()
+
+    #
+    # IR manipulator
+    #
+
+    def add_node(self, meta: NodeBase):
+        """
+        Add a node to dag ir
+        """
+        if self.has_node(meta.name):
+            raise SyntaxError(f"Variable '{meta.name}' cannot be defined twice.")
+        self._graph.add_node(meta.name, meta=meta)
+
+    def add_edge(self, src: str, dst: str, weight: int=0):
+        """
+        Add an edge src -> dst to dag ir with weight
+        """
+        if not self.has_node(src):
+            raise SyntaxError(f"Variable '{src}' is undefined.")
+        if not self.has_node(dst):
+            raise SyntaxError(f"Variable '{dst}' is undefined.")
+        self._graph.add_edge(src, dst, weight=weight)
+
+    def remove_node(self, node: str):
+        """
+        Remove node from dag ir
+        """
+        self._graph.remove_node(node)
+
+    def remove_edge(self, src: str, dst: str):
+        """
+        Remove edge src -> dst
+        """
+        self._graph.remove_edge(src, dst)
+
+    #
+    # Helper functions for getting attrs
+    #
+
+    def has_node(self, node: str) -> bool:
+        """
+        Check if the node is in the graph
+        """
+        return self._graph.has_node(node)
+
+    def in_degree(self, node: str):
+        """
+        Get the input degree of node
+        """
+        return self._graph.in_degree(node)
+
+    def in_edges(self, node: str):
+        """
+        Get the input edges of node
+        """
+        return [edge for edge in self._graph.in_edges(node)]
+
+    def out_degree(self, node: str):
+        """
+        Get the output degree of node
+        """
+        return self._graph.out_degree(node)
+
+    def out_edges(self, node: str):
+        """
+        Get the output edges of node
+        """
+        return [edge for edge in self._graph.out_edges(node)]
+
+    def get_node_meta(self, node: str):
+        """
+        Get the meta data of the node
+        """
+        return self._graph.nodes[node]["meta"]
+
+    def get_edge_weight(self, src, dst):
+        """
+        Get the edge weight of edge src->dst
+        """
+        return self._graph.get_edge_data(src, dst)["weight"]
+
+    #
+    # High-level helper functions
+    #
+
+    def all_reachable_nodes(self, node: str):
+        """
+        Get all the nodes reachable from the current node (exclude)
+        """
+        return list(nx.dfs_preorder_nodes(self._graph, source=node))
+
+    def get_users(self, node: str):
+        """
+        Get all users of the current node
+        """
+        return [edge[1] for edge in self.out_edges(node)]
+
+    def get_all_inputs(self, node: str):
+        """
+        Get all the input nodes sorted by edge weight
+        """
+        in_edges = self.in_edges(node)
+        edge_weights = [self.get_edge_weight(*edge) for edge in in_edges]
+        return [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+
+    def get_all_inputs_meta(self, node: str):
+        """
+        Get all the input node metas sorted by edge weight
+        """
+        return [self.get_node_meta(input_node) for input_node in self.get_all_inputs(node)]
+
+    def replace_all_uses_with(self, node1, node2):
+        """
+        Replace all uses of node1 with node2
+        """
+        for edge in self.out_edges(node1):
+            weight = self.get_edge_weight(*edge)
+            user = edge[1]
+            self.add_edge(node2, user, weight)
+            self.remove_edge(node1, user)
+        self.remove_node(node1)
+
+    #
+    # Node accessor
+    #
+    def nodes_topological_order(self):
+        """
+        Get the nodes in the unique lexicographical topological order
+        It generates a unique ordering of nodes by first sorting topologically
+        and then additionally by sorting lexicographically.
+
+        Although topological_sort alone also works, this generates a unique key
+        for each epilogue visitor pattern and ensures the compilation cache can be reused.
+        :return: list[str]
+        """
+        return list(nx.lexicographical_topological_sort(self._graph))
+
+    def node_metas_topological_order(self):
+        """
+        Get the node metas in topological order
+        :return: list[NodeBase]
+        """
+        return [self.get_node_meta(node) for node in self.nodes_topological_order()]
+
+    @property
+    def nodes(self):
+        """
+        Get all nodes
+        :return: list[str]
+        """
+        return list(self._graph.nodes)
+
+    @property
+    def nodes_meta(self):
+        """
+        Get all node metas
+        :return: list[NodeBase]
+        """
+        return [data[1]['meta'] for data in self._graph.nodes.data()]
+
+    @property
+    def edges(self):
+        """
+        Get all edges
+        :return: list[(str, str)]
+        """
+        return list(self._graph.edges)
+
+    #
+    # Path
+    #
+    def has_path(self, src: str, target: str) -> bool:
+        """
+        Return True is a path exists from src to target
+        """
+        return nx.has_path(self._graph, src, target)
diff --git a/python/cutlass/backend/evt/ir/layout_algorithm.py b/python/cutlass/backend/evt/ir/layout_algorithm.py
new file mode 100644
index 0000000000..3da35b8d68
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/layout_algorithm.py
@@ -0,0 +1,324 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Layout algebras
+"""
+
+from pycute import Layout, composition, make_layout, flatten, product
+
+
+def _infer_split(old_shape, new_shape):
+    old_shape = _tuple_to_list(old_shape)
+    new_shape = _tuple_to_list(new_shape)
+    if len(old_shape) == 0 and len(new_shape) == 0:
+        return []
+    if len(old_shape) == 0:
+        if product(tuple(new_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return new_shape
+    if len(new_shape) == 0:
+        if product(tuple(old_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return old_shape
+    # This is done recursively by only process the last dimension at each time
+    old_dim = old_shape[-1]
+    new_dim = new_shape[-1]
+    # Exact match
+    if old_dim == new_dim:
+        return _infer_split(old_shape[:-1], new_shape[:-1]) + [new_dim,]
+    # Needs split
+    if old_dim > new_dim and old_dim % new_dim == 0:
+        residual = old_dim // new_dim
+        return _infer_split(old_shape[:-1] + [residual,], new_shape[:-1]) + [new_dim,]
+    # Needs merge
+    if old_dim < new_dim and new_dim % old_dim == 0:
+        residual = new_dim // old_dim
+        return _infer_split(old_shape[:-1], new_shape[:-1] + [residual,]) + [old_dim,]
+
+    raise NotImplementedError(f"Unsupported split: {old_shape} -> {new_shape}")
+
+def _infer_merge(flatten_shape, shape):
+    flatten_shape = _tuple_to_list(flatten_shape)
+    shape = _tuple_to_list(shape)
+    idx_flat = 0
+    merged_shape = []
+    for dim in shape:
+        # Exact match
+        if dim == flatten_shape[idx_flat]:
+            merged_shape.append(dim)
+            idx_flat += 1
+        # Need group
+        elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+            residual = dim
+            group = []
+            while(residual > 1):
+                group.append(flatten_shape[idx_flat])
+                residual = residual // flatten_shape[idx_flat]
+                idx_flat += 1
+            merged_shape.append(group)
+        else:
+            raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+    return merged_shape
+
+def _list_to_tuple(nested_list):
+    if isinstance(nested_list, list) or isinstance(nested_list, tuple):
+        return tuple(_list_to_tuple(item) for item in nested_list)
+    return nested_list
+
+def _tuple_to_list(nested_tuple):
+    if isinstance(nested_tuple, list) or isinstance(nested_tuple, tuple):
+        return list(_tuple_to_list(item) for item in nested_tuple)
+    return nested_tuple
+
+def _reverse_tuple(nested_tuple: tuple):
+    if isinstance(nested_tuple, tuple):
+        return tuple([_reverse_tuple(item) for item in nested_tuple][::-1])
+    return nested_tuple
+
+def _get_first_lhs_nonzero_stride(stride_list, idx):
+    for i in reversed(range(idx)):
+        if stride_list[i] != 0:
+            return i
+    else:
+        return None
+
+def _get_first_rhs_nonzero_stride(stride_list, idx):
+    for i in range(idx+1, len(stride_list)):
+        if stride_list[i] != 0:
+            return i
+        else:
+            return None
+
+def reshape(layout, new_shape):
+    """
+    General reshape of input layout.
+    It takes two steps:
+    1. split the dimensions of the old layout
+    2. merge the splitted dimensions according to the new shape
+    """
+    #
+    # Step 1: Split the dimensions of the old layout
+    #
+    # 1.1 Flat old and new shape
+    old_flatten_shape = list(flatten(layout.shape))
+    new_flatten_shape = list(flatten(new_shape))
+
+    # 1.2 Infer the flatten splitted shape
+    splitted_flatten_shape = _infer_split(old_flatten_shape, new_flatten_shape)
+
+    # 1.3 Unflat the splitted shape based on the old shape
+    splited_shape = _infer_merge(splitted_flatten_shape, old_flatten_shape)
+
+    # 1.4 Infer the type of each split
+    # If the split type is in row-major (R), the dimension list is reversed because
+    # the cute::composition only support column-major split
+    split_type = []  # the type of each split (ColumnMajor or RowMajor)
+    permuted_splitted_shape = []
+    old_flatten_stride = list(flatten(layout.stride))
+    for idx, dim in enumerate(splited_shape):
+        if not isinstance(dim, list):
+            permuted_splitted_shape.append(dim)
+            split_type.append("C")
+        else:
+            lhs_stride = _get_first_lhs_nonzero_stride(old_flatten_stride, idx)
+            rhs_stride = _get_first_rhs_nonzero_stride(old_flatten_stride, idx)
+            # Special case for single tuple
+            # Use column-major by default
+            if lhs_stride is None and rhs_stride is None:
+                permuted_splitted_shape.append(dim)
+                split_type.append("C")
+            else:
+                if lhs_stride is not None and rhs_stride is not None:
+                    # We consider shape[idx]:stride[idx]
+                    # Case 1: stride[idx - 1] <= stride[idx] <= stride[idx + 1]: column major
+                    if lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                    # Case 2: stride[idx - 1] > stride[idx] > stride[idx + 1]: row major
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    # Case 3: stride[idx - 1] <= stride[idx] > stride[idx + 1]: concave
+                    elif lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    # Case 4: stride[idx - 1] > stride[idx] <= stride[idx + 1]: concave
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    else:
+                        raise NotImplementedError()
+                elif lhs_stride is None:
+                    # Case 1: dim's stride < dim+1's stride, expand in column major
+                    if old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                else:
+                    # Case 1: dim's stride > dim-1's stride
+                    if old_flatten_stride[idx] < lhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+
+    # 1.4 Generate the splitted layout
+    permuted_splitted_layout = composition(layout, Layout(_list_to_tuple(permuted_splitted_shape)))
+
+    # 1.5 Reverse the permutation in 1.4 before merge
+    splitted_shape = []
+    splitted_stride = []
+    for shape_dim, stride_dim, type in zip(
+            permuted_splitted_layout.shape,
+            permuted_splitted_layout.stride,
+            split_type):
+        if type == "C":
+            splitted_shape.append(shape_dim)
+            splitted_stride.append(stride_dim)
+        else:
+            splitted_shape.append(tuple([d for d in reversed(shape_dim)]))
+            splitted_stride.append(tuple([d for d in reversed(stride_dim)]))
+    splitted_layout = Layout(tuple(splitted_shape), tuple(splitted_stride))
+
+
+    #
+    # Step 2: Merge the splitted dimensions according to the new shape
+    #
+    # 2.1 Merge layout
+    merged_layout = composition(splitted_layout, Layout(new_shape))
+
+    # 2.2 Cleaning up
+    output_layout = composition(merged_layout, Layout(new_shape))
+    return output_layout
+
+
+def permutation(layout, permutation):
+    """
+    Permute the layout
+    """
+    new_shape = tuple([layout.shape[idx] for idx in permutation])
+    new_stride = tuple([layout.stride[idx] for idx in permutation])
+    return Layout(new_shape, new_stride)
+
+
+def _broadcast(layout, new_shape):
+    if len(layout) == 1 and isinstance(new_shape, int):
+        old_dim = layout.shape
+        old_stride = layout.stride
+        new_dim = new_shape
+        if old_dim == new_dim:
+            return Layout(old_dim, old_stride)
+        elif old_dim == 1:
+            return Layout(new_dim, 0)
+        else:
+            raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {new_dim}")
+
+    # Align the dimensions
+    old_shape = layout.shape
+    if isinstance(old_shape, int):
+        old_shape = (old_shape,)
+        sub_layouts = [layout,]
+    else:
+        sub_layouts = [sub_layout for sub_layout in layout]
+    rhs_broadcast_layouts = [Layout(1, 0)] * (len(new_shape) - len(old_shape))
+    # Get the broadcasted layout
+    broadcast_layouts = []
+    try:
+        layout = make_layout(*sub_layouts, *rhs_broadcast_layouts)
+        broadcast_layouts = []
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    except NotImplementedError:
+        layout = make_layout(*rhs_broadcast_layouts, *sub_layouts)
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    return make_layout(*broadcast_layouts)
+
+
+def broadcast(layout, new_shape):
+    """
+    Broadcast the new layout based on the input shape
+    The broadcasted shape equals to the new shape
+    The stride of broadcasted dimensions are 0
+    """
+    return _broadcast(layout, new_shape)
+
+
+def debroadcast(layout, dims):
+    """
+    Squeeze the 0-stride
+    """
+    for dim in dims:
+        if layout.stride[dim] != 0:
+            raise ValueError(f"Dim{dim} cannot be debroadcasted as it has stride {layout.stride[dim]}")
+    new_shape = tuple([s for idx, s in enumerate(layout.shape) if idx not in dims])
+    new_stride = tuple([s for idx, s in enumerate(layout.stride) if idx not in dims])
+    return Layout(new_shape, new_stride)
+
+
+def canonicalization_(shapes, strides):
+    if isinstance(shapes, tuple):
+        c_shapes = []
+        c_strides = []
+        for shape, stride in zip(shapes, strides):
+            c_shape, c_stride = canonicalization_(shape, stride)
+            c_shapes.append(c_shape)
+            c_strides.append(c_stride)
+        return tuple(c_shapes), tuple(c_strides)
+    else:
+        if shapes == 1:
+            return 1, 0
+        else:
+            return shapes, strides
+
+def canonicalization(layout):
+    """
+    Canonicalize the input layout
+    1. set the stride of shape "1" to 0
+    """
+    new_shape, new_stride = canonicalization_(layout.shape, layout.stride)
+    return Layout(new_shape, new_stride)
diff --git a/python/cutlass/backend/evt/ir/layout_nodes.py b/python/cutlass/backend/evt/ir/layout_nodes.py
new file mode 100644
index 0000000000..4262389897
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/layout_nodes.py
@@ -0,0 +1,336 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Layout manipulation nodes and implementations
+
+The layout Nodes change the layout of intermediate nodes in epilogue visitor graph
+"""
+
+from copy import deepcopy
+
+from pycute import product, flatten
+
+import cutlass
+from cutlass import LayoutType
+from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _tuple_to_list
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.evt.ir.tensor import Tensor
+
+
+class PermutationImpl:
+    """
+    Detailed implementation and helper functions for permutation
+    """
+    def __init__(self, node) -> None:
+        assert "indices" in node.kwargs.keys()
+        self.indices = list(node.kwargs["indices"])
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.indices = self.inverse_indices
+        inverse_impl.inverse_indices = self.indices
+        return inverse_impl
+
+    def update(self, shape):
+        num_dim = len(shape)
+        indices = self.indices
+        num_old_dim = len(indices)
+        # Add offset
+        for i, idx in enumerate(indices):
+            indices[i] = idx + num_dim - num_old_dim
+        # Add broadcast dims
+        for i in range(num_dim - num_old_dim):
+            indices = [i,] + indices
+
+        self.indices = indices
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_indices(self, indices):
+        """
+        Get the indices for inverse permutation
+        """
+        num_dim = len(indices)
+        inverse_indices = [0] * num_dim
+        for i in range(num_dim):
+            inverse_indices[indices[i]] = i
+        return inverse_indices
+
+    def shape_propagation(self, input_node_meta):
+        input_shape = input_node_meta.tensor.shape
+        output_shape = tuple([input_shape[idx] for idx in self.indices])
+        return output_shape
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape
+        """
+        self.update(shape)
+        inverse_shape = tuple([shape[idx] for idx in self.inverse_indices])
+        node_meta.tensor.broadcast(inverse_shape)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to the users of the current nodes
+        """
+        usr_meta.tensor.permute(self.inverse_indices)
+        if hasattr(usr_meta, "store_tensor"):
+            if usr_meta.store_tensor is not None:
+                usr_meta.store_tensor.permute(self.inverse_indices)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to inputs of the current nodes
+        """
+        input_meta.tensor.permute(self.indices)
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.permute(self.indices)
+
+
+class ReshapeImpl:
+    """
+    Detailed implementation and helper functions for reshape
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        assert "new_shape" in node.kwargs.keys()
+        self.output_shape = _list_to_tuple(node.kwargs["new_shape"])
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.output_shape = self.input_shape
+        inverse_impl.input_shape = self.output_shape
+        return inverse_impl
+
+    def shape_propagation(self, input_node_meta):
+        self.input_shape = input_node_meta.tensor.shape
+        return _list_to_tuple(self.output_shape)
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape.
+        """
+        # Step 1: infer split
+        flatten_split_shape = self.infer_split(flatten(self.input_shape), flatten(self.output_shape))
+        split_input_shape = self.infer_merge(flatten_split_shape, self.input_shape)
+        split_output_shape = self.infer_merge(flatten_split_shape, self.output_shape)
+
+        # broadcast shape -> split_output_shape -> flatten_split_shape
+        if len(shape) - len(split_output_shape) > 0:
+            for _ in range(len(shape) - len(split_output_shape)):
+                split_output_shape = [1,] + split_output_shape
+                flatten_split_shape = [1,] + flatten_split_shape
+                split_input_shape = [1,] + split_input_shape
+        broadcast_factor = []
+        for dim, old_dim in zip(shape, split_output_shape):
+            if not isinstance(dim, list):
+                dim = [dim,]
+            if not isinstance(old_dim, list):
+                old_dim = [old_dim,]
+            if product(tuple(dim)) == product(tuple(old_dim)):
+                broadcast_factor += [1] * len(old_dim)
+            elif product(tuple(old_dim)) == 1:
+                assert len(dim) == 1
+                broadcast_factor.append(dim[0])
+            else:
+                raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {dim}")
+
+        # flatten_split_shape -> split_input_shape
+        factor_idx = 0
+        broadcast_split_input_shape = []
+        for dim in split_input_shape:
+            if isinstance(dim, list):
+                new_dim = []
+                for d in dim:
+                    new_dim.append(d * broadcast_factor[factor_idx])
+                    factor_idx += 1
+                broadcast_split_input_shape.append(new_dim)
+            else:
+                broadcast_split_input_shape.append(dim * broadcast_factor[factor_idx])
+                factor_idx += 1
+        broadcast_split_input_shape = _list_to_tuple(broadcast_split_input_shape)
+        node_meta.tensor.reshape(_list_to_tuple(split_input_shape))
+        node_meta.tensor.broadcast(broadcast_split_input_shape)
+        # Last reshape op to clean up
+        broadcast_input_shape = tuple([product(dim) for dim in broadcast_split_input_shape])
+        node_meta.tensor.reshape(broadcast_input_shape)
+        # Update the input shape and output shape
+        self.input_shape = _list_to_tuple(node_meta.tensor.shape)
+        self.output_shape = _list_to_tuple(shape)
+
+    def apply_to_user(self, user_meta: NodeBase):
+        """
+        Propagate the reshape to user nodes
+        """
+        user_meta.tensor.reshape(tuple(self.input_shape))
+        if hasattr(user_meta, "store_tensor"):
+            if user_meta.store_tensor is not None:
+                user_meta.store_tensor.reshape(tuple(self.input_shape))
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the reshape to input nodes
+        """
+        input_meta.tensor.reshape(tuple(self.output_shape))
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.reshape(tuple(self.output_shape))
+
+    #
+    # Helper functions
+    #
+
+    def infer_split(self, input_shape, output_shape):
+        """
+        Infer the flatten splitted shape that can be merged to both input_shape and output_shape
+        """
+        input_shape = _tuple_to_list(input_shape)
+        output_shape = _tuple_to_list(output_shape)
+        if len(input_shape) == 0 and len(output_shape) == 0:
+            return []
+        if len(input_shape) == 0:
+            if product(tuple(output_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return output_shape
+        if len(output_shape) == 0:
+            if product(tuple(input_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return input_shape
+        # This is done recursively by only process the last dimension at each time
+        old_dim = input_shape[-1]
+        new_dim = output_shape[-1]
+        # Exact match
+        if old_dim == new_dim:
+            return self.infer_split(input_shape[:-1], output_shape[:-1]) + [new_dim,]
+        # Needs split
+        if old_dim > new_dim and old_dim % new_dim == 0:
+            residual = old_dim // new_dim
+            return self.infer_split(input_shape[:-1] + [residual,], output_shape[:-1]) + [new_dim,]
+        # Needs merge
+        if old_dim < new_dim and new_dim % old_dim == 0:
+            residual = new_dim // old_dim
+            return self.infer_split(input_shape[:-1], output_shape[:-1] + [residual,]) + [old_dim,]
+
+        raise NotImplementedError(f"Unsupported split: {input_shape} -> {output_shape}")
+
+    def infer_merge(self, flatten_shape, shape):
+        flatten_shape = _tuple_to_list(flatten_shape)
+        shape = _tuple_to_list(shape)
+        idx_flat = len(flatten_shape) - 1
+        merged_shape = []
+        for dim in reversed(shape):
+            # Exact match
+            if dim == flatten_shape[idx_flat]:
+                merged_shape.append(dim)
+                idx_flat -= 1
+            # need group
+            elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+                residual = dim
+                group = []
+                while(residual > 1):
+                    group.append(flatten_shape[idx_flat])
+                    residual = residual // flatten_shape[idx_flat]
+                    idx_flat -= 1
+                merged_shape.append(group[::-1])
+            else:
+                raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+        return merged_shape[::-1]
+
+
+class LayoutNode(NodeBase):
+    """
+    Layout manipulation nodes
+    """
+    fn_to_impl = {
+        "permute": PermutationImpl,
+        "reshape": ReshapeImpl
+    }
+    def __init__(self, name: str, fn, kwargs: dict) -> None:
+        super().__init__(name)
+        self.op = "layout"
+        self.fn = fn
+        self.kwargs = kwargs
+        self.underlying_impl = self.fn_to_impl[self.fn.__name__](self)
+
+    def get_inverse_node(self):
+        inverse_node = deepcopy(self)
+        inverse_node.underlying_impl = self.underlying_impl.get_inverse_impl()
+        return inverse_node
+
+    def shape_propagation(self, input_node_metas):
+        if self._tensor is not None:
+            return
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+
+        output_shape = self.underlying_impl.shape_propagation(input_node_metas[0])
+
+        self._tensor = Tensor(
+            element=self.element_output,
+            shape=output_shape, layout_tag=LayoutType.RowMajor
+        )
+
+        return super().shape_propagation(input_node_metas)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        shape = self.tensor.shape
+
+        for child in input_node_metas:
+            self.underlying_impl.broadcast(shape, child)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to user nodes
+        """
+        self.underlying_impl.apply_to_user(usr_meta)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to input nodes
+        """
+        self.underlying_impl.apply_to_input(input_meta)
diff --git a/python/cutlass/backend/evt/ir/load_nodes.py b/python/cutlass/backend/evt/ir/load_nodes.py
new file mode 100644
index 0000000000..22abc33f61
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/load_nodes.py
@@ -0,0 +1,294 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Load nodes and implementations
+"""
+
+import ctypes
+
+from cutlass.backend.c_types import tuple_factory
+from cutlass.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase
+
+
+class LoadImplBase(ImplBase):
+    """
+    Base class for load node implementations
+    """
+    reserved_names = ["accum", "C"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.tensor.stride
+
+
+class AccumulatorImpl(LoadImplBase):
+    """
+    Accumulator node implementation
+    """
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "accum" and node.tensor.shape == problem_size
+
+
+class LoadSrcImpl(LoadImplBase):
+    """
+    Load C implementation
+    """
+    @property
+    def name_camel(self) -> str:
+        return "TensorC"
+
+    @property
+    def argument_type_c(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", tuple_type)
+            ]
+            def __init__(self, ptr) -> None:
+                self.ptr_C = ptr
+                self.stride_C = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "C" and node.tensor.shape == problem_size
+
+
+class AuxLoadImpl(LoadImplBase):
+    """
+    Load arbitrary tensor
+    """
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+        strideMN = node.tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class RowBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a row vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_row", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dRow", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_row = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dRow = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ColumnBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a column vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_col", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dCol", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_col = int(ptr)
+                self.null_default = to_ctype_value(0, element_type)
+                self.dCol = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class ScalarBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a scalar
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+
+        if self.tensor.is_constant:
+            value = self.tensor.value
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    self.scalars = to_ctype_value(value, element_type)
+                    self.scalar_ptrs = 0
+                    self.dScalar = tuple_type(stride_mnl)
+
+        else:
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    scalar_or_ptr = kwargs[name]
+                    if isinstance(scalar_or_ptr, float):
+                        self.scalars = to_ctype_value(scalar_or_ptr, element_type)
+                        self.scalar_ptrs = 0
+                    else:
+                        self.scalar_ptrs = int(scalar_or_ptr)
+
+                    self.dScalar = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class LoadNode(NodeBase):
+    """
+    Load Node
+    """
+    cnt = 0
+    possible_impls = [
+        AccumulatorImpl, LoadSrcImpl, AuxLoadImpl,
+        RowBroadcastImpl, ColumnBroadcastImpl,
+        ScalarBroadcastImpl
+    ]
+    def __init__(self, name: str) -> None:
+        if name is None:
+            name = f"load{LoadNode.cnt}"
+            LoadNode.cnt += 1
+        super().__init__(name)
+        self.op = "load"
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+
+        self.element = self.tensor.element
+        self.element_output = self.tensor.element
diff --git a/python/cutlass/backend/evt/ir/node.py b/python/cutlass/backend/evt/ir/node.py
new file mode 100644
index 0000000000..9cf23331f3
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/node.py
@@ -0,0 +1,292 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base & visitor classes of DAGIR Nodes
+"""
+
+import ctypes
+from re import sub
+
+from cutlass import LayoutType
+from cutlass.backend.evt.ir.layout_algorithm import _list_to_tuple, _reverse_tuple
+from cutlass.backend.evt.ir.tensor import Tensor
+
+
+class ImplBase:
+    """
+    Base class for Node Implementation
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        self.name = node.name
+        self.tensor = node.tensor
+        self._type_decl = None
+        self.stride_dtype = "int64_t"
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        """
+        Match function used in get_underlying_impl
+        """
+        raise NotImplementedError(f"The `match` function is not defined.")
+
+    @property
+    def argument_type(self):
+        """
+        Default class for Argument Type
+        """
+        class _Argument(ctypes.Structure):
+            _fields_ = []
+
+            def __init__(self, *args, **kwargs) -> None:
+                pass
+
+        return _Argument
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return sub(r"(_|-)+", " ", self.name).title().replace(" ", "")
+
+    def _emit_cute_tuple(self, py_tuple):
+        """
+        Emit the cute tuple to C++ code
+        """
+        if isinstance(py_tuple, int):
+            if py_tuple in [0, 1]:
+                return f"cute::Int<{py_tuple}>"
+            else:
+                return f"{self.stride_dtype}"
+        elif isinstance(py_tuple, tuple):
+            decl = "cute::Stride<"
+            for item in py_tuple:
+                decl += self._emit_cute_tuple(item) + ", "
+            return decl[:-2] + ">"
+        else:
+            raise ValueError(f"_emit_cute_tuple only accepts tuple or int, got {type(py_tuple).__name__}")
+
+    @property
+    def stride_mnl(self):
+        """
+        Typename StrideMNL
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return self._emit_cute_tuple(stride)
+
+    def get_non_constant_stride(self, py_tuple):
+        if isinstance(py_tuple, int):
+            if py_tuple not in [0, 1]:
+                return py_tuple
+            else:
+                return None
+        non_constant_stride = []
+        for item in py_tuple:
+            item_out = self.get_non_constant_stride(item)
+            if item_out:
+                non_constant_stride.append(item_out)
+        return tuple(non_constant_stride)
+
+    def get_stride_mnl(self):
+        """
+        Get the non-zero stride mnl. This is used in argument construction
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return stride
+
+    def get_smem_size(self, *args, **kwargs):
+        """
+        Get the shared memory size and alignment of current node
+        """
+        return (0, 1)
+
+
+class NoOpImpl(ImplBase):
+    """
+    The NoOpImpl does nothing but forward its input to users
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.op == "store":
+            # Store that is not output is a No OP
+            return not node.is_output
+
+
+class NodeBase:
+    """
+    Base class of DAG Node
+    """
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.underlying_impl = None
+
+        self._tensor = None
+
+        # Whether the node is disabled for emit
+        self.disabled = False
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return self.underlying_impl.name_camel
+
+    @property
+    def tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass.backend.evt.ir.tensor)
+        """
+        return self._tensor
+
+    @tensor.setter
+    def tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._tensor = Tensor(**kwargs)
+
+    #
+    # Helper functions for type/shape propagation
+    #
+
+    def shape_propagation(self, input_node_metas):
+        """
+        Infer shape from input nodes
+        General Broadcasting Rules from NumPy
+        When operating on two arrays, we compare their shapes element-wise.
+        It starts with the trailing (i.e. rightmost) dimension and works its
+        way left. Two dimensions are compatible when
+        1. they are equal
+        2. one of them is 1
+        """
+        if self._tensor is not None:
+            return
+
+        shape = None
+        for src in input_node_metas:
+            src_shape = src.tensor.shape
+            if shape is None:
+                shape = src_shape
+            else:
+                len_difference = len(shape) - len(src_shape)
+                if len_difference > 0:
+                    for _ in range(len_difference):
+                        src_shape = [1, ] + list(src_shape)
+                elif len_difference < 0:
+                    for _ in range(-len_difference):
+                        shape = [1, ] + list(shape)
+                broadcasted_shape = []
+                # Infer broadcast shape
+                for shape_dim, src_dim in zip(reversed(shape), reversed(src_shape)):
+                    if shape_dim == 1:
+                        broadcasted_shape = [src_dim, ] + list(broadcasted_shape)
+                    elif src_dim == 1:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    elif shape_dim == src_dim:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    else:
+                        error_msg = "Dimension mismatch between "
+                        for src_ in input_node_metas:
+                            error_msg += f"{src_.name}{src_.tensor.shape}, "
+                        error_msg = error_msg[:-2] + "."
+                        raise RuntimeError(error_msg)
+                shape = tuple(broadcasted_shape)
+
+        self._tensor = Tensor(element=self.element_output, shape=shape, layout_tag=LayoutType.RowMajor)
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Each node is associated with two data types: `element` and `element_output`.
+        The `element_output` is the type of return array of the node. The `element`
+        has specific meaning for different node types.
+        * Load Node: data type of tensor in gmem
+        * Compute Node: element compute
+        * Store Node: data type of tensor in gmem
+        This function must be overloaded in the derived classes
+        """
+        raise NotImplementedError(f"Function `type_propagation` is not overloaded in {self.__class__.__name__}")
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order.
+        For example:
+            C[l, m, n] = A[m, 1] + B[l, m, n]
+        After the broadcast propagation, it will be come
+            C[l, m, n] = A[l, m, n] + B[l, m, n]
+        and each tensor will have a proper stride accessing the underlying tensor
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        for child in input_node_metas:
+            child.tensor.broadcast(self.tensor.shape)
+
+    def get_underlying_impl(self, problem_size: tuple):
+        """
+        Get the underlying implementation of the current node.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The Layout of node {self.name} is unknown. Please call PassShapeTypePropagation first.")
+
+        for impl in self.possible_impls:
+            if impl.match(self, problem_size):
+                self.underlying_impl = impl(self)
+                break
+
+        if self.underlying_impl is None:
+            raise NotImplementedError(f"No matching op for node {self.name} with stride {self.tensor.stride}.")
+
+#
+# Visitor Nodes & Impls
+#
+
+class TopoVisitorImpl(ImplBase):
+    """
+    Impl for topological visitor
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node.output_node)
+        self.name = node.name
+        self.element_output = node.output_node.element_output
+
+class TopoVisitorNode(NodeBase):
+    def __init__(self, name: str, subgraph, output_node) -> None:
+        super().__init__(name)
+        self.subgraph = subgraph
+        self.output_node = output_node
+        self.op = "dag"
+        self.underlying_impl = TopoVisitorImpl(self)
diff --git a/python/cutlass/backend/evt/ir/store_nodes.py b/python/cutlass/backend/evt/ir/store_nodes.py
new file mode 100644
index 0000000000..e050e43009
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/store_nodes.py
@@ -0,0 +1,276 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Store node and implementations
+"""
+
+import ctypes
+
+from cutlass import DataType
+from cutlass.backend.c_types import tuple_factory
+from cutlass.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass.backend.evt.ir.node import NodeBase, ImplBase, NoOpImpl
+from cutlass.backend.evt.ir.tensor import Tensor
+from cutlass.backend.library import FloatRoundStyle, FunctionalOp
+
+
+class StoreImplBase(ImplBase):
+    """
+    Base class for store node implementation
+    """
+    reserved_names = ["D"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.store_tensor.stride
+
+
+class StoreDImpl(StoreImplBase):
+    """
+    Store D implementation
+    """
+
+    @property
+    def argument_type_d(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", tuple_type)
+            ]
+            def __init__(self, ptr: int) -> None:
+                self.ptr_D = ptr
+                self.stride_D = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name == "D" and node.store_tensor.shape == problem_size:
+            return True
+        return False
+
+
+class AuxStoreImpl(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.round_style = FloatRoundStyle.ToNearest
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class ReductionImplBase(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.store_tensor.element
+        self.element_compute = node.element_compute
+        self.reg_reduce_fn = self.node.reg_reduce_fn
+        self.gmem_reduce_fn = self.node.gmem_reduce_fn
+        self.round_style = node.round_style
+        self.stride_dtype = "int"
+
+    def get_reduce_identity(self):
+        """
+        Return the reduction identity of the current reduce_fn
+        """
+        maxes = {
+            DataType.f32: (2 ** 31) - 1,
+            DataType.f16: (2 ** 15),
+            DataType.s32: (2 ** 31) - 1,
+            DataType.s8: (2 ** 7) - 1
+        }
+        mins = {
+            DataType.f32: -maxes[DataType.f32],
+            DataType.f16: -maxes[DataType.f16],
+            DataType.s32: -maxes[DataType.s32],
+            DataType.s8: -maxes[DataType.s8]
+        }
+        if self.reg_reduce_fn == FunctionalOp.Maximum:
+            if self.element_compute not in mins:
+                raise Exception(f"No min entry for data type {self.element_compute}")
+            return to_ctype_value(mins[self.element_compute], self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Multiplies:
+            return to_ctype_value(1., self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Minimum:
+            if self.element_compute not in maxes:
+                raise Exception(f"No max entry for data type {self.element_compute}")
+            return to_ctype_value(maxes[self.element_compute], self.element_compute)
+        else:
+            return to_ctype_value(0., self.element_compute)
+
+    @property
+    def argument_type(self):
+        self.get_reduce_identity()
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_compute = self.element_compute
+        reduce_identity = self.get_reduce_identity()
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr", ctypes.c_void_p),
+                ("reduce_identity", dtype2ctype[element_compute]),
+                ("dMNL", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr = ptr
+                self.reduce_identity = reduce_identity
+                self.dMNL = tuple_type(stride_mnl)
+
+        return _Argument
+
+
+class ColumnReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class RowReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ScalarReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class StoreNode(NodeBase):
+    """
+    Store node
+    """
+    possible_impls = [
+        AuxStoreImpl, RowReductionImpl,
+        ColumnReductionImpl, ScalarReductionImpl,
+        NoOpImpl, StoreDImpl
+    ]
+    def __init__(self, name: str) -> None:
+        super().__init__(name)
+        self.op = "store"
+        self.is_output = False
+        self._store_tensor = None
+
+    @property
+    def store_tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass.backend.evt.ir.tensor)
+        """
+        return self._store_tensor
+
+    @store_tensor.setter
+    def store_tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._store_tensor = Tensor(**kwargs)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        if self.is_output:
+            if self.store_tensor is None:
+                raise RuntimeError(f"The store tensor of node {self.name} is unknown.")
+            self.element = self.store_tensor.element
+        assert len(input_node_metas) == 1, "Store node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        super().broadcast_propagation(input_node_metas)
+        if self.is_output:
+            self._store_tensor.broadcast(self.tensor.shape)
diff --git a/python/cutlass/backend/evt/ir/tensor.py b/python/cutlass/backend/evt/ir/tensor.py
new file mode 100644
index 0000000000..aa0c008e89
--- /dev/null
+++ b/python/cutlass/backend/evt/ir/tensor.py
@@ -0,0 +1,130 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level class for tensor
+"""
+
+from cutlass import LayoutType
+
+from cutlass.backend.evt.ir.layout_algorithm import (
+    Layout,
+    broadcast,
+    canonicalization,
+    permutation,
+    reshape,
+    _reverse_tuple
+)
+from cutlass.utils.datatypes import get_datatype_and_layout, get_tensor_shape, library_type
+
+
+class Tensor:
+    """
+    The tensor abstracts the data type
+    """
+    def __init__(self, tensor=None, element=None, shape=None, layout_tag=None, is_constant=False) -> None:
+        if element is not None and tensor is not None:
+            raise Exception(f"Must not specify both element and tensor")
+        elif shape is not None and tensor is not None:
+            raise Exception(f"Must not specify both shape and tensor")
+        elif layout_tag is not None and tensor is not None:
+            raise Exception(f"Must not specify both layout_tag and tensor")
+        elif (element is None or layout_tag is None or shape is None) and (tensor is None) :
+            raise Exception(f"Must specify one of (element, shape, layout) or (tensor)")
+
+        if isinstance(tensor, Tensor):
+            # Directly copy all the attributes
+            self.__dict__.update(vars(tensor))
+        else:
+            if tensor is None:
+                self.element = library_type(element)
+            else:
+                self.element, layout_tag = get_datatype_and_layout(tensor)
+                shape = get_tensor_shape(tensor)
+            if layout_tag == LayoutType.RowMajor:
+                self.layout = Layout(shape[::-1])
+            elif layout_tag == LayoutType.ColumnMajor:
+                self.layout = permutation(Layout(shape), [idx for idx in reversed(range(len(shape)))])
+            self.layout = canonicalization(self.layout)
+
+            self.is_constant = is_constant
+            # Save the tensor value if it is constant
+            if is_constant and tensor is not None:
+                self.value = tensor
+
+    @property
+    def shape(self):
+        """
+        Returns the RowMajor layout shape
+        """
+        return _reverse_tuple(self.layout.shape)
+
+    @property
+    def stride(self):
+        """
+        Returns the RowMajor layout stride
+        """
+        return _reverse_tuple(self.layout.stride)
+
+    @property
+    def rank(self):
+        """
+        Returns the rank of the tensor
+        """
+        return len(self.shape)
+
+    #
+    # Layout Algorithms
+    #
+
+    def broadcast(self, shape):
+        """
+        Broadcast self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        self.layout = broadcast(self.layout, _reverse_tuple(shape))
+
+    def reshape(self, shape):
+        """
+        Reshape self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        reverse_shape = _reverse_tuple(shape)
+        self.layout = reshape(self.layout, reverse_shape)
+
+    def permute(self, indices):
+        """
+        Permute self.layout according to indices
+        """
+        length = len(indices)
+        indices = [length - idx - 1 for idx in indices]
+        self.layout = permutation(self.layout, indices[::-1])
diff --git a/python/cutlass/backend/evt/passes/__init__.py b/python/cutlass/backend/evt/passes/__init__.py
new file mode 100644
index 0000000000..265e2180ba
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/__init__.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.backend.evt.passes.graph_drawer import EVTGraphDrawer
+from cutlass.backend.evt.passes.pass_argument_type import PassGetArgumentType
+from cutlass.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass.backend.evt.passes.pass_manager import EVTPassManager
+from cutlass.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass.backend.evt.passes.smem_size_calculator import GetSmemSize
diff --git a/python/cutlass/backend/evt/passes/graph_drawer.py b/python/cutlass/backend/evt/passes/graph_drawer.py
new file mode 100644
index 0000000000..83406f96ee
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/graph_drawer.py
@@ -0,0 +1,158 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import subprocess
+
+import pydot
+
+from cutlass import DataTypeTag
+from cutlass.backend.evt.ir.dag_ir import DAGIR
+
+
+_COLOR_MAP = {
+    "load": '"AliceBlue"',
+    "compute": "LemonChiffon1",
+    "accumulator": "LightGrey",
+    "store": "PowderBlue",
+    "layout": "lightseagreen",
+    "dag": "darkorange"
+}
+
+
+class EVTGraphDrawer:
+    """
+    Visualize a EVT DAGIR with graphviz
+    """
+    def __init__(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        self._name = name
+        self._dot_graphs = {}
+
+        self._dot_graphs[name] = self._to_dot(graph, name)
+        self.dot_available = self._check_dot_availability()
+
+    def _check_dot_availability(self):
+        """
+        Check if graphviz is installed
+        """
+        try:
+            # Run the 'dot' command and capture its output
+            result = subprocess.run(
+                ["dot", "-V"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            # Check if the command was successful and the output contains version information
+            if result.returncode == 0 and "dot - graphviz" in result.stderr:
+                return True
+        except FileNotFoundError:
+            pass
+        return False
+
+    def _get_node_style(self, node):
+        template = {
+            "shape": "record",
+            "fillcolor": "#CAFFE3",
+            "style": '"filled,rounded"',
+            "fontcolor": "#000000",
+        }
+        if node.op in _COLOR_MAP:
+            template["fillcolor"] = _COLOR_MAP[node.op]
+        else:
+            raise NotImplementedError("unknown node op")
+        if node.disabled:
+            template["fontcolor"] = "grey"
+            template["fillcolor"] = "white"
+        return template
+
+    def _get_node_label(self, node):
+        label = "{" + f"name={node.name}|op={node.op}"
+        if node.op == "layout":
+            label += f"|fn={node.fn.__name__}"
+            for key in node.kwargs:
+                label += f"|{key}={node.kwargs[key]}"
+        if node.underlying_impl is not None:
+            label += f"|impl={type(node.underlying_impl).__name__}"
+            if node.op == "load":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element]}"
+            elif node.op == "compute":
+                label += f"|element_compute={DataTypeTag[node.underlying_impl.element_compute]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "store":
+                label += f"|element_store={DataTypeTag[node.underlying_impl.element]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "dag":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+        if node.tensor is not None:
+            shape = node.tensor.shape
+            stride = node.tensor.stride
+            label += f"|shape={shape}|stride={stride}"
+
+        if hasattr(node, "store_tensor"):
+            if node.store_tensor is not None:
+                store_shape = node.store_tensor.shape
+                store_stride = node.store_tensor.stride
+                label += f"|store_shape={store_shape}|stride_stride={store_stride}"
+
+        label += "}"
+        return label
+
+    def _to_dot(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        dot_graph = pydot.Dot(name, randir="TB")
+        for node in graph.nodes_meta:
+            style = self._get_node_style(node)
+            label = self._get_node_label(node)
+            dot_node = pydot.Node(
+                node.name, label=label, **style
+            )
+            dot_graph.add_node(dot_node)
+            if node.op == "dag":
+                dot_subgraph = self._to_dot(node.subgraph, name=node.name)
+                self._dot_graphs[node.name] = dot_subgraph
+
+        # Add edges
+        for src, dst in graph.edges:
+            weight = graph.get_edge_weight(src, dst)
+            dot_graph.add_edge(pydot.Edge(src, dst, label=weight))
+
+        return dot_graph
+
+    def get_dot_graph(self) -> pydot.Dot:
+        return [(key, self.get_dot_graph_by_name(key)) for key in self._dot_graphs.keys()]
+
+    def get_dot_graph_by_name(self, name) -> pydot.Dot:
+        return self._dot_graphs[name]
+
+    def get_main_dot_graph(self) -> pydot.Dot:
+        return self._dot_graphs[self._name]
diff --git a/python/cutlass/backend/evt/passes/pass_argument_type.py b/python/cutlass/backend/evt/passes/pass_argument_type.py
new file mode 100644
index 0000000000..1e09a612f8
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_argument_type.py
@@ -0,0 +1,116 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Construct the epilogue visitor argument type
+"""
+
+from cutlass.backend.c_types import visitor_factory
+from cutlass.backend.evt.ir import TopoVisitorNode
+from cutlass.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassGetArgumentType(EVTPassBase):
+    """
+    Construct the epilogue visitor argument type
+    """
+    dependencies = [
+        PassShapeTypePropagation,     # The Layout of all nodes must be set
+        PassDAG2Tree,                 # The type of each node must be set
+        PassGetImpl                   # The DAG subgraphs must be set
+    ]
+
+    def requires(self) -> None:
+        # Check "D" is in the node list
+        if self.cc == 90 and (not self.dag_ir.has_node("D")):
+            raise SyntaxError(
+                "Sm90 EVT requires the epilogue to have a returned tensor D, "
+                "but the variable 'D' is not found in the return values.")
+
+    def call(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.argument_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.argument_types[node] = meta.underlying_impl.argument_type
+            if node == "D" and self.cc == 90:
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_argument_type(node)
+            else:
+                self.get_evt_argument_type(node)
+
+        self.cc_specific_method(self.set_argument_type)()
+
+    def get_evt_argument_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.argument_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(
+                input_types + [self.argument_types[node],], self.dag_ir.get_all_inputs(node) + [node,])
+
+    def get_dag_argument_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.argument_types[n] = m.underlying_impl.argument_type
+        input_types = [self.argument_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(input_types, subgraph_nodes[:-1])
+
+    def set_argument_type(self):
+        pass
+
+    def sm90_set_argument_type(self):
+        self.dag_ir.epilogue_thread_type = self.argument_types[self.dag_ir.get_all_inputs("D")[0]]
+        # Get the tensorD argument type
+        self.dag_ir.arg_d_type = self.dag_ir.get_node_meta("D").underlying_impl.argument_type_d
+
+        # Get the tensorC argument type
+        if self.dag_ir.has_node("C"):
+            self.dag_ir.arg_c_type = self.dag_ir.get_node_meta("C").underlying_impl.argument_type_c
+        else:
+            self.dag_ir.arg_c_type = self.dag_ir.arg_d_type
+
+    def sm80_set_argument_type(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.dag_ir.epilogue_thread_type = self.argument_types[nodes[-1]]
diff --git a/python/cutlass/backend/evt/passes/pass_dag_2_tree.py b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
new file mode 100644
index 0000000000..b833792425
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
@@ -0,0 +1,147 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Merge non-tree sub-graphs of the DAG IR into a single DAG. The fused DAG will be implemented
+by the topological visitor, while the rest of the graph will be implemented with the tree visitor.
+"""
+
+from copy import deepcopy
+
+from cutlass.backend.evt.ir import DAGIR, TopoVisitorNode
+from cutlass.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassDAG2Tree(EVTPassBase):
+    """
+    Convert the DAG IR to Tree by fusing subgraphs
+    """
+    dependencies = [
+        PassShapeTypePropagation,
+        PassGetImpl
+    ]
+
+    def call(self):
+        # Step 1: find the nodes that have multiple parents
+        multi_parent_nodes = []
+
+        for node in self.dag_ir.nodes_topological_order():
+            if self.dag_ir.out_degree(node) > 1:
+                multi_parent_nodes.append(node)
+        # Step 2: find the lowest common ancestor (LCA) of all its parents
+        for node in multi_parent_nodes:
+            # A multi-parent node could be already fused by the previous node
+            if not self.dag_ir.has_node(node):
+                continue
+            # A node uncovered by the previous fusions can have out degree change
+            # Case 1: it has <= 1 edges to the previously fused subgraph, no degree change
+            # Case 2: it has more than one edges to the previously fused subgraph, degree drops
+            if self.dag_ir.out_degree(node) <= 1:
+                continue
+
+            # Otherwise, the node still
+            reachable_nodes = []
+            # Complexity: O(Dout*N)
+            for parent in self.dag_ir.get_users(node):
+                reachable_nodes.append(set(self.dag_ir.all_reachable_nodes(parent)))
+            # get the common reachable objects
+            common_items = set.intersection(*reachable_nodes)
+
+            # If common ancestor exists, find the lowest one
+            if len(common_items) > 0:
+                topo_order = self.dag_ir.nodes_topological_order()
+                lca = None
+                topo_idx = -1
+                for item in common_items:
+                    if lca is None:
+                        lca = item
+                        topo_idx = topo_order.index(item)
+                    else:
+                        if topo_idx > topo_order.index(item):
+                            lca = item
+                            topo_idx = topo_order.index(item)
+                # The lca is the output node of the DAG node
+                # Get the nodes to be fused
+                node_to_fuse = set.union(*reachable_nodes).difference(common_items)
+                node_to_fuse.add(lca)
+                # Get all the input nodes
+                all_input_nodes = []
+                all_output_nodes = []
+                for node in node_to_fuse:
+                    all_input_nodes.append(set(self.dag_ir.get_all_inputs(node)))
+                    all_output_nodes.append(set(self.dag_ir.get_users(node)))
+                all_input_nodes = set.union(*all_input_nodes)
+                all_output_nodes = set.union(*all_output_nodes)
+
+                new_subgraph_nodes = set.union(node_to_fuse, all_input_nodes, all_output_nodes)
+
+                # Create the subgraph
+                subgraph_ = self.dag_ir._graph.subgraph(new_subgraph_nodes)
+                subgraph = DAGIR()
+                for node in subgraph_.nodes:
+                    meta = deepcopy(self.dag_ir.get_node_meta(node))
+                    if node not in node_to_fuse:
+                        meta.disabled = True
+                    subgraph.add_node(meta)
+                for edge in subgraph_.edges:
+                    subgraph.add_edge(edge[0], edge[1], self.dag_ir.get_edge_weight(edge[0], edge[1]))
+
+
+                # Create the fused node
+                dag_node = TopoVisitorNode(
+                    name=f"dag_{lca}", subgraph=subgraph,
+                    output_node=self.dag_ir.get_node_meta(lca))
+                self.dag_ir.add_node(dag_node)
+
+                # Add input edges
+                for idx, node in enumerate(all_input_nodes):
+                    self.dag_ir.add_edge(node, dag_node.name, weight=idx)
+
+                # Replace all uses with DAG node (only 1 output node)
+                self.dag_ir.replace_all_uses_with(lca, dag_node.name)
+
+                # Remove all fused nodes
+                node_to_fuse.remove(lca)
+                for node in node_to_fuse:
+                    self.dag_ir.remove_node(node)
+
+            else:
+                raise NotImplementedError("No LCA found. Consider SplitTreeVisitor.")
+
+    def ensures(self) -> None:
+        # Ensure that after the pass, the resulting DAG becomes a tree
+        for node in self.dag_ir.nodes:
+            out_degree = self.dag_ir.out_degree(node)
+            if out_degree > 1:
+                raise RuntimeError(f"PassDAG2Tree failed. Node {node} still have outdegree = {out_degree}")
diff --git a/python/cutlass/backend/evt/passes/pass_fix_element_d.py b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
new file mode 100644
index 0000000000..1e8e8604a4
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
@@ -0,0 +1,64 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Fix the element_output of producer of D.
+
+In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+element converter, so the compute node producing D must have element_output = type(D).
+"""
+
+from cutlass.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassFixElementD(EVTPassBase):
+    """
+    In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+    element converter, so the compute node producing D must have
+    element_output = type(D)
+    """
+    dependencies = [
+        PassLayoutManipulateElimination
+    ]
+    def get_producer(self, node, element_D):
+        node_meta = self.dag_ir.get_node_meta(node)
+        if node_meta.op == "compute":
+            node_meta.element_output = element_D
+        elif node_meta.op == "store":
+            self.get_producer(self.dag_ir.get_all_inputs(node)[0], element_D)
+
+    def call(self):
+        if self.dag_ir.has_node("D"):
+            node_d_meta = self.dag_ir.get_node_meta("D")
+            element_D = node_d_meta.store_tensor.element
+            self.get_producer("D", element_D)
diff --git a/python/cutlass/backend/evt/passes/pass_get_impl.py b/python/cutlass/backend/evt/passes/pass_get_impl.py
new file mode 100644
index 0000000000..90c746071c
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_get_impl.py
@@ -0,0 +1,89 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Infer the underlying implement of each node.
+
+While the frontend only distinguish between Load/Store/Compute Node,
+each of these nodes can have different underlying implementation based
+on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+This pass infers the underlying impl of each node
+"""
+
+import cutlass.backend.evt.backend as evt_backend
+from cutlass.backend.evt.ir import DAGIR, LoadNode
+from cutlass.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_no_op_elimination import PassNoOpElimination
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassGetImpl(EVTPassBase):
+    """
+    While the frontend only distinguish between Load/Store/Compute Node,
+    each of these nodes can have different underlying implementation based
+    on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+    This pass infers the underlying impl of each node
+    """
+    dependencies = [
+        PassShapeTypePropagation,  # The shape and type info are required for inference
+        PassFixElementD
+    ]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.no_op_elimination = PassNoOpElimination(dag_ir)
+
+    def requires(self) -> None:
+        # Verify "accum" is in the arg list
+        if not self.dag_ir.has_node("accum"):
+            raise SyntaxError("Cannot find 'accum' in the argument list.")
+
+    def call(self):
+        # The loop structure of the epilogue is determined by the
+        # accumulator shape
+        accumulator: LoadNode = self.dag_ir.get_node_meta("accum")
+        problem_size = accumulator.tensor.shape
+
+        for node_meta in self.dag_ir.node_metas_topological_order():
+            node_meta.get_underlying_impl(problem_size)
+
+    def ensures(self) -> None:
+        # Some nodes will be lowered to NoOp, eliminate them
+        self.no_op_elimination()
+        # Lower to cc-specific impl
+        for node_meta in self.dag_ir.nodes_meta:
+            node_impl_ccs = getattr(evt_backend, f"sm{self.cc}_nodes")
+            node_meta.underlying_impl = getattr(
+                node_impl_ccs,
+                f"Sm{self.cc}" + node_meta.underlying_impl.__class__.__name__
+            )(node_meta)
diff --git a/python/cutlass/backend/evt/passes/pass_layout_elimination.py b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
new file mode 100644
index 0000000000..435a0cbf34
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
@@ -0,0 +1,217 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Eliminate layout manipulation nodes
+"""
+
+from copy import deepcopy
+
+from cutlass.backend.evt.ir import DAGIR, LayoutNode
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassLayoutManipulateElimination(EVTPassBase):
+    """
+    Eliminate layout manipulation nodes
+    """
+    dependencies = [PassShapeTypePropagation]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.copy_cnt = 0
+
+    def call(self):
+        self.layout_nodes_worklist = self.get_all_layout_nodes()
+        # Run while loop utill all layout nodes are eliminated
+        while(len(self.layout_nodes_worklist) > 0):
+            node = self.layout_nodes_worklist.pop(0)
+            # for node in layout_nodes:
+            # Step 1: get the propagation direction
+            direction = self.get_propagation_direction(node)
+            self.visited = []
+            getattr(self, f"propagate_to_{direction}")(self.dag_ir.get_node_meta(node), node)
+            # Eliminate the current node
+            input_node = self.dag_ir.get_all_inputs(node)[0]
+            self.dag_ir.replace_all_uses_with(node, input_node)
+            # layout_nodes = self.get_all_layout_nodes()
+
+    def get_all_layout_nodes(self):
+        layout_nodes = []
+        for node_meta in reversed(self.dag_ir.node_metas_topological_order()):
+            if isinstance(node_meta, LayoutNode):
+                layout_nodes.append(node_meta.name)
+        return layout_nodes
+
+    def get_propagation_direction(self, node: str):
+        """
+        The logic is propagating all layout nodes away from the accumulator node.
+        """
+        self.visited = []
+        self.get_influenced_users(node)
+        nodes_influenced_dir_users = self.visited
+        self.visited = []
+        self.get_influenced_inputs(node)
+        nodes_influenced_dir_inputs = self.visited
+
+        if "accum" in nodes_influenced_dir_users and "accum" not in nodes_influenced_dir_inputs:
+            return "inputs"
+        elif "accum" not in nodes_influenced_dir_users and "accum" in nodes_influenced_dir_inputs:
+            return "users"
+        else:
+            raise RuntimeError("Unsolved propagation direction")
+
+    # Get all influenced nodes if we propagate along the user direction
+    def get_influenced_users(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        users = self.dag_ir.get_users(node)
+        for user in users:
+            self.get_influenced_users(user)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.get_influenced_inputs(input)
+
+    # Get all influenced nodes if we propagate along the input direction
+    def get_influenced_inputs(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        inputs = self.dag_ir.get_all_inputs(node)
+        for input in inputs:
+            self.get_influenced_inputs(input)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.get_influenced_users(user)
+
+    def add_copy_before(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        target_inputs = self.dag_ir.get_all_inputs(target)
+        for src in target_inputs:
+            self.dag_ir.remove_edge(src, target)
+            self.dag_ir.add_edge(src, copied_node)
+        self.dag_ir.add_edge(copied_node, target)
+        self.layout_nodes_worklist.append(copied_node)
+
+    def add_copy_after(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        users = self.dag_ir.get_users(target)
+        for user in users:
+            self.dag_ir.remove_edge(target, user)
+            self.dag_ir.add_edge(copied_node, user)
+        self.dag_ir.add_edge(target, copied_node)
+        self.layout_nodes_worklist.append(copied_node)
+
+    # Propagate the layout `node` along the user direction
+    def propagate_to_users(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to users
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_before(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_user(node_meta)
+
+        users = self.dag_ir.get_users(node)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        for user in users:
+            self.propagate_to_users(layout_node_meta, user)
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.propagate_to_inputs(layout_node_meta.get_inverse_node(), input)
+
+    # Propagate the layout `node` along the input direction
+    def propagate_to_inputs(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to inputs
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_after(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_input(node_meta)
+        inputs = self.dag_ir.get_all_inputs(node)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        for input in inputs:
+            self.propagate_to_inputs(layout_node_meta, input)
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.propagate_to_users(layout_node_meta.get_inverse_node(), user)
diff --git a/python/cutlass/backend/evt/passes/pass_manager.py b/python/cutlass/backend/evt/passes/pass_manager.py
new file mode 100644
index 0000000000..4fa31a8bd2
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_manager.py
@@ -0,0 +1,163 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Pass manager for DAG IR.
+"""
+
+from typing import Any
+
+import networkx as nx
+
+from cutlass.backend.evt.ir import DAGIR
+
+
+class EVTPassBase:
+    """
+    Base class for EVT Passes
+    """
+    dependencies = []
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    def requires(self) -> None:
+        """
+        This function will be called before the pass is run.
+        """
+        pass
+
+    def call(self) -> None:
+        """
+        The pass that is run through the self.dag_ir
+        """
+        raise NotImplementedError(
+            f"__call__ is not overwritten in Pass {self.__class__.__name__}")
+
+    def ensures(self) -> None:
+        """
+        This function will be called after the pass is run.
+        """
+        pass
+
+    def __call__(self) -> Any:
+        self.requires()
+        self.call()
+        self.ensures()
+
+    def cc_specific_method(self, func):
+        """
+        This enables defining function that behaves differently under different cc
+        The simplest example of using this function is the following
+
+        .. highlight:: python
+        .. code-block:: python
+
+        class ExamplePass(EVTPassBase):
+
+            def call(sekf):
+                # This automatically select the smXX_func based on current cc
+                self.cc_specific_method(self.func)()
+
+            # Interface func, can be empty
+            def func(self):
+                pass
+
+            # Sm90 specific func
+            def sm90_func(self):
+                // sm90 specific method
+                return
+
+            # Sm80 specific func
+            def sm80_func(self):
+                // sm80 specific method
+                return
+        """
+        func_name = f"sm{self.cc}_{func.__name__}"
+        if hasattr(self, func_name):
+            return getattr(self, func_name)
+        else:
+            raise NotImplementedError(f"func {func.__name__} is not overwritten for Sm{self.cc}")
+
+
+class EVTPassManager(nx.DiGraph):
+    """
+    Topological-based Pass Manager.
+    Each registered pass has a list of dependencies. The pass manager organizes
+    the passes as a DAG and launch the compiler passes under topological order.
+    """
+    def __init__(self, dag_ir: DAGIR, pass_list):
+        super().__init__()
+        self.dag_ir = dag_ir
+        for pass_cls in pass_list:
+            self.add_pass(pass_cls)
+
+        self.sorted_passes = self.schedule()
+
+    def get_callable(self, pass_name):
+        """
+        Return the callable of the pass
+        """
+        return self.nodes[pass_name]["callable"]
+
+    def add_pass(self, pass_cls):
+        """
+        Add a pass to the pass manager
+        :param pass_cls: the class of pass
+        :type pass_cls: derived class of EVTPassBase
+        """
+        name = pass_cls.__name__
+        pass_callable = pass_cls(self.dag_ir)
+        self.add_node(name, callable=pass_callable)
+
+    def schedule(self):
+        """
+        Schedule the added passes under topological order
+        """
+        # Add edges
+        for pass_name in self.nodes:
+            callable = self.get_callable(pass_name)
+            for dependency_cls in callable.dependencies:
+                self.add_edge(
+                    dependency_cls.__name__,
+                    type(callable).__name__)
+
+        # Topological sort
+        return list(nx.topological_sort(self))
+
+    def __call__(self) -> Any:
+        """
+        Launch the registered passes
+        """
+        for pass_name in self.sorted_passes:
+            callable = self.get_callable(pass_name)
+            callable()
diff --git a/python/cutlass/backend/evt/passes/pass_no_op_elimination.py b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
new file mode 100644
index 0000000000..df63ecfc28
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+No op elimination node
+"""
+
+from typing import Any
+
+from cutlass.backend.evt.ir import NoOpImpl
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassNoOpElimination(EVTPassBase):
+    """
+    The dead node elimination pass removes nodes with NoOpImpl in DAG IR
+    """
+    dependencies = []
+
+    def call(self) -> Any:
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta = self.dag_ir.get_node_meta(node)
+            if isinstance(node_meta.underlying_impl, NoOpImpl):
+                self.dag_ir.replace_all_uses_with(node, self.dag_ir.get_all_inputs(node)[0])
diff --git a/python/cutlass/backend/evt/passes/pass_preprocess_red.py b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
new file mode 100644
index 0000000000..afb8a9c46d
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Preprocess the reduction nodes.
+
+The parser treats reduction as Compute(op=(reg_reduce_fn, gmem_reduce_fn)) - Store()
+This pass fuses these into a single store node, and then replaces all uses of the
+current node with the new store node.
+"""
+
+from cutlass.backend.evt.ir import ComputeNode, StoreNode
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+
+
+
+class PassPreprocessRed(EVTPassBase):
+    """
+    Preprocess red nodes
+    """
+
+    def call(self):
+        # Step 1: find the compute nodes with op=red
+        red_compute_nodes = []
+        for node_meta in self.dag_ir.nodes_meta:
+            if isinstance(node_meta, ComputeNode):
+                if type(node_meta.fn) == tuple:
+                    # To keep the frontend simple, the reduction nodes
+                    # are parsed into compute nodes by default
+                    # The simple heuristic to distinguish between compute
+                    # and reduction node is that compute node is a single function,
+                    # while the reduction node is a tuple of functions for
+                    # in-register reduction and atomic global memory reduction
+                    red_compute_nodes.append(node_meta.name)
+
+        # Step 2: for each compute, merge it with the succeeding store
+        for node in red_compute_nodes:
+            # Verify
+            users = self.dag_ir.get_users(node)
+            inputs = self.dag_ir.get_all_inputs(node)
+            # Has a single user
+            assert len(users) == 1
+            assert len(inputs) == 1
+            user = users[0]
+            input = inputs[0]
+
+            user_meta = self.dag_ir.get_node_meta(user)
+            # Must be a store node
+            assert isinstance(user_meta, StoreNode)
+            # With output degree == 0
+            assert self.dag_ir.out_degree(user) == 0
+            # Register the reduce op
+            node_meta = self.dag_ir.get_node_meta(node)
+            user_meta.reg_reduce_fn, user_meta.gmem_reduce_fn = node_meta.fn
+            user_meta.element_compute = node_meta.element_compute
+            user_meta.round_style = node_meta.round_style
+
+            # Replace all uses
+            self.dag_ir.remove_edge(input, node)
+            input_users = self.dag_ir.get_users(input)
+            for iu in input_users:
+                weight = self.dag_ir.get_edge_weight(input, iu)
+                self.dag_ir.add_edge(user, iu, weight)
+                self.dag_ir.remove_edge(input, iu)
+            self.dag_ir.add_edge(input, user)
+            self.dag_ir.remove_node(node)
+
+            # Register the reduction name
+            self.dag_ir.reduction_names.append(user)
diff --git a/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
new file mode 100644
index 0000000000..a4f66ced46
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Shape and type propagation pass
+"""
+
+from cutlass.backend.evt.ir.node import NodeBase
+from cutlass.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+
+
+class PassShapeTypePropagation(EVTPassBase):
+    """
+    Propagate the shape and type of all nodes
+    """
+    dependencies = [PassPreprocessRed]
+
+    def call(self):
+        # Propagate the node shape and type
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.type_propagation(input_node_metas)
+            node_meta.shape_propagation(input_node_metas)
+
+        for node in reversed(self.dag_ir.nodes_topological_order()):
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.broadcast_propagation(input_node_metas)
diff --git a/python/cutlass/backend/evt/passes/smem_size_calculator.py b/python/cutlass/backend/evt/passes/smem_size_calculator.py
new file mode 100644
index 0000000000..670367d075
--- /dev/null
+++ b/python/cutlass/backend/evt/passes/smem_size_calculator.py
@@ -0,0 +1,200 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Compute the shared memory size in bytes
+"""
+
+from pycute import shape_div, product
+
+import cutlass
+from cutlass.backend.evt.ir import TopoVisitorNode, DAGIR
+from cutlass.backend.library import DataTypeSize
+
+
+class GetSmemSize:
+    """
+    Get the size in byte of shared memory used by the kernel
+    """
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    #
+    # Sm90 epilogue specific
+    #
+
+    def sm90_epilogue_tile(self, tile_description):
+        # Get the epilogue tile size
+        schedule = tile_description.epilogue_schedule
+        if schedule == cutlass.EpilogueScheduleType.TmaWarpSpecialized:
+            epilogue_tile_mn = (64, 32)
+        elif schedule == cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative:
+            epilogue_tile_mn = (128, 32)
+        else:
+            raise NotImplementedError(f"Unsupported schedule: {schedule}")
+
+        # Get the pipeline stages
+        stages_d = 2
+        epi_tiles = product(shape_div(tuple(tile_description.threadblock_shape)[:2], epilogue_tile_mn))
+        if self.dag_ir.has_node("C"):
+            element_c = self.dag_ir.get_node_meta("C").element
+        else:
+            element_c = None
+
+        element_d = self.dag_ir.get_node_meta("D").element
+        if element_c == element_d:
+            reuse_smem_c = True
+        else:
+            reuse_smem_c = False
+        stages_c = max(epi_tiles, stages_d + 1) if reuse_smem_c else epi_tiles
+
+        # Record the epilogue tile
+        self.cta_tile_mnk = tuple(tile_description.threadblock_shape)
+        self.epilogue_tile_mn = epilogue_tile_mn
+        self.epi_tiles = epi_tiles
+        self.stages_c = stages_c
+        self.stages_d = stages_d
+        self.reuse_smem_c = reuse_smem_c
+        self.element_c = element_c
+        self.element_d = element_d
+        self.is_source_supported = element_c is not None
+
+    def sm90_epilogue_smem_size(self, tile_description):
+        """
+        Compute the shared memory size of sm90 collective epilogue
+        """
+        self.sm90_epilogue_tile(tile_description)
+        # Get the Fusion Storage
+        nodes = self.dag_ir.nodes_topological_order()
+        self.smem_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.smem_types[node] = meta.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+            if node == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_smem_type(node)
+            else:
+                self.get_evt_smem_type(node)
+
+        thread_smem_size = self.smem_types[self.dag_ir.get_all_inputs("D")[0]][0]
+        # Get the Tensor Storage
+        tensors = []
+        if self.is_source_supported:
+            smem_C = DataTypeSize[self.element_c] * product(self.epilogue_tile_mn) * self.stages_c // 8
+            tensors.append((smem_C, 128))
+        else:
+            tensors.append((0, 1))
+        if self.reuse_smem_c:
+            tensors.append((0, 128))
+        else:
+            smem_D = DataTypeSize[self.element_d] * product(self.epilogue_tile_mn) * self.stages_d // 8
+            tensors.append((smem_D, 128))
+        tensors.append((thread_smem_size, 128))
+
+        tensor_smem_size = self.get_struct_size(tensors)
+        # Get pipeline storage size
+        # sizeof(uint64_t * stages_c * 2), alignment of uint64_t
+        # 2 is for FullBarrier and EmptyBarrier
+        pipeline_smem_size = (8 * self.stages_c * 2, 8)
+
+        # get SharedStorage size
+        smem_size = self.get_struct_size([tensor_smem_size, pipeline_smem_size])
+        return smem_size[0]
+
+    def __call__(self, tile_description):
+        return getattr(self, f"sm{self.cc}_epilogue_smem_size")(tile_description)
+
+    #
+    # Helper functions
+    #
+
+    @staticmethod
+    def get_visitor_size(members: list, ebo: bool):
+        """
+        Get the size of struct in bytes
+        """
+        offset = 0
+        max_alignment = 1
+        if len(members) > 0:
+            # Get alignment
+            for _, alignment in members:
+                max_alignment = max(max_alignment, alignment)
+
+            for type_size, _ in members:
+                if type_size != 0:
+                    offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+                if type_size == 0 and not ebo:
+                    offset += 1
+                else:
+                    offset += type_size
+            offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+            return (offset, max_alignment)
+        else:
+            # Struct size is at least 1
+            return (1, 1)
+
+    def get_struct_size(self, members: list):
+        """
+        Get the size of struct in bytes
+        """
+        return self.get_visitor_size(members, False)
+
+    def get_evt_smem_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.smem_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        input_types.append(self.smem_types[node])
+        if len(input_types) > 1:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
+
+    def get_dag_smem_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.smem_types[n] = m.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+        input_types = [self.smem_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
diff --git a/python/cutlass/backend/frontend.py b/python/cutlass/backend/frontend.py
index b4e0be07fb..a43dcbb00b 100644
--- a/python/cutlass/backend/frontend.py
+++ b/python/cutlass/backend/frontend.py
@@ -36,10 +36,12 @@
 from cutlass.backend.memory_manager import device_mem_alloc, todevice
 from cutlass.backend.utils.software import CheckPackages
 
-if CheckPackages().check_torch():
+torch_available = CheckPackages().check_torch()
+if torch_available:
     import torch
 
-if CheckPackages().check_cupy():
+cupy_available = CheckPackages().check_cupy()
+if cupy_available:
     import cupy as cp
 
 
@@ -94,3 +96,19 @@ class CupyFrontend:
     @staticmethod
     def argument(cupy_ndarray: "cp.ndarray"):
         return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
+
+class TensorFrontend:
+    """
+    Universal Frontend for client-provide tensors
+    """
+
+    @staticmethod
+    def argument(tensor, is_output=False):
+        if isinstance(tensor, np.ndarray):
+            return NumpyFrontend.argument(tensor, is_output)
+        elif torch_available and isinstance(tensor, torch.Tensor):
+            return TorchFrontend.argument(tensor)
+        elif cupy_available and isinstance(tensor, cp.ndarray):
+            return CupyFrontend.argument(tensor)
+        else:
+            raise NotImplementedError("Unknown Tensor Type")
diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py
index 96fefd4b78..8bbf402418 100644
--- a/python/cutlass/backend/gemm_operation.py
+++ b/python/cutlass/backend/gemm_operation.py
@@ -35,20 +35,41 @@
 import enum
 
 from cuda import cuda, cudart
-import cutlass_bindings
 import numpy as np
 import rmm
 
 from cutlass import (
+    ComplexTransformTag,
+    DataType,
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
     EpilogueScheduleSuffixes,
     EpilogueScheduleTag,
     EpilogueScheduleType,
+    GemmKind,
+    GemmKindNames,
+    GemmUniversalMode,
     KernelScheduleSuffixes,
     KernelScheduleTag,
     KernelScheduleType,
+    LayoutTag,
+    LayoutType,
+    MathOperation,
+    MathOperationTag,
+    OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
+    OperationKind,
+    ShortComplexLayoutNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
     TileSchedulerSuffixes,
     TileSchedulerTag,
-    TileSchedulerType
+    TileSchedulerType,
+    get_complex_from_real
 )
 from cutlass.backend.arguments import ArgumentBase
 from cutlass.backend.c_types import (
@@ -66,41 +87,21 @@
 from cutlass.backend.library import (
     ApiVersion,
     EmissionType,
-    ComplexTransformTag,
-    DataTypeNames,
-    DataTypeSize,
-    DataTypeTag,
-    GemmKind,
-    GemmKindNames,
-    LayoutTag,
-    MathOperation,
-    MathOperationTag,
-    OpcodeClassNames,
-    OpcodeClassTag,
-    OperationKind,
     SchedulerMode,
     SchedulerModeTag,
-    ShortComplexLayoutNames,
-    ShortDataTypeNames,
-    ShortLayoutTypeNames,
     TensorDescription,
     TileDescription,
     api_version,
-    enum_auto,
-    get_complex_from_real,
 )
 from cutlass.backend.memory_manager import device_mem_alloc, todevice
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
-from cutlass.backend.tensor_ref import TensorRef
 from cutlass.backend.type_hint import GemmOperation, Tensor
 from cutlass.backend.utils.software import (
     CheckPackages,
     SubstituteTemplate,
     device_sm_count,
 )
-
-if CheckPackages().check_torch():
-    import torch
+from cutlass.shape import GemmCoord, MatrixCoord
 
 
 ################################################################################
@@ -110,13 +111,31 @@
 ################################################################################
 
 
-def transpose_layout(layout: cutlass_bindings.layout):
-    if layout == cutlass_bindings.ColumnMajor:
-        return cutlass_bindings.RowMajor
-    elif layout == cutlass_bindings.RowMajor:
-        return cutlass_bindings.ColumnMajor
+def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int:
+    """
+    Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``.
+
+    :param layout: layout of the tensor
+    :type layout: cutlass.shape.LayoutType
+    :param shape: shape of the tensor
+    :type shape: cutlass.shape.MatrixCoord
+
+    :return: leading dimension of the tensor
+    :rtype: int
+    """
+    if layout == LayoutType.RowMajor:
+        return shape.column
+    elif layout == LayoutType.ColumnMajor:
+        return shape.row
+
+
+def transpose_layout(layout: LayoutType) -> LayoutType:
+    if layout == LayoutType.ColumnMajor:
+        return LayoutType.RowMajor
+    elif layout == LayoutType.RowMajor:
+        return LayoutType.ColumnMajor
     else:
-        raise ValueError("unsupported Layout {}".format(layout))
+        raise ValueError(f"Unsupported Layout {layout}")
 
 
 class GemmArguments2x(ArgumentBase):
@@ -129,7 +148,7 @@ class GemmArguments2x(ArgumentBase):
      :class:`cutlass.backend.GemmOperationGrouped`
 
     :param problem_size: GEMM problem size gemm(M, N, K)
-    :type operation: :class:`cutlass_bindings.gemm.GemmCoord`
+    :type operation: :class:`cutlass.shape.GemmCoord`
 
     :param A: tensor A
     :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
@@ -144,72 +163,67 @@ class GemmArguments2x(ArgumentBase):
     :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
 
     :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass_bindings.gemm.Mode`
+    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
 
-    def __init__(
-        self, operation: "GemmOperation", problem_size: "cutlass_bindings.gemm.GemmCoord",
-        A: "Tensor", B: "Tensor", C: "Tensor", D: "Tensor",
-        gemm_mode: "cutlass_bindings.gemm.Mode" = cutlass_bindings.gemm.Mode.Gemm, **kwargs):
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
         self.operation = operation
 
-        self.layout_A: cutlass_bindings.layout = operation.A.layout
-        self.layout_B: cutlass_bindings.layout = operation.B.layout
-        self.layout_C: cutlass_bindings.layout = operation.C.layout
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
 
         self.element_A = operation.A.element
         self.element_B = operation.B.element
         self.element_C = operation.C.element
 
-        if (operation.C.layout in
-            [cutlass_bindings.RowMajorInterleaved32, cutlass_bindings.ColumnMajorInterleaved32]):
-            # reorder tensor B for interleaved layout output
-            B = self.reorder_tensor_B(B, problem_size)
+        if operation.C.layout in [LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32]:
+            raise Exception("Interleaved layout not currently supported")
 
-        super().__init__(A, B, C, D, **kwargs)
+        if hasattr(self.operation.epilogue_functor, "visitor") and operation.arch != 90:
+            super().__init__(A, B, None, None, **kwargs)
+        else:
+            super().__init__(A, B, C, D, **kwargs)
 
         if operation.switched:
-            self.problem_size = cutlass_bindings.gemm.GemmCoord(
-                problem_size.n(), problem_size.m(), problem_size.k())
+            self.problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
             self.ptr_A, self.ptr_B = self.ptr_B, self.ptr_A
         else:
-            self.problem_size = cutlass_bindings.gemm.GemmCoord(
-                problem_size.m(), problem_size.n(), problem_size.k())
-
-        # if the number of elements in C = problem_size.n
-        # C is treated as the bias
+            self.problem_size = problem_size
+        # If the number of elements in C = problem_size.n, C is treated as the bias
         if hasattr(self, "tensor_c_numel"):
-            if self.tensor_c_numel == self.problem_size.n() and self.problem_size.m() != 1:
+            if self.tensor_c_numel == self.problem_size.n and self.problem_size.m != 1:
                 self.bias = True
 
-        # get the leading dimension
-        self.lda = operation.A.layout.packed(self.problem_size.mk()).stride()
-        self.ldb = operation.B.layout.packed(self.problem_size.kn()).stride()
-        self.ldc = operation.C.layout.packed(self.problem_size.mn()).stride()
+        self.lda = leading_dimension(self.layout_A, self.problem_size.mk)
+        self.ldb = leading_dimension(self.layout_B, self.problem_size.kn)
+        self.ldc = leading_dimension(self.layout_C, self.problem_size.mn)
         self.ldd = self.ldc
 
-        # stride 0 trick
         if self.bias:
             self.ldc = 0
 
-        if "output_op" in kwargs.keys() and gemm_mode != cutlass_bindings.gemm.Mode.GemmSplitKParallel:
+        if "output_op" in kwargs.keys() and gemm_mode != GemmUniversalMode.GemmSplitKParallel:
             self.output_op = kwargs["output_op"]
         else:
-            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+            if self.operation.epilogue_functor.element_epilogue in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+                dtype = int
+            else:
+                dtype = float
+            self.output_op = self.operation.epilogue_type(dtype(1.0), dtype(0.0))
 
-        # get number of slices on k dimension
         self.gemm_mode = gemm_mode
-        if gemm_mode in [cutlass_bindings.gemm.Mode.Gemm, cutlass_bindings.gemm.Mode.GemmSplitKParallel]:
+        if gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
             if "split_k_slices" in kwargs.keys():
                 self.batch_count = kwargs["split_k_slices"]
             else:
                 self.batch_count = 1
             self.split_k_slices = self.batch_count
 
-        if gemm_mode in [cutlass_bindings.gemm.Mode.Batched, cutlass_bindings.gemm.Mode.Array]:
+        if gemm_mode in [GemmUniversalMode.Batched, GemmUniversalMode.Array]:
             if "batch" in kwargs.keys():
                 self.batch_count = kwargs["batch"]
             else:
@@ -221,16 +235,15 @@ def __init__(
             self.batched_stride_C = kwargs["batch_strides"]["C"]
             self.batched_stride_D = kwargs["batch_strides"]["D"]
         else:
-            self.batched_stride_A = self.problem_size.m() * self.problem_size.k()
-            self.batched_stride_B = self.problem_size.n() * self.problem_size.k()
-            self.batched_stride_C = self.problem_size.m() * self.problem_size.n()
-            self.batched_stride_D = self.problem_size.m() * self.problem_size.n()
+            self.batched_stride_A = self.problem_size.m * self.problem_size.k
+            self.batched_stride_B = self.problem_size.n * self.problem_size.k
+            self.batched_stride_C = self.problem_size.m * self.problem_size.n
+            self.batched_stride_D = self.problem_size.m * self.problem_size.n
 
         if self.bias:
-            self.batched_stride_C = self.problem_size.n()
+            self.batched_stride_C = self.problem_size.n
 
-        # support GEMM Mode Array
-        if gemm_mode == cutlass_bindings.gemm.Mode.Array:
+        if gemm_mode == GemmUniversalMode.Array:
             self.ptr_A_array = []
             self.ptr_B_array = []
             self.ptr_C_array = []
@@ -264,55 +277,14 @@ def __init__(
         if isinstance(self.operation, GemmOperationUniversal):
             self.initialize()
 
-    def reorder_tensor_B(self, tensor_B: "np.ndarray",
-        problem_size: "cutlass_bindings.gemm.GemmCoord"):
-        """
-        Reorder tensor_B for interleaved layout
-
-        :param tensor_B: input tensor B
-        :type tensor_B: numpy.ndarray
-        :param problem_size: GEMM problem size
-        :type problem_size: :class:`cutlass_bindings.gemm.GemmCoord`
-
-        :return: reordered tensor B
-        :rtype: numpy.ndarray
-        """
-        reordered_tensor_B = np.empty_like(tensor_B)
-        tensor_ref_B = self.get_tensor_ref(
-            tensor_B, self.element_B, self.layout_B, problem_size, "b"
-        )
-        reordered_tensor_ref_B = self.get_tensor_ref(
-            reordered_tensor_B, self.element_B, self.layout_B, problem_size, "b"
-        )
-        cutlass_bindings.gemm.host.reorder_column(
-            tensor_ref_B, reordered_tensor_ref_B, problem_size)
-        return reordered_tensor_B
-
-    def get_tensor_ref(
-        self, tensor, dtype, tensor_layout, problem_size, operand):
-        if operand == "a":
-            tensor_coord = problem_size.mk()
-        elif operand == "b":
-            tensor_coord = problem_size.kn()
-        elif operand in ["c", "d"]:
-            tensor_coord = problem_size.mn()
-        else:
-            raise ValueError("unknown operand: " + operand)
-
-        layout = tensor_layout.packed(tensor_coord)
-
-        return TensorRef(tensor, dtype, layout).tensor_ref
-
     def get_arguments(self):
-        problem_size_ = GemmCoord_(self.problem_size)
-        grid_tiled_shape_ = GemmCoord_(
-            cutlass_bindings.gemm.GemmCoord(
-                self.grid_tiled_shape.x,
-                self.grid_tiled_shape.y,
-                self.grid_tiled_shape.z
-            )
-        )
-        if self.gemm_mode == cutlass_bindings.gemm.Mode.Array:
+        problem_size_ = self.problem_size.ctype
+        grid_tiled_shape_ = GemmCoord(
+            self.grid_tiled_shape.x,
+            self.grid_tiled_shape.y,
+            self.grid_tiled_shape.z ).ctype
+
+        if self.gemm_mode == GemmUniversalMode.Array:
             arguments = self.operation.argument_type(
                 # Arguments from UniversalArgumentsBase
                 self.gemm_mode,
@@ -351,10 +323,9 @@ def get_arguments(self):
         self.arguments = arguments, grid_tiled_shape_, self.gemm_k_size
 
     def initialize(self):
-        # get launch configuration
         launch_config = self.operation.rt_module.plan(self)
 
-        # get the host and evice workspace
+        # Get the host and evice workspace
         device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
 
         if device_workspace_size > 0:
@@ -366,12 +337,10 @@ def initialize(self):
             workspace_ptr = None
 
         device_workspace = 0
-        if workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            # in GEMM splik-K parallel, the D pointer is redirected
-            # to the workspace
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
             self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
-        elif workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.Gemm:
-            # in GEMM split-K serial
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
             device_workspace = workspace_ptr
 
         self.get_arguments()
@@ -387,6 +356,11 @@ def initialize(self):
         self.device_workspace = device_workspace
         self.launch_config = launch_config
 
+    def sync(self, stream_sync=True):
+        super().sync(stream_sync)
+        if hasattr(self.output_op, "sync"):
+            self.output_op.sync()
+
 
 class GemmArguments2xStreamK(GemmArguments2x):
     """
@@ -398,7 +372,7 @@ class GemmArguments2xStreamK(GemmArguments2x):
      :class:`cutlass.backend.GemmOperationGrouped`
 
     :param problem_size: GEMM problem size gemm(M, N, K)
-    :type operation: :class:`cutlass_bindings.gemm.GemmCoord`
+    :type operation: :class:`cutlass.shape.GemmCoord`
 
     :param A: tensor A
     :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
@@ -413,30 +387,27 @@ class GemmArguments2xStreamK(GemmArguments2x):
     :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
 
     :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass_bindings.gemm.Mode`
+    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
 
-    def __init__(
-        self, operation: "GemmOperation", problem_size: "cutlass_bindings.gemm.GemmCoord",
-        A: "Tensor", B: "Tensor", C: "Tensor", D: "Tensor",
-        gemm_mode: "cutlass_bindings.gemm.Mode" = cutlass_bindings.gemm.Mode.Gemm, **kwargs):
-        if gemm_mode not in [cutlass_bindings.gemm.Mode.Gemm, cutlass_bindings.gemm.Mode.Batched]:
-            raise Exception("Unsupporged GEMM mode {}.".format(gemm_mode))
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
 
         super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
 
     def get_arguments(self):
-        batch_stride_A = self.problem_size.m() * self.problem_size.k()
-        batch_stride_B = self.problem_size.k() * self.problem_size.n()
-        batch_stride_C = self.problem_size.m() * self.problem_size.n()
-        batch_stride_D = self.problem_size.m() * self.problem_size.n()
+        batch_stride_A = self.problem_size.m * self.problem_size.k
+        batch_stride_B = self.problem_size.k * self.problem_size.n
+        batch_stride_C = self.problem_size.m * self.problem_size.n
+        batch_stride_D = self.problem_size.m * self.problem_size.n
 
         arguments = self.operation.argument_type(
             self.gemm_mode,
-            GemmCoord_(self.problem_size),
+            GemmCoord_(self.problem_size.m, self.problem_size.n, self.problem_size.k),
             self.batch_count,
             self.output_op,
             int(self.ptr_A),
@@ -454,7 +425,7 @@ def get_arguments(self):
         return arguments
 
     def initialize(self):
-        # get the host and device workspace
+        # Get the host and device workspace
         device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
 
         device_workspace_size = 10 << 20
@@ -467,12 +438,10 @@ def initialize(self):
             workspace_ptr = None
 
         device_workspace = 0
-        if workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            # in GEMM splik-K parallel, the D pointer is redirected
-            # to the workspace
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
             self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
-        elif workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.Gemm:
-            # in GEMM split-K serial
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
             device_workspace = workspace_ptr
 
         arguments = self.get_arguments()
@@ -512,7 +481,7 @@ class GemmArguments3x(GemmArguments2x):
      :class:`cutlass.backend.GemmOperationGrouped`
 
     :param problem_size: GEMM problem size gemm(M, N, K)
-    :type operation: :class:`cutlass_bindings.gemm.GemmCoord`
+    :type operation: :class:`cutlass.shape.GemmCoord`
 
     :param A: tensor A
     :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
@@ -527,18 +496,15 @@ class GemmArguments3x(GemmArguments2x):
     :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
 
     :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass_bindings.gemm.Mode`
+    :type gemm_mode: GemmUniversalMode
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
 
-    def __init__(
-        self, operation: "GemmOperation", problem_size: "cutlass_bindings.gemm.GemmCoord",
-        A: "Tensor", B: "Tensor", C: "Tensor", D: "Tensor",
-        gemm_mode: "cutlass_bindings.gemm.Mode" = cutlass_bindings.gemm.Mode.Gemm, **kwargs):
-        if gemm_mode not in [cutlass_bindings.gemm.Mode.Gemm, cutlass_bindings.gemm.Mode.Batched]:
-            raise Exception("Unsupporged GEMM mode {}.".format(gemm_mode))
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
 
         super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
 
@@ -584,7 +550,7 @@ def get_arguments(self):
         hw_info = self.operation.rt_module.hw_info(0, device_sm_count())
 
         self.arguments = self.operation.argument_type(
-            self.gemm_mode,
+            int(self.gemm_mode),
             problem_size_,
             mainloop,
             epilogue,
@@ -593,7 +559,7 @@ def get_arguments(self):
         return self.arguments
 
     def initialize(self):
-        # get the host and evice workspace
+        # Get the host and evice workspace
         device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
 
         if device_workspace_size > 0:
@@ -605,12 +571,10 @@ def initialize(self):
             workspace_ptr = None
 
         device_workspace = 0
-        if workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            # in GEMM splik-K parallel, the D pointer is redirected
-            # to the workspace
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
             self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
-        elif workspace_ptr is not None and self.gemm_mode == cutlass_bindings.gemm.Mode.Gemm:
-            # in GEMM split-K serial
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
             device_workspace = workspace_ptr
 
         self.get_arguments()
@@ -637,10 +601,7 @@ def initialize(self):
         )
 
 
-def GemmArguments(
-    operation: "GemmOperation", problem_size: "cutlass_bindings.gemm.GemmCoord",
-    A: "Tensor", B: "Tensor", C: "Tensor", D: "Tensor",
-    gemm_mode: "cutlass_bindings.gemm.Mode" = cutlass_bindings.gemm.Mode.Gemm, **kwargs):
+def GemmArguments(operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
     """
     Argument wrapper for GEMM in CUTLASS 2 or 3. It returns either 2x arguments
     or 3x arguments depending on the `arch` field specified in `operation`.
@@ -650,7 +611,7 @@ def GemmArguments(
      :class:`cutlass.backend.GemmOperationGrouped`
 
     :param problem_size: GEMM problem size gemm(M, N, K)
-    :type operation: :class:`cutlass_bindings.gemm.GemmCoord`
+    :type operation: :class:`cutlass.shape.GemmCoord`
 
     :param A: tensor A
     :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
@@ -665,12 +626,12 @@ def GemmArguments(
     :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
 
     :param gemm_mode: GEMM mode
-    :type gemm_mode: :class:`cutlass_bindings.gemm.Mode`
+    :type gemm_mode: :class:`cutlass.GemmUniversalMode`
 
     :param output_op: output operator, optional
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
-    if isinstance(operation.swizzling_functor, cutlass_bindings.ThreadblockSwizzleStreamK):
+    if operation.swizzling_functor == SwizzlingFunctor.StreamK:
         if operation.api == ApiVersion.v3x:
             raise Exception("Stream K is currently only supported in CUTLASS 2.x")
         ArgClass = GemmArguments2xStreamK
@@ -688,7 +649,7 @@ class GemmGroupedArguments:
     :type operation: :class:`cutlass.backend.GemmOperationGrouped`
 
     :param problem_size: list of GEMM problem size gemm(M, N, K)
-    :type operation: list[:class:`cutlass_bindings.gemm.GemmCoord`]
+    :type operation: list[:class:`cutlass.shape.GemmCoord`]
 
     :param A: list of tensor A
     :type A: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
@@ -706,13 +667,11 @@ class GemmGroupedArguments:
     :type output_op: :class:`cutlass.backend.LinearCombinationFunctorArguments`
     """
 
-    def __init__(
-        self, operation: "GemmOperationGrouped", problem_sizes: "list[cutlass_bindings.gemm.GemmCoord]",
-        A: "list[Tensor]", B: "list[Tensor]", C: "list[torch.Tensor]", D: "list[Tensor]", **kwargs):
-        # get number of problems in the group
+    def __init__(self, operation, problem_sizes, A, B, C, D, **kwargs):
+        # Get number of problems in the group
         self.problem_count = len(problem_sizes)
 
-        # check the input arguments
+        # Check the input arguments
         assert len(A) == self.problem_count
         assert len(B) == self.problem_count
         assert len(C) == self.problem_count
@@ -733,9 +692,9 @@ def __init__(
 
         self.operation = operation
 
-        # get the threadblock
+        # Get the threadblock
         threadblock_shape = operation.tile_description.threadblock_shape
-        self.threadblock_shape = cutlass_bindings.gemm.GemmCoord(
+        self.threadblock_shape = GemmCoord(
             threadblock_shape[0],
             threadblock_shape[1],
             threadblock_shape[2],
@@ -746,19 +705,19 @@ def __init__(
 
         self.gemm_arguments = []
 
-        # process the input arguments
+        # Process the input arguments
         for idx, problem_size in enumerate(problem_sizes):
-            M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
+            M, N, K = problem_size.m, problem_size.n, problem_size.k
             temp_argument = GemmArguments2x(
                 operation=operation,
-                problem_size=cutlass_bindings.gemm.GemmCoord(M, N, K),
+                problem_size=GemmCoord(M, N, K),
                 A=A[idx], B=B[idx], C=C[idx], D=D[idx])
             self.gemm_arguments.append(temp_argument)
 
             problem_size_host.append(
-                [temp_argument.problem_size.m(),
-                 temp_argument.problem_size.n(),
-                 temp_argument.problem_size.k()]
+                [temp_argument.problem_size.m,
+                 temp_argument.problem_size.n,
+                 temp_argument.problem_size.k]
             )
 
             self.ptr_A_host.append(int(temp_argument.ptr_A))
@@ -773,11 +732,13 @@ def __init__(
             self.ptr_D_host.append(int(temp_argument.ptr_D))
             ldd_host.append(temp_argument.ldd)
 
-            # get number of tiles
-            grid = self.threadblock_swizzle.get_grid_shape(
-                self.threadblock_swizzle.get_tiled_shape(
-                    temp_argument.problem_size, self.threadblock_shape,
-                    temp_argument.batch_count)
+            # Get number of tiles
+            grid = self.operation.rt_module.get_grid_shape(
+                self.operation.rt_module.get_tiled_shape(
+                    temp_argument.problem_size.ctype,
+                    self.threadblock_shape.ctype,
+                    temp_argument.batch_count
+                )
             )
             self.total_tiles += grid.x * grid.y * grid.z
 
@@ -804,7 +765,7 @@ def __init__(
         else:
             self.output_op = self.operation.epilogue_type(1.0, 0.0)
 
-        # get host problem size
+        # Get host problem size
         self.host_problem_size_ptr = np.array(problem_size_host, dtype=np.int32).__array_interface__["data"][0]
 
         self.arguments = self.get_arguments()
@@ -829,10 +790,10 @@ def get_arguments(self):
         )
 
     def initialize(self):
-        # get launch configuration
+        # Get launch configuration
         launch_config = self.operation.rt_module.plan(self)
 
-        # get the host and evice workspace
+        # Get the host and evice workspace
         device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
 
         if device_workspace_size > 0:
@@ -901,7 +862,7 @@ def __init__(self, operation: "GemmOperation"):
 
         self.operation = operation
         threadblock_shape = operation.tile_description.threadblock_shape
-        self.threadblock_shape = cutlass_bindings.gemm.GemmCoord(
+        self.threadblock_shape = GemmCoord(
             threadblock_shape[0], threadblock_shape[1], threadblock_shape[2])
         self.threadblock_swizzle = operation.swizzling_functor
 
@@ -971,13 +932,27 @@ class GemmRTUniversal(GemmRTbase):
 
     return output;
   }
+
+  cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+    cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+        problem_size, tile_size, split_k_slices);
+  }
+
+  dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+  }
 }
   """
 
-    def __init__(self, operation: "GemmOperation"):
+    def __init__(self, operation):
         super(GemmRTUniversal, self).__init__(operation)
+        self.extra_funcs = {
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
         self.emitter = EmitGemmUniversalInstance(
-            "_type", operation.direct_store, operation.visitor)
+            "_type", operation.direct_store)
 
         self.argument_type, self.epilogue_type = get_gemm_arguments(operation.epilogue_functor)
         self.argtype = [
@@ -986,24 +961,26 @@ def __init__(self, operation: "GemmOperation"):
         ]
 
     def plan(self, arguments):
-        grid = self.threadblock_swizzle.get_tiled_shape(
-            arguments.problem_size, self.threadblock_shape, arguments.batch_count
+        grid = self.get_tiled_shape(
+            arguments.problem_size.ctype,
+            self.threadblock_shape.ctype,
+            arguments.batch_count
         )
 
-        gemm_k_size = arguments.problem_size.k()
-        if arguments.gemm_mode in [cutlass_bindings.gemm.Mode.Gemm, cutlass_bindings.gemm.Mode.GemmSplitKParallel]:
+        gemm_k_size = arguments.problem_size.k
+        if arguments.gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
             alignk = max(max(128 // DataTypeSize[self.operation.A.element],
                          128 // DataTypeSize[self.operation.B.element]), 1)
 
-            gemm_k_size = (((arguments.problem_size.k() + arguments.batch_count - 1) //
+            gemm_k_size = (((arguments.problem_size.k + arguments.batch_count - 1) //
                            arguments.batch_count + alignk - 1) // alignk) * alignk
 
             if gemm_k_size:
-                grid_z = (arguments.problem_size.k() + gemm_k_size - 1) // gemm_k_size
-                grid = cutlass_bindings.gemm.GemmCoord(grid.m(), grid.n(), grid_z)
+                grid_z = (arguments.problem_size.k + gemm_k_size - 1) // gemm_k_size
+                grid = GemmCoord(grid.m, grid.n, grid_z).ctype
 
-        arguments.grid_tiled_shape = cutlass_bindings.dim3(grid.m(), grid.n(), grid.k())
-        grid = self.threadblock_swizzle.get_grid_shape(grid)
+        arguments.grid_tiled_shape = dim3_(grid.m, grid.n, grid.k)
+        grid = self.get_grid_shape(grid)
         arguments.gemm_k_size = gemm_k_size
         return LaunchConfiguration(
             [grid.x, grid.y, grid.z],
@@ -1012,10 +989,10 @@ def plan(self, arguments):
 
     def get_device_workspace_size(self, arguments: GemmArguments):
         workspace_bytes = 0
-        if arguments.gemm_mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
+        if arguments.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
             workspace_bytes = (DataTypeSize[arguments.operation.C.element]
              * arguments.batched_stride_D * arguments.grid_tiled_shape.z // 8)
-        elif (arguments.gemm_mode == cutlass_bindings.gemm.Mode.Gemm and
+        elif (arguments.gemm_mode == GemmUniversalMode.Gemm and
             arguments.split_k_slices > 1):
             workspace_bytes = 4 * arguments.grid_tiled_shape.x * arguments.grid_tiled_shape.y
 
@@ -1057,7 +1034,6 @@ class GemmRTUniversalStreamK(GemmRTUniversal):
     return output;
   }
 
-  // Get the grid shape
   dim3 ${operation_name}_get_grid_shape(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
     typename GemmType::Params params(*args, device_sms, sm_occupancy);
     return params.get_grid_dims();
@@ -1133,7 +1109,6 @@ class GemmRTUniversal3x(GemmRTUniversal):
   // Get the params as byte array
   char* ${operation_name}_get_params(GemmType::Arguments* argument, int* workspace){
     GemmType::Params params = GemmType::to_underlying_arguments(*argument, workspace);
-
     char *bytes = ((char*)(&params));
     char *output = new char[sizeof(GemmType::Params)];
     for (unsigned int i = 0; i < sizeof(GemmType::Params); i ++)
@@ -1164,7 +1139,7 @@ class GemmRTUniversal3x(GemmRTUniversal):
 }
   """
 
-    def __init__(self, operation: "GemmOperation"):
+    def __init__(self, operation):
         super(GemmRTUniversal3x, self).__init__(operation)
         self.extra_funcs = {
             "get_grid_shape": dim3_,
@@ -1238,6 +1213,48 @@ def __init__(self, operation_suffix=""):
     ${tile_scheduler}
 >;
 
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_kernel_visitor = """
+using namespace cute;
+
+${callback_decl}
+
+using CollectiveEpilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ElementC, StrideC, ${align_c},
+    ElementD, StrideD, ${align_d},
+    ${epilogue_schedule},
+    ${callback_name}
+  >::CollectiveOp;
+
+using CollectiveMainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    ${stage_count_type},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    ${tile_scheduler}
+>;
+
 // Define named type
 struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
@@ -1250,11 +1267,7 @@ def __init__(self, operation_suffix=""):
 """
 
     def emit(self, operation):
-        instance_layout_A, instance_layout_B, instance_layout_C, = \
-            (operation.A.layout, operation.B.layout, operation.C.layout)
-
         # Support built-in epilogue functors or user-defined functions
-        epilogue_functor = operation.epilogue_functor.emit()
 
         if operation.tile_description.stages is None or operation.tile_description.stages == 0:
             stage_count_type = "cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename CollectiveEpilogue::SharedStorage)>"
@@ -1280,16 +1293,15 @@ def emit(self, operation):
             "operation_name": operation.procedural_name(),
             "operation_suffix": self.operation_suffix,
             "element_a": DataTypeTag[operation.A.element],
-            "layout_a": LayoutTag[instance_layout_A],
+            "layout_a": LayoutTag[operation.A.layout],
             "element_b": DataTypeTag[operation.B.element],
-            "layout_b": LayoutTag[instance_layout_B],
+            "layout_b": LayoutTag[operation.B.layout],
             "element_c": DataTypeTag[operation.C.element],
-            "layout_c": LayoutTag[instance_layout_C],
+            "layout_c": LayoutTag[operation.C.layout],
             "element_d": DataTypeTag[operation.epilogue_functor.element_output],
-            "layout_d": LayoutTag[instance_layout_C],
+            "layout_d": LayoutTag[operation.C.layout],
             "element_accumulator": DataTypeTag[operation.accumulator_type()],
             "element_epilogue": DataTypeTag[operation.epilogue_functor.element_epilogue],
-            "epilogue_vector_length": str(operation.epilogue_functor.epilogue_vector_length),
             "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
             "arch": "cutlass::arch::Sm%d" % operation.arch,
             "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
@@ -1307,9 +1319,15 @@ def emit(self, operation):
             "epilogue_schedule": EpilogueScheduleTag[eschedule],
             "tile_scheduler": TileSchedulerTag[tschedule]
         }
+        if hasattr(operation.epilogue_functor, "visitor"):
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
 
-        values["epilogue_functor"] = operation.epilogue_functor.emit()
-        return SubstituteTemplate(gemm_template, values)
+        else:
+            values["epilogue_functor"] = operation.epilogue_functor.emit()
+            return SubstituteTemplate(gemm_template, values)
 
 
 ###################################################################################################
@@ -1377,13 +1395,26 @@ class GemmRTGrouped(GemmRTbase):
 
       return output;
     }
+
+    cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+        cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+            problem_size, tile_size, split_k_slices);
+    }
+
+    dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+    }
   }
   """
 
     def __init__(self, operation: "GemmOperation"):
         super(GemmRTGrouped, self).__init__(operation)
-        self.extra_funcs = {"precompute": None}
-
+        self.extra_funcs = {
+            "precompute": None,
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
         self.emitter = EmitGemmGroupedInstance("_type")
         self.argument_type, self.epilogue_type = get_gemm_grouped_arguments(operation.epilogue_functor)
         self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_int, ctypes.c_void_p]
@@ -1431,7 +1462,7 @@ class GemmOperationBase:
     def __init__(
         self, gemm_kind, arch, tile_description: TileDescription,
         A: TensorDescription, B: TensorDescription, C: TensorDescription,
-        epilogue_functor, swizzling_functor=cutlass_bindings.IdentitySwizzle1,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1,
         api=ApiVersion.v2x, emission_type=EmissionType.Kernel, **kwargs):
         self.operation_kind: OperationKind = OperationKind.Gemm
         self.arch: int = arch
@@ -1447,21 +1478,17 @@ def __init__(
         # The code below uses deep copy to avoid overwritting the original TensorDescription
         self.switched = (self.api != ApiVersion.v3x and
                          self.emission_type == EmissionType.Kernel and
-                         C.layout == cutlass_bindings.ColumnMajor)
+                         C.layout == LayoutType.ColumnMajor)
 
         self.A, self.B, self.C = GemmOperationBase.get_operands(A, B, C, self.switched)
 
         self.epilogue_functor = epilogue_functor
-        self.swizzling_functor = swizzling_functor()
+        self.swizzling_functor = swizzling_functor
 
         if "direct_store" in kwargs:
             self.direct_store = kwargs["direct_store"]
         else:
             self.direct_store = False
-        if "visitor" in kwargs:
-            self.visitor = kwargs["visitor"]
-        else:
-            self.visitor = False
 
     @staticmethod
     def get_operands(A: TensorDescription, B: TensorDescription, C: TensorDescription, swap: bool):
@@ -1549,14 +1576,18 @@ def core_name(self):
             MathOperation.xor_popc: "xor",
         }
 
-        if (self.tile_description.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp or
-            self.tile_description.math_instruction.opcode_class == cutlass_bindings.OpClass.WmmaTensorOp):
+        if (self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or
+            self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp):
             math_op = self.tile_description.math_instruction.math_operation
             math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ""
 
             if self.tile_description.math_instruction.instruction_shape is not None:
-                inst_shape = "%dx%dx%d" % tuple(
-                    self.tile_description.math_instruction.instruction_shape)
+                if self.api == ApiVersion.v3x and self.arch >= 90:
+                    inst_shape = "%dx%dx%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
+                else:
+                    inst_shape = "%d%d%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
             else:
                 inst_shape = "Default"
             inst_shape += math_op_string
@@ -1591,11 +1622,12 @@ def extended_name(self):
 
     def extended_name_3x(self):
         """Generates a string representing the MMA atom. Assumes accumulator type is C type."""
-        extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}".format(
+        extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
             element_a=DataTypeNames[self.A.element],
             element_b=DataTypeNames[self.B.element],
             element_acc=DataTypeNames[self.tile_description.math_instruction.element_accumulator],
             element_c=DataTypeNames[self.C.element],
+            element_d=DataTypeNames[self.C.element],
             core_name=self.core_name())
         return extended_name
 
@@ -1657,10 +1689,9 @@ def procedural_name(self):
                 e=self.epilogue_schedule_name_3x()
             )
         else:
-            threadblock = self.tile_description.procedural_name()
-            return "cutlass{p}_sm{ar}_{op}_{ex}_{tb}_{l}_align{a}".format(
+            threadblock = self.tile_description.procedural_name_2x()
+            return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
                 p=self.prefix,
-                ar=self.arch,
                 op=opcode_class_name,
                 ex=self.extended_name(),
                 tb=threadblock,
@@ -1675,17 +1706,17 @@ def configuration_name(self):
 
 class GemmOperationUniversal(GemmOperationBase):
     def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
-        epilogue_functor, swizzling_functor=cutlass_bindings.IdentitySwizzle1, **kwargs):
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
         api = api_version(arch, tile_description.math_instruction.opcode_class, A.element)
         super(GemmOperationUniversal, self).__init__(GemmKind.Universal, arch, tile_description,
                                                      A, B, C, epilogue_functor, swizzling_functor,
                                                      api=api, **kwargs, )
         if api == ApiVersion.v3x:
-            if swizzling_functor == cutlass_bindings.ThreadblockSwizzleStreamK:
-                raise Exception("Stream K is currently only supported for CUTLASS 2.x kernels")
+            if swizzling_functor == SwizzlingFunctor.StreamK:
+                raise Exception("Stream K swizzle functor is currently only supported for CUTLASS 2.x kernels")
             self.rt_module = GemmRTUniversal3x(self)
         else:
-            if swizzling_functor == cutlass_bindings.ThreadblockSwizzleStreamK:
+            if swizzling_functor == SwizzlingFunctor.StreamK:
                 self.rt_module = GemmRTUniversalStreamK(self)
             else:
                 self.rt_module = GemmRTUniversal(self)
@@ -1703,14 +1734,13 @@ def device_op(self):
         """
         A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
         return GemmOperationUniversal(self.arch, self.tile_description, A, B, C,
-                                      self.epilogue_functor, type(self.swizzling_functor),
-                                      emission_type=EmissionType.Device, direct_store=self.direct_store,
-                                      visitor=self.visitor)
+                                      self.epilogue_functor, self.swizzling_functor,
+                                      emission_type=EmissionType.Device, direct_store=self.direct_store)
 
 
 class GemmOperationGrouped(GemmOperationBase):
     def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
-        epilogue_functor, swizzling_functor=cutlass_bindings.IdentitySwizzle1, **kwargs):
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
         super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description,
                                                    A, B, C, epilogue_functor, swizzling_functor, **kwargs)
         assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'."
@@ -1731,7 +1761,7 @@ def device_op(self):
         A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
         return GemmOperationGrouped(
             self.arch, self.tile_description, A, B, C, self.epilogue_functor,
-            type(self.swizzling_functor), emission_type=EmissionType.Device,
+            self.swizzling_functor, emission_type=EmissionType.Device,
             direct_store=self.direct_store, precompute_mode=self.precompute_mode, )
 
 
@@ -1748,14 +1778,13 @@ class EmitGemmUniversalInstance:
     def __init__(
         self,
         operation_suffix="",
-        direct_store=False,
-        visitor=False,
+        direct_store=False
     ):
         self.operation_suffix = operation_suffix
         self.direct_store = direct_store
-        self.visitor = visitor
         self.includes = [
             "cutlass/cutlass.h",
+            "cutlass/gemm_coord.h",
             "cutlass/numeric_types.h",
             "cutlass/arch/arch.h",
             "cutlass/arch/mma.h",
@@ -1764,12 +1793,6 @@ def __init__(
             "cutlass/gemm/device/gemm_universal_adapter.h",
             "cutlass/gemm/kernel/default_gemm_universal.h",
         ]
-        if self.visitor:
-            self.includes += [
-                "gemm/gemm_universal_with_visitor.h",
-                "epilogue/epilogue_visitor_with_layernorm.h",
-                "epilogue/epilogue_visitor_generic.h",
-            ]
         if self.direct_store:
             self.includes.append(
                 "cutlass/epilogue/threadblock/default_epilogue_direct_store.h"
@@ -1794,7 +1817,7 @@ def __init__(
 >::GemmKernel;
 
 // Define named type
-struct ${operation_name}${operation_suffix} : 
+struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
 """
 
@@ -1836,7 +1859,7 @@ def __init__(
 """
         self.gemm_template_direct_store = """
 // Gemm operator ${operation_name}
-using ${operation_name}_default = 
+using ${operation_name}_default =
   typename cutlass::gemm::kernel::DefaultGemmUniversal<
     ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
     ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
@@ -1853,7 +1876,7 @@ def __init__(
     ${math_operation}
 >::GemmKernel;
 
-using ${operation_name}_base = 
+using ${operation_name}_base =
   cutlass::gemm::kernel::GemmUniversal<
     ${operation_name}_default::Mma,
     cutlass::epilogue::threadblock::DefaultEpilogueDirectStore<
@@ -1863,43 +1886,43 @@ def __init__(
   >;
 
 // Define named type
-struct ${operation_name}${operation_suffix} : 
+struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
 """
-        self.gemm_template_visitor = """
+        self.gemm_template_kernel_visitor = """
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    ${element_c},
+    ${align_c},
+    ${epilogue_stages} /* epilogue stages */
+>;
+
+${callback_decl}
+
 // Gemm operator ${operation_name}
-using ${operation_name}_default = 
-    typename cutlass::gemm::kernel::DefaultGemmUniversal<
+using ${operation_name}_base =
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
     ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
     ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
-    ${element_c}, ${layout_c},
+    ${element_c}, ${layout_c}, ${align_c},
     ${element_accumulator},
+    ${element_epilogue},
     ${opcode_class},
     ${arch},
     cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
     cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
     cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-    ${elementwise_epilogue_functor},
+    ${callback_name},
     ${swizzling_functor},
     ${stages},
-    ${math_operation}
+    ${math_operation},
+    ${epilogue_stages} /* epilogue stages */
 >::GemmKernel;
 
-${epilogue_visitor}
-
-using ${operation_name}_Epilogue = typename cutlass::epilogue::threadblock::EpilogueWithVisitorFromExistingEpilogue<
-    ${operation_name}_EpilogueVisitor,
-    typename ${operation_name}_default::Epilogue>::Epilogue;
-
-using ${operation_name}_base =
-    cutlass::gemm::kernel::GemmUniversalwithEpilogueVisitor<
-        ${operation_name}_default::Mma,
-        ${operation_name}_Epilogue,
-        ${operation_name}_default::ThreadblockSwizzle
-    >;
-
 // Define named type
-struct ${operation_name}${operation_suffix} : 
+struct ${operation_name}${operation_suffix} :
   public ${operation_name}_base { };
 """
 
@@ -1924,8 +1947,6 @@ def emit(self, operation):
         if operation.emission_type == EmissionType.Kernel:
             if self.direct_store:
                 gemm_template = self.gemm_template_direct_store
-            elif self.visitor:
-                gemm_template = self.gemm_template_visitor
             else:
                 gemm_template = self.gemm_template_kernel
         else:
@@ -1952,7 +1973,7 @@ def emit(self, operation):
             "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
             "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
             "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
-            "swizzling_functor": operation.swizzling_functor.tag(),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
             "stages": str(operation.tile_description.stages),
             "align_a": str(operation.A.alignment),
             "align_b": str(operation.B.alignment),
@@ -1961,13 +1982,25 @@ def emit(self, operation):
             "math_operation": MathOperationTag[operation.tile_description.math_instruction.math_operation],
         }
 
-        if self.visitor:
-            values["epilogue_visitor"] = operation.epilogue_functor.emit(operation)
-            values["elementwise_epilogue_functor"] = operation.epilogue_functor.elementwise_functor.emit()
+        if hasattr(operation.epilogue_functor, "visitor"):
+            self.includes += [
+                "cutlass/epilogue/threadblock/fusion/visitors.hpp",
+                "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+            ]
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            values["align_c"] = str(operation.C.alignment)
+            values["element_epilogue"] = DataTypeTag[operation.epilogue_functor.element_epilogue]
+            if hasattr(operation.epilogue_functor, "epilogue_stages"):
+                epilogue_stages = operation.epilogue_functor.epilogue_stages
+            else:
+                epilogue_stages = 1
+            values["epilogue_stages"] = str(epilogue_stages)
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
         else:
             values["epilogue_functor"] = operation.epilogue_functor.emit()
-
-        return SubstituteTemplate(gemm_template, values)
+            return SubstituteTemplate(gemm_template, values)
 
 
 class EmitGemmGroupedInstance:
@@ -2058,7 +2091,7 @@ def emit(self, operation):
             "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
             "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
             "epilogue_functor": epilogue_functor,
-            "swizzling_functor": operation.swizzling_functor.tag(),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
             "stages": str(operation.tile_description.stages),
             "align_a": str(operation.A.alignment),
             "align_b": str(operation.B.alignment),
diff --git a/python/cutlass/backend/library.py b/python/cutlass/backend/library.py
index 18b56b030f..62939a521c 100644
--- a/python/cutlass/backend/library.py
+++ b/python/cutlass/backend/library.py
@@ -31,14 +31,21 @@
 #################################################################################################
 
 """
-Common data types and string names for them. This file is similar to /tools/library/scripts/library.py,
-but uses the Pybind-bound CUTLASS data types as many keys to the dictionary.
+Common data types and string names/tags for them
 """
 
 import enum
 
-import cutlass_bindings
-from cutlass import EpilogueScheduleType, KernelScheduleType, TileSchedulerType
+from cutlass import (
+    ComplexTransform,
+    DataType,
+    DataTypeSize,
+    EpilogueScheduleType,
+    KernelScheduleType,
+    MathOperation,
+    OpcodeClass,
+    TileSchedulerType
+)
 
 
 # The following block implements enum.auto() for Python 3.5 variants that don't include it such
@@ -58,121 +65,6 @@ def enum_auto() -> int:
         return i
 
 
-ShortDataTypeNames = {
-    cutlass_bindings.int32: "i",
-    cutlass_bindings.float16: "h",
-    cutlass_bindings.float32: "s",
-    cutlass_bindings.float64: "d",
-    cutlass_bindings.dtype.cf32: "c",
-    cutlass_bindings.dtype.cf64: "z",
-}
-
-
-DataTypeNames = {
-    cutlass_bindings.dtype.b1: "b1",
-    cutlass_bindings.dtype.u4: "u4",
-    cutlass_bindings.dtype.u8: "u8",
-    cutlass_bindings.dtype.u16: "u16",
-    cutlass_bindings.dtype.u32: "u32",
-    cutlass_bindings.dtype.u64: "u64",
-    cutlass_bindings.dtype.s4: "s4",
-    cutlass_bindings.int8: "s8",
-    cutlass_bindings.dtype.s16: "s16",
-    cutlass_bindings.int32: "s32",
-    cutlass_bindings.dtype.s64: "s64",
-    cutlass_bindings.float16: "f16",
-    cutlass_bindings.bfloat16: "bf16",
-    cutlass_bindings.float32: "f32",
-    cutlass_bindings.tfloat32: "tf32",
-    cutlass_bindings.float64: "f64",
-    cutlass_bindings.dtype.cf16: "cf16",
-    cutlass_bindings.dtype.cbf16: "cbf16",
-    cutlass_bindings.dtype.cf32: "cf32",
-    cutlass_bindings.dtype.ctf32: "ctf32",
-    cutlass_bindings.dtype.cf64: "cf64",
-    cutlass_bindings.dtype.cu4: "cu4",
-    cutlass_bindings.dtype.cu8: "cu8",
-    cutlass_bindings.dtype.cu16: "cu16",
-    cutlass_bindings.dtype.cu32: "cu32",
-    cutlass_bindings.dtype.cu64: "cu64",
-    cutlass_bindings.dtype.cs4: "cs4",
-    cutlass_bindings.dtype.cs8: "cs8",
-    cutlass_bindings.dtype.cs16: "cs16",
-    cutlass_bindings.dtype.cs32: "cs32",
-    cutlass_bindings.dtype.cs64: "cs64",
-}
-
-
-DataTypeTag = {
-    cutlass_bindings.dtype.b1: "cutlass::uint1b_t",
-    cutlass_bindings.dtype.u4: "cutlass::uint4b_t",
-    cutlass_bindings.dtype.u8: "uint8_t",
-    cutlass_bindings.dtype.u16: "uint16_t",
-    cutlass_bindings.dtype.u32: "uint32_t",
-    cutlass_bindings.dtype.u64: "uint64_t",
-    cutlass_bindings.dtype.s4: "cutlass::int4b_t",
-    cutlass_bindings.int8: "int8_t",
-    cutlass_bindings.dtype.s16: "int16_t",
-    cutlass_bindings.int32: "int32_t",
-    cutlass_bindings.dtype.s64: "int64_t",
-    cutlass_bindings.float16: "cutlass::half_t",
-    cutlass_bindings.bfloat16: "cutlass::bfloat16_t",
-    cutlass_bindings.float32: "float",
-    cutlass_bindings.tfloat32: "cutlass::tfloat32_t",
-    cutlass_bindings.float64: "double",
-    cutlass_bindings.dtype.cf16: "cutlass::complex<cutlass::half_t>",
-    cutlass_bindings.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
-    cutlass_bindings.dtype.cf32: "cutlass::complex<float>",
-    cutlass_bindings.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
-    cutlass_bindings.dtype.cf64: "cutlass::complex<double>",
-    cutlass_bindings.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
-    cutlass_bindings.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
-    cutlass_bindings.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
-    cutlass_bindings.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
-    cutlass_bindings.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
-    cutlass_bindings.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
-    cutlass_bindings.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
-    cutlass_bindings.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
-    cutlass_bindings.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
-    cutlass_bindings.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
-}
-
-
-DataTypeSize = {
-    cutlass_bindings.dtype.b1: 1,
-    cutlass_bindings.dtype.u4: 4,
-    cutlass_bindings.dtype.u8: 8,
-    cutlass_bindings.dtype.u16: 16,
-    cutlass_bindings.dtype.u32: 32,
-    cutlass_bindings.dtype.u64: 64,
-    cutlass_bindings.dtype.s4: 4,
-    cutlass_bindings.int8: 8,
-    cutlass_bindings.dtype.s16: 16,
-    cutlass_bindings.int32: 32,
-    cutlass_bindings.dtype.s64: 64,
-    cutlass_bindings.float16: 16,
-    cutlass_bindings.bfloat16: 16,
-    cutlass_bindings.float32: 32,
-    cutlass_bindings.tfloat32: 32,
-    cutlass_bindings.float64: 64,
-    cutlass_bindings.dtype.cf16: 32,
-    cutlass_bindings.dtype.cbf16: 32,
-    cutlass_bindings.dtype.cf32: 64,
-    cutlass_bindings.dtype.ctf32: 32,
-    cutlass_bindings.dtype.cf64: 128,
-    cutlass_bindings.dtype.cu4: 8,
-    cutlass_bindings.dtype.cu8: 16,
-    cutlass_bindings.dtype.cu16: 32,
-    cutlass_bindings.dtype.cu32: 64,
-    cutlass_bindings.dtype.cu64: 128,
-    cutlass_bindings.dtype.cs4: 8,
-    cutlass_bindings.dtype.cs8: 16,
-    cutlass_bindings.dtype.cs16: 32,
-    cutlass_bindings.dtype.cs32: 64,
-    cutlass_bindings.dtype.cs64: 128,
-}
-
-
 class DataTypeSizeBytes:
     """
     Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
@@ -193,193 +85,15 @@ def __class_getitem__(datatype):
         bits = DataTypeSize[datatype]
         if bits < 8:
             raise Exception(
-                "Data type {} is less than one byte in size.".format(datatype)
+                f"Data type {datatype} is less than one byte in size."
             )
         elif bits % 8 != 0:
             raise Exception(
-                "Data type {} is not an integer number of bytes.".format(datatype)
+                f"Data type datatype is not an integer number of bytes."
             )
         return bits // 8
 
 
-ComplexTransformTag = {
-    cutlass_bindings.complex_transform.none: "cutlass::ComplexTransform::kNone",
-    cutlass_bindings.complex_transform.conj: "cutlass::ComplexTransform::kConjugate",
-}
-
-
-RealComplexBijection = [
-    (cutlass_bindings.float16, cutlass_bindings.dtype.cf16),
-    (cutlass_bindings.float32, cutlass_bindings.dtype.cf32),
-    (cutlass_bindings.float64, cutlass_bindings.dtype.cf64),
-]
-
-
-def is_complex(data_type):
-    for r, c in RealComplexBijection:
-        if data_type == c:
-            return True
-    return False
-
-
-def get_complex_from_real(real_type):
-    for r, c in RealComplexBijection:
-        if real_type == r:
-            return c
-    return cutlass_bindings.dtype.invalid
-
-
-def get_real_from_complex(complex_type):
-    for r, c in RealComplexBijection:
-        if complex_type == c:
-            return r
-    return cutlass_bindings.dtype.invalid
-
-
-class ComplexMultiplyOp(enum.Enum):
-    multiply_add = enum_auto()
-    gaussian = enum_auto()
-
-
-class MathOperation(enum.Enum):
-    multiply_add = enum_auto()
-    multiply_add_saturate = enum_auto()
-    xor_popc = enum_auto()
-    multiply_add_fast_bf16 = enum_auto()
-    multiply_add_fast_f16 = enum_auto()
-    multiply_add_fast_f32 = enum_auto()
-    multiply_add_complex_fast_f32 = enum_auto()
-    multiply_add_complex = enum_auto()
-    multiply_add_complex_gaussian = enum_auto()
-
-
-MathOperationNames = {
-    MathOperation.multiply_add: "multiply_add",
-    MathOperation.multiply_add_saturate: "multiply_add_saturate",
-    MathOperation.xor_popc: "xor_popc",
-    MathOperation.multiply_add_fast_bf16: "multiply_add_fast_bf16",
-    MathOperation.multiply_add_fast_f16: "multiply_add_fast_f16",
-    MathOperation.multiply_add_fast_f32: "multiply_add_fast_f32",
-    MathOperation.multiply_add_complex_fast_f32: "multiply_add_complex_fast_f32",
-    MathOperation.multiply_add_complex: "multiply_add_complex",
-    MathOperation.multiply_add_complex_gaussian: "multiply_add_complex_gaussian",
-}
-
-
-MathOperationTag = {
-    MathOperation.multiply_add: "cutlass::arch::OpMultiplyAdd",
-    MathOperation.multiply_add_saturate: "cutlass::arch::OpMultiplyAddSaturate",
-    MathOperation.xor_popc: "cutlass::arch::OpXorPopc",
-    MathOperation.multiply_add_fast_bf16: "cutlass::arch::OpMultiplyAddFastBF16",
-    MathOperation.multiply_add_fast_f16: "cutlass::arch::OpMultiplyAddFastF16",
-    MathOperation.multiply_add_fast_f32: "cutlass::arch::OpMultiplyAddFastF32",
-    MathOperation.multiply_add_complex_fast_f32: "cutlass::arch::OpMultiplyAddComplexFastF32",
-    MathOperation.multiply_add_complex: "cutlass::arch::OpMultiplyAddComplex",
-    MathOperation.multiply_add_complex_gaussian: "cutlass::arch::OpMultiplyAddGaussianComplex",
-}
-
-
-LayoutTag = {
-    cutlass_bindings.ColumnMajor: "cutlass::layout::ColumnMajor",
-    cutlass_bindings.RowMajor: "cutlass::layout::RowMajor",
-    cutlass_bindings.layout.ColumnMajorInterleaved2: "cutlass::layout::ColumnMajorInterleaved<2>",
-    cutlass_bindings.layout.RowMajorInterleaved2: "cutlass::layout::RowMajorInterleaved<2>",
-    cutlass_bindings.ColumnMajorInterleaved32: "cutlass::layout::ColumnMajorInterleaved<32>",
-    cutlass_bindings.RowMajorInterleaved32: "cutlass::layout::RowMajorInterleaved<32>",
-    cutlass_bindings.layout.ColumnMajorInterleaved64: "cutlass::layout::ColumnMajorInterleaved<64>",
-    cutlass_bindings.layout.RowMajorInterleaved64: "cutlass::layout::RowMajorInterleaved<64>",
-    cutlass_bindings.TensorNHWC: "cutlass::layout::TensorNHWC",
-    cutlass_bindings.layout.TensorNDHWC: "cutlass::layout::TensorNDHWC",
-    cutlass_bindings.layout.TensorNCHW: "cutlass::layout::TensorNCHW",
-    cutlass_bindings.layout.TensorNGHWC: "cutlass::layout::TensorNGHWC",
-    cutlass_bindings.TensorNC32HW32: "cutlass::layout::TensorNCxHWx<32>",
-    cutlass_bindings.TensorC32RSK32: "cutlass::layout::TensorCxRSKx<32>",
-    cutlass_bindings.layout.TensorNC64HW64: "cutlass::layout::TensorNCxHWx<64>",
-    cutlass_bindings.layout.TensorC64RSK64: "cutlass::layout::TensorCxRSKx<64>",
-}
-
-
-TransposedLayout = {
-    cutlass_bindings.ColumnMajor: cutlass_bindings.RowMajor,
-    cutlass_bindings.RowMajor: cutlass_bindings.ColumnMajor,
-    cutlass_bindings.layout.ColumnMajorInterleaved2: cutlass_bindings.layout.RowMajorInterleaved2,
-    cutlass_bindings.layout.RowMajorInterleaved2: cutlass_bindings.layout.ColumnMajorInterleaved2,
-    cutlass_bindings.ColumnMajorInterleaved32: cutlass_bindings.RowMajorInterleaved32,
-    cutlass_bindings.RowMajorInterleaved32: cutlass_bindings.ColumnMajorInterleaved32,
-    cutlass_bindings.layout.ColumnMajorInterleaved64: cutlass_bindings.layout.RowMajorInterleaved64,
-    cutlass_bindings.layout.RowMajorInterleaved64: cutlass_bindings.layout.ColumnMajorInterleaved64,
-    cutlass_bindings.TensorNHWC: cutlass_bindings.TensorNHWC,
-}
-
-
-ShortLayoutTypeNames = {
-    cutlass_bindings.ColumnMajor: "n",
-    cutlass_bindings.layout.ColumnMajorInterleaved2: "n2",
-    cutlass_bindings.ColumnMajorInterleaved32: "n32",
-    cutlass_bindings.layout.ColumnMajorInterleaved64: "n64",
-    cutlass_bindings.RowMajor: "t",
-    cutlass_bindings.layout.RowMajorInterleaved2: "t2",
-    cutlass_bindings.RowMajorInterleaved32: "t32",
-    cutlass_bindings.layout.RowMajorInterleaved64: "t64",
-    cutlass_bindings.TensorNHWC: "nhwc",
-    cutlass_bindings.layout.TensorNDHWC: "ndhwc",
-    cutlass_bindings.layout.TensorNCHW: "nchw",
-    cutlass_bindings.layout.TensorNGHWC: "nghwc",
-    cutlass_bindings.TensorNC32HW32: "nc32hw32",
-    cutlass_bindings.layout.TensorNC64HW64: "nc64hw64",
-    cutlass_bindings.TensorC32RSK32: "c32rsk32",
-    cutlass_bindings.layout.TensorC64RSK64: "c64rsk64",
-}
-
-
-ShortComplexLayoutNames = {
-    (cutlass_bindings.ColumnMajor, cutlass_bindings.complex_transform.none): "n",
-    (cutlass_bindings.ColumnMajor, cutlass_bindings.complex_transform.conj): "c",
-    (cutlass_bindings.RowMajor, cutlass_bindings.complex_transform.none): "t",
-    (cutlass_bindings.RowMajor, cutlass_bindings.complex_transform.conj): "h",
-}
-
-
-OpcodeClassNames = {
-    cutlass_bindings.OpClass.Simt: "simt",
-    cutlass_bindings.OpClass.TensorOp: "tensorop",
-    cutlass_bindings.OpClass.WmmaTensorOp: "wmma_tensorop",
-    cutlass_bindings.OpClass.SparseTensorOp: "sptensorop",
-}
-
-
-OpcodeClassTag = {
-    cutlass_bindings.OpClass.Simt: "cutlass::arch::OpClassSimt",
-    cutlass_bindings.OpClass.TensorOp: "cutlass::arch::OpClassTensorOp",
-    cutlass_bindings.OpClass.WmmaTensorOp: "cutlass::arch::OpClassWmmaTensorOp",
-    cutlass_bindings.OpClass.SparseTensorOp: "cutlass::arch::OpClassSparseTensorOp",
-}
-
-
-class OperationKind(enum.Enum):
-    Gemm = enum_auto()
-    Conv2d = enum_auto()
-    Conv3d = enum_auto()
-
-
-OperationKindNames = {
-    OperationKind.Gemm: "gemm",
-    OperationKind.Conv2d: "conv2d",
-    OperationKind.Conv3d: "conv3d",
-}
-
-
-ArchitectureNames = {
-    50: "maxwell",
-    60: "pascal",
-    61: "pascal",
-    70: "volta",
-    75: "turing",
-    80: "ampere",
-    90: "hopper",
-}
-
-
 SharedMemPerCC = {
     70: 96 << 10,  # 96KB of SMEM
     72: 96 << 10,  # 96KB of SMEM
@@ -392,52 +106,8 @@ class OperationKind(enum.Enum):
 }
 
 
-class GemmKind(enum.Enum):
-    Gemm = enum_auto()
-    Sparse = enum_auto()
-    Universal = enum_auto()
-    PlanarComplex = enum_auto()
-    PlanarComplexArray = enum_auto()
-    Grouped = enum_auto()
-
-
-GemmKindNames = {
-    GemmKind.Gemm: "gemm",
-    GemmKind.Sparse: "spgemm",
-    GemmKind.Universal: "gemm",
-    GemmKind.PlanarComplex: "gemm_planar_complex",
-    GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
-    GemmKind.Grouped: "gemm_grouped",
-}
-
-
-class SwizzlingFunctor(enum.Enum):
-    Identity1 = enum_auto()
-    Identity2 = enum_auto()
-    Identity4 = enum_auto()
-    Identity8 = enum_auto()
-    Horizontal = enum_auto()
-    BatchedIdentity1 = enum_auto()
-    StridedDgradIdentity1 = enum_auto()
-    StridedDgradIdentity4 = enum_auto()
-    StridedDgradHorizontal = enum_auto()
-
-
-SwizzlingFunctorTag = {
-    cutlass_bindings.IdentitySwizzle1: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>",
-    SwizzlingFunctor.Identity2: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>",
-    SwizzlingFunctor.Identity4: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
-    SwizzlingFunctor.Identity8: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>",
-    SwizzlingFunctor.Horizontal: "cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle",
-    SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
-    SwizzlingFunctor.StridedDgradIdentity1: "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
-    SwizzlingFunctor.StridedDgradIdentity4: "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>",
-    SwizzlingFunctor.StridedDgradHorizontal: "cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle",
-}
-
-
 class SchedulerMode(enum.Enum):
-    Device = (enum_auto(),)
+    Device = enum_auto()
     Host = enum_auto()
 
 
@@ -450,61 +120,98 @@ class SchedulerMode(enum.Enum):
 ShortSchedulerModeNames = {SchedulerMode.Device: "Device", SchedulerMode.Host: "Host"}
 
 
-ConvKindTag = {
-    cutlass_bindings.conv.Operator.fprop: "cutlass::conv::Operator::kFprop",
-    cutlass_bindings.conv.Operator.dgrad: "cutlass::conv::Operator::kDgrad",
-    cutlass_bindings.conv.Operator.wgrad: "cutlass::conv::Operator::kWgrad",
-}
-
-
-ConvKindNames = {
-    cutlass_bindings.conv.Operator.fprop: "fprop",
-    cutlass_bindings.conv.Operator.dgrad: "dgrad",
-    cutlass_bindings.conv.Operator.wgrad: "wgrad",
+class FunctionalOp(enum.Enum):
+    AtomicAdd = enum_auto()
+    AtomicMaximum = enum_auto()
+    Divides = enum_auto()
+    Maximum = enum_auto()
+    Minimum = enum_auto()
+    Minus = enum_auto()
+    Multiplies = enum_auto()
+    MultiplyAdd = enum_auto()
+    Plus = enum_auto()
+
+
+FunctionalOpTag = {
+    FunctionalOp.AtomicAdd: "cutlass::atomic_add",
+    FunctionalOp.AtomicMaximum: "cutlass::atomic_maximum",
+    FunctionalOp.Divides: "cutlass::divides",
+    FunctionalOp.Maximum: "cutlass::maximum",
+    FunctionalOp.Minimum: "cutlass::minimum",
+    FunctionalOp.Minus: "cutlass::minus",
+    FunctionalOp.Multiplies: "cutlass::multiplies",
+    FunctionalOp.MultiplyAdd: "cutlass::multiply_add",
+    FunctionalOp.Plus: "cutlass::plus",
 }
 
 
-IteratorAlgorithmTag = {
-    cutlass_bindings.conv.IteratorAlgorithm.analytic: "cutlass::conv::IteratorAlgorithm::kAnalytic",
-    cutlass_bindings.conv.IteratorAlgorithm.optimized: "cutlass::conv::IteratorAlgorithm::kOptimized",
-    cutlass_bindings.conv.IteratorAlgorithm.fixed_channels: "cutlass::conv::IteratorAlgorithm::kFixedChannels",
-    cutlass_bindings.conv.IteratorAlgorithm.few_channels: "cutlass::conv::IteratorAlgorithm::kFewChannels",
+class ActivationOp(enum.Enum):
+    DGelu = enum_auto()
+    Gelu = enum_auto()
+    GeluTaylor = enum_auto()
+    HardSwish = enum_auto()
+    Identity = enum_auto()
+    LeakyReLU = enum_auto()
+    ReLU = enum_auto()
+    Sigmoid = enum_auto()
+    SiLU = enum_auto()
+    Tanh = enum_auto()
+
+
+ActivationOpTag = {
+    ActivationOp.DGelu: "cutlass::epilogue::thread::dGELU",
+    ActivationOp.Gelu: "cutlass::epilogue::thread::GELU",
+    ActivationOp.GeluTaylor: "cutlass::epilogue::thread::GELU_taylor",
+    ActivationOp.HardSwish: "cutlass::epilogue::thread::HardSwish",
+    ActivationOp.Identity: "cutlass::epilogue::thread::Identity",
+    ActivationOp.LeakyReLU: "cutlass::epilogue::thread::LeakyReLU",
+    ActivationOp.ReLU: "cutlass::epilogue::thread::ReLu",
+    ActivationOp.Sigmoid: "cutlass::epilogue::thread::Sigmoid",
+    ActivationOp.SiLU: "cutlass::epilogue::thread::SiLu",
+    ActivationOp.Tanh: "cutlass::epilogue::thread::Tanh",
 }
 
 
-IteratorAlgorithmNames = {
-    cutlass_bindings.conv.IteratorAlgorithm.analytic: "analytic",
-    cutlass_bindings.conv.IteratorAlgorithm.optimized: "optimized",
-    cutlass_bindings.conv.IteratorAlgorithm.fixed_channels: "fixed_channels",
-    cutlass_bindings.conv.IteratorAlgorithm.few_channels: "few_channels",
-}
-
-
-class StrideSupport(enum.Enum):
-    Strided = enum_auto()
-    Unity = enum_auto()
-
-
-StrideSupportTag = {
-    StrideSupport.Strided: "cutlass::conv::StrideSupport::kStrided",
-    StrideSupport.Unity: "cutlass::conv::StrideSupport::kUnity",
-}
-
-
-StrideSupportNames = {
-    StrideSupport.Strided: "",
-    StrideSupport.Unity: "unity_stride",
-}
-
-
-class ConvMode(enum.Enum):
-    CrossCorrelation = enum_auto()
-    Convolution = enum_auto()
+def op_tag(op) -> str:
+    """
+    Dispatches `op` to the appropriate *Tag dictionary depending on whether
+    `op` is an ActivationOp or FunctionalOp. This is useful for cases in which
+    either type can be used.
 
+    :param op: operation to emit a tag for
+    :type op: ActivationOp | FunctionalOp
 
-ConvModeTag = {
-    ConvMode.CrossCorrelation: "cutlass::conv::Mode::kCrossCorrelation",
-    ConvMode.Convolution: "cutlass::conv::Mode::kConvolution",
+    :return: tag corresponding to op
+    :rtype: str
+    """
+    if isinstance(op, ActivationOp):
+        return ActivationOpTag[op]
+    elif isinstance(op, FunctionalOp):
+        return FunctionalOpTag[op]
+    else:
+        raise Exception(f"Unexpected op type {op}. Must be one of ActivationOp or FunctionalOp.")
+
+
+class FloatRoundStyle(enum.Enum):
+    ToNearest = enum_auto()
+    ToNearestSatfinite = enum_auto()
+    Indeterminate = enum_auto()
+    TowardZero = enum_auto()
+    TowardInfinity = enum_auto()
+    TowardNegInfinity = enum_auto()
+    HalfUlpTruncDntz = enum_auto()
+    HalfUlpTruncate = enum_auto()
+
+
+FloatRoundStyleTag = {
+    FloatRoundStyle.ToNearest: "cutlass::FloatRoundStyle::round_to_nearest",
+    FloatRoundStyle.ToNearestSatfinite: "cutlass::FloatRoundStyle::round_to_nearest_satfinite",
+    FloatRoundStyle.Indeterminate: "cutlass::FloatRoundStyle::round_indeterminate",
+    FloatRoundStyle.TowardZero: "cutlass::FloatRoundStyle::round_toward_zero",
+    FloatRoundStyle.TowardInfinity: "cutlass::FloatRoundStyle::round_toward_infinity",
+    FloatRoundStyle.TowardNegInfinity: "cutlass::FloatRoundStyle::round_toward_neg_infinity",
+    FloatRoundStyle.HalfUlpTruncDntz: "cutlass::FloatRoundStyle::round_half_ulp_trunc_dntz",
+    FloatRoundStyle.HalfUlpTruncate: "cutlass::FloatRoundStyle::round_half_ulp_truncate",
 }
 
 
@@ -519,7 +226,7 @@ def __init__(
         element_a,
         element_b,
         element_accumulator,
-        opcode_class=cutlass_bindings.OpClass.Simt,
+        opcode_class=OpcodeClass.Simt,
         math_operation=MathOperation.multiply_add,
     ):
         """
@@ -529,7 +236,7 @@ def __init__(
         :param element_b: data type of operand B
         :param element_accumulator: data type used in accumulation
         :param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
-        :type opcode_class: cutlass_bindings.OpClass
+        :type opcode_class: cutlass_library.library.OpcodeClass
         :param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
         :type math_operation: MathOperation
         """
@@ -556,7 +263,7 @@ def __init__(
         cluster_shape=[1, 1, 1],
         kernel_schedule: KernelScheduleType = None,
         epilogue_schedule: EpilogueScheduleType = None,
-        tile_scheduler: TileSchedulerType = None,
+        tile_scheduler: TileSchedulerType = None
     ):
         """
         :param threadblock_shape: shape of a threadblock tyle
@@ -610,7 +317,7 @@ def clone_and_update(self, td: dict):
             else:
                 attrs[key] = getattr(self, key)
 
-        mi = MathInstruction(
+        attrs["math_instruction"] = MathInstruction(
             attrs["instruction_shape"],
             self.math_instruction.element_a,
             self.math_instruction.element_b,
@@ -619,11 +326,10 @@ def clone_and_update(self, td: dict):
             self.math_instruction.math_operation
         )
 
-        return TileDescription(
-            attrs["threadblock_shape"], attrs["stages"],
-            attrs["warp_count"], mi, attrs["cluster_shape"],
-            attrs["kernel_schedule"], attrs["epilogue_schedule"]
-        )
+        # Remove the instruction shape
+        del attrs["instruction_shape"]
+
+        return TileDescription(**attrs)
 
     @property
     def num_threads(self):
@@ -660,6 +366,15 @@ def procedural_name(self):
 
         return name
 
+    def procedural_name_2x(self):
+        """
+        Returns a name identifying the tile description
+
+        :return: name identifying the tile description
+        :rtype: int
+        """
+        return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
     def __str__(self):
         """
         Returns a string with containing each of the tile description's values
@@ -695,8 +410,7 @@ def __str__(self):
 
 
 class TensorDescription:
-    def __init__(self, element, layout, alignment=1,
-                 complex_transform=cutlass_bindings.complex_transform.none):
+    def __init__(self, element, layout, alignment=1, complex_transform=ComplexTransform.none):
         self.element = element
         self.layout = layout
         self.alignment = min(128 // DataTypeSize[self.element], alignment)
@@ -751,7 +465,7 @@ class ApiVersion(enum.Enum):
     v3x = enum_auto()
 
 
-def api_version(arch, opclass, datatype):
+def api_version(arch, opclass, dtype):
     """
     Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
     or 3.x for code emission.
@@ -759,15 +473,16 @@ def api_version(arch, opclass, datatype):
     :param arch: compute capability of device on which to run
     :type arch: int
     :param opclass: class of the operation being performed
-    :type opclass: cutlass_bindings.OpClass
-    :param datatype: data type to be used in operation (assumes that ElementA and ElementB are the same)
+    :type opclass: cutlass.OpcodeClass
+    :param dtype: data type to be used in operation (assumes that ElementA and ElementB are the same)
+    :type dtype: cutlass.DataType
 
     :return: API version to be used in code emission
     :rtype: ApiVersion
     """
     if (arch >= 90 and
-        opclass == cutlass_bindings.OpClass.TensorOp and
-        (datatype != cutlass_bindings.float64)):
+        opclass == OpcodeClass.TensorOp and
+        (dtype != DataType.f64)):
         return ApiVersion.v3x
     else:
         return ApiVersion.v2x
diff --git a/python/cutlass/backend/parser.py b/python/cutlass/backend/parser.py
deleted file mode 100644
index be28bc0cb5..0000000000
--- a/python/cutlass/backend/parser.py
+++ /dev/null
@@ -1,877 +0,0 @@
-################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-################################################################################
-
-import ast
-import ctypes
-import inspect
-import textwrap
-from typing import Generic, TypeVar
-
-from cuda import cuda, cudart
-import numpy as np
-from treelib import Tree
-
-from cutlass.backend.epilogue import (
-    AccumulatorOp,
-    BinaryOp,
-    ColumnBroadcastOp,
-    ColumnReductionOp,
-    RowBroadcastOp,
-    RowReductionOp,
-    TensorInputOp,
-    TensorOutputOp,
-    UnaryOp,
-)
-from cutlass.backend.frontend import NumpyFrontend
-from cutlass.backend.utils.software import SubstituteTemplate
-import cutlass.backend as backend
-
-################################################################################
-# Type annotation for input arguments
-################################################################################
-
-Ttype = TypeVar("Ttype")
-Dtype = TypeVar("Dtype")
-
-
-class NDArray(np.ndarray, Generic[Ttype, Dtype]):
-    pass
-
-
-################################################################################
-# Operations
-################################################################################
-
-operators = {
-    ast.Add: "Add",
-    ast.Div: "Div",
-    ast.Eq: "Equal",
-    ast.Mult: "Mult",
-}
-
-
-################################################################################
-# AST Node abstractions
-################################################################################
-class UnaryNode:
-    cnt = 0
-
-    # Concept: this is created by the BinOp Node in python ast
-    def __init__(
-        self,
-        element_accumulator,
-        element_compute,
-        elements_per_access,
-        node,
-        args,
-    ) -> None:
-        if isinstance(node, BinOpNode):
-            self.op = node.op
-        elif isinstance(node, ast.Call):
-            if isinstance(node.func, ast.Name):
-                self.op = node.func.id
-            elif isinstance(node.func, ast.Attribute):
-                self.op = node.func.value.id
-            else:
-                raise TypeError
-        else:
-            raise TypeError
-        self.tag = "Unary" + self.op + str(UnaryNode.cnt)
-        self.id = self.op + str(UnaryNode.cnt)
-        self.args = args
-        UnaryNode.cnt += 1
-
-        self.type = "tensor"
-
-        self.epilogue_op = getattr(backend, self.op)(element_compute)
-
-        # data types
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = UnaryOp(
-            self.element_accumulator,
-            self.element_compute,
-            self.elements_per_access,
-            *visitors,
-            self.epilogue_op,
-        )
-
-    def get_argument(self, visitor_args, kwargs):
-        epilogue_ops = []
-        for arg in self.args:
-            try:
-                epilogue_ops.append(kwargs[arg])
-            except:
-                epilogue_ops.append(arg)  # direct arguments like constant
-        self.argument = self.epilogue_node.argument_type(
-            self.epilogue_op.argument_type(*epilogue_ops),
-            *visitor_args,
-        )
-
-
-class BinOpNode:
-    cnt = 0
-
-    # Concept: this is created by the BinOp Node in python ast
-    def __init__(
-        self,
-        element_accumulator,
-        element_compute,
-        elements_per_access,
-        node,
-    ) -> None:
-        self.op = operators[type(node.op)]
-        self.tag = "Binary" + self.op + str(BinOpNode.cnt)
-        self.id = self.op + str(BinOpNode.cnt)
-        self.args = None
-        BinOpNode.cnt += 1
-
-        self.type = "tensor"
-
-        self.epilogue_op = getattr(backend, "Vector" + self.op)(element_compute)
-
-        # data types
-        self.element_accumulator = element_accumulator
-        self.element_compute = element_compute
-        self.elements_per_access = elements_per_access
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = BinaryOp(
-            self.element_accumulator,
-            self.element_compute,
-            self.elements_per_access,
-            *visitors,
-            self.epilogue_op,
-        )
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            self.epilogue_op.argument_type(self.args),
-            *visitor_args,
-        )
-
-
-class NameNode:
-    # Concept: this is created by the Name Node in python ast
-    def __init__(self, node) -> None:
-        try:
-            self.id = node.id
-        except:
-            self.id = node.targets[0].id
-        self.tag = self.id
-
-
-class ScalarInputNode(NameNode):
-    # Concept: scalar
-    def __init__(self, node) -> None:
-        super().__init__(node)
-        self.tag = "Scalar:" + self.tag
-        self.type = "scalar"
-
-
-class AccumulatorNode(NameNode):
-    # Concept: VisitorOpAccumulator
-    def __init__(
-        self,
-        element_accumulator,
-        elements_per_access,
-        node,
-    ) -> None:
-        super().__init__(node)
-        self.tag = "Accum:" + self.tag
-        self.type = "tensor"
-
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = AccumulatorOp(
-            self.element_accumulator,
-            self.elements_per_access,
-        )
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type()
-
-
-class TensorInputNode(NameNode):
-    # Concept: VisitorOpTensorInput
-    def __init__(self, element_accumulator, node) -> None:
-        super().__init__(node)
-        self.tag = "TensorInput:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = TensorInputOp(self.element_accumulator)
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            kwargs["problem_size"][1],
-            kwargs["problem_size"][0] * kwargs["problem_size"][1],
-        )
-
-
-class RowBroadcastNode(NameNode):
-    # Concept: VisitorOpRowBroadcast
-    def __init__(
-        self,
-        element_accumulator,
-        element_fragment,
-        node,
-    ) -> None:
-        super().__init__(node)
-        #
-        self.tag = "RowBroadcast:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = RowBroadcastOp(
-            self.element_accumulator,
-            self.element_fragment,
-        )
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            kwargs["problem_size"][1],
-        )
-
-
-class ColumnBroadcastNode(NameNode):
-    # Concept: VisitorOpColumnBroadcast
-    def __init__(
-        self,
-        element_accumulator,
-        element_fragment,
-        node,
-    ) -> None:
-        super().__init__(node)
-        self.tag = "ColumnBroadcast:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_fragment = element_fragment
-
-    def get_epilogue_node(self, *args):
-        self.epilogue_node = ColumnBroadcastOp(
-            self.element_accumulator,
-            self.element_fragment,
-        )
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            kwargs["problem_size"][0],
-        )
-
-
-class TensorOutputNode(NameNode):
-    # Concept: VisitorOpTensorOutput
-    def __init__(self, element_accumulator, node) -> None:
-        super().__init__(node)
-        self.tag = "TensorOutput:" + self.tag
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = TensorOutputOp(self.element_accumulator, *visitors)
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            kwargs["problem_size"][1],
-            *visitor_args,
-            kwargs["problem_size"][0] * kwargs["problem_size"][1],
-        )
-
-
-class RowReductionNode:
-    # Concept: RowReductionOp
-    def __init__(
-        self,
-        element_accumulator,
-        element_reduction,
-        element_reduction_accumulator,
-        id,
-        factor,
-    ) -> None:
-        #
-        self.id = id
-        self.tag = "RowReduction:" + self.id
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.factor = factor
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = RowReductionOp(
-            self.element_accumulator,
-            self.element_reduction,
-            self.element_reduction_accumulator,
-            *visitors,
-        )
-
-    def get_batch_stride(self, problem_size):
-        return problem_size[0] * ((problem_size[1] + self.factor - 1) // self.factor)
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            *visitor_args,
-            self.get_batch_stride(kwargs["problem_size"]),
-        )
-
-
-class ColumnReductionNode:
-    # Concept: ColumnReductionOp
-    def __init__(
-        self,
-        element_accumulator,
-        element_reduction,
-        element_reduction_accumulator,
-        id,
-        factor,
-    ) -> None:
-        #
-        self.id = id
-        self.tag = "ColumnReduction:" + self.id
-        self.type = "tensor"
-        self.element_accumulator = element_accumulator
-        self.element_reduction = element_reduction
-        self.element_reduction_accumulator = element_reduction_accumulator
-        self.factor = factor
-
-    def get_epilogue_node(self, visitors):
-        self.epilogue_node = ColumnReductionOp(
-            self.element_accumulator,
-            self.element_reduction,
-            self.element_reduction_accumulator,
-            *visitors,
-        )
-
-    def get_batch_stride(self, problem_size):
-        return problem_size[1] * ((problem_size[0] + self.factor - 1) // self.factor)
-
-    def get_argument(self, visitor_args, kwargs):
-        self.argument = self.epilogue_node.argument_type(
-            kwargs[self.id + "_ptr"],
-            *visitor_args,
-            self.get_batch_stride(kwargs["problem_size"]),
-        )
-
-
-################################################################################
-# Epilogue parser function
-################################################################################
-class EpilogueAST(ast.NodeVisitor):
-    def __init__(
-        self,
-        epilogue,
-        tile_description,
-        element_accumulator,
-        elements_per_access,
-        element_compute,
-        element_output,
-    ) -> None:
-        #
-
-        self.tile_description = tile_description
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-        self.element_compute = element_compute
-        self.element_output = element_output
-        self.epilogue = epilogue
-
-        self.source = textwrap.dedent(inspect.getsource(epilogue.__call__))
-        self.ast_tree = ast.parse(self.source)
-        self.epilogue_tree = Tree()
-
-        # print(ast.dump(self.ast_tree, indent=4)) # For Debug purpose
-
-        # input arguments
-        self.input_args = {}
-        # return nodes
-        self.returns = []
-        # reduction source nodes
-        self.reduction_source = {}
-
-        # stack used to keep the parent node id
-        self.stack = []
-
-        # visit the AST
-        self.visit(self.ast_tree)
-
-    # visit the name node
-    def visit_Name(self, node):
-        # append the return ids into self.returns
-        if self.stack[-1] == "return":
-            self.returns.append(node.id)
-        else:
-            # accum is produced from accumulator node
-            if node.id == "accum":
-                name_node = AccumulatorNode(
-                    self.element_accumulator,
-                    self.elements_per_access,
-                    node,
-                )
-            else:
-                # for input nodes
-                if node.id in self.input_args.keys():
-                    type = self.input_args[node.id][0]
-                    if type == "tensor":
-                        name_node = TensorInputNode(
-                            self.element_accumulator,
-                            node,
-                        )
-                    elif type == "row":
-                        name_node = RowBroadcastNode(
-                            self.element_accumulator,
-                            self.element_compute,
-                            node,
-                        )
-                    elif type == "column":
-                        name_node = ColumnBroadcastNode(
-                            self.element_accumulator,
-                            self.element_compute,
-                            node,
-                        )
-                    elif type == "scalar":
-                        name_node = ScalarInputNode(node)
-                    else:
-                        raise ValueError(type)
-                # for output nodes
-                else:
-                    name_node = TensorOutputNode(
-                        self.element_accumulator,
-                        node,
-                    )
-            self.epilogue_tree.create_node(
-                name_node.tag,
-                name_node.id,
-                data=name_node,
-                parent=self.stack[-1],
-            )
-
-    def visit_Assign(self, node):
-        pre_assign_node = self.epilogue_tree.get_node(node.targets[0].id)
-        if pre_assign_node is None:
-            # The assign is to a root node
-            # skip the reduction nodes
-            if isinstance(node.value, ast.Call):
-                if isinstance(node.value.func, ast.Name):
-                    func_type = node.value.func.id
-                elif isinstance(node.value.func, ast.Attribute):
-                    func_type = node.value.func.value.id
-                else:
-                    raise TypeError
-                if func_type == "reduction_op":
-                    self.reduction_source[node.value.args[0].id] = [
-                        node.value.args[1].value,
-                        node.value.args[2].value,
-                        node.targets[0].id,
-                    ]
-                    return
-            name_node = TensorOutputNode(self.element_accumulator, node)
-            self.epilogue_tree.create_node(
-                name_node.tag,
-                name_node.id,
-                data=name_node,
-            )
-            self.stack.append(name_node.id)
-        else:
-            if (
-                node.targets[0].id in self.returns
-                or node.targets[0].id in self.reduction_source.keys()
-            ):
-                self.stack.append(node.targets[0].id)
-            else:
-                self.stack.append(
-                    pre_assign_node.predecessor(self.epilogue_tree.identifier)
-                )
-                self.epilogue_tree.remove_node(node.targets[0].id)
-
-        # get child tag
-        self.visit(node.value)
-        self.stack.pop()
-
-    def visit_Call(self, node):
-        if isinstance(node.func, ast.Name):
-            func_type = node.func.id
-        elif isinstance(node.func, ast.Attribute):
-            func_type = node.func.value.id
-        else:
-            raise TypeError
-        if func_type == "reduction_op":
-            self.visit(node.args[0])
-        else:
-            arg_list = []
-            for idx, arg in enumerate(node.args):
-                if idx == 0:
-                    continue
-                if isinstance(arg, ast.Constant):
-                    arg_list.append(arg.value)
-                elif isinstance(arg, ast.Name):
-                    arg_list.append(arg.id)
-                else:
-                    raise TypeError
-
-            unary_node = UnaryNode(
-                self.element_accumulator,
-                self.element_compute,
-                self.elements_per_access,
-                node,
-                arg_list,
-            )
-            self.epilogue_tree.create_node(
-                unary_node.tag,
-                unary_node.id,
-                parent=self.stack[-1],
-                data=unary_node,
-            )
-            self.stack.append(unary_node.id)
-            self.visit(node.args[0])
-            self.stack.pop()
-
-    def visit_BinOp(self, node):
-        binop = BinOpNode(
-            self.element_accumulator,
-            self.element_compute,
-            self.elements_per_access,
-            node,
-        )
-        self.epilogue_tree.create_node(
-            binop.tag,
-            binop.id,
-            data=binop,
-            parent=self.stack[-1],
-        )
-        self.stack.append(binop.id)
-        self.visit(node.left)
-        self.visit(node.right)
-        self.stack.pop()
-
-    def visit_Return(self, node):
-        self.stack.append("return")
-        self.visit(node.value)
-        self.stack.pop()
-
-    # # A function definition
-    def visit_FunctionDef(self, node: ast.FunctionDef):
-        # visit args
-        for arg in node.args.args:
-            if arg.arg == "self":
-                continue
-            if isinstance(arg.annotation, ast.Constant):
-                self.input_args[arg.arg] = [
-                    arg.annotation.value,
-                ]
-        # visit the assign in the reverse order
-        for idx in range(len(node.body)):
-            self.visit(node.body[-1 - idx])
-
-    #
-    # Tree optimization pass
-    #
-
-    # pass 1: lower Binary to Unary
-    def pass_binary_2_unary(self, tree, nid):
-        node = tree.get_node(nid)
-        if isinstance(node.data, BinOpNode):
-            lhs_node = tree.get_node(node.successors(tree.identifier)[0])
-            left_type = lhs_node.data.type
-            rhs_node = tree.get_node(node.successors(tree.identifier)[1])
-            right_type = rhs_node.data.type
-
-            if left_type == "scalar" and right_type == "tensor":
-                node.data = UnaryNode(
-                    self.element_accumulator,
-                    self.element_compute,
-                    self.elements_per_access,
-                    node.data,
-                    [
-                        lhs_node.data.id,
-                    ],
-                )
-                node.tag = node.data.tag
-                tree.remove_node(lhs_node.data.id)
-                self.pass_binary_2_unary(tree, rhs_node.data.id)
-
-            elif left_type == "tensor" and right_type == "scalar":
-                node.data = UnaryNode(
-                    self.element_accumulator,
-                    self.element_compute,
-                    self.elements_per_access,
-                    node.data,
-                    [
-                        rhs_node.id,
-                    ],
-                )
-                node.tag = node.data.tag
-                tree.remove_node(rhs_node.data.id)
-                self.pass_binary_2_unary(tree, lhs_node.data.id)
-
-            else:
-                self.pass_binary_2_unary(tree, lhs_node.data.id)
-                self.pass_binary_2_unary(tree, rhs_node.data.id)
-        else:
-            for child in node.successors(tree.identifier):
-                self.pass_binary_2_unary(tree, child)
-
-    # pass 2: inject reduction nodes
-    def pass_inject_reduction(self, tree, nid):
-        node = tree.get_node(nid)
-        if isinstance(node.data, TensorOutputNode):
-            if node.data.id in self.reduction_source.keys():
-                direction = self.reduction_source[node.data.id][0]
-                target = self.reduction_source[node.data.id][-1]
-                if direction == "row":
-                    reduction_node = RowReductionNode(
-                        self.element_accumulator,
-                        self.element_output,
-                        self.element_accumulator,
-                        target,
-                        self.tile_description.threadblock_shape[1],
-                    )
-                elif direction == "column":
-                    reduction_node = ColumnReductionNode(
-                        self.element_accumulator,
-                        self.element_output,
-                        self.element_accumulator,
-                        target,
-                        self.tile_description.threadblock_shape[0],
-                    )
-                else:
-                    raise ValueError(direction)
-                child_nid = node.successors(tree.identifier)[0]
-                # if this output node is injected only for reduction
-                if node.data.id not in self.returns:
-                    # get reduction config from disc
-                    node.data = reduction_node
-                    node.tag = reduction_node.tag
-                    self.pass_inject_reduction(tree, child_nid)
-                # if this output node is also a tensor output, inject reduction as its children
-                else:
-                    # get child node
-                    tree.create_node(
-                        reduction_node.tag,
-                        reduction_node.id,
-                        data=reduction_node,
-                        parent=node.data.id,
-                    )
-                    tree.move_node(
-                        child_nid,
-                        reduction_node.id,
-                    )
-                    child = tree.get_node(child_nid)
-                    for grand_child in child.successors(tree.identifier):
-                        self.pass_inject_reduction(tree, grand_child)
-            else:
-                for child in node.successors(tree.identifier):
-                    self.pass_inject_reduction(tree, child)
-        else:
-            for child in node.successors(tree.identifier):
-                self.pass_inject_reduction(tree, child)
-
-    def pass_inject_epilogue_op(self, tree, nid):
-        node = tree.get_node(nid)
-        visitors = []
-        for child in node.successors(tree.identifier):
-            visitors.append(self.pass_inject_epilogue_op(tree, child))
-
-        node.data.get_epilogue_node(visitors)
-        return node.data.epilogue_node
-
-    def get_arguments(self, tree, nid, kwargs):
-        node = tree.get_node(nid)
-        visitor_args = []
-        for child in node.successors(tree.identifier):
-            visitor_args.append(self.get_arguments(tree, child, kwargs))
-
-        node.data.get_argument(visitor_args, kwargs)
-        return node.data.argument
-
-
-class EpilogueVisitTree:
-    KernelTemplate = """
-${visitor}
-
-using ${operation_name}_EpilogueVisitor = cutlass::epilogue::threadblock::EpilogueVisitorGeneric<${visitor_name}>;
-"""
-
-    def __init__(
-        self,
-        elementwise_functor,
-        tile_description,
-        element_accumulator,
-        elements_per_access,
-        element_compute,
-        element_output,
-    ) -> None:
-        #
-        # data types
-        self.tile_description = tile_description
-        self.element_accumulator = element_accumulator
-        self.elements_per_access = elements_per_access
-        self.element_compute = element_compute
-        self.element_output = element_output
-        self.elementwise_functor = elementwise_functor
-        pass
-
-    def initialize(self):
-        function = EpilogueAST(
-            self,
-            self.tile_description,
-            self.element_accumulator,
-            self.elements_per_access,
-            self.element_compute,
-            self.element_output,
-        )
-        #
-        tree = function.epilogue_tree
-        self.tree = tree
-        function.pass_binary_2_unary(self.tree, self.tree.root)
-        function.pass_inject_reduction(self.tree, self.tree.root)
-        function.pass_inject_epilogue_op(self.tree, self.tree.root)
-
-        visitor = self.tree.get_node(self.tree.root).data.epilogue_node
-        self.visitor = visitor
-
-        class _Argument(ctypes.Structure):
-            _fields_ = [
-                (
-                    "visitor_arg",
-                    visitor.argument_type,
-                )
-            ]
-
-            def __init__(self, **kwargs) -> None:
-                # process input args
-                _kwargs = {}
-                for input_key in function.input_args.keys():
-                    if input_key == "accum":
-                        continue
-                    if function.input_args[input_key][0] == "scalar":
-                        continue
-                    # tensor input
-                    else:
-                        setattr(
-                            self,
-                            "buffer_tensor_" + input_key,
-                            NumpyFrontend.argument(
-                                kwargs[input_key],
-                                False,
-                            ),
-                        )
-                        setattr(
-                            self,
-                            input_key + "_ptr",
-                            int(
-                                getattr(
-                                    self,
-                                    "buffer_tensor_" + input_key,
-                                ).ptr
-                            ),
-                        )
-                        _kwargs[input_key + "_ptr"] = getattr(
-                            self,
-                            input_key + "_ptr",
-                        )
-                # process the return args
-                for ret in function.returns:
-                    setattr(
-                        self,
-                        "buffer_tensor_" + ret,
-                        NumpyFrontend.argument(kwargs[ret], True),
-                    )
-                    setattr(
-                        self,
-                        ret + "_ptr",
-                        int(
-                            getattr(
-                                self,
-                                "buffer_tensor_" + ret,
-                            ).ptr
-                        ),
-                    )
-                    _kwargs[ret + "_ptr"] = getattr(self, ret + "_ptr")
-                    setattr(
-                        self,
-                        "host_tensor_" + ret,
-                        kwargs[ret],
-                    )
-
-                _kwargs.update(kwargs)
-                function.get_arguments(tree, tree.root, _kwargs)
-                self.visitor_arg = tree.get_node(tree.root).data.argument
-
-            def sync(self, stream_sync=True):
-                if stream_sync:
-                    (err,) = cudart.cudaDeviceSynchronize()
-                    if err != cuda.CUresult.CUDA_SUCCESS:
-                        raise RuntimeError("CUDA Error %s" % str(err))
-
-                for ret in function.returns:
-                    (err,) = cuda.cuMemcpyDtoH(
-                        getattr(
-                            self,
-                            "host_tensor_" + ret,
-                        ),
-                        cuda.CUdeviceptr(getattr(self, ret + "_ptr")),
-                        getattr(
-                            self,
-                            "host_tensor_" + ret,
-                        ).size
-                        * getattr(
-                            self,
-                            "host_tensor_" + ret,
-                        ).itemsize,
-                    )
-                if err != cuda.CUresult.CUDA_SUCCESS:
-                    raise RuntimeError("CUDA Error %s" % str(err))
-                pass
-
-        self.epilogue_type = _Argument
-
-    def emit(self, operation):
-        values = {
-            "visitor": self.visitor.emit(operation),
-            "operation_name": operation.procedural_name(),
-            "visitor_name": self.visitor.instance_name,
-        }
-        return SubstituteTemplate(self.KernelTemplate, values)
diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py
index 0542f6c58b..9662017cc9 100644
--- a/python/cutlass/backend/reduction_operation.py
+++ b/python/cutlass/backend/reduction_operation.py
@@ -30,24 +30,24 @@
 #
 ################################################################################
 
-
+import ctypes
 from typing import Union
 
-import ctypes
 from cuda import cuda, cudart
-import cutlass_bindings
 import numpy as np
 
-from cutlass.backend.c_types import MatrixCoord_, TensorRef2D_, get_reduction_params
-from cutlass.backend.frontend import NumpyFrontend, TorchFrontend
-from cutlass.backend.library import (
+from cutlass import (
     DataTypeNames,
     DataTypeSize,
     DataTypeTag,
-    TensorDescription,
+    LayoutType
 )
+from cutlass.backend.c_types import MatrixCoord_, TensorRef2D_, get_reduction_params
+from cutlass.backend.frontend import NumpyFrontend, TorchFrontend
+from cutlass.backend.library import TensorDescription
 from cutlass.backend.operation import ExecutableOperation, LaunchConfiguration
 from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
+from cutlass.shape import MatrixCoord
 
 if CheckPackages().check_torch():
     import torch
@@ -80,10 +80,9 @@ def __init__(
             self.bias = False
 
         self.operation = operation
-        #: pointer to the workspace
         self.ptr_workspace = workspace
 
-        #: number of split-k partitions
+        # number of split-k partitions
         self.partitions = partitions
 
         if isinstance(destination, np.ndarray):
@@ -112,19 +111,18 @@ def __init__(
         else:
             self.output_op = self.operation.epilogue_type(1.0, 0.0)
 
-        # get arguments
         self.get_arguments()
 
     @staticmethod
     def get_tensor_ref(
         extent: "tuple[int]",
         device_ptr: cuda.CUdeviceptr,
-        layout: cutlass_bindings.layout,
+        layout: LayoutType,
     ):
-        if layout == cutlass_bindings.RowMajor:
+        if layout == LayoutType.RowMajor:
             return TensorRef2D_(int(device_ptr), extent[1])
         else:
-            raise ValueError("unknown layout type")
+            raise ValueError(f"Unknown layout type {layout}")
 
     def get_arguments(self):
         ref_workspace = ReductionArguments.get_tensor_ref(
@@ -133,13 +131,13 @@ def get_arguments(self):
                 self.problem_size.column,
             ],
             device_ptr=self.ptr_workspace,
-            layout=cutlass_bindings.RowMajor,
+            layout=LayoutType.RowMajor,
         )
         if self.bias:
             ref_source = ReductionArguments.get_tensor_ref(
                 extent=[0, 0],
                 device_ptr=self.ptr_source,
-                layout=cutlass_bindings.RowMajor,
+                layout=LayoutType.RowMajor,
             )
         else:
             ref_source = ReductionArguments.get_tensor_ref(
@@ -148,7 +146,7 @@ def get_arguments(self):
                     self.problem_size.column,
                 ],
                 device_ptr=self.ptr_source,
-                layout=cutlass_bindings.RowMajor,
+                layout=LayoutType.RowMajor,
             )
 
         ref_destination = ReductionArguments.get_tensor_ref(
@@ -157,7 +155,7 @@ def get_arguments(self):
                 self.problem_size.column,
             ],
             device_ptr=self.ptr_destination,
-            layout=cutlass_bindings.RowMajor,
+            layout=LayoutType.RowMajor,
         )
 
         self.c_arguments = self.operation.argument_type(
@@ -176,7 +174,7 @@ def get_arguments(self):
     def sync(self):
         (err,) = cudart.cudaDeviceSynchronize()
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
+            raise RuntimeError(f"CUDA Error {str(err)}")
 
         if hasattr(self, "host_D"):
             (err,) = cuda.cuMemcpyDtoH(
@@ -258,15 +256,15 @@ def emit(self):
 
     def plan(self, arguments: ReductionArguments):
         block_shape = [
-            self.operation.shape.column() // self.elements_per_access,
-            self.operation.shape.row(),
+            self.operation.shape.column // self.elements_per_access,
+            self.operation.shape.row,
             1,
         ]
         grid_shape = [
-            (arguments.problem_size.row + self.operation.shape.row() - 1)
-            // self.operation.shape.row(),
-            (arguments.problem_size.column + self.operation.shape.column() - 1)
-            // self.operation.shape.column(),
+            (arguments.problem_size.row + self.operation.shape.row - 1)
+            // self.operation.shape.row,
+            (arguments.problem_size.column + self.operation.shape.column - 1)
+            // self.operation.shape.column,
             1,
         ]
         return LaunchConfiguration(
@@ -282,20 +280,17 @@ def initialize(self):
             value=self.shared_memory_capacity,
         )
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("Cuda Error: {}".format(err))
+            raise RuntimeError(f"CUDA Error: {err}")
 
 
 class ReductionOperation:
     """
-    CUTLASS Reduction Operation
-    shape: shape of CTA
-    outputop: output operator
-    r
+    CUTLASS reduction Operation
     """
 
     def __init__(
         self,
-        shape: cutlass_bindings.MatrixCoord,
+        shape: MatrixCoord,
         C: TensorDescription,
         element_accumulator,
         element_workspace=None,
@@ -304,45 +299,33 @@ def __init__(
         count: int = 1,
         partitions_per_stage: int = 4,
     ) -> None:
-        """Constructor"""
-
         self.shape = shape
-        #: epilogue functor (default: LinearCombination)
         self.epilogue_functor = epilogue_functor
-        #: datatype of accumulator
         self.element_accumulator = element_accumulator
 
         if element_workspace is None:
-            #: datatype of workspace
             self.element_workspace = element_accumulator
         else:
-            #: datatype of workspace
             self.element_workspace = element_workspace
 
         if element_compute is None:
-            #: datatype of workspace
             self.element_compute = element_accumulator
         else:
-            #: datatype of workspace
             self.element_compute = element_compute
 
-        #: datatype of output
         self.element_output = C.element
-
-        #: operand C
         self.C: TensorDescription = C
 
-        #: reduce op processing size
+        # Reduce op processing size
         self.count: int = count
 
-        #: number of partitions to reduce per stage
+        # Number of partitions to reduce per stage
         self.partitions_per_stage: int = partitions_per_stage
 
         self.rt_module: ReductionRT = ReductionRT(self)
         self.argument_type = self.rt_module.argument_type
         self.epilogue_type = self.rt_module.epilogue_type
 
-    #
     def extended_name(self):
         extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
 
@@ -356,15 +339,14 @@ def extended_name(self):
             },
         )
 
-    #
     def configuration_name(self):
         """The full procedural name indicates architecture, extended name, tile size"""
 
         configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
 
         threadblock = "%dx%d" % (
-            self.shape.row(),
-            self.shape.column(),
+            self.shape.row,
+            self.shape.column,
         )
 
         return SubstituteTemplate(
@@ -375,7 +357,6 @@ def configuration_name(self):
             },
         )
 
-    #
     def procedural_name(self):
         """The full procedural name indicates architeture, extended name, tile size"""
         return self.configuration_name()
@@ -384,14 +365,11 @@ def run(self, arguments: ReductionArguments) -> cuda.CUresult:
         """
         Configure and launch the cuda kernel with input arguments
         """
-        # get launch configuration
         launch_config = self.rt_module.plan(arguments)
 
-        # get the host and device workspace
         host_workspace = arguments.host_workspace
         device_workspace = None
 
-        # launch the kernel
         err = self.rt_module.run(
             host_workspace,
             device_workspace,
@@ -399,7 +377,7 @@ def run(self, arguments: ReductionArguments) -> cuda.CUresult:
         )
 
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
+            raise RuntimeError(f"CUDA Error {str(err)}")
 
         return err
 
@@ -421,7 +399,7 @@ def __init__(self, operation_suffix="") -> None:
         ]
         self.template = """
 // Reduction kernel instance
-using ${operation_name}_base = 
+using ${operation_name}_base =
 typename cutlass::reduction::kernel::ReduceSplitK<
   cutlass::MatrixShape<${shape_row}, ${shape_column}>,
   ${epilogue_functor},
@@ -436,19 +414,14 @@ def __init__(self, operation_suffix="") -> None:
       """
 
     def emit(self, operation: ReductionOperation):
-        epilogue_vector_length = int(
-            min(
-                operation.C.alignment * DataTypeSize[operation.C.element],
-                128,
-            )
-            / DataTypeSize[operation.C.element]
-        )
+        vector_length_bits = min(operation.C.alignment * DataTypeSize[operation.C.element], 128)
+        epilogue_vector_length = vector_length_bits // DataTypeSize[operation.C.element]
 
         values = {
             "operation_name": operation.configuration_name(),
             "operation_suffix": self.operation_suffix,
-            "shape_row": str(operation.shape.row()),
-            "shape_column": str(operation.shape.column()),
+            "shape_row": str(operation.shape.row),
+            "shape_column": str(operation.shape.column),
             "epilogue_functor": operation.epilogue_functor.emit(),
             "element_output": DataTypeTag[operation.element_output],
             "epilogue_vector_length": str(epilogue_vector_length),
diff --git a/python/cutlass/backend/test/conv2d_testbed.py b/python/cutlass/backend/test/conv2d_testbed.py
deleted file mode 100644
index 3715b47515..0000000000
--- a/python/cutlass/backend/test/conv2d_testbed.py
+++ /dev/null
@@ -1,807 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import re
-import subprocess
-from time import sleep
-
-from bfloat16 import bfloat16
-import cutlass_bindings
-import numpy as np
-
-from cutlass.backend import compiler
-from cutlass.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
-from cutlass.backend.library import DataTypeSize, ShortDataTypeNames, StrideSupport
-from cutlass.backend.memory_manager import get_allocated_size
-from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
-from cutlass.backend.test.profiler import GpuTimer
-from cutlass.backend.utils.software import SubstituteTemplate
-
-
-def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
-    ptr = tensor.__array_interface__["data"][0]
-    if operand == "a":
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_a_extent(
-            conv_kind, problem_size
-        )
-    elif operand == "b":
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_b_extent(
-            conv_kind, problem_size
-        )
-    elif operand in ["c", "d"]:
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_c_extent(
-            conv_kind, problem_size
-        )
-    else:
-        raise ValueError("unknown operand: " + operand)
-
-    layout = tensor_layout.packed(tensor_coord)
-
-    if tensor.dtype == np.float64:
-        return cutlass_bindings.TensorRefF64NHWC(ptr, layout)
-    elif tensor.dtype == np.float32:
-        return cutlass_bindings.TensorRefF32NHWC(ptr, layout)
-    elif tensor.dtype == np.float16:
-        return cutlass_bindings.TensorRefF16NHWC(ptr, layout)
-    if tensor.dtype == bfloat16:
-        return cutlass_bindings.TensorRefBF16NHWC(ptr, layout)
-    elif tensor.dtype == np.int32:
-        return cutlass_bindings.TensorRefS32NHWC(ptr, layout)
-    elif tensor.dtype == np.int8:
-        if tensor_layout == cutlass_bindings.TensorNC32HW32:
-            return cutlass_bindings.TensorRefS8NC32HW32(ptr, layout)
-        elif tensor_layout == cutlass_bindings.TensorC32RSK32:
-            return cutlass_bindings.TensorRefS8C32RSK32(ptr, layout)
-        else:
-            return cutlass_bindings.TensorRefS8NHWC(ptr, layout)
-    else:
-        raise ValueError("unsupported data type")
-
-
-def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
-    tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
-
-    if operand == "a":
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_a_extent(
-            conv_kind, problem_size
-        )
-    elif operand == "b":
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_b_extent(
-            conv_kind, problem_size
-        )
-    elif operand in ["c", "d"]:
-        tensor_coord = cutlass_bindings.conv.implicit_gemm_tensor_c_extent(
-            conv_kind, problem_size
-        )
-    else:
-        raise ValueError("unknown operand: " + operand)
-
-    if tensor.dtype == np.float64:
-        return cutlass_bindings.TensorViewF64NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.float32:
-        return cutlass_bindings.TensorViewF32NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.float16:
-        return cutlass_bindings.TensorViewF16NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == bfloat16:
-        return cutlass_bindings.TensorViewBF16NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.int32:
-        return cutlass_bindings.TensorViewS32NHWC(tensor_ref, tensor_coord)
-    elif tensor.dtype == np.int8:
-        if tensor_layout == cutlass_bindings.TensorNC32HW32:
-            return cutlass_bindings.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
-        elif tensor_layout == cutlass_bindings.TensorC32RSK32:
-            return cutlass_bindings.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
-        else:
-            return cutlass_bindings.TensorViewS8NHWC(tensor_ref, tensor_coord)
-
-    else:
-        raise ValueError("unsupported data type")
-
-
-class Conv2dLauncher:
-    """
-    Launcher that runs the operation on given problem size
-    """
-
-    def __init__(
-        self,
-        operation: "Conv2dOperation",
-        seed: int = 2080,
-        interleaved=False,
-        verification=True,
-        profiling=False,
-        warmup_iterations=500,
-        iterations=500,
-        compilation_mode="nvcc",
-        **kwargs,
-    ) -> None:
-        self.enable_cached_results = True
-        self.interleaved = interleaved
-
-        # create the reduction kernel
-        self.reduction_operation = ReductionOperation(
-            shape=cutlass_bindings.MatrixCoord(4, 32 * operation.C.alignment),
-            C=operation.C,
-            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
-            element_compute=operation.epilogue_functor.element_epilogue,
-            epilogue_functor=operation.epilogue_functor,
-            count=operation.C.alignment,
-        )
-
-        #: verify the output result
-        self.verification = verification
-        #: profile the kernel's runtime
-        self.profiling = profiling
-
-        self.timer = GpuTimer()
-
-        self.warmup_iterations = warmup_iterations
-        self.iterations = iterations
-
-        if "sleep" in kwargs.keys():
-            self.sleep_time = kwargs["sleep"]
-        else:
-            self.sleep_time = 0
-
-        #
-        # Compile the operator
-        #
-
-        if compilation_mode == "nvcc":
-            compiler.nvcc()
-        elif compilation_mode == "nvrtc":
-            compiler.nvrtc()
-        else:
-            raise Exception(f"Unexpected compilation mode {compilation_mode}")
-
-        compiler.add_module([operation, self.reduction_operation])
-
-        self.operation = operation
-
-        self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
-        self.layout_A = operation.A.layout
-        self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
-        self.layout_B = operation.B.layout
-        self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
-        self.layout_C = operation.C.layout
-        self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
-        self.layout_D = operation.C.layout
-
-        accumulator_size = DataTypeSize[
-            operation.tile_description.math_instruction.element_accumulator
-        ]
-        element_size = DataTypeSize[operation.A.element]
-
-        if element_size <= 8:
-            self.randomization_max = 1
-        elif element_size == 16:
-            if accumulator_size <= 16:
-                self.randomization_max = 2
-            else:
-                self.randomization_max = 4
-        else:
-            self.randomization_max = 7
-
-        # Seed
-        self.seed = seed
-
-        self.conv_kind = operation.conv_kind
-
-        #
-        # Get the host reference function
-        #
-
-        self.element_compute = operation.epilogue_functor.element_epilogue
-
-        self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
-
-        self.timer = GpuTimer()
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass_bindings.float64:
-            return np.float64
-        elif type == cutlass_bindings.float32:
-            return np.float32
-        elif type == cutlass_bindings.float16:
-            return np.float16
-        elif type == cutlass_bindings.bfloat16:
-            return bfloat16
-        elif type == cutlass_bindings.int32:
-            return np.int32
-        elif type == cutlass_bindings.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def print_problem_size(self, p, split_k_mode=1):
-        print(
-            "nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
-            % (
-                p.N,
-                p.H,
-                p.W,
-                p.C,
-                p.K,
-                p.R,
-                p.S,
-                p.C,
-                p.pad_h,
-                p.pad_w,
-                p.stride_h,
-                p.stride_w,
-                p.dilation_h,
-                p.dilation_w,
-                p.split_k_slices,
-                split_k_mode,
-            )
-        )
-
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=-self.randomization_max - 0.5, high=self.randomization_max - 0.5, size=size
-                ).astype(dtype)
-            )
-        else:
-            return np.random.uniform(
-                low=-self.randomization_max - 1, high=self.randomization_max + 1, size=size
-            ).astype(dtype)
-
-    def eq_gemm_size(self, problem_size):
-        n = problem_size.N
-        p = problem_size.P
-        q = problem_size.Q
-        k = problem_size.K
-        r = problem_size.R
-        s = problem_size.S
-        c = problem_size.C
-        h = problem_size.H
-        w = problem_size.W
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-            return cutlass_bindings.gemm.GemmCoord(n * p * q, k, r * s * c)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-            return cutlass_bindings.gemm.GemmCoord(n * h * w, c, k * r * s)
-        else:
-            return cutlass_bindings.gemm.GemmCoord(k, r * s * c, n * p * q)
-
-    def bytes(self, problem_size, alpha, beta):
-        mnk = self.eq_gemm_size(problem_size)
-
-        bytes_ = (
-            (DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k()
-            + (DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k()
-            + (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
-        )
-
-        if beta != 0:
-            bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
-
-        return bytes_
-
-    def flops(self, problem_size):
-        mnk = self.eq_gemm_size(problem_size)
-
-        flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
-        flops_epilogue_ = mnk.m() * mnk.n() * 2
-
-        # Adjust mainloop flop for dgrad stride
-        if self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-            flops_mainloop_ = flops_mainloop_ // (
-                problem_size.stride_h * problem_size.stride_w
-            )
-
-        flops_total_ = flops_mainloop_ + flops_epilogue_
-
-        return flops_total_
-
-    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
-        if self.element_compute == cutlass_bindings.float16:
-            alpha = cutlass_bindings.float16(alpha)
-            beta = cutlass_bindings.float16(beta)
-        elif self.element_compute == cutlass_bindings.int32:
-            alpha = int(alpha)
-            beta = int(beta)
-        else:
-            alpha = alpha
-            beta = beta
-
-        # if cached result is loaded
-        cached_result_loaded = False
-
-        if self.enable_cached_results:
-            # get problem key
-            cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
-                self.conv_kind,
-                problem_size,
-                alpha,
-                beta,
-                getTensorView(
-                    tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-                ),
-                getTensorView(
-                    tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-                ),
-                getTensorView(
-                    tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-                ),
-            )
-
-            cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
-
-            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
-                self.operation.arch,
-                self.seed,
-            )
-
-            cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
-                conv2d_result_cache_name
-            )
-            # CachedTestResultListing cached_results(conv2d_result_cache_name);
-            cached = cached_results.find(cached_test_key)
-            cached_result_loaded = cached[0]
-            if cached_result_loaded:
-                cached_test_result = cached[1]
-
-        if not cached_result_loaded:
-            # compute the conv2d on host
-            tensor_D_ref = np.ones_like(tensor_C)
-            tensor_ref_A = getTensorRef(
-                tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-            )
-            tensor_ref_B = getTensorRef(
-                tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-            )
-            tensor_ref_C = getTensorRef(
-                tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-            )
-            tensor_ref_D_ref = getTensorRef(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            self.host_conv2d(
-                self.conv_kind,
-                problem_size,
-                tensor_ref_A,
-                tensor_ref_B,
-                tensor_ref_C,
-                tensor_ref_D_ref,
-                alpha,
-                beta,
-            )
-
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            if self.enable_cached_results:
-                cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
-                    tensor_view_D_ref
-                )
-                cached_results = (
-                    cutlass_bindings.test.conv.host.CachedTestResultListing(
-                        conv2d_result_cache_name
-                    )
-                )
-                cached_results.append(cached_test_key, cached_test_result)
-                cached_results.write(conv2d_result_cache_name)
-            else:
-                return tensor_D_ref
-
-        return cached_test_result.D
-
-    def equal(self, tensor_D, tensor_D_ref, problem_size):
-        if self.enable_cached_results:
-            tensor_view_D = getTensorView(
-                tensor_D, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-            tensor_D_hash = cutlass_bindings.test.conv.host.TensorHash(tensor_view_D)
-
-            return tensor_D_hash == tensor_D_ref
-        else:
-            tensor_view_D = getTensorView(
-                tensor_D, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-            return cutlass_bindings.test.conv.host.equals(
-                tensor_view_D, tensor_view_D_ref
-            )
-
-    def run_cutlass_profiler(
-        self,
-        problem_size,
-        split_k_mode=cutlass_bindings.conv.SplitKMode.Serial,
-        alpha=1.0,
-        beta=0.0,
-    ):
-        if split_k_mode == cutlass_bindings.conv.SplitKMode.Serial:
-            split_k_mode_ = "serial"
-        else:
-            split_k_mode_ = "parallel"
-
-        cutlass_path = os.getenv("CUTLASS_PATH")
-        assert (
-            cutlass_path is not None
-        ), "Environment variable 'CUTLASS_PATH' is not defined."
-
-        values = {
-            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
-            "kernel_name": self.operation.procedural_name(),
-            "verification_providers": "device",
-            "provider": "cutlass",
-            "n": str(problem_size.N),
-            "h": str(problem_size.H),
-            "w": str(problem_size.W),
-            "c": str(problem_size.C),
-            "k": str(problem_size.K),
-            "r": str(problem_size.R),
-            "s": str(problem_size.S),
-            "p": str(problem_size.P),
-            "q": str(problem_size.Q),
-            "pad_h": str(problem_size.pad_h),
-            "pad_w": str(problem_size.pad_w),
-            "stride_h": str(problem_size.stride_h),
-            "stride_w": str(problem_size.stride_w),
-            "dilation_h": str(problem_size.dilation_h),
-            "dilation_w": str(problem_size.dilation_w),
-            "split_k_slices": str(problem_size.split_k_slices),
-            "split_k_mode": split_k_mode_,
-            "alpha": str(alpha),
-            "beta": str(beta),
-            "warmup": str(self.warmup_iterations),
-            "profile": str(self.iterations),
-        }
-
-        cmd_template = (
-            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}"
-            " --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}"
-            " --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}"
-            " --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}"
-            " --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
-        )
-
-        cmd = SubstituteTemplate(cmd_template, values)
-        result = subprocess.getoutput(cmd)
-
-        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
-        runtime = float(m.group("runtime"))
-
-        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
-        bytes = int(m.group("bytes"))
-
-        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
-        flops = int(m.group("flops"))
-
-        # check if the problem size matches
-        assert bytes == self.bytes(problem_size, alpha, beta)
-        assert flops == self.flops(problem_size)
-
-        return runtime
-
-    def run(
-        self,
-        problem_size,
-        split_k_mode=cutlass_bindings.conv.SplitKMode.Serial,
-        alpha=1.0,
-        beta=0.0,
-    ):
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released in previous run"
-            % get_allocated_size()
-        )
-
-        #
-        # Initialize input and output tensors
-        #
-        tensor_A_size = cutlass_bindings.conv.implicit_gemm_tensor_a_size(
-            self.conv_kind, problem_size
-        )
-        tensor_B_size = cutlass_bindings.conv.implicit_gemm_tensor_b_size(
-            self.conv_kind, problem_size
-        )
-        tensor_C_size = cutlass_bindings.conv.implicit_gemm_tensor_c_size(
-            self.conv_kind, problem_size
-        )
-
-        np.random.seed(self.seed)
-
-        tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
-        tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
-        tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
-        tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
-
-        #
-        # Launch kernel
-        #
-
-        arguments = Conv2dArguments(
-            operation=self.operation,
-            problem_size=problem_size,
-            A=tensor_A,
-            B=tensor_B,
-            C=tensor_C,
-            D=tensor_D,
-            output_op=self.operation.epilogue_type(alpha, beta),
-            split_k_slices=problem_size.split_k_slices,
-            split_k_mode=split_k_mode,
-        )
-
-        if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-            implicit_gemm_size = cutlass_bindings.conv.implicit_gemm_problem_size(
-                self.operation.conv_kind, arguments.problem_size
-            )
-            reduction_arguments = ReductionArguments(
-                self.reduction_operation,
-                problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()],
-                partitions=problem_size.split_k_slices,
-                workspace=arguments.ptr_D,
-                destination=tensor_D,
-                source=tensor_C,
-                output_op=self.reduction_operation.epilogue_type(alpha, beta),
-            )
-
-        self.operation.run(arguments)
-        if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-            self.reduction_operation.run(reduction_arguments)
-
-        passed = True
-        if self.verification:
-            if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-                reduction_arguments.sync()
-            else:
-                arguments.sync()
-
-            tensor_D_ref = self.host_reference(
-                problem_size, tensor_A, tensor_B, tensor_C, alpha, beta
-            )
-
-            passed = self.equal(tensor_D, tensor_D_ref, problem_size)
-
-            try:
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size, split_k_mode)
-
-        if self.profiling:
-            sleep(self.sleep_time)
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-                    self.reduction_operation.run(reduction_arguments)
-
-            self.timer.start()
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-                    self.reduction_operation.run(reduction_arguments)
-            self.timer.stop_and_wait()
-            runtime = self.timer.duration(self.iterations)
-
-        # free memory
-        del arguments
-        if split_k_mode == cutlass_bindings.conv.SplitKMode.Parallel:
-            del reduction_arguments
-
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released after current run"
-            % get_allocated_size()
-        )
-        if self.profiling:
-            return runtime
-        return passed
-
-
-########################################################################################################
-# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
-# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
-# Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
-# (conv_blacklist_sizes)
-############################################################################################################
-
-
-def test_all_conv2d_from_compilation_mode(
-    operation: Conv2dOperation,
-    conv_test_sizes,
-    interleaved,
-    compilation_mode):
-
-    passed = True
-
-    testbed = Conv2dLauncher(operation, interleaved=interleaved, compilation_mode=compilation_mode)
-
-    #
-    # Get conv problem sizes to run conv operator
-    #
-
-    conv_problems = cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64)
-
-    # Vector of conv2d problem sizes to avoid duplicate runs
-    conv_tested_sizes = []
-
-    # Flatten 2D problem_vectors into a 1D problem sizes
-    problem_sizes = conv_problems.conv2d_default_sizes
-
-    problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
-
-    # Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
-    for conv_problem in problem_sizes:
-        if conv_problem in conv_tested_sizes:
-            continue
-
-        # skip channel dimension % 32 != 0 for interleaved case
-        if interleaved:
-            if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
-                continue
-
-        #
-        # Procedurally disable certain cases
-        #
-
-        # CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
-        if (
-            operation.conv_kind == cutlass_bindings.conv.Operator.dgrad
-            and operation.stride_support == StrideSupport.Unity
-        ):
-            if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
-                continue
-
-        if not interleaved:
-            # Fixed channels algorithm requires channel count to match access size
-            if (
-                operation.iterator_algorithm
-                == cutlass_bindings.conv.IteratorAlgorithm.fixed_channels
-            ):
-                if conv_problem.C != operation.A.alignment:
-                    continue
-
-            # Few channels algorithm requires channel count to match access size
-            if (
-                operation.iterator_algorithm
-                == cutlass_bindings.conv.IteratorAlgorithm.few_channels
-            ):
-                if conv_problem.C % operation.A.alignment:
-                    continue
-
-            # CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
-            # Although strided dgrad works for all stride combinations, we are only going
-            # to run strided dgrad for non-unity strides
-
-            if (
-                operation.conv_kind == cutlass_bindings.conv.Operator.dgrad
-                and operation.stride_support == StrideSupport.Strided
-            ):
-                if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
-                    continue
-
-        #
-        # Test
-        #
-
-        # push back tested problem size to avoid re-running duplicates
-        conv_tested_sizes.append(conv_problem)
-
-        passed = testbed.run(conv_problem)
-
-        if not passed:
-            return False
-
-    if interleaved:
-        return True
-    #
-    # filter the cases for split K
-    #
-
-    # Small-channels convolution can't run here.
-    if operation.iterator_algorithm in [
-        cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-        cutlass_bindings.conv.IteratorAlgorithm.few_channels,
-    ]:
-        return True
-
-    # CUTLASS DGRAD's *stride* specialization does not support split-k mode
-    if (
-        operation.conv_kind == cutlass_bindings.conv.Operator.dgrad
-        and operation.stride_support == StrideSupport.Strided
-    ):
-        conv_problem = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 56, 56, 8),
-            cutlass_bindings.Tensor4DCoord(8, 1, 1, 8),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1,
-            1,
-        )
-        passed = testbed.run(conv_problem)
-
-        return passed
-
-    # Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
-    # a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
-    # which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
-    # alpha and beta for local testing, but only runs one value for alpha and beta.
-
-    conv2d_split_k_test_size = cutlass_bindings.conv.Conv2dProblemSize(
-        cutlass_bindings.Tensor4DCoord(1, 17, 11, 288),
-        cutlass_bindings.Tensor4DCoord(160, 3, 3, 288),
-        cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-        cutlass_bindings.MatrixCoord(1, 1),
-        cutlass_bindings.MatrixCoord(1, 1),
-        cutlass_bindings.conv.Mode.cross_correlation,
-        1,
-        1,
-    )
-
-    split_k_modes = [
-        cutlass_bindings.conv.SplitKMode.Parallel,
-        cutlass_bindings.conv.SplitKMode.Serial,
-    ]
-
-    split_k_slices = [1, 2, 3, 4, 201]
-    problem_alpha = [
-        2.0,
-    ]
-    problem_beta = [
-        2.0,
-    ]
-
-    for split_k_mode in split_k_modes:
-        for split_k_slice in split_k_slices:
-            for alpha in problem_alpha:
-                for beta in problem_beta:
-                    passed = testbed.run(
-                        conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
-                        split_k_mode,
-                        alpha,
-                        beta,
-                    )
-
-    return passed
-
-
-def test_all_conv2d(
-    operation: Conv2dOperation,
-    conv_test_sizes=[],
-    interleaved=False,
-    compilation_modes=["nvcc", "nvrtc"]):
-
-    for compilation_mode in compilation_modes:
-        passed = test_all_conv2d_from_compilation_mode(operation, conv_test_sizes, interleaved, compilation_mode)
-
-        if not passed:
-            return False
-
-    return True
diff --git a/python/cutlass/backend/test/gemm_grouped_testbed.py b/python/cutlass/backend/test/gemm_grouped_testbed.py
deleted file mode 100644
index 95f22a0c47..0000000000
--- a/python/cutlass/backend/test/gemm_grouped_testbed.py
+++ /dev/null
@@ -1,276 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from bfloat16 import bfloat16
-import cutlass_bindings
-import numpy as np
-
-from cutlass.backend import compiler
-from cutlass.backend.gemm_operation import GemmGroupedArguments, GemmOperationGrouped
-from cutlass.backend.library import DataTypeSize, ShortDataTypeNames
-from cutlass.backend.memory_manager import get_allocated_size
-from cutlass.backend.test.gemm_testbed import getTensorRef, getTensorView, transpose
-
-
-class TestbedGrouped:
-    def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
-        compiler.add_module([operation])
-
-        self.seed = seed
-
-        self.operation = operation
-
-        element_size = DataTypeSize[operation.A.element]
-
-        self.dtype_A = self.numpy_type(operation.A.element)
-        self.dtype_B = self.numpy_type(operation.B.element)
-        self.dtype_C = self.numpy_type(operation.C.element)
-        self.dtype_D = self.numpy_type(operation.C.element)
-
-        if element_size == 1:
-            self.scope_max = 1
-            self.scope_min = 0
-        elif element_size <= 8:
-            self.scope_max = 1
-            self.scope_min = -1
-        elif element_size == 16:
-            self.scope_max = 4
-            self.scope_min = -4
-        else:
-            self.scope_max = 8
-            self.scope_min = -8
-
-        #: compute type
-        self.compute_type = operation.epilogue_functor.element_epilogue
-
-        self.accumulator_type = (
-            operation.tile_description.math_instruction.element_accumulator
-        )
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass_bindings.float64:
-            return np.float64
-        elif type == cutlass_bindings.float32:
-            return np.float32
-        elif type == cutlass_bindings.float16:
-            return np.float16
-        elif type == cutlass_bindings.bfloat16:
-            return bfloat16
-        elif type == cutlass_bindings.int32:
-            return np.int32
-        elif type == cutlass_bindings.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=self.scope_min - 0.5, high=self.scope_max - 0.5, size=size
-                ).astype(dtype)
-            )
-        else:
-            return np.random.uniform(
-                low=self.scope_min - 1, high=self.scope_max + 1, size=size
-            ).astype(dtype)
-
-    def print_problem_size(self, p):
-        problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
-        print(problem_size)
-
-    def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released in previous run"
-            % get_allocated_size()
-        )
-
-        # initialize
-        passed = False
-        np.random.seed(self.seed)
-
-        # generate the problem sizes
-        problem_sizes = []
-        tensor_As = []
-        tensor_Bs = []
-        tensor_Cs = []
-        tensor_Ds = []
-        tensor_D_refs = []
-
-        for i in range(problem_count):
-            if self.dtype_A == np.int8:
-                if i == 0:
-                    problem_size = cutlass_bindings.gemm.GemmCoord(48, 16, 32)
-                else:
-                    problem_size = cutlass_bindings.gemm.GemmCoord(
-                        16 * np.random.randint(0, 64) + 48,
-                        16 * np.random.randint(0, 64) + 48,
-                        16 * np.random.randint(0, 64) + 48,
-                    )
-            else:
-                if i == 0:
-                    problem_size = cutlass_bindings.gemm.GemmCoord(48, 16, 8)
-                else:
-                    problem_size = cutlass_bindings.gemm.GemmCoord(
-                        8 * np.random.randint(0, 64) + 24,
-                        8 * np.random.randint(0, 64) + 24,
-                        8 * np.random.randint(0, 64) + 24,
-                    )
-
-            tensor_As.append(
-                self.uniform_init(
-                    size=(problem_size.m() * problem_size.k(),), dtype=self.dtype_A
-                )
-            )
-            tensor_Bs.append(
-                self.uniform_init(
-                    size=(problem_size.n() * problem_size.k(),), dtype=self.dtype_B
-                )
-            )
-            tensor_Cs.append(
-                self.uniform_init(
-                    size=(problem_size.m() * problem_size.n(),), dtype=self.dtype_C
-                )
-            )
-
-            tensor_Ds.append(
-                np.zeros(
-                    shape=(problem_size.m() * problem_size.n(),), dtype=self.dtype_D
-                )
-            )
-
-            tensor_D_refs.append(
-                np.ones(
-                    shape=(problem_size.m() * problem_size.n(),), dtype=self.dtype_D
-                )
-            )
-
-            problem_sizes.append(problem_size)
-
-        arguments = GemmGroupedArguments(
-            operation=self.operation,
-            problem_sizes=problem_sizes,
-            A=tensor_As,
-            B=tensor_Bs,
-            C=tensor_Cs,
-            D=tensor_Ds,
-            output_op=self.operation.epilogue_type(alpha, beta),
-        )
-
-        self.operation.run(arguments)
-
-        arguments.sync()
-
-        #
-        # Reference check
-        #
-        alpha = self.compute_type(alpha).value()
-        beta = self.compute_type(beta).value()
-        init_acc = self.accumulator_type(0).value()
-
-        for idx, problem_size in enumerate(problem_sizes):
-            if self.operation.switched:
-                tensor_ref_A = getTensorRef(
-                    tensor_As[idx],
-                    problem_size,
-                    "a",
-                    transpose(self.operation.B.layout),
-                )
-                tensor_ref_B = getTensorRef(
-                    tensor_Bs[idx],
-                    problem_size,
-                    "b",
-                    transpose(self.operation.A.layout),
-                )
-                tensor_ref_C = getTensorRef(
-                    tensor_Cs[idx],
-                    problem_size,
-                    "c",
-                    transpose(self.operation.C.layout),
-                )
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_refs[idx],
-                    problem_size,
-                    "d",
-                    transpose(self.operation.C.layout),
-                )
-            else:
-                tensor_ref_A = getTensorRef(
-                    tensor_As[idx], problem_size, "a", self.operation.A.layout
-                )
-                tensor_ref_B = getTensorRef(
-                    tensor_Bs[idx], problem_size, "b", self.operation.B.layout
-                )
-                tensor_ref_C = getTensorRef(
-                    tensor_Cs[idx], problem_size, "c", self.operation.C.layout
-                )
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_refs[idx], problem_size, "d", self.operation.C.layout
-                )
-
-            tensor_view_D_ref = getTensorView(
-                tensor_D_refs[idx], problem_size, "d", self.operation.C.layout
-            )
-
-            cutlass_bindings.test.gemm.host.gemm(
-                problem_size,
-                alpha,
-                tensor_ref_A,
-                tensor_ref_B,
-                beta,
-                tensor_ref_C,
-                tensor_ref_D_ref,
-                init_acc,
-            )
-
-            tensor_view_D = getTensorView(
-                tensor_Ds[idx], problem_size, "d", self.operation.C.layout
-            )
-
-            passed = cutlass_bindings.test.gemm.host.equals(
-                tensor_view_D, tensor_view_D_ref
-            )
-
-            try:
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size)
-
-        del arguments
-
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released after current run"
-            % get_allocated_size()
-        )
-
-        return passed
diff --git a/python/cutlass/backend/test/gemm_testbed.py b/python/cutlass/backend/test/gemm_testbed.py
deleted file mode 100644
index a52c41bd85..0000000000
--- a/python/cutlass/backend/test/gemm_testbed.py
+++ /dev/null
@@ -1,765 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import os
-import re
-import subprocess
-from time import sleep
-
-from bfloat16 import bfloat16
-from cuda import cuda, cudart
-import cutlass_bindings
-import numpy as np
-
-from cutlass.backend import compiler
-from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
-from cutlass.backend.library import (
-    DataTypeSize,
-    DataTypeSizeBytes,
-    MathOperation,
-    ShortDataTypeNames,
-)
-from cutlass.backend.memory_manager import get_allocated_size
-from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
-from cutlass.backend.test.profiler import GpuTimer
-from cutlass.backend.utils.datatypes import to_cutlass
-from cutlass.backend.utils.software import SubstituteTemplate
-
-
-def transpose(layout):
-    if layout == cutlass_bindings.RowMajor:
-        return cutlass_bindings.ColumnMajor
-    elif layout == cutlass_bindings.ColumnMajor:
-        return cutlass_bindings.RowMajor
-    elif layout == cutlass_bindings.ColumnMajorInterleaved32:
-        return cutlass_bindings.RowMajorInterleaved32
-    elif layout == cutlass_bindings.RowMajorInterleaved32:
-        return cutlass_bindings.ColumnMajorInterleaved32
-
-
-def getTensorRef(
-    tensor: np.ndarray,
-    problem_size: cutlass_bindings.gemm.GemmCoord,
-    operand: str,
-    layout: cutlass_bindings.layout,
-    batch_offset: int = 0,
-):
-    ptr = tensor.__array_interface__["data"][0]
-    if operand == "a":
-        tensor_coord = problem_size.mk()
-        batch_stride = problem_size.m() * problem_size.k()
-    elif operand == "b":
-        tensor_coord = problem_size.kn()
-        batch_stride = problem_size.k() * problem_size.n()
-    elif operand in ["c", "d"]:
-        tensor_coord = problem_size.mn()
-        batch_stride = problem_size.m() * problem_size.n()
-    else:
-        raise ValueError("Unknown operand: " + operand)
-
-    elt_size = DataTypeSizeBytes[to_cutlass(tensor.dtype)]
-    ptr += batch_offset * batch_stride * elt_size
-
-    if layout == cutlass_bindings.RowMajor:
-        layout = cutlass_bindings.RowMajor.packed(tensor_coord)
-        layout_tag = "RowMajor"
-    elif layout == cutlass_bindings.ColumnMajor:
-        layout = cutlass_bindings.ColumnMajor.packed(tensor_coord)
-        layout_tag = "ColumnMajor"
-    elif layout == cutlass_bindings.ColumnMajorInterleaved32:
-        layout = cutlass_bindings.ColumnMajorInterleaved32.packed(tensor_coord)
-        layout_tag = "ColumnMajorInterleaved32"
-    elif layout == cutlass_bindings.RowMajorInterleaved32:
-        layout = cutlass_bindings.RowMajorInterleaved32.packed(tensor_coord)
-        layout_tag = "RowMajorInterleaved32"
-    else:
-        raise ValueError("unsupported layout")
-    if tensor.dtype == np.float32:
-        ref_name = "TensorRefF32" + layout_tag
-    elif tensor.dtype == np.float64:
-        ref_name = "TensorRefF64" + layout_tag
-    elif tensor.dtype == np.float16:
-        ref_name = "TensorRefF16" + layout_tag
-    elif tensor.dtype == bfloat16:
-        ref_name = "TensorRefBF16" + layout_tag
-    elif tensor.dtype == np.int8:
-        ref_name = "TensorRefS8" + layout_tag
-    elif tensor.dtype == np.int32:
-        ref_name = "TensorRefS32" + layout_tag
-    else:
-        raise ValueError("unsupported datatype %s" % ShortDataTypeNames[tensor.dtype])
-
-    return getattr(cutlass_bindings, ref_name)(ptr, layout)
-
-
-def getTensorView(
-    tensor: np.ndarray,
-    problem_size: cutlass_bindings.gemm.GemmCoord,
-    operand: str,
-    layout: str,
-    batch_offset: int = 0,
-):
-    tensor_ref = getTensorRef(tensor, problem_size, operand, layout, batch_offset)
-
-    if operand == "a":
-        tensor_coord = problem_size.mk()
-    elif operand == "b":
-        tensor_coord = problem_size.kn()
-    elif operand in ["c", "d"]:
-        tensor_coord = problem_size.mn()
-    else:
-        raise ValueError("Unknown operand: " + operand)
-
-    if layout == cutlass_bindings.RowMajor:
-        layout_tag = "RowMajor"
-    elif layout == cutlass_bindings.ColumnMajor:
-        layout_tag = "ColumnMajor"
-    elif layout == cutlass_bindings.ColumnMajorInterleaved32:
-        layout_tag = "ColumnMajorInterleaved32"
-    elif layout == cutlass_bindings.RowMajorInterleaved32:
-        layout_tag = "RowMajorInterleaved32"
-    else:
-        raise ValueError("unsupported layout")
-    if tensor.dtype == np.float32:
-        ref_name = "TensorViewF32" + layout_tag
-    elif tensor.dtype == np.float64:
-        ref_name = "TensorViewF64" + layout_tag
-    elif tensor.dtype == np.float16:
-        ref_name = "TensorViewF16" + layout_tag
-    elif tensor.dtype == bfloat16:
-        ref_name = "TensorViewBF16" + layout_tag
-    elif tensor.dtype == np.int32:
-        ref_name = "TensorViewS32" + layout_tag
-    elif tensor.dtype == np.int8:
-        ref_name = "TensorViewS8" + layout_tag
-    else:
-        raise ValueError("unsupported datatype")
-
-    return getattr(cutlass_bindings, ref_name)(tensor_ref, tensor_coord)
-
-
-class GemmUniversalLauncher:
-    def __init__(
-        self,
-        operation: "GemmOperationUniversal",
-        seed: int = 2080,
-        interleaved=False,
-        verification=True,
-        profiling=False,
-        warmup_iterations=500,
-        iterations=500,
-        compiler_mode: str = "nvcc",
-        **kwargs,
-    ) -> None:
-        # create the reduction kernel
-        self.reduction_operation: ReductionOperation = ReductionOperation(
-            shape=cutlass_bindings.MatrixCoord(4, 32 * operation.C.alignment),
-            C=operation.C,
-            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
-            element_compute=operation.epilogue_functor.element_epilogue,
-            epilogue_functor=operation.epilogue_functor,
-            count=operation.C.alignment,
-        )
-
-        self.math_operation = operation.tile_description.math_instruction.math_operation
-
-        #: verify the output result
-        self.verification = verification
-        #: profile the kernel's runtime
-        self.profiling = profiling
-
-        self.timer = GpuTimer()
-
-        self.warmup_iterations = warmup_iterations
-        self.iterations = iterations
-
-        if "sleep" in kwargs.keys():
-            self.sleep_time = kwargs["sleep"]
-        else:
-            self.sleep_time = 0
-
-        #
-        # Compile the operator
-        #
-        if compiler_mode == "nvcc":
-            compiler.nvcc()
-        elif compiler_mode == "nvrtc":
-            compiler.nvrtc()
-        else:
-            raise Exception(f"Unexpected compiler string {compiler_mode}")
-
-        op_list = [operation]
-        if operation.arch < 90:
-            # Split K via Python is currently only supported for pre-SM90 kernels
-            op_list.append(self.reduction_operation)
-
-        compiler.add_module(op_list, bypass_cache=True)
-
-        self.operation = operation
-
-        self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
-        self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
-        self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
-        self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
-
-        accumulator_size = DataTypeSize[
-            operation.tile_description.math_instruction.element_accumulator
-        ]
-        element_size = DataTypeSize[operation.A.element]
-
-        if element_size == 1:
-            self.scope_max = 1
-            self.scope_min = 0
-        elif element_size <= 8:
-            self.scope_max = 1
-            self.scope_min = -1
-        elif element_size == 16:
-            self.scope_max = 4
-            self.scope_min = -4
-        else:
-            self.scope_max = 8
-            self.scope_min = -8
-
-        #: seed
-        self.seed: int = seed
-
-        #: whether the layout is interleaved
-        self.interleaved = interleaved
-
-        #: compute type
-        self.compute_type = operation.epilogue_functor.element_epilogue
-        self.accumulator_type = (
-            operation.tile_description.math_instruction.element_accumulator
-        )
-
-    def print_problem_size(self, p, mode, batch_count):
-        if mode == cutlass_bindings.gemm.Mode.Gemm:
-            mode = "Gemm"
-        elif mode == cutlass_bindings.gemm.Mode.Batched:
-            mode = "GemmBatched"
-        elif mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            mode = "GemmSplitKParallel"
-        problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
-            p.m(),
-            p.n(),
-            p.k(),
-            batch_count,
-            mode,
-        )
-        print(problem_size)
-
-    @staticmethod
-    def numpy_type(type):
-        if type == cutlass_bindings.float64:
-            return np.float64
-        elif type == cutlass_bindings.float32:
-            return np.float32
-        elif type == cutlass_bindings.float16:
-            return np.float16
-        elif type == cutlass_bindings.bfloat16:
-            return bfloat16
-        elif type == cutlass_bindings.int32:
-            return np.int32
-        elif type == cutlass_bindings.int8:
-            return np.int8
-        else:
-            raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
-
-    def uniform_init(self, size, dtype):
-        if dtype in [np.float32, np.float16, bfloat16, np.float64]:
-            return np.ceil(
-                np.random.uniform(
-                    low=self.scope_min - 0.5, high=self.scope_max - 0.5, size=size
-                ).astype(dtype)
-            )
-        else:
-            return np.random.uniform(
-                low=self.scope_min - 1, high=self.scope_max + 1, size=size
-            ).astype(dtype)
-
-    def reorder_tensor_B(self, tensor_B, problem_size):
-        reordered_tensor_B = np.empty_like(tensor_B)
-        tensor_ref_B = getTensorRef(
-            tensor_B, problem_size, "b", self.operation.B.layout
-        )
-        reordered_tensor_ref_B = getTensorRef(
-            reordered_tensor_B, problem_size, "b", self.operation.B.layout
-        )
-        cutlass_bindings.gemm.host.reorder_column(
-            tensor_ref_B, reordered_tensor_ref_B, problem_size
-        )
-        return reordered_tensor_B
-
-    def host_reference(self, problem_size, batch_count, tensor_A, tensor_B, tensor_C, alpha, beta):
-        tensor_D_ref = np.ones_like(tensor_C)
-        alpha = self.numpy_type(self.compute_type)(alpha)
-        beta = self.numpy_type(self.compute_type)(beta)
-        init_acc = 0
-
-        alpha = self.compute_type(alpha).value()
-        beta = self.compute_type(beta).value()
-        init_acc = self.accumulator_type(init_acc).value()
-
-        for i in range(batch_count):
-            if self.operation.switched:
-                tensor_ref_A = getTensorRef(
-                    tensor_A,
-                    problem_size,
-                    "a",
-                    transpose(self.operation.B.layout),
-                    batch_offset=i,
-                )
-                tensor_ref_B = getTensorRef(
-                    tensor_B,
-                    problem_size,
-                    "b",
-                    transpose(self.operation.A.layout),
-                    batch_offset=i,
-                )
-                tensor_ref_C = getTensorRef(
-                    tensor_C,
-                    problem_size,
-                    "c",
-                    transpose(self.operation.C.layout),
-                    batch_offset=i,
-                )
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_ref,
-                    problem_size,
-                    "d",
-                    transpose(self.operation.C.layout),
-                    batch_offset=i,
-                )
-            else:
-                tensor_ref_A = getTensorRef(
-                    tensor_A, problem_size, "a", self.operation.A.layout, batch_offset=i
-                )
-                tensor_ref_B = getTensorRef(
-                    tensor_B, problem_size, "b", self.operation.B.layout, batch_offset=i
-                )
-                tensor_ref_C = getTensorRef(
-                    tensor_C, problem_size, "c", self.operation.C.layout, batch_offset=i
-                )
-                tensor_ref_D_ref = getTensorRef(
-                    tensor_D_ref,
-                    problem_size,
-                    "d",
-                    self.operation.C.layout,
-                    batch_offset=i,
-                )
-
-            if self.math_operation in [MathOperation.multiply_add_saturate]:
-                cutlass_bindings.test.gemm.host.gemm_saturate(
-                    problem_size,
-                    alpha,
-                    tensor_ref_A,
-                    tensor_ref_B,
-                    beta,
-                    tensor_ref_C,
-                    tensor_ref_D_ref,
-                    init_acc,
-                )
-            else:
-                cutlass_bindings.test.gemm.host.gemm(
-                    problem_size,
-                    alpha,
-                    tensor_ref_A,
-                    tensor_ref_B,
-                    beta,
-                    tensor_ref_C,
-                    tensor_ref_D_ref,
-                    init_acc,
-                )
-
-        return tensor_D_ref
-
-    def equal(self, tensor_D, tensor_D_ref, problem_size, batch_count):
-        for i in range(batch_count):
-            tensor_view_D = getTensorView(
-                tensor_D, problem_size, "d", self.operation.C.layout, batch_offset=i
-            )
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i
-            )
-
-            if not cutlass_bindings.test.gemm.host.equals(
-                tensor_view_D, tensor_view_D_ref
-            ):
-                return False
-
-        return True
-
-    def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
-        m = problem_size.m()
-        n = problem_size.n()
-        k = problem_size.k()
-
-        bytes = (
-            (DataTypeSize[self.operation.A.element] * m // 8) * k
-            + (DataTypeSize[self.operation.B.element] * n // 8) * k
-            + (DataTypeSize[self.operation.C.element] * m // 8) * n
-        )
-
-        if beta != 0:
-            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
-
-        bytes *= batch_count
-
-        return bytes
-
-    def flops(self, problem_size, batch_count=1):
-        m = problem_size.m()
-        n = problem_size.n()
-        k = problem_size.k()
-
-        flops_ = (m * n * k) * 2 * batch_count
-
-        return flops_
-
-    def run_cutlass_profiler(
-        self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0
-    ):
-        cutlass_path = os.getenv("CUTLASS_PATH")
-        assert (
-            cutlass_path is not None
-        ), "Environment variable 'CUTLASS_PATH' is not defined."
-
-        values = {
-            "profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
-            "kernel_name": self.operation.procedural_name(),
-            "verification_providers": "device",
-            "provider": "cutlass",
-            "m": str(problem_size.m()),
-            "n": str(problem_size.n()),
-            "k": str(problem_size.k()),
-            "split_k_slices": str(batch_count),
-            "alpha": str(alpha),
-            "beta": str(beta),
-            "warmup": str(self.warmup_iterations),
-            "profile": str(self.iterations),
-        }
-
-        cmd_template = (
-            "${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}"
-            " --providers=${provider} --m=${m} --n=${n} --k=${k}"
-        )
-
-        cmd = SubstituteTemplate(cmd_template, values)
-        result = subprocess.getoutput(cmd)
-
-        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
-        runtime = float(m.group("runtime"))
-
-        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
-        bytes = int(m.group("bytes"))
-
-        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
-        flops = int(m.group("flops"))
-
-        # check if the problem size matches
-        assert bytes == self.bytes(problem_size, alpha, beta)
-        assert flops == self.flops(problem_size)
-
-        return runtime
-
-    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released in previous run"
-            % get_allocated_size()
-        )
-
-        np.random.seed(self.seed)
-
-        # Assign an actual batch count in cases where we are not running in batched mode.
-        # This is to differentiate between the number of split K slices and the batch count,
-        # which are overloaded within the single `batch_count` variable.
-        true_batch_count = (
-            batch_count if mode == cutlass_bindings.gemm.Mode.Batched else 1
-        )
-
-        tensor_A = self.uniform_init(
-            size=(problem_size.m() * problem_size.k() * true_batch_count,),
-            dtype=self.dtype_A,
-        )
-        tensor_B = self.uniform_init(
-            size=(problem_size.n() * problem_size.k() * true_batch_count,),
-            dtype=self.dtype_B,
-        )
-        tensor_C = self.uniform_init(
-            size=(problem_size.m() * problem_size.n() * true_batch_count,),
-            dtype=self.dtype_C,
-        )
-        tensor_D = np.zeros(
-            shape=(problem_size.m() * problem_size.n() * true_batch_count,),
-            dtype=self.dtype_D,
-        )
-
-        #
-        # Launch kernel
-        #
-
-        arguments = GemmArguments(
-            operation=self.operation,
-            problem_size=problem_size,
-            A=tensor_A,
-            B=tensor_B,
-            C=tensor_C,
-            D=tensor_D,
-            output_op=self.operation.epilogue_type(alpha, beta),
-            gemm_mode=mode,
-            split_k_slices=split_k_slices,
-            batch=batch_count,
-        )
-
-        if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            reduction_arguments = ReductionArguments(
-                self.reduction_operation,
-                problem_size=[problem_size.m(), problem_size.n()],
-                partitions=split_k_slices,
-                workspace=arguments.ptr_D,
-                destination=tensor_D,
-                source=tensor_C,
-                output_op=self.reduction_operation.epilogue_type(alpha, beta),
-            )
-
-        self.operation.run(arguments)
-
-        if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            self.reduction_operation.run(reduction_arguments)
-
-        passed = True
-
-        if self.verification:
-            if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-                reduction_arguments.sync()
-            else:
-                arguments.sync()
-            tensor_D_ref = self.host_reference(
-                problem_size,
-                true_batch_count,
-                tensor_A,
-                tensor_B,
-                tensor_C,
-                alpha,
-                beta,
-            )
-            passed = self.equal(tensor_D, tensor_D_ref, problem_size, true_batch_count)
-
-            try:
-                assert passed
-            except AssertionError:
-                self.print_problem_size(problem_size, mode, batch_count)
-
-        if self.profiling:
-            sleep(self.sleep_time)
-            for _ in range(self.warmup_iterations):
-                self.operation.run(arguments)
-                if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-                    self.reduction_operation.run(reduction_arguments)
-
-            self.timer.start()
-            for _ in range(self.iterations):
-                self.operation.run(arguments)
-                if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-                    self.reduction_operation.run(reduction_arguments)
-            self.timer.stop_and_wait()
-
-            runtime = self.timer.duration(self.iterations)
-
-        # free memory and clear buffers
-        del arguments
-        if mode == cutlass_bindings.gemm.Mode.GemmSplitKParallel:
-            del reduction_arguments
-
-        assert get_allocated_size() == 0, (
-            "%d byte of pool memory is not released after current run"
-            % get_allocated_size()
-        )
-
-        if self.profiling:
-            return runtime
-        return passed
-
-
-def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
-    passed = True
-
-    minimum_operand_element_size = min(
-        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
-    )
-    opcode_class = operation.tile_description.math_instruction.opcode_class
-
-    if opcode_class == cutlass_bindings.OpClass.Simt:
-        alignment = 1
-    else:
-        alignment = 128 // minimum_operand_element_size
-
-    # int8_t gemm alignment constraints
-    if opcode_class == cutlass_bindings.OpClass.Simt and operation.A.element == cutlass_bindings.int8 and operation.A.layout == cutlass_bindings.ColumnMajor:
-        alignment_m = 4
-    else:
-        alignment_m = alignment
-
-    if (
-        opcode_class == cutlass_bindings.OpClass.Simt
-        and operation.B.element == cutlass_bindings.int8
-        and operation.A.layout == cutlass_bindings.RowMajor
-    ):
-        alignment_n = 4
-    else:
-        alignment_n = alignment
-
-    if (
-        opcode_class == cutlass_bindings.OpClass.Simt
-        and operation.A.element == cutlass_bindings.int8
-        and operation.B.element == cutlass_bindings.int8
-        and (
-            operation.A.layout == cutlass_bindings.RowMajor
-            or operation.B.layout == cutlass_bindings.ColumnMajor
-        )
-    ):
-        alignment_k = 4
-    else:
-        alignment_k = alignment
-
-    threadblock_k = operation.tile_description.threadblock_shape[2]
-
-    if testcase == "interleaved":
-        if operation.A.layout in [
-            cutlass_bindings.ColumnMajorInterleaved32,
-            cutlass_bindings.RowMajorInterleaved32,
-        ]:
-            interleavedk = 32
-        else:
-            raise ValueError("Unknown layout")
-
-    # Split K mode via Python is currently only supported pre-SM90, and when stream K is not used.
-    # Stream K enables split-k functionality with mode `Gemm` and a non-unit batch count.
-    supports_split_k = operation.arch < 90 and not isinstance(
-        operation.swizzling_functor, cutlass_bindings.ThreadblockSwizzleStreamK
-    )
-    if testcase == "interleaved":
-        modes = [
-            cutlass_bindings.gemm.Mode.Gemm,
-        ]
-        problem_size_m = [interleavedk, 512 + interleavedk]
-        problem_size_n = [interleavedk, 512 + interleavedk]
-        problem_size_k = [
-            interleavedk,
-            threadblock_k * operation.tile_description.stages + interleavedk,
-        ]
-        problem_alpha = [1.0]
-        problem_beta = [0.0]
-        batch_counts = [
-            1,
-        ]
-    elif testcase == "multistage":
-        modes = [
-            cutlass_bindings.gemm.Mode.Gemm,
-        ]
-        problem_size_m = [16, 528]
-        problem_size_n = [16, 528]
-        problem_size_k = [
-            threadblock_k,
-            threadblock_k * operation.tile_description.stages
-            + operation.tile_description.math_instruction.instruction_shape[2],
-        ]
-        problem_alpha = [1.0]
-        problem_beta = [0.0]
-        batch_counts = [
-            1,
-        ]
-    else:  # universal
-        modes = [cutlass_bindings.gemm.Mode.Gemm]
-        batch_counts = [1, 2, 3, 5, 7]
-        if supports_split_k:
-            modes.append(cutlass_bindings.gemm.Mode.GemmSplitKParallel)
-
-        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
-        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
-        if operation.tile_description.stages is None:
-            stages_for_k_calc = 7
-        else:
-            stages_for_k_calc = operation.tile_description.stages
-        problem_size_k = [
-            alignment_k,
-            threadblock_k * stages_for_k_calc - alignment_k,
-            threadblock_k * stages_for_k_calc * 3 - alignment_k,
-        ]
-        problem_alpha = [1.0]
-        problem_beta = [2.0]
-
-    testbed = GemmUniversalLauncher(operation, interleaved=(testcase == "interleaved"), compiler_mode=compilation_mode)
-
-    for mode in modes:
-        for m in problem_size_m:
-            for n in problem_size_n:
-                for k in problem_size_k:
-                    for batch_count in batch_counts:
-                        for alpha in problem_alpha:
-                            for beta in problem_beta:
-                                # skip very small K problems
-                                if testcase == "universal":
-                                    if k // batch_count < 2 * threadblock_k:
-                                        continue
-
-                                problem_size = cutlass_bindings.gemm.GemmCoord(m, n, k)
-
-                                if supports_split_k:
-                                    split_k_slices = batch_count
-                                else:
-                                    split_k_slices = 1
-
-                                overridden_mode = mode
-                                if (
-                                    mode == cutlass_bindings.gemm.Mode.Gemm
-                                    and batch_count > 1
-                                ):
-                                    overridden_mode = cutlass_bindings.gemm.Mode.Batched
-
-                                passed = testbed.run(
-                                    overridden_mode,
-                                    problem_size,
-                                    batch_count,
-                                    split_k_slices,
-                                    alpha,
-                                    beta,
-                                )
-
-                                (err,) = cudart.cudaDeviceSynchronize()
-                                if err != cuda.CUresult.CUDA_SUCCESS:
-                                    raise RuntimeError("CUDA Error %s" % str(err))
-
-                                if not passed:
-                                    return False
-
-    return passed
diff --git a/python/cutlass/backend/test/profiler.py b/python/cutlass/backend/test/profiler.py
deleted file mode 100644
index 31d14b3d23..0000000000
--- a/python/cutlass/backend/test/profiler.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from cuda import cuda, cudart
-
-
-class GpuTimer:
-    def __init__(self) -> None:
-        self.events = [
-            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
-            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
-        ]
-
-    def start(self, stream=cuda.CUstream(0)):
-        (err,) = cuda.cuEventRecord(self.events[0], stream)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-
-    def stop(self, stream=cuda.CUstream(0)):
-        (err,) = cuda.cuEventRecord(self.events[1], stream)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-        pass
-
-    def stop_and_wait(self, stream=cuda.CUstream(0)):
-        self.stop(stream)
-        if stream:
-            (err,) = cuda.cuStreamSynchronize(stream)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-        else:
-            (err,) = cudart.cudaDeviceSynchronize()
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
-
-    def duration(self, iterations=1):
-        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError("CUDA Error %s" % str(err))
-        return duration / float(iterations)
diff --git a/python/cutlass/backend/utils/__init__.py b/python/cutlass/backend/utils/__init__.py
index 3d71d4da67..be36ad8337 100644
--- a/python/cutlass/backend/utils/__init__.py
+++ b/python/cutlass/backend/utils/__init__.py
@@ -32,7 +32,6 @@
 
 from cutlass.backend.utils.datatypes import *
 from cutlass.backend.utils.device import check_cuda_errors, device_cc
-from cutlass.backend.utils.reference_model import ReferenceModule
 from cutlass.backend.utils.software import (
     CheckPackages,
     SubstituteTemplate,
diff --git a/python/cutlass/backend/utils/datatypes.py b/python/cutlass/backend/utils/datatypes.py
index 834a607106..1140cb84ba 100644
--- a/python/cutlass/backend/utils/datatypes.py
+++ b/python/cutlass/backend/utils/datatypes.py
@@ -34,8 +34,9 @@
 Utility functions for converting between frontend datatypes and CUTLASS datatypes
 """
 
-import cutlass_bindings
+from cuda import cuda
 
+from cutlass import DataType
 from cutlass.backend.utils.software import CheckPackages
 
 numpy_available = CheckPackages().check_numpy()
@@ -43,16 +44,16 @@
     import numpy as np
 
     numpy_to_cutlass_dict = {
-        np.float16: cutlass_bindings.float16,
-        np.float32: cutlass_bindings.float32,
-        np.float64: cutlass_bindings.float64,
-        np.int8: cutlass_bindings.int8,
-        np.int32: cutlass_bindings.int32,
-        np.dtype('float16'): cutlass_bindings.float16,
-        np.dtype('float32'): cutlass_bindings.float32,
-        np.dtype('float64'): cutlass_bindings.float64,
-        np.dtype('int8'): cutlass_bindings.int8,
-        np.dtype('int32'): cutlass_bindings.int32,
+        np.float16: DataType.f16,
+        np.float32: DataType.f32,
+        np.float64: DataType.f64,
+        np.int8: DataType.s8,
+        np.int32: DataType.s32,
+        np.dtype('float16'): DataType.f16,
+        np.dtype('float32'): DataType.f32,
+        np.dtype('float64'): DataType.f64,
+        np.dtype('int8'): DataType.s8,
+        np.dtype('int32'): DataType.s32,
     }
 
 
@@ -67,9 +68,9 @@ def numpy_to_cutlass(inp):
     import cupy as cp
 
     cupy_to_cutlass_dict = {
-        cp.float16: cutlass_bindings.float16,
-        cp.float32: cutlass_bindings.float32,
-        cp.float64: cutlass_bindings.float64,
+        cp.float16: DataType.f16,
+        cp.float32: DataType.f32,
+        cp.float64: DataType.f64,
     }
 
 
@@ -84,12 +85,12 @@ def cupy_to_cutlass(inp):
     import torch
 
     torch_to_cutlass_dict = {
-        torch.half: cutlass_bindings.float16,
-        torch.float16: cutlass_bindings.float16,
-        torch.float: cutlass_bindings.float32,
-        torch.float32: cutlass_bindings.float32,
-        torch.double: cutlass_bindings.float64,
-        torch.float64: cutlass_bindings.float64,
+        torch.half: DataType.f16,
+        torch.float16: DataType.f16,
+        torch.float: DataType.f32,
+        torch.float32: DataType.f32,
+        torch.double: DataType.f64,
+        torch.float64: DataType.f64,
     }
 
 
@@ -102,7 +103,7 @@ def torch_to_cutlass(inp):
     import bfloat16
 
     bfloat16_available = True
-    numpy_to_cutlass_dict[np.dtype(bfloat16.bfloat16)] = cutlass_bindings.bfloat16
+    numpy_to_cutlass_dict[np.dtype(bfloat16.bfloat16)] = DataType.bf16
 except ImportError:
     bfloat16_available = False
 
@@ -110,7 +111,7 @@ def torch_to_cutlass(inp):
 def bfloat16_to_cutlass(inp):
     if bfloat16_available:
         if inp == bfloat16.bfloat16:
-            return cutlass_bindings.bfloat16
+            return DataType.bf16
 
 
 def to_cutlass(inp):
@@ -127,3 +128,29 @@ def to_cutlass(inp):
     raise Exception(
         "No available conversion from type {} to a CUTLASS type.".format(inp)
     )
+
+
+def to_device_ptr(tensor) -> cuda.CUdeviceptr:
+    """
+    Converts a tensor to a CUdeviceptr
+
+    :param tensor: tensor to convert
+    :type tensor: np.ndarray | torch.Tensor | cp.ndarray | int
+
+    :return: device pointer
+    :rtype: cuda.CUdeviceptr
+    """
+    if isinstance(tensor, np.ndarray):
+        ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
+    elif torch_available and isinstance(tensor, torch.Tensor):
+        ptr = cuda.CUdeviceptr(tensor.data_ptr())
+    elif cupy_available and isinstance(tensor, cp.ndarray):
+        ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
+    elif isinstance(tensor, cuda.CUdeviceptr):
+        ptr = tensor
+    elif isinstance(tensor, int):
+        ptr = cuda.CUdeviceptr(tensor)
+    else:
+        raise NotImplementedError(tensor)
+
+    return ptr
diff --git a/python/cutlass/backend/utils/reference_model.py b/python/cutlass/backend/utils/reference_model.py
deleted file mode 100644
index 2e25dcce5e..0000000000
--- a/python/cutlass/backend/utils/reference_model.py
+++ /dev/null
@@ -1,317 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from typing import Union
-
-from bfloat16 import bfloat16
-import cutlass_bindings
-import numpy as np
-
-from cutlass.backend.library import TensorDescription
-from cutlass.backend.utils.software import CheckPackages
-
-torch_available = CheckPackages().check_torch()
-if torch_available:
-    import torch
-
-
-class ReferenceModule:
-    def __init__(
-        self, A: TensorDescription, B: TensorDescription, C: TensorDescription
-    ) -> None:
-        self.layout_A = A.layout
-        self.layout_B = B.layout
-        self.layout_C = C.layout
-
-    def run(
-        self,
-        A: np.ndarray,
-        B: np.ndarray,
-        C: np.ndarray,
-        problem_size: cutlass_bindings.gemm.GemmCoord,
-        alpha: float = 1.0,
-        beta: float = 0.0,
-        bias=False,
-        batch=1,
-    ):
-        """
-        Compute the reference result on CPU
-        Args:
-            A: dense operator with shape (M, K) in row-major and (K, M) in column-major
-            B: dense operator with shape (K, N) in row-major and (N, K) in column-major
-            C: dense operator with shape (M, N) in row-major and (N, M) in column-major
-        """
-        M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
-        if isinstance(A, np.ndarray):
-            if self.layout_A == cutlass_bindings.RowMajor:
-                A_row = np.reshape(A, newshape=(batch, M, K))
-            else:
-                A_col = np.reshape(A, newshape=(batch, K, M))
-                A_row = np.transpose(A_col, axes=(0, 2, 1))
-
-            if self.layout_B == cutlass_bindings.RowMajor:
-                B_row = np.reshape(B, newshape=(batch, K, N))
-            else:
-                B_col = np.reshape(B, newshape=(batch, N, K))
-                B_row = np.transpose(B_col, axes=(0, 2, 1))
-
-            if self.layout_C == cutlass_bindings.RowMajor:
-                if bias:
-                    C_row = np.reshape(C, newshape=(batch, 1, N))
-                else:
-                    C_row = np.reshape(C, newshape=(batch, M, N))
-            else:
-                if bias:
-                    C_row = np.reshape(C, newshape=(batch, M, 1))
-                else:
-                    C_col = np.reshape(C, newshape=(batch, N, M))
-                    C_row = np.transpose(C_col, axes=(0, 2, 1))
-
-            if A_row.dtype == bfloat16:
-                # numpy's einsum doesn't support bfloat16
-                out_row = (
-                    np.einsum(
-                        "bik,bkj->bij",
-                        A_row.astype(np.float32),
-                        B_row.astype(np.float32),
-                    )
-                    * alpha
-                    + C_row * beta
-                )
-                out_row = out_row.astype(C_row.dtype)
-            else:
-                out_row = np.einsum("bik,bkj->bij", A_row, B_row) * alpha + C_row * beta
-
-            if self.layout_C == cutlass_bindings.ColumnMajor:
-                out = np.transpose(out_row, axes=(0, 2, 1))
-            else:
-                out = out_row
-
-            return out.ravel()
-
-        elif isinstance(A, torch.Tensor):
-            if self.layout_A == cutlass_bindings.RowMajor:
-                A_row = A.view((M, K))
-            else:
-                A_col = A.view((K, M))
-                A_row = torch.permute(A_col, (1, 0))
-
-            if self.layout_B == cutlass_bindings.RowMajor:
-                B_row = B.view((K, N))
-            else:
-                B_col = B.view((N, K))
-                B_row = torch.permute(B_col, (1, 0))
-
-            if self.layout_C == cutlass_bindings.RowMajor:
-                C_row = C.view((M, N))
-            else:
-                C_col = C.view((N, M))
-                C_row = torch.permute(C_col, (1, 0))
-
-            out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
-
-            if self.layout_C == cutlass_bindings.ColumnMajor:
-                out = torch.permute(out_row, (1, 0))
-            else:
-                out = out_row
-
-            return torch.flatten(out)
-
-
-#####################################################################################################
-# Conv2d
-#####################################################################################################
-
-if torch_available:
-    import torch
-
-    class Conv2dReferenceModule:
-        def __init__(
-            self,
-            A: TensorDescription,
-            B: TensorDescription,
-            C: TensorDescription,
-            kind: cutlass_bindings.conv.Operator.fprop,
-        ) -> None:
-            self.layout_A = A.layout
-            self.layout_B = B.layout
-            self.layout_C = C.layout
-            self.kind = kind
-
-        def run(
-            self,
-            A: Union[np.ndarray, torch.Tensor],
-            B: Union[np.ndarray, torch.Tensor],
-            C: Union[np.ndarray, torch.Tensor],
-            problem_size,
-            alpha=1.0,
-            beta=0.0,
-            bias=False,
-        ) -> np.ndarray:
-            """
-            Compute the reference result on CPU
-            """
-            n = problem_size.N
-            h = problem_size.H
-            w = problem_size.W
-            c = problem_size.C
-
-            k = problem_size.K
-            r = problem_size.R
-            s = problem_size.S
-
-            p = problem_size.P
-            q = problem_size.Q
-
-            stride_h = problem_size.stride_h
-            stride_w = problem_size.stride_w
-
-            pad_h = problem_size.pad_h
-            pad_w = problem_size.pad_w
-
-            dilation_h = problem_size.dilation_h
-            dilation_w = problem_size.dilation_w
-
-            groups = problem_size.groups
-
-            if isinstance(A, np.ndarray):
-                # the pytorch activation layout is NCHW
-                #             weight layout is Cout Cin Kh Kw (also NCHW)
-                if self.layout_A == cutlass_bindings.TensorNHWC:
-                    A_nhwc = np.reshape(A, newshape=(n, h, w, c))
-                    A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
-                    A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
-
-                if self.layout_B == cutlass_bindings.TensorNHWC:
-                    B_nhwc = np.reshape(B, newshape=(k, r, s, c))
-                    B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
-                    B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
-
-                if self.layout_C == cutlass_bindings.TensorNHWC:
-                    C_nhwc = np.reshape(C, newshape=(n, p, q, k))
-                    C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
-                    C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
-
-            elif isinstance(A, torch.Tensor):
-                if self.kind == cutlass_bindings.conv.Operator.wgrad:
-                    if self.layout_A == cutlass_bindings.TensorNHWC:
-                        A_nhwc = A.view((n, p, q, k))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_B == cutlass_bindings.TensorNHWC:
-                        B_nhwc = B.view((n, h, w, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_C == cutlass_bindings.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, c))
-                        else:
-                            C_nhwc = C.view((k, r, s, c))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-                elif self.kind == cutlass_bindings.conv.Operator.dgrad:
-                    if self.layout_A == cutlass_bindings.TensorNHWC:
-                        A_nhwc = A.view((n, p, q, k))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_B == cutlass_bindings.TensorNHWC:
-                        B_nhwc = B.view((k, r, s, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_C == cutlass_bindings.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, c))
-                        else:
-                            C_nhwc = C.view((n, h, w, c))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-                else:
-                    if self.layout_A == cutlass_bindings.TensorNHWC:
-                        A_nhwc = A.view((n, h, w, c))
-                        A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_B == cutlass_bindings.TensorNHWC:
-                        B_nhwc = B.view((k, r, s, c))
-                        B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
-
-                    if self.layout_C == cutlass_bindings.TensorNHWC:
-                        if bias:
-                            C_nhwc = C.view((1, 1, 1, k))
-                        else:
-                            C_nhwc = C.view((n, p, q, k))
-                        C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
-
-            if self.kind == cutlass_bindings.conv.Operator.fprop:
-                D_torch_nchw = (
-                    alpha
-                    * torch.nn.functional.conv2d(
-                        A_torch_nchw,
-                        B_torch_nchw,
-                        stride=(stride_h, stride_w),
-                        padding=(pad_h, pad_w),
-                        dilation=(dilation_h, dilation_w),
-                        groups=groups,
-                    )
-                    + beta * C_torch_nchw
-                )
-            elif self.kind == cutlass_bindings.conv.Operator.dgrad:
-                D_torch_nchw = (
-                    alpha
-                    * torch.nn.grad.conv2d_input(
-                        (n, c, h, w),
-                        B_torch_nchw,
-                        A_torch_nchw,
-                        padding=(pad_h, pad_w),
-                        stride=(stride_h, stride_w),
-                    ).to(torch.float32)
-                    + beta * C_torch_nchw
-                )
-            elif self.kind == cutlass_bindings.conv.Operator.wgrad:
-                D_torch_nchw = (
-                    alpha
-                    * torch.nn.grad.conv2d_weight(
-                        B_torch_nchw,
-                        (k, c, r, s),
-                        A_torch_nchw,
-                        padding=(pad_h, pad_w),
-                        stride=(stride_h, stride_w),
-                    ).to(torch.float32)
-                    + beta * C_torch_nchw
-                )
-
-            if self.layout_C == cutlass_bindings.TensorNHWC:
-                if isinstance(A, np.ndarray):
-                    D_torch_out = (
-                        torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
-                    )
-                elif isinstance(A, torch.Tensor):
-                    D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
-
-            return D_torch_out.flatten()
diff --git a/python/cutlass/backend/utils/software.py b/python/cutlass/backend/utils/software.py
index 86bcc41172..9f099b8a29 100644
--- a/python/cutlass/backend/utils/software.py
+++ b/python/cutlass/backend/utils/software.py
@@ -1,3 +1,6 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -85,10 +88,7 @@ def SubstituteTemplate(template, values):
     return text
 
 
-# this._device_sm_count = None
 def device_sm_count():
-    # Query the number of SMs, if needed
-    # if this._device_sm_count is None:
     from cuda import cuda
 
     _device = 0
diff --git a/python/cutlass/cpp/cutlass_bindings.cpp b/python/cutlass/cpp/cutlass_bindings.cpp
deleted file mode 100644
index c5becc570d..0000000000
--- a/python/cutlass/cpp/cutlass_bindings.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief binding CUTLASS C++ APIs to Python
-*/
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "builtin_types.h"
-#include "device_launch_parameters.h"
-#include "stddef.h"
-#include "cutlass/cutlass.h"
-
-#include "include/conv/convolution.h"
-#include "include/gemm/gemm.h"
-#include "include/types.h"
-#include "include/layout/layout.h"
-#include "include/tensor_coord.h"
-#include "include/arch.h"
-#include "include/tensor_ref_view.h"
-#include "include/swizzling.h"
-#include "test/conv/convolution.h"
-#include "test/gemm/gemm.h"
-
-
-// Data Types
-#include "library.h"
-
-// compiler
-#include "compiler.h"
-
-
-namespace py = pybind11;
-
-
-PYBIND11_MODULE(cutlass_bindings, m) {
-
-    // module doc
-    m.doc() = "CUTLASS C++ binding";
-
-    //
-    // Bind data type
-    //
-    bind_cutlass_types(m);
-
-    //
-    // Bind layout
-    //
-    bind_layout(m);
-
-    //
-    // Bind tensor coord
-    //
-    bind_tensor_coord(m);
-
-    //
-    // Bind tensor ref
-    //
-    bind_tensor_refs_and_views(m);
-
-    //
-    // Bind opcode
-    //
-    bind_opcode(m);
-
-    //
-    // Bind convolution
-    //
-    py::module_ conv_submodule = m.def_submodule("conv");
-    bind_convolution(conv_submodule);
-
-    //
-    // Bind gemm
-    //
-    py::module_ gemm_submodule = m.def_submodule("gemm");
-    bind_gemm(gemm_submodule);
-
-    //
-    // Bind swizzling
-    //
-    bind_threadblock_swizzle(m);
-
-
-    //
-    // Bind test units
-    //
-    py::module_ test = m.def_submodule("test");
-    py::module_ test_conv = test.def_submodule("conv");
-    bind_convolution_test(test_conv);
-
-    py::module_ test_gemm = test.def_submodule("gemm");
-    bind_gemm_test(test_gemm);
-
-    // data types
-    py::enum_<cutlass::DataType>(m, "dtype")
-        .value("b1", cutlass::DataType::kB1)
-        .value("u2", cutlass::DataType::kU2)
-        .value("u4", cutlass::DataType::kU4)
-        .value("u8", cutlass::DataType::kU8)
-        .value("u16", cutlass::DataType::kU16)
-        .value("u32", cutlass::DataType::kU32)
-        .value("u64", cutlass::DataType::kU64)
-        .value("s2", cutlass::DataType::kS2)
-        .value("s4", cutlass::DataType::kS4)
-        .value("s16", cutlass::DataType::kS16)
-        .value("s64", cutlass::DataType::kS64)
-        .value("cf16", cutlass::DataType::kCF16)
-        .value("cbf16", cutlass::DataType::kCBF16)
-        .value("cf32", cutlass::DataType::kCF32)
-        .value("ctf32", cutlass::DataType::kCTF32)
-        .value("cf64", cutlass::DataType::kCF64)
-        .value("cs2", cutlass::DataType::kCS2)
-        .value("cs4", cutlass::DataType::kCS4)
-        .value("cs8", cutlass::DataType::kCS8)
-        .value("cs16", cutlass::DataType::kCS16)
-        .value("cs32", cutlass::DataType::kCS32)
-        .value("cs64", cutlass::DataType::kCS64)
-        .value("cu2", cutlass::DataType::kCU2)
-        .value("cu4", cutlass::DataType::kCU4)
-        .value("cu8", cutlass::DataType::kCU8)
-        .value("cu16", cutlass::DataType::kCU16)
-        .value("cu32", cutlass::DataType::kCU32)
-        .value("cu64", cutlass::DataType::kCU64)
-        .value("invalid", cutlass::DataType::kInvalid);
-    
-    // layout types
-    py::enum_<cutlass::LayoutType>(m, "layout")
-        .value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
-        .value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
-        .value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
-        .value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
-        .value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
-        .value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
-        .value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
-        .value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
-        .value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
-    
-    // transform types
-    py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
-        .value("none", cutlass::ComplexTransform::kNone)
-        .value("conj", cutlass::ComplexTransform::kConjugate);
-
-    //
-    // Compiler
-    //
-    py::class_<cutlass::CompileCache>(m, "CompileCache")
-        .def(py::init<>())
-        .def("at", &cutlass::CompileCache::at)
-        .def("insert", &cutlass::CompileCache::insert)
-        .def("size", &cutlass::CompileCache::size)
-        .def("clear", &cutlass::CompileCache::clear);
-
-}
diff --git a/python/cutlass/cpp/include/conv/conv_problem_size.h b/python/cutlass/cpp/include/conv/conv_problem_size.h
deleted file mode 100644
index cfc6e64e33..0000000000
--- a/python/cutlass/cpp/include/conv/conv_problem_size.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Convolution problem sizes to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/conv/conv2d_problem_size.h"
-
-namespace py = pybind11;
-
-void bind_conv_problem_size(py::module &m) {
-    //
-    // Conv2d Problem Size: 
-    // include/cutlass/conv/conv2d_problem_size.h
-    //
-    py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
-         // constructors
-        .def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
-        .def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
-        // attribute accessors
-        .def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
-        .def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
-        .def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
-        .def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
-        .def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
-        .def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
-        .def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
-        .def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
-        .def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
-        .def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
-        .def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
-        .def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
-        .def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
-        .def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
-        .def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
-        .def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
-        .def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
-        .def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
-        // functions
-        .def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
-        .def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
-        .def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
-        .def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
-        .def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
-        .def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
-        .def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
-    
-    // Get tensor size
-    m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
-    m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
-    m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
-
-    // Get tensor extent
-    m.def("implicit_gemm_tensor_a_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_a_extent));
-
-    m.def("implicit_gemm_tensor_b_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_b_extent));
-    
-    m.def("implicit_gemm_tensor_c_extent",
-        py::overload_cast<
-            cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
-        >(&cutlass::conv::implicit_gemm_tensor_c_extent));
-    
-    m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
-    
-}
diff --git a/python/cutlass/cpp/include/conv/convolution.h b/python/cutlass/cpp/include/conv/convolution.h
deleted file mode 100644
index 36126ecaaf..0000000000
--- a/python/cutlass/cpp/include/conv/convolution.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution related enum types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "conv_problem_size.h"
-#include "host.h"
-#include "cutlass/conv/convolution.h"
-
-namespace py = pybind11;
-
-void bind_convolution(py::module &m) {
-    //
-    // Enumerate types
-    // cutlass/include/cutlass/conv/convolution.h
-    //
-
-    /// Convolutional operator
-    py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
-        .value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
-        .value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
-        .value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
-
-    /// Distinguishes convolution  from cross correlation
-    py::enum_<cutlass::conv::Mode>(m, "Mode")
-        .value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
-        .value("convolution", cutlass::conv::Mode::kConvolution);
-    
-    /// Selects among several implementation variants trading off performance with simplicity
-    py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
-        R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
-        .value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
-        .value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
-        .value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
-        .value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
-    
-    /// Distinguishes among partial specializations that accelerate certain problems where convolution
-    /// stride is unit.
-    py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
-        R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
-        stride is unit.)pbdoc")
-        .value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
-        .value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
-    
-    /// Identifies split-K mode
-    py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
-        .value("None", cutlass::conv::SplitKMode::kNone)
-        .value("Serial", cutlass::conv::SplitKMode::kSerial)
-        .value("Parallel", cutlass::conv::SplitKMode::kParallel);
-    
-    // Conv problem sizes
-    bind_conv_problem_size(m);
-
-    //
-    // host helper functions
-    //
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_conv_host_helper(host_submodule);
-}
diff --git a/python/cutlass/cpp/include/conv/host.h b/python/cutlass/cpp/include/conv/host.h
deleted file mode 100644
index 7a33251650..0000000000
--- a/python/cutlass/cpp/include/conv/host.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind conv host helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/util/host_reorder.h"
-#include "cutlass/layout/tensor.h"
-
-namespace py = pybind11;
-
-
-void bind_conv_host_helper(py::module &m) {
-
-    /// reorder operand B for interleaved layout
-    m.def("reorder_convK", [](
-        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
-        cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
-        cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
-            cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
-            cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
-        });
-}
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_generic.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_generic.h
deleted file mode 100644
index 6b33f9a350..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_generic.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-
-  \brief A generic wrapper around an epilogue visitor operation
-*/
-
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
-
-#include "epilogue_visitor_op/visitor_op_linear_combination.h"
-#include "epilogue_visitor_op/visitor_op_tensor_input.h"
-#include "epilogue_visitor_op/visitor_op_accumulator.h"
-#include "epilogue_visitor_op/visitor_op_row_broadcast.h"
-#include "epilogue_visitor_op/visitor_op_tensor_output.h"
-#include "epilogue_visitor_op/visitor_op_column_reduction.h"
-#include "epilogue_visitor_op/visitor_op_row_reduction.h"
-#include "epilogue_visitor_op/visitor_op_column_broadcast.h"
-#include "epilogue_visitor_op/visitor_op_unary.h"
-#include "epilogue_visitor_op/visitor_op_binary.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic Epilogue Visitor.
-template <
-  typename OutputOp_
->
-class EpilogueVisitorGeneric {
-public:
-
-  using OutputOp = OutputOp_;
-  using AccumulatorAccessType = typename OutputOp::AccumulatorAccessType;
-  static int const kElementsPerAccess = OutputOp::kElementsPerAccess;
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using OutputTileIterator = typename OutputOp::OutputTileIterator;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-
-  ///
-  /// End Epilogue Tree
-  ///
-
-  /// Additional SMEM bufer is not required in the broadcast epilogue visitor
-  struct SharedStorage {
-
-    typename OutputOp::SharedStorage output_smem;
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-  /// Argument structure
-  struct Arguments {
-    typename OutputOp::Arguments output_op_args;
-    //
-    // Methods
-    //
-    Arguments() { }
-
-    Arguments(
-      typename OutputOp::Arguments output_op_args
-    ):
-      output_op_args(output_op_args)
-    {
-
-    }
-  };
-
-  struct Params {
-    typename OutputOp::Params output_op_params;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      output_op_params(args.output_op_args)
-    {
-
-    }
-  };
-
-
-
-private:
-
-  OutputOp output_op;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueVisitorGeneric(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord threadblock_offset,
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    MatrixCoord problem_size
-  ):
-    output_op(params.output_op_params, shared_storage.output_smem, thread_idx, threadblock_offset, problem_size)
-  { }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-    output_op.set_batch_index(batch_idx);
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-    output_op.begin_epilogue();
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    output_op.begin_step(step_idx);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-    output_op.begin_row(row_idx);
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorAccessType const &accum) {
-      output_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-    output_op.end_row(row_idx);
-
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-    output_op.end_step(step_idx);
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-    output_op.end_epilogue();
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h
deleted file mode 100644
index 9952a52bc8..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the unary ops
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Scalar multiplication
-template <typename T, int N>
-struct Mult {
-
-    struct Arguments {
-        T alpha;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():alpha(T(1.0)){ }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T alpha): alpha(alpha) { }
-    };
-    
-    struct Params {
-        T alpha;   ///< scales accumulators
-
-        CUTLASS_HOST_DEVICE
-        Params():alpha(T(1.0)){ }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): alpha(args.alpha) { }
-    };
-
-    T alpha_;
-
-    CUTLASS_HOST_DEVICE
-    Mult(
-        Params const &params
-    ):
-        alpha_(params.alpha)
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &source) const {
-        cutlass::multiplies<Array<T, N>> multiply_op;
-        return multiply_op(source, alpha_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return alpha_ != T(0);
-    }
-
-};
-
-
-/// ReLU
-template <typename T, int N>
-struct ReLUVisitor {
-    struct Arguments {
-        T threshold;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():threshold(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T threshold): threshold(threshold) { }
-    };
-
-    struct Params {
-        T threshold;
-
-        CUTLASS_HOST_DEVICE
-        Params():threshold(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): threshold(args.threshold) { }
-    };
-
-    T threshold_;
-
-    CUTLASS_HOST_DEVICE
-    ReLUVisitor(Params const &params):
-        threshold_(params.threshold) { }
-    
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        maximum<Array<T, N>> mx;
-        return mx(frag, threshold_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-};
-
-/// leakyReLU
-template <typename T, int N>
-struct LeakyReLUVisitor {
-    struct Arguments {
-        T leaky_alpha;
-
-        CUTLASS_HOST_DEVICE
-        Arguments():leaky_alpha(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(T leaky_alpha): leaky_alpha(leaky_alpha) { }
-    };
-
-    struct Params {
-        T leaky_alpha;
-
-        CUTLASS_HOST_DEVICE
-        Params():leaky_alpha(T(0.0)) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args): leaky_alpha(args.leaky_alpha) { }
-    };
-
-    T leaky_alpha_;
-
-    CUTLASS_HOST_DEVICE
-    LeakyReLUVisitor(Params const &params):
-        leaky_alpha_(params.leaky_alpha) { }
-    
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        cutlass::epilogue::thread::LeakyReLU<Array<T, N>> leaky_op;
-        return leaky_op(frag, leaky_alpha_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-    
-};
-
-/// Tanh
-template <typename T, int N>
-struct TanhVisitor {
-    /// Argument
-    struct Arguments {
-        // a placeholder argument to ensure correctness of ctypes
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments(): tmp(0) { };
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { };
-    };
-
-    /// Param
-    struct Params {
-        CUTLASS_HOST_DEVICE
-        Params(){ };
-        Params(Arguments const &args) { }
-    };
-
-    /// Constructor
-    CUTLASS_HOST_DEVICE
-    TanhVisitor(Params const &params) { }
-
-    // scalar operator
-    CUTLASS_HOST_DEVICE
-    T tanh_op(T const &scalar) const {
-        return fast_tanh(scalar);
-    }
-
-    /// vector operator
-    CUTLASS_HOST_DEVICE
-    Array<T, N> operator()(Array<T, N> const &frag) const {
-        Array<T, N> y;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i=0; i < N; ++i) {
-            y[i] = tanh_op(frag[i]);
-        }
-
-        return y;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool guard() {
-        return true;
-    }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h
deleted file mode 100644
index 2072cfaf26..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_accumulator.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with accumulator
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following Computation
-///
-/// ElementAccumulator accum;
-/// return accum;
-///
-/// It can only be the leaf node of the epilogue tree
-
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    int      kElementsPerAccess_    ///< Number of elements computed per operation
->
-class VisitorOpAccumulator{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    /// Fragment type for Accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = AccumulatorAccessType;
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        // Note: it is strange that ctypes will return issue with empty arguments
-        int tmp;
-
-        CUTLASS_HOST_DEVICE
-        Arguments() { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(int tmp): tmp(tmp) { }
-    };
-
-    /// Parameter structure
-    struct Params {
-
-        CUTLASS_HOST_DEVICE
-        Params() { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args) { }
-    };
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpAccumulator(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ) { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) { }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        return accum;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h
deleted file mode 100644
index 7486e56fd9..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Binary op
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "binary_ops.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,   ///< Number of elements computed per operation
-    typename VisitorA_,            ///< Child node A      
-    typename VisitorB_,            ///< Child node B
-    template<typename T, int N> typename BinaryOp_
->
-class VisitorOpBinary{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using VisitorA = VisitorA_;
-    using VisitorB = VisitorB_;
-
-    /// Fragment type returned from VisitorA.visit
-    using VisitAccessTypeA = typename VisitorA::VisitAccessType;
-    using ElementA = typename VisitAccessTypeA::Element;
-
-    /// Fragment type returned from VisitorB.visit
-    using VisitAccessTypeB = typename VisitorB::VisitAccessType;
-    using ElementB = typename VisitAccessTypeB::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    using BinaryOp = BinaryOp_<ElementCompute, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
-    static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename VisitorA::SharedStorage storage_a;
-        typename VisitorB::SharedStorage storage_b;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        typename BinaryOp::Arguments binary_arg;
-        typename VisitorA::Arguments visitor_a_arg;    ///< Argument type for visitor_a
-        typename VisitorB::Arguments visitor_b_arg;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():binary_arg() { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            typename BinaryOp::Arguments binary_arg,
-            typename VisitorA::Arguments visitor_a_arg,
-            typename VisitorB::Arguments visitor_b_arg
-        ):
-            binary_arg(binary_arg),
-            visitor_a_arg(visitor_a_arg),
-            visitor_b_arg(visitor_b_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        typename BinaryOp::Params binary_param;
-        typename VisitorA::Params visitor_a_param;    ///< Argument type for visitor_a
-        typename VisitorB::Params visitor_b_param;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            binary_param(args.binary_arg),
-            visitor_a_param(args.visitor_a_arg),
-            visitor_b_param(args.visitor_b_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-
-    BinaryOp binary_op;
-
-    VisitorA visitor_a_op;
-    VisitorB visitor_b_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpBinary(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        binary_op(params.binary_param),
-        visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
-        visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
-    { }
-
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_a_op.begin_epilogue();
-        visitor_b_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        visitor_a_op.set_batch_index(batch_idx);
-        visitor_b_op.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_a_op.begin_step(step_idx);
-        visitor_b_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_a_op.begin_row(row_idx);
-        visitor_b_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeA result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        VisitAccessTypeB result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
-        NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
-
-        return binary_op(
-            source_converter_A(result_A),
-            source_converter_B(result_B)
-        );
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_a_op.end_row(row_idx);
-        visitor_b_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_a_op.end_step(step_idx);
-        visitor_b_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_a_op.end_epilogue();
-        visitor_b_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h
deleted file mode 100644
index 6dcb32b27f..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_broadcast.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with broadcasting vector to all columns
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementVector T[i][j] <- device-memory Td[i]
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,    ///< Data type of the Accumulator
-    typename ElementFragment_,       ///< Data type used to cache vector in register
-    typename InputTileIterator_      ///< Tile iterator type to read the broadcasted tensor
->
-class VisitorOpColumnBroadcast {
-public:
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementVector = typename InputTileIterator::Element;
-    using ElementFragment = ElementFragment_;
-
-    using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
-
-    /// Thread map used by input tile iterators
-    using ThreadMap = typename InputTileIterator::ThreadMap;
-
-    /// Fragment object used to store the broadcast values
-    using BroadcastFragment = Array<
-        ElementFragment, kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Used for the broadcast
-    struct BroadcastDetail {
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        // /// Number of iterations (accesses) the threadblock takes to reduce a row
-        // static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-    };
-
-    // using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            broadcast_ptr(nullptr) { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementVector *broadcast_ptr,
-            int64_t batch_stride
-        ):
-            broadcast_ptr(broadcast_ptr),
-            batch_stride(batch_stride) { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            broadcast_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            broadcast_ptr(args.broadcast_ptr),
-            batch_stride(args.batch_stride) { }
-    };
-
-private:
-    ElementVector *broadcast_ptr;
-    BroadcastFragment broadcast_fragment;   ///< Array holds the loaded broadcast fragment
-    MatrixCoord threadblock_offset_;
-    int thread_idx_;
-    MatrixCoord problem_size;
-    
-    int thread_start_row_;
-    int state_[3];
-    int thread_offset_row_;
-
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpColumnBroadcast(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        broadcast_ptr(params.broadcast_ptr),
-        threadblock_offset_(threadblock_offset),
-        thread_idx_(thread_idx),
-        problem_size(problem_size),
-        thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
-        batch_stride_(params.batch_stride)
-    {
-        state_[0] = state_[1] = state_[2] = 0;
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        broadcast_ptr += batch_idx * batch_stride_;
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {}
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {}
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        // get pointer
-        thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
-        
-        ElementFragment broadcast_data = ElementFragment(*(broadcast_ptr + thread_offset_row_));
-
-        broadcast_fragment.fill(broadcast_data);
-
-        return broadcast_fragment;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        // run operator ++
-        ++state_[0];
-
-        thread_start_row_ += ThreadMap::Shape::kRow;
-        if (state_[0] == ThreadMap::Count::kRow) {
-            state_[0] = 0;
-            ++state_[1];
-            thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-                ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-            
-            if (state_[1] == ThreadMap::Count::kGroup) {
-                state_[1] = 0;
-                ++state_[2];
-                thread_start_row_ += ThreadMap::Count::kGroup *
-                    ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-                
-                if (state_[2] == ThreadMap::Count::kCluster) {
-                    state_[2] = 0;
-                }
-            }
-        }
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h
deleted file mode 100644
index 289119c933..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with reduction over columns in CTA
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementReductionAccumulator R[j] = \sum_i ElementReductionAccumulator(T[i][j])
-///  device memory <- ElementReduction(R[j])
-///
-template <
-    typename ThreadblockShape_,             /// Threadblock shape
-    typename ElementAccumulator_,           ///< Data type of the Accumulator
-    typename ElementReduction_,             ///< Data type of the output reduction in device memory
-    typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
-    typename OutputTileIterator_,           ///< Tile Iterator type
-    typename Visitor_                       ///< preceding visitor op
->
-class VisitorOpColumnReduction {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementReductionAccumulator = ElementReductionAccumulator_;
-    using ElementReduction = ElementReduction_;
-    using OutputTileIterator = OutputTileIterator_;
-    using ThreadblockShape = ThreadblockShape_;
-    using Visitor = Visitor_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-    using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
-    using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of reduction
-    using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
-
-    /// Thread map used by output tile iterators
-    using ThreadMap = typename OutputTileIterator::ThreadMap;
-    /// Used for the reduction
-    struct ReductionDetail {
-
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        /// Number of iterations (accesses) the threadblock takes to reduce a row
-        static int const kThreadAccessesPerRow = const_max(1, (ThreadblockShape::kN + kThreadCount - 1) / kThreadCount);
-
-        using StorageShape = MatrixShape<
-            kThreadRows,
-            ThreadblockShape::kN
-        >;
-    };
-
-    using ReductionFragment = Array<ElementReductionAccumulator, ReductionDetail::kColumnsPerThread>;
-
-    /// Shared storage
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-        AlignedArray<ElementReductionAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Arguments(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementReduction *reduction_ptr,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            reduction_ptr(reduction_ptr),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;     ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            reduction_ptr(args.reduction_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    ElementReduction *reduction_output_ptr_;           ///< Pointer to the reduction tensor in device memory
-    ElementReductionAccumulator *reduction_smem_ptr_;  ///< Pointer to the partial reductions in shared memory
-    ReductionFragment reduction_fragment;              ///< register fragments that hold the partial reduction
-    Visitor visitor_;                                  ///< visitor
-    int thread_idx_;
-    MatrixCoord threadblock_offset;
-    MatrixCoord problem_size_;
-    int64_t batch_stride_;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpColumnReduction(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor,
-            thread_idx, threadblock_offset, problem_size),
-        reduction_smem_ptr_(shared_storage.reduction.data()),
-        reduction_output_ptr_(params.reduction_ptr),
-        thread_idx_(thread_idx),
-        threadblock_offset(threadblock_offset),
-        problem_size_(problem_size),
-        batch_stride_(params.batch_stride)
-    { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        reduction_output_ptr_ += batch_idx * batch_stride_;
-        visitor_.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-        
-        // clear the reduction fragment
-        reduction_fragment.clear();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        NumericArrayConverter<ElementReductionAccumulator, ElementVisitor, kElementsPerAccess> reduction_converter;
-        ReductionOp reduction_op;
-        ReductionAccumulatorAccessType* reduction_fragment_ = reinterpret_cast<ReductionAccumulatorAccessType*>(&reduction_fragment);
-        reduction_fragment_[column_idx] = reduction_op(reduction_fragment_[column_idx], reduction_converter(result));
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-         //
-        // Store the partially reduced value to SMEM
-        //
-
-        // Guard against uses of the existing SMEM tile
-        __syncthreads();
-
-        using AccessType = AlignedArray<ElementReductionAccumulator, ThreadMap::kElementsPerAccess>;
-
-        //
-        // Determine a compact thread arrangement to store to SMEM
-        //
-
-        MatrixCoord thread_offset(
-            thread_idx_ / ReductionDetail::kThreadsPerRow,
-            (thread_idx_ % ReductionDetail::kThreadsPerRow) * ThreadMap::kElementsPerAccess
-        );
-
-        //
-        // Each thread store its fragment to a SMEM
-        //
-        AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
-            &reduction_smem_ptr_[thread_offset.row() * ThreadblockShape::kN + thread_offset.column()]
-        );
-
-        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(
-            &reduction_fragment
-        );
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
-
-            aligned_reduction_ptr[col_idx] = frag_ptr[column];
-        }
-
-        __syncthreads();
-
-        //
-        // Now, threads are assigned several columns of the output. The fetch over all rows from
-        // the compacted SMEM tile and perform a reduction.
-        //
-
-        NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
-            int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
-
-            ReductionOpScalar reduction_op;
-            ElementReductionAccumulator reduction_element = ElementReductionAccumulator();
-
-            int output_column_idx = threadblock_offset.column() + column_idx;
-
-            if (column_idx < ThreadblockShape::kN && output_column_idx < problem_size_.column()) {
-                
-                CUTLASS_PRAGMA_UNROLL
-                for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
-                    if (row) {
-                        auto frag = reduction_smem_ptr_[row * ThreadblockShape::kN + column_idx];
-                        reduction_element = reduction_op(reduction_element, frag);
-                    }
-                    else {
-                        
-                        reduction_element = reduction_smem_ptr_[column_idx];
-                    }
-                }
-
-                // Store
-                reduction_output_ptr_[column_idx + threadblock_offset.column() + threadblock_offset.row() / ThreadblockShape::kM * problem_size_.column()] = output_converter(reduction_element);
-            }
-        }
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h
deleted file mode 100644
index 259656e78f..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Linear Combination
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,   ///< Number of elements computed per operation
-    typename VisitorA_,            ///< Child node A      
-    typename VisitorB_             ///< Child node B
->
-class VisitorOpLinearCombination{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using VisitorA = VisitorA_;
-    using VisitorB = VisitorB_;
-
-    /// Fragment type returned from VisitorA.visit
-    using VisitAccessTypeA = typename VisitorA::VisitAccessType;
-    using ElementA = typename VisitAccessTypeA::Element;
-
-    /// Fragment type returned from VisitorB.visit
-    using VisitAccessTypeB = typename VisitorB::VisitAccessType;
-    using ElementB = typename VisitAccessTypeB::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Combination Op
-    using CombinationOp = cutlass::plus<VisitAccessType>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
-    static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename VisitorA::SharedStorage storage_a;
-        typename VisitorB::SharedStorage storage_b;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        ElementCompute alpha;                         ///< scales accumulators
-        ElementCompute beta;                          ///< scales source tensor
-        typename VisitorA::Arguments visitor_a_arg;    ///< Argument type for visitor_a
-        typename VisitorB::Arguments visitor_b_arg;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            alpha(ElementCompute(1)),
-            beta(ElementCompute(0))
-            { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementCompute alpha,
-            ElementCompute beta,
-            typename VisitorA::Arguments visitor_a_arg,
-            typename VisitorB::Arguments visitor_b_arg
-        ):
-            alpha(alpha),
-            beta(beta),
-            visitor_a_arg(visitor_a_arg),
-            visitor_b_arg(visitor_b_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        ElementCompute alpha;                         ///< scales accumulators
-        ElementCompute beta;                          ///< scales source tensor
-        typename VisitorA::Params visitor_a_param;    ///< Argument type for visitor_a
-        typename VisitorB::Params visitor_b_param;    ///< Argument type for visitor_b
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            alpha(args.alpha),
-            beta(args.beta),
-            visitor_a_param(args.visitor_a_arg),
-            visitor_b_param(args.visitor_b_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-
-    ElementCompute alpha_;
-    ElementCompute beta_;
-
-    VisitorA visitor_a_op;
-    VisitorB visitor_b_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpLinearCombination(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        alpha_(params.alpha),
-        beta_(params.beta),
-        visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
-        visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
-    { }
-
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_epilogue();
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_step(step_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.begin_row(row_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeA result_A;
-        VisitAccessTypeB result_B;
-
-        if (alpha_ != ElementCompute(0)) {
-            result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            // Fill the result A with zeros
-            result_A.clear();
-        }
-
-        if (beta_ != ElementCompute(0)) {
-            result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            // Fill the result B with zeros
-            result_B.clear();
-        }
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
-        NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
-
-        CombinationOp combination_op;
-
-        cutlass::multiplies<VisitAccessType> multiply_op;
-
-        return combination_op(
-            multiply_op(alpha_, source_converter_A(result_A)),
-            multiply_op(beta_, source_converter_B(result_B))
-        );
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_row(row_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_step(step_idx);
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        if (alpha_ != ElementCompute(0)) visitor_a_op.end_epilogue();
-        if (beta_ != ElementCompute(0)) visitor_b_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h
deleted file mode 100644
index dc7bfa2f49..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_broadcast.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with broadcasting vector to all rows
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementVector T[i][j] <- device-memory Td[j]
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,    ///< Data type of the Accumulator
-    typename ElementFragment_,       ///< Data type used to cache vector in register
-    typename InputTileIterator_      ///< Tile iterator type to read the broadcasted tensor
->
-class VisitorOpRowBroadcast {
-public:
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementVector = typename InputTileIterator::Element;
-    using ElementFragment = ElementFragment_;
-
-    using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
-
-    /// Thread map used by input tile iterators
-    using ThreadMap = typename InputTileIterator::ThreadMap;
-
-    /// Fragment object used to store the broadcast values
-    using BroadcastFragment = Array<
-        ElementFragment, 
-        ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Used for the broadcast
-    struct BroadcastDetail {
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-        // /// Number of iterations (accesses) the threadblock takes to reduce a row
-        // static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-    };
-
-    // using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments():
-            broadcast_ptr(nullptr) { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementVector *broadcast_ptr,
-            int64_t batch_stride
-        ):
-            broadcast_ptr(broadcast_ptr),
-            batch_stride(batch_stride) { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementVector *broadcast_ptr;      ///< Pointer to the additional tensor operand
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            broadcast_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            broadcast_ptr(args.broadcast_ptr),
-            batch_stride(args.batch_stride) { }
-    };
-
-private:
-    ElementVector *broadcast_ptr;
-    BroadcastFragment broadcast_fragment;   ///< Array holds the loaded broadcast fragment
-    MatrixCoord threadblock_offset_;
-    int thread_idx_;
-    MatrixCoord problem_size;
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpRowBroadcast(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        broadcast_ptr(params.broadcast_ptr + threadblock_offset.column()),
-        threadblock_offset_(threadblock_offset),
-        thread_idx_(thread_idx),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        broadcast_ptr += batch_idx * batch_stride_;
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        // load broadcast fragment
-        load_broadcast_fragment_();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {}
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {}
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        VisitAccessType* broadcast_fragment_ = reinterpret_cast<VisitAccessType*>(&broadcast_fragment);
-        return broadcast_fragment_[column_idx];
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-
-private:
-
-  CUTLASS_DEVICE
-  void load_broadcast_fragment_() {
-
-    broadcast_fragment.clear();
-
-    // If no pointer is supplied, set with all zeros and avoid memory accesses
-    if (!broadcast_ptr) {
-      return;
-    }
-
-    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
-
-    int thread_column_idx = threadblock_offset_.column() + thread_initial_column;
-    broadcast_ptr += thread_initial_column;
-
-    NumericArrayConverter<ElementFragment, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
-    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
-    using AccessFragmentType = Array<ElementFragment, BroadcastDetail::kElementsPerAccess>;
-
-    AccessFragmentType *frag_ptr = reinterpret_cast<AccessFragmentType *>(&broadcast_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
-
-      AccessType loaded;
-
-      loaded.clear();
-
-      if (thread_column_idx < problem_size.column()) {
-        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
-      }
-
-      AccessFragmentType cvt = converter(loaded);
-      frag_ptr[j] = cvt;
-
-      thread_column_idx += ThreadMap::Delta::kColumn;
-      broadcast_ptr += ThreadMap::Delta::kColumn;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h
deleted file mode 100644
index 72f65c5232..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with reduction over rows in CTA
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "stdio.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementReductionAccumulator R[i] = \sum_i ElementReductionAccumulator(T[i][j])
-///  device memory <- ElementReduction(R[i])
-///
-template <
-    typename ThreadblockShape_,             /// Threadblock shape
-    typename ElementAccumulator_,           ///< Data type of the Accumulator
-    typename ElementReduction_,             ///< Data type of the output reduction in device memory
-    typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
-    typename OutputTileIterator_,           ///< Tile Iterator type
-    typename Visitor_                       ///< preceding visitor op
->
-class VisitorOpRowReduction {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementReductionAccumulator = ElementReductionAccumulator_;
-    using ElementReduction = ElementReduction_;
-    using OutputTileIterator = OutputTileIterator_;
-    using ThreadblockShape = ThreadblockShape_;
-    using Visitor = Visitor_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-    using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
-    using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of reduction
-    using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
-
-    /// Thread map used by output tile iterators
-    using ThreadMap = typename OutputTileIterator::ThreadMap;
-    /// Used for the reduction
-    struct ReductionDetail {
-
-        /// Number of threads per warp
-        static int const kWarpSize = 32;
-
-        /// Number of distinct scalar column indices handled by each thread
-        static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-        /// Number of distinct scalar row indices handled by each thread
-        static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-        /// Number of threads per threadblock
-        static int const kThreadCount = ThreadMap::kThreads;
-
-        /// Number of distinct threads per row of output tile
-        static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
-
-        /// Half number of threads per row used for cross-thread reduction
-        static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
-
-        /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
-        static int const kThreadRows = kThreadCount / kThreadsPerRow;
-    };
-
-    /// Shared storage
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Arguments(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementReduction *reduction_ptr,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            reduction_ptr(reduction_ptr),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        ElementReduction *reduction_ptr;            ///< Pointer to the reduction tensor in device memory
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;     ///< Argument type of visitor
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params(): reduction_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            reduction_ptr(args.reduction_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    ElementReduction *reduction_output_ptr_;           ///< Pointer to the reduction tensor in device memory
-    ElementReductionAccumulator reduction_accum;
-    Visitor visitor_;                                  ///< visitor
-    int thread_idx_;
-    MatrixCoord threadblock_offset;
-    MatrixCoord problem_size_;
-
-    int thread_start_row_;                             /// used to identify
-    int state_[3];                                     /// used to track row iterator
-    int thread_offset_row_;                            
-    int64_t batch_stride_;
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpRowReduction(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor,
-            thread_idx, threadblock_offset, problem_size),
-        reduction_output_ptr_(params.reduction_ptr),
-        thread_idx_(thread_idx),
-        threadblock_offset(threadblock_offset),
-        problem_size_(problem_size),
-        thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
-        batch_stride_(params.batch_stride)
-    {
-        state_[0] = state_[1] = state_[2] = 0;
-    }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        reduction_output_ptr_ += batch_idx * batch_stride_;
-        visitor_.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-
-        reduction_accum = ElementReductionAccumulator(0);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
-
-        ReductionOpScalar reduction_op;
-
-        ElementReductionAccumulator reduction_accum_ = reduction(result);
-
-        // After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = ReductionDetail::kHalfThreadsPerRow; i > 0; i >>= 1) {
-            reduction_accum_ = reduction_op(reduction_accum_, __shfl_xor_sync(0xFFFFFFFF, reduction_accum_, i));
-        }
-        reduction_accum = reduction_op(reduction_accum, reduction_accum_);
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-        NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
-
-        bool is_write_thread = (thread_offset_row_ < problem_size_.row() && (thread_idx_ % ReductionDetail::kThreadsPerRow) == 0);
-        int row_offset = thread_offset_row_ + threadblock_offset.column() / ThreadblockShape::kN * problem_size_.row();
-
-        ElementReduction *curr_ptr_reduction = reduction_output_ptr_ + row_offset;
-
-        arch::global_store<ElementReduction, sizeof(ElementReduction)>(
-            output_converter(reduction_accum),
-            (void *)curr_ptr_reduction,
-            is_write_thread);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-
-        // run operator ++
-        ++state_[0];
-
-        thread_start_row_ += ThreadMap::Shape::kRow;
-        if (state_[0] == ThreadMap::Count::kRow) {
-            state_[0] = 0;
-            ++state_[1];
-            thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-                ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-            
-            if (state_[1] == ThreadMap::Count::kGroup) {
-                state_[1] = 0;
-                ++state_[2];
-                thread_start_row_ += ThreadMap::Count::kGroup *
-                    ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-                
-                if (state_[2] == ThreadMap::Count::kCluster) {
-                    state_[2] = 0;
-                }
-            }
-        }
-
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-    }
-
-private:
-
-    CUTLASS_DEVICE
-    ElementReductionAccumulator reduction(VisitAccessTypeVisitor const& result) {
-        ElementReductionAccumulator sum_ = ElementReductionAccumulator(0);
-
-        ReductionOpScalar reduction_op;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < VisitAccessTypeVisitor::kElements; ++i) {
-            sum_ = reduction_op(sum_, result[i]);
-        }
-
-        return sum_;
-    }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h
deleted file mode 100644
index d2eac4f3a4..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_input.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Tensor Output
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementInput C <- device memory
-///
-/// It can only be a leaf node in the epilogue tree
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename InputTileIterator_    ///< Tile iterator type to read the tensor
->
-class VisitorOpTensorInput {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using InputTileIterator = InputTileIterator_;
-
-    static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
-    using ElementInput = typename InputTileIterator::Element;
-
-    using VisitAccessType = Array<ElementInput, kElementsPerAccess>;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    struct SharedStorage {
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementInput *input_ptr;                 ///< Pointer to the input tensor in device memory
-        int ldt;                                 ///< Leading dimension of the input tensor operand
-        int64_t batch_stride;                        ///< batch stride for batched GEMM
-        
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments(): input_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementInput *input_ptr,
-            int ldt, int64_t batch_stride
-        ):
-            input_ptr(input_ptr),
-            ldt(ldt),
-            batch_stride(batch_stride)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        typename InputTileIterator::Params params_input;
-        ElementInput *input_ptr;
-        int64_t batch_stride;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            input_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            params_input(args.ldt),
-            input_ptr(args.input_ptr),
-            batch_stride(args.batch_stride)
-        { }
-    };
-
-private:
-    InputTileIterator iterator_T_;
-    typename InputTileIterator::Fragment fragment_T_;
-    MatrixCoord problem_size;
-    int64_t batch_stride_;
-
-public:
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpTensorInput(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        iterator_T_(
-            InputTileIterator(
-                params.params_input,
-                params.input_ptr,
-                problem_size,
-                thread_idx,
-                threadblock_offset
-            )
-        ),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() { }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        fragment_T_.clear();
-        iterator_T_.load(fragment_T_);
-        ++iterator_T_;
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        VisitAccessType source = reinterpret_cast<VisitAccessType *>(&fragment_T_)[frag_idx];
-        return source;
-    }
-
-     CUTLASS_DEVICE
-    void end_row(int row_idx) { }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) { }
-
-    CUTLASS_DEVICE
-    void end_epilogue() { }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h
deleted file mode 100644
index 407611aafc..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_tensor_output.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Tensor Output
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "stdio.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementOutput T = ElementOutput(Visitor)
-///  T-> device memory
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename OutputTileIterator_,  ///< Tile iterator type to write the tensor
-    typename Visitor_              ///< Child visitor that produces the output tensor
->
-class VisitorOpTensorOutput {
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using OutputTileIterator = OutputTileIterator_;
-
-    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-    using ElementOutput = typename OutputTileIterator::Element;
-
-    using Visitor = Visitor_;
-
-    /// Fragment type returned from Visitor
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisitor = typename VisitAccessTypeVisitor::Element;
-
-    using VisitAccessType = VisitAccessTypeVisitor;
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Fragment type of output
-    using OutputAccessType = Array<ElementOutput, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
-
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() { }
-    };
-
-    /// Host-constructable Argument structure
-    struct Arguments {
-        ElementOutput *output_ptr;                 ///< Pointer to the output tensor in device memory
-        int ldt;                                   ///< Leading dimension of the output tensor operand
-        int64_t batch_stride;                      ///< batch stride
-        typename Visitor::Arguments visitor_arg;   ///< Argument type of visitor
-
-        /// Methods
-        CUTLASS_HOST_DEVICE
-        Arguments(): output_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            ElementOutput *output_ptr,
-            int ldt,
-            int64_t batch_stride,
-            typename Visitor::Arguments visitor_arg
-        ):
-            output_ptr(output_ptr),
-            ldt(ldt),
-            batch_stride(batch_stride),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Param structure
-    struct Params {
-        typename OutputTileIterator::Params params_output;
-        ElementOutput *output_ptr;
-        int64_t batch_stride;
-        typename Visitor::Params visitor_param;
-
-        /// Method
-        CUTLASS_HOST_DEVICE
-        Params():
-            output_ptr(nullptr) { }
-
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            params_output(args.ldt),
-            output_ptr(args.output_ptr),
-            batch_stride(args.batch_stride),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    OutputTileIterator iterator_T_;
-    typename OutputTileIterator::Fragment fragment_T_;
-    MatrixCoord problem_size;
-    Visitor visitor_;
-    int64_t batch_stride_;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpTensorOutput(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        visitor_(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size),
-        iterator_T_(
-            OutputTileIterator(
-                params.params_output,
-                params.output_ptr,
-                problem_size,
-                thread_idx,
-                threadblock_offset
-            )
-        ),
-        problem_size(problem_size),
-        batch_stride_(params.batch_stride) { }
-    
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
-        visitor_.set_batch_index(batch_idx);
-    }
-    
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        visitor_.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        fragment_T_.clear();
-        visitor_.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        visitor_.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) {
-        /// Get result from visitor
-        VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-
-        // Column guard
-        MatrixCoord thread_offset_ = iterator_T_.thread_start() + OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-        bool column_guard = (thread_offset_.column() < problem_size.column());
-
-        if (column_guard) {
-            NumericArrayConverter<ElementOutput, ElementVisitor, kElementsPerAccess> output_converter;
-            OutputAccessType &output = reinterpret_cast<OutputAccessType *>(&fragment_T_)[frag_idx];
-            output = output_converter(result);
-        }
-
-        return result;
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        visitor_.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        visitor_.end_step(step_idx);
-        iterator_T_.store(fragment_T_);
-        ++iterator_T_;
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        visitor_.end_epilogue();
-    }
-
-};
-
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h
deleted file mode 100644
index c80543ea3a..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  
-  \brief A file contains the epilogue visitor Op with Unary operation
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "unary_ops.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Epilogue Visitor operator for the following computation:
-///
-///  ElementCompute alpha;
-///  ElementCompute beta;
-///  ElementCompute C = UnaryOp(ElementCompute(Visitor)) 
-///  Return C;
-///
-template <
-    typename ElementAccumulator_,  ///< Data type of the Accumulator
-    typename ElementCompute_,      ///< Data type used to compute linear combination
-    int      kElementsPerAccess_,  ///< Number of elements computed per operation
-    typename Visitor_,              ///< Child node
-    template<typename T, int N> typename UnaryOp_
->
-class VisitorOpUnary{
-public:
-    using ElementAccumulator = ElementAccumulator_;
-    using ElementCompute = ElementCompute_;
-    static int const kElementsPerAccess = kElementsPerAccess_;
-
-    using Visitor = Visitor_;
-
-    /// Fragment type returned from Visitor.visit
-    using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
-    using ElementVisit = typename VisitAccessTypeVisitor::Element;
-
-    /// Fragment type returned by this visitor
-    using VisitAccessType = Array<ElementCompute, kElementsPerAccess>; 
-
-    /// Fragment type of accumulator
-    using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
-
-    /// Combination Op
-    using UnaryOp = UnaryOp_<ElementCompute, kElementsPerAccess>;
-
-    static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
-
-    /// SMEM buffer class required in the epilogue visitor
-    struct SharedStorage {
-        typename Visitor::SharedStorage storage_visitor;
-
-        CUTLASS_HOST_DEVICE
-        SharedStorage() {}
-    };
-
-
-    /// Host-constructable Arguments structure
-    struct Arguments {
-        typename UnaryOp::Arguments unary_arg;
-        typename Visitor::Arguments visitor_arg;    ///< Argument type for visitor
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Arguments():unary_arg() { }
-        
-        CUTLASS_HOST_DEVICE
-        Arguments(
-            typename UnaryOp::Arguments unary_arg,
-            typename Visitor::Arguments visitor_arg
-        ):
-            unary_arg(unary_arg),
-            visitor_arg(visitor_arg)
-        { }
-    };
-
-    /// Parameter structure
-    struct Params {
-        typename UnaryOp::Params unary_param;
-        typename Visitor::Params visitor_param;    ///< Argument type for visitor
-
-        //
-        // Methods
-        //
-        CUTLASS_HOST_DEVICE
-        Params():unary_param() { }
-        
-        CUTLASS_HOST_DEVICE
-        Params(Arguments const &args):
-            unary_param(args.unary_arg),
-            visitor_param(args.visitor_arg)
-        { }
-    };
-
-private:
-    //
-    // Data members
-    //
-    UnaryOp unary_op;
-
-    Visitor visitor_op;
-
-public:
-
-    /// Constructs the function object
-    CUTLASS_HOST_DEVICE
-    VisitorOpUnary(
-        Params const &params,
-        SharedStorage &shared_storage,
-        int thread_idx,
-        MatrixCoord threadblock_offset,
-        MatrixCoord problem_size
-    ):
-        unary_op(params.unary_param),
-        visitor_op(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size)
-    { }
-
-    CUTLASS_DEVICE
-    void set_batch_index(int batch_idx) {
-        visitor_op.set_batch_index(batch_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_epilogue() {
-        if (unary_op.guard()) visitor_op.begin_epilogue();
-    }
-
-    CUTLASS_DEVICE
-    void begin_step(int step_idx) {
-        if (unary_op.guard()) visitor_op.begin_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void begin_row(int row_idx) {
-        if (unary_op.guard()) visitor_op.begin_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    VisitAccessType visit(
-        int iter_idx,
-        int row_idx,
-        int column_idx,
-        int frag_idx,
-        AccumulatorAccessType const &accum
-    ) { 
-        /// Get result from visitor A and visitor B
-        VisitAccessTypeVisitor result;
-
-        if (unary_op.guard()){
-            result = visitor_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
-        } else {
-            result.clear();
-        }
-
-        /// Type conversion
-        NumericArrayConverter<ElementCompute, ElementVisit, kElementsPerAccess> source_converter;
-
-        cutlass::multiplies<VisitAccessType> multiply_op;
-
-        return unary_op(source_converter(result));
-    }
-
-    CUTLASS_DEVICE
-    void end_row(int row_idx) {
-        if (unary_op.guard()) visitor_op.end_row(row_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_step(int step_idx) {
-        if (unary_op.guard()) visitor_op.end_step(step_idx);
-    }
-
-    CUTLASS_DEVICE
-    void end_epilogue() {
-        if (unary_op.guard()) visitor_op.end_epilogue();
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/epilogue/epilogue_visitor_with_layernorm.h b/python/cutlass/cpp/include/epilogue/epilogue_visitor_with_layernorm.h
deleted file mode 100644
index 54936ffbc5..0000000000
--- a/python/cutlass/cpp/include/epilogue/epilogue_visitor_with_layernorm.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this layernormware without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Epilogue visitor type used for partial computation of a layernorm operation
-
-    GemmLayernorm example =  GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm)
-                          +  lightweight full reduction kernel (ApplyFinalReduction)
-                          +  GEMM1 with elementwise operations fused in mainloop (GemmLayernormMainloopFusion)
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ThreadblockShape_,
-  int ThreadCount,
-  typename OutputTileIterator_,
-  typename AccumulatorTile_,
-  typename ElementAccumulator_,
-  typename ElementVariance_,
-  typename ElementMean_,
-  typename ElementLayernormCompute_,
-  typename ElementwiseFunctor_,
-  bool IsShiftedVariance_ = false
->
-class EpilogueVisitorLayerNorm {
-public:
-
-  using ElementVariance = ElementVariance_;
-  using ElementMean = ElementMean_;
-  using ElementLayernormCompute = ElementLayernormCompute_;
-
-  using AccumulatorTile = AccumulatorTile_;
-
-  using ThreadblockShape   = ThreadblockShape_;
-  static int const kThreadCount = ThreadCount;
-
-  using OutputTileIterator = OutputTileIterator_;
-  using ElementwiseFunctor = ElementwiseFunctor_;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-  static int const kRowIterations = OutputTileIterator::ThreadMap::Iterations::kRow;
-
-  static int const kThreads = OutputTileIterator::ThreadMap::kThreads;
-
-  static bool const kIsShiftedVariance = IsShiftedVariance_;
-
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  static int const kDeltaRow = OutputTileIterator::ThreadMap::Delta::kRow;
-
-  /// Array type used in Shift-K Layernorm
-  static int const kRowAccessCount = kIterations * kRowIterations;
-
-  using ConvertedShiftFragment = Array<ElementLayernormCompute, kRowAccessCount>;
-
-  // Conducts manual transpose externally (already supported) for column major
-  using LayoutOutput = cutlass::layout::RowMajor;
-
-  using ElementAccumulator = ElementAccumulator_;
-
-  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
-  using LayernormFragment = Array<ElementLayernormCompute, kElementsPerAccess>;
-  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
-  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
-
-  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
-  static int const kThreadsInColumn = kThreads / kThreadsPerRow;
-  static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
-
-  /// Argument structure
-  struct Arguments {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    ElementVariance                       *ptr_Variance;
-    ElementMean                           *ptr_Mean;
-    ElementOutput                         *ptr_Shifted_K;
-    MatrixCoord                           extent;
-
-    //
-    // Methods
-    //
-    Arguments():
-      ptr_Variance(nullptr),
-      ptr_Mean(nullptr),
-      ptr_Shifted_K(nullptr)
-    {
-
-    }
-
-    Arguments(
-      typename ElementwiseFunctor::Params   elementwise_,
-      ElementVariance                       *ptr_Variance,
-      ElementMean                           *ptr_Mean_,
-      ElementOutput                         *ptr_Shifted_K_ = nullptr,
-      MatrixCoord                           extent = MatrixCoord(0, 0)
-    ):
-      elementwise(elementwise_),
-      ptr_Variance(ptr_Variance),
-      ptr_Mean(ptr_Mean_),
-      ptr_Shifted_K(ptr_Shifted_K_),
-      extent(extent)
-    {
-
-    }
-  };
-
-  struct Params {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    ElementVariance                       *ptr_Variance;
-    ElementMean                           *ptr_Mean;
-    ElementOutput                         *ptr_Shifted_K;
-    MatrixCoord                           extent;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params():
-      ptr_Variance(nullptr),
-      ptr_Mean(nullptr)
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      elementwise(args.elementwise),
-      ptr_Variance(args.ptr_Variance),
-      ptr_Mean(args.ptr_Mean),
-      ptr_Shifted_K(args.ptr_Shifted_K),
-      extent(args.extent)
-    {
-
-    }
-  };
-
-  /// Shared storage
-  struct SharedStorage {
-
-  };
-
-private:
-
-  Params const &                        params_;
-  SharedStorage &                       shared_storage_;
-  MatrixCoord                           extent_;
-  ElementwiseFunctor                    elementwise_;
-
-  OutputTileIterator                    iterator_C_;
-  OutputTileIterator                    iterator_D_;
-  typename OutputTileIterator::Fragment fragment_C_;
-  typename OutputTileIterator::Fragment fragment_D_;
-
-  ElementAccumulator                    alpha_;
-  ElementAccumulator                    beta_;
-  ConvertedShiftFragment                shift_k_frag_;
-
-  ElementLayernormCompute               accum_sum_square_;
-  ElementLayernormCompute               accum_sum_element_;
-  int                                   thread_idx_;
-
-  MatrixCoord                           thread_offset_;
-
-  gemm::GemmCoord                       threadblock_tile_offset_;
-
-public:
-
-  CUTLASS_DEVICE
-  EpilogueVisitorLayerNorm(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord threadblock_offset,
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    OutputTileIterator destination_iterator,                      ///< Tile iterator for destination
-    OutputTileIterator source_iterator                            ///< Threadblock tile coordinate in GEMMM
-  ):
-    params_(params),
-    shared_storage_(shared_storage),
-    elementwise_(params.elementwise),
-    extent_(params.extent),
-    iterator_C_(source_iterator),
-    iterator_D_(destination_iterator),
-    threadblock_tile_offset_(threadblock_tile_offset),
-    thread_idx_(thread_idx)
-  {
-    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
-    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
-
-    if (beta_ == ElementAccumulator()) {
-      iterator_C_.clear_mask();
-    }
-  }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-
-    // If shift-K feature is enabled, we load shift-k fragment
-    // at the very beginning of an epilogue
-    if (kIsShiftedVariance && params_.ptr_Shifted_K != nullptr) {
-      shift_k_frag_.clear();
-      int thread_offset_row_base = iterator_D_.thread_start_row();
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
-        int step_offset = iter_idx * OutputTileIterator::Shape::kRow;
-        CUTLASS_PRAGMA_UNROLL
-        for (int rid = 0; rid < kRowIterations; ++rid) {
-          int row_step_offset = rid * kDeltaRow;
-          int row_offset = thread_offset_row_base + step_offset + row_step_offset;
-          bool is_load = (row_offset < extent_.row());  
-          shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load);
-        }
-
-      }
-
-    }
-
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    fragment_D_.clear();
-
-    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      fragment_C_.clear();
-      iterator_C_.load(fragment_C_);
-      ++iterator_C_;
-    }
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-    /// set the accumulator to 0
-    accum_sum_element_ = ElementLayernormCompute(0);
-    accum_sum_square_ = ElementLayernormCompute(0);
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorFragment const &accum) {
-
-    using Mul = cutlass::multiplies<ElementLayernormCompute>;
-    using Minus = cutlass::minus<ElementLayernormCompute>;
-    using Exp   = cutlass::fast_exp_op<ElementLayernormCompute>;
-
-    Minus     minus;
-    Mul       mul;
-    Exp       exponential;
-
-    LayernormFragment result;
-
-    thread_offset_ =
-      iterator_D_.thread_start() +
-      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-
-    NumericArrayConverter<ElementLayernormCompute, ElementOutput, kElementsPerAccess> source_converter;
-    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
-
-    bool column_guard = (thread_offset_.column() < extent_.column());
-
-    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      result = source_converter(elementwise_(accum));
-    }else{
-      result = source_converter(elementwise_(accum, source_vector));
-    }
-
-
-    ElementLayernormCompute inv_scalar = cutlass::constants::one<ElementLayernormCompute>() / ElementLayernormCompute(extent_.column());
-
-    // Fragment is cleared for non-reachable columns so no need to check against column guard
-    ElementLayernormCompute accum_sum_element_tmp = element_sum_accumulator_(result);
-
-    // Square sum is different. Non-reachable columns should've been computed for shift-k
-    // Otherwise we will incorrectly have some extra k^2 added into square sum.
-    ElementLayernormCompute accum_sum_square_tmp = ElementLayernormCompute(0);
-
-    if (column_guard) {
-      accum_sum_square_tmp = (kIsShiftedVariance) ? \
-                        square_sum_accumulator_(result, shift_k_frag_[iter_idx * kRowIterations + row_idx]) : \
-                        square_sum_accumulator_(result);
-    }
-
-    accum_sum_element_tmp *= inv_scalar;
-    accum_sum_square_tmp *= inv_scalar;
-
-    // After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = kHalfThreadsPerRow; i > 0; i >>= 1) {
-      accum_sum_element_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_element_tmp, i);
-      accum_sum_square_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_square_tmp, i);
-    }
-    accum_sum_element_ += accum_sum_element_tmp;
-    accum_sum_square_ += accum_sum_square_tmp;
-
-    // Convert to the output
-    NumericArrayConverter<ElementOutput, ElementLayernormCompute, kElementsPerAccess> output_converter;
-    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
-    output = output_converter(result);
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-
-    using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
-    using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
-
-    ConvertVarianceOutput   convert_variance_output;
-    ConvertMeanOutput  convert_mean_output;
-
-    bool is_write_thread = (thread_offset_.row() < extent_.row() && (threadIdx.x % kThreadsPerRow) == 0);
-    int row_offset = thread_offset_.row() + threadblock_tile_offset_.n() * extent_.row();
-
-    ElementVariance *curr_ptr_sum_square = params_.ptr_Variance + row_offset;
-    ElementMean *curr_ptr_element_sum = params_.ptr_Mean + row_offset;
-
-    arch::global_store<ElementVariance, sizeof(ElementVariance)>(
-              convert_variance_output(accum_sum_square_),
-              (void *)curr_ptr_sum_square,
-              is_write_thread);
-
-    arch::global_store<ElementMean, sizeof(ElementMean)>(
-              convert_mean_output(accum_sum_element_),
-              (void *)curr_ptr_element_sum,
-              is_write_thread);
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-
-    iterator_D_.store(fragment_D_);
-    ++iterator_D_;
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) {
-    using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
-    ConvertShiftK convert_shift_k;    
-    ElementOutput shift_k_val;
-
-    // Computes the address to load shift_k element
-    ElementOutput *curr_ptr_shift_k = params_.ptr_Shifted_K + row_offset;
-    // Conditionally loads from global memory
-    arch::global_load<ElementOutput, sizeof(ElementOutput)>(shift_k_val, (void *)curr_ptr_shift_k, is_load);
-    // Converts data type to return
-    ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val);
-    
-    return converted_shift_k_val;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      auto accum_ = accum[i];
-      sum_ += accum_ * accum_;
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum, ElementLayernormCompute shift_k_val) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      auto accum_ = accum[i] - shift_k_val;
-      sum_ += accum_ * accum_;
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementLayernormCompute element_sum_accumulator_(LayernormFragment const &accum) {
-    ElementLayernormCompute sum_ = ElementLayernormCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < LayernormFragment::kElements; ++i) {
-      sum_ += accum[i];
-    }
-
-    return sum_;
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/gemm/gemm.h b/python/cutlass/cpp/include/gemm/gemm.h
deleted file mode 100644
index 6eb6abfd0f..0000000000
--- a/python/cutlass/cpp/include/gemm/gemm.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm related enum types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/gemm/gemm.h"
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_gemm(py::module &m) {
-    //
-    // Enumerate types
-    // cutlass/gemm/gemm.h
-
-    py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
-        .value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
-        .value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
-        .value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
-        .value("Array", cutlass::gemm::GemmUniversalMode::kArray)
-        .value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
-    
-    /// GemmCoord is a structure that specifies a location within the coordinate space of a GEMM problem
-    py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
-        .def(py::init<int, int, int>())
-        .def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
-        .def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
-        .def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
-        // get tensor coords
-        .def("mk", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.mk());
-            })
-        .def("kn", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.kn());
-            })
-        .def("mn", 
-            [](const cutlass::gemm::GemmCoord & problem_size) {
-                return cutlass::MatrixCoord(problem_size.mn());
-            });
-    
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_gemm_host_helper(host_submodule);
-}
diff --git a/python/cutlass/cpp/include/gemm/gemm_universal_with_visitor.h b/python/cutlass/cpp/include/gemm/gemm_universal_with_visitor.h
deleted file mode 100644
index 73c5f962c3..0000000000
--- a/python/cutlass/cpp/include/gemm/gemm_universal_with_visitor.h
+++ /dev/null
@@ -1,638 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmUniversalwithEpilogueVisitor {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueVisitor = typename Epilogue::Visitor;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueVisitor::ElementOutput;
-  using LayoutC = typename EpilogueVisitor::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value,
-    128 / sizeof_bits<ElementB>::value
-  );
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase {
-
-    //
-    // Data members
-    //
-
-    typename EpilogueVisitor::Arguments epilogue_visitor;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    typename LayoutA::Stride stride_a;
-    typename LayoutB::Stride stride_b;
-    typename LayoutC::Stride stride_c;
-    typename LayoutC::Stride stride_d;
-
-    typename LayoutA::Stride::LongIndex lda;
-    typename LayoutB::Stride::LongIndex ldb;
-    typename LayoutC::Stride::LongIndex ldc;
-    typename LayoutC::Stride::LongIndex ldd;
-
-    int const * ptr_gather_A_indices;
-    int const * ptr_gather_B_indices;
-    int const * ptr_scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
-      ptr_gather_A_indices(nullptr),
-      ptr_gather_B_indices(nullptr),
-      ptr_scatter_D_indices(nullptr) {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueVisitor::Arguments epilogue_visitor,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride stride_a,
-      typename LayoutB::Stride stride_b,
-      typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue_visitor(epilogue_visitor),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      lda = 0;
-      ldb = 0;
-      ldc = 0;
-      ldd = 0;
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueVisitor::Arguments epilogue_visitor,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue_visitor(epilogue_visitor),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices) {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-      }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
-
-      return args;
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC> {
-
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC>;
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename EpilogueVisitor::OutputTileIterator::Params params_C;
-    typename EpilogueVisitor::OutputTileIterator::Params params_D;
-
-    typename EpilogueVisitor::Params epilogue_visitor;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    int * ptr_gather_A_indices;
-    int * ptr_gather_B_indices;
-    int * ptr_scatter_D_indices;
-
-    int *semaphore;
-
-    //
-    // Methods
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int device_sms,
-      int sm_occupancy
-    ):
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      epilogue_visitor(args.epilogue_visitor),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
-      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
-      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-
-      epilogue_visitor = args.epilogue_visitor;
-
-      semaphore = static_cast<int *>(workspace);
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-    typename EpilogueVisitor::SharedStorage visitor;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  GemmUniversalwithEpilogueVisitor() { }
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmUniversalwithEpilogueVisitor::can_implement()");
-
-    static int const kAlignmentA = (platform::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmUniversalwithEpilogueVisitor op;
-    op(params, shared_storage);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.ptr_gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    // EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // Tile iterator loading from source tensor.
-
-    EpilogueVisitor epilogue_visitor(
-        params.epilogue_visitor,
-        shared_storage.visitor,
-        threadblock_offset,
-        threadblock_tile_offset,
-        thread_idx,
-        params.problem_size.mn()
-    );
-
-    if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
-      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
-    }
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(epilogue_visitor, accumulators);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/cutlass/cpp/include/layout/matrix.h b/python/cutlass/cpp/include/layout/matrix.h
deleted file mode 100644
index f19e04e7b0..0000000000
--- a/python/cutlass/cpp/include/layout/matrix.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Matrix layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/layout/matrix.h"
-
-namespace py = pybind11;
-
-void bind_matrix_layout(py::module &m) {
-    //
-    // Matrix layouts
-    // cutlass/layout/matrix.h
-    //
-
-    py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
-        Mapping function for row-major matrices.
-    )pbdoc")
-        .def_static("packed", &cutlass::layout::RowMajor::packed, 
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::RowMajor & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
-        Mapping function for column-major matrices.
-    )pbdoc")
-        .def_static("packed", &cutlass::layout::ColumnMajor::packed, 
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
-        .def("stride", [](const cutlass::layout::ColumnMajor & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
-        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
-        as row-major arrangement of fixed-size columns 32)pbdoc")
-        .def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-
-    py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
-        R"pbdoc(Mapping function for interleaved matrices. Matrix is structured 
-        as column-major arrangement of fixed-size rows 32)pbdoc")
-        .def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
-            py::arg("extent"), 
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
-            return layout.stride().at(0);
-        }, R"pbdoc(Returns the stride of the layout)pbdoc");
-}
diff --git a/python/cutlass/cpp/include/layout/tensor.h b/python/cutlass/cpp/include/layout/tensor.h
deleted file mode 100644
index 5edb100b9f..0000000000
--- a/python/cutlass/cpp/include/layout/tensor.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Tensor layouts to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/layout/tensor.h"
-
-namespace py = pybind11;
-
-void bind_tensor_layout(py::module &m) {
-    //
-    // Tensor layouts
-    // cutlass/include/cutlass/layout/tensor.h
-    //
-
-    /// Mapping function for 4-D NHWC tensors.
-    py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
-        R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorNHWC::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-    
-    /// Mapping function for 4-D NC/xHWx tensors.
-    py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
-        R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-    
-    /// Mapping function for 4-D CxRSKx tensors.
-    py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
-        R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
-        .def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
-            py::arg("extent"),
-            R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
-        .def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
-            R"pbdoc(Returns the stride of the layout)pbdoc");
-}
diff --git a/python/cutlass/cpp/include/swizzling.h b/python/cutlass/cpp/include/swizzling.h
deleted file mode 100644
index e306624597..0000000000
--- a/python/cutlass/cpp/include/swizzling.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind threadblock swizzling to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/conv/threadblock/threadblock_swizzle.h"
-
-#include <cxxabi.h>
-#include <cuda_runtime.h>
-
-namespace py = pybind11;
-
-std::string demangle(const char* mangled_name) {
-    std::size_t len = 0;
-    int status = 0;
-    std::unique_ptr<char> ptr(
-                __cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status));
-    return ptr.get();
-}
-
-template<typename T>
-void bind_identity_swizzle(py::module & m, std::string name) {
-    py::class_<T>(m, name.c_str(),
-        R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-
-            :param problem_size: gemm(M, N, K)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`
-            )pbdoc")
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-
-            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-
-            :param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_grid_shape", &T::get_grid_shape,
-            py::arg("tiled_shape"), 
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-template<typename T>
-void bind_swizzle(py::module & m, std::string name, std::string doc) {
-    py::class_<T>(m, name.c_str(), doc.c_str())
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-
-            :param problem_size: gemm(M, N, K)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`
-            )pbdoc")
-        .def("get_grid_shape", &T::get_grid_shape,
-            py::arg("tiled_shape"),
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-template<typename T>
-void bind_swizzle_streamk(py::module & m, std::string name, std::string doc) {
-    py::class_<T>(m, name.c_str(), doc.c_str())
-        .def(py::init<>())
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-template<typename T>
-void bind_dgrad_swizzle(py::module & m, std::string name) {
-    py::class_<T>(m, name.c_str(),
-        R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
-        .def(py::init<>())
-        .def("get_tiled_shape",
-            py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
-                &T::get_tiled_shape, py::const_
-            ), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
-            R"pbdoc(Returns the shape of the problem in units of logical tiles
-
-            :param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
-            :type problem_size: :class:`cutlass.gemm.GemmCoord`)
-            )pbdoc")
-        .def("get_grid_shape", &T::get_grid_shape,
-            py::arg("tiled_shape"),
-            R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
-        .def("tag", [](const T & swizzle){
-            return demangle(typeid(T).name());
-        }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
-}
-
-void bind_threadblock_swizzle(py::module &m) {
-
-    py::class_<dim3>(m, "dim3",
-        R"pbdoc(A int3 type xyz contains three integers)pbdoc")
-        .def(py::init<int, int, int>(),
-            py::arg("x"), py::arg("y"), py::arg("z"))
-        .def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
-        .def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
-        .def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
-
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
-    bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
-
-    bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle",  R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
-    bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle",  R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
-
-    bind_swizzle_streamk<cutlass::gemm::threadblock::ThreadblockSwizzleStreamK>(m, "ThreadblockSwizzleStreamK", R"pbdoc(Threadblock swizzling function using Stream K feature)pbdoc");
-
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
-    bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
-}
diff --git a/python/cutlass/cpp/include/tensor_coord.h b/python/cutlass/cpp/include/tensor_coord.h
deleted file mode 100644
index 547df0737e..0000000000
--- a/python/cutlass/cpp/include/tensor_coord.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Tensor Coord to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/tensor_coord.h"
-
-namespace py = pybind11;
-
-void bind_tensor_coord(py::module &m) {
-    //
-    // Tensor Coords
-    // cutlass/include/cutlass/tensor_coord.h
-    //
-
-    /// Defines a canonical 4D coordinate used by tensor operations.
-    py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
-        R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
-        .def(py::init<int, int, int, int>(),
-            py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
-            R"pbdoc(Helper to construct from N, H, W, and C)pbdoc")
-        .def("at", py::overload_cast<int>(&cutlass::Tensor4DCoord::at),
-            py::arg("dim"),
-            R"pbdoc(Gets the index of a given Coord element)pbdoc")
-        .def("size", [](const cutlass::Tensor4DCoord & coord) {
-            return coord.at(0) * coord.at(1) * coord.at(2) * coord.at(3);},
-            R"pbdoc(The size of the tensor coord)pbdoc");
-    
-    py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
-        R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
-        .def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
-            py::arg("dim"),
-            R"pbdoc(Gets the index of a given Coord element)pbdoc");
-
-    // Matrix Size
-    py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
-        R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
-        expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
-        .def(py::init<int, int>(),
-            py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
-        .def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
-            R"pbdoc(Returns the row of the coordinate)pbdoc")
-        .def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
-            R"pbdoc(Returns the column of the coordinate)pbdoc");
-
-}
diff --git a/python/cutlass/cpp/include/tensor_ref_view.h b/python/cutlass/cpp/include/tensor_ref_view.h
deleted file mode 100644
index 48872e61e6..0000000000
--- a/python/cutlass/cpp/include/tensor_ref_view.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind TensorRef and View to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "types.h"
-
-
-template<typename T, typename L, typename TF>
-void bind_tensor_ref_view(py::module &m, std::string name) {
-    py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
-        .def(py::init([](int64_t address, const L& layout_ ) {
-            T* ptr = reinterpret_cast< T*>(address);
-            return new cutlass::TensorRef<T, L>(ptr, layout_);
-        }))
-        .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
-            T* ptr = tensor_ref.data();
-            return int64_t(ptr);
-        })
-        .def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
-    
-    m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
-        T* ptr = reinterpret_cast<T*>(address);
-        cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
-        return tensor_ref;
-    });
-    
-    py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
-        .def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
-}
-
-
-void bind_tensor_refs_and_views(py::module &m) {
-
-    /// float
-    bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
-    bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
-    bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
-
-    /// double
-    bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
-    bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
-    bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
-
-    // half_t
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
-    bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
-
-    // bfloat16
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
-    bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
-
-    // int8_t
-    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
-    bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
-    bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
-
-    // int32_t
-    bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
-    bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
-    bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
-}
diff --git a/python/cutlass/cpp/include/types.h b/python/cutlass/cpp/include/types.h
deleted file mode 100644
index da1669644c..0000000000
--- a/python/cutlass/cpp/include/types.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind CUTLASS types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/half.h"
-
-
-namespace py = pybind11;
-
-namespace cutlass {
-
-/// IEEE 32-bit signed integer
-struct alignas(1) int8 {
-    int8_t storage;
-    explicit int8(int x) {
-        storage = int8_t(x);
-    }
-    explicit int8(float x) {
-        storage = int8_t(x);
-    }
-
-    int8_t c_value(){return storage;}
-};
-
-/// IEEE 32-bit signed integer
-struct alignas(4) int32 {
-    int storage;
-    explicit int32(int x) {
-        storage = x;
-    }
-    explicit int32(float x) {
-        storage = int(x);
-    }
-
-    int c_value(){return storage;}
-};
-/// IEEE single-precision floating-point type
-struct alignas(4) float32 {
-    float storage;
-    explicit float32(float x) {
-        storage = x;
-    }
-    explicit float32(int x) {
-        storage = float(x);
-    }
-    float c_value(){return storage;}
-};
-/// IEEE double-precision floating-point type
-struct alignas(4) float64 {
-    double storage;
-    explicit float64(float x) {
-        storage = double(x);
-    }
-    explicit float64(int x) {
-        storage = double(x);
-    }
-    double c_value(){return storage;}
-};
-}
-
-void bind_cutlass_types(py::module &m) {
-
-    // s8
-    py::class_<cutlass::int8>(m, "int8")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::int8::storage)
-        .def("value", &cutlass::int8::c_value);
-
-    // s32
-    py::class_<cutlass::int32>(m, "int32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::int32::storage)
-        .def("value", &cutlass::int32::c_value);
-
-    // f16
-    py::class_<cutlass::half_t>(m, "float16")
-        .def(py::init<float>())
-        .def(py::init<double>())
-        .def(py::init<int>())
-        .def(py::init<unsigned>())
-        .def_readwrite("storage", &cutlass::half_t::storage)
-        .def("value", [](const cutlass::half_t& value) {return value;});
-    
-    // bf16
-    py::class_<cutlass::bfloat16_t>(m, "bfloat16")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::bfloat16_t::storage)
-        .def("value", [](const cutlass::bfloat16_t& value) {return value;});
-
-    // f32
-    py::class_<cutlass::float32>(m, "float32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::float32::storage)
-        .def("value", &cutlass::float32::c_value);
-
-    // tf32
-    py::class_<cutlass::tfloat32_t>(m, "tfloat32")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::tfloat32_t::storage)
-        .def("value", [](const cutlass::tfloat32_t& value) {return value;});
-    
-    // f64
-    py::class_<cutlass::float64>(m, "float64")
-        .def(py::init<float>())
-        .def(py::init<int>())
-        .def_readwrite("storage", &cutlass::float64::storage)
-        .def("value", &cutlass::float64::c_value);
-}
diff --git a/python/cutlass/cpp/library.h b/python/cutlass/cpp/library.h
deleted file mode 100644
index 5d46f69d1f..0000000000
--- a/python/cutlass/cpp/library.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <cutlass/complex.h>
-
-namespace cutlass {
-
-/// ENUM class for datatypes
-enum class DataType {
-    kB1, kU2, kU4, kU8,
-    kU16, kU32, kU64, kS2,
-    kS4, kS8, kS16, kS32,
-    kS64, kF16, kBF16, kF32,
-    kTF32, kF64, kCF16, kCBF16,
-    kCF32, kCTF32, kCF64, kCS2,
-    kCS4, kCS8, kCS16, kCS32, 
-    kCS64, kCU2, kCU4, kCU8,
-    kCU16, kCU32, kCU64, kInvalid
-};
-
-/// ENUM class for LayoutTypes
-enum class LayoutType {
-    kColumnMajor, kRowMajor,
-    kColumnMajorInterleaved2, kRowMajorInterleaved2,
-    kColumnMajorInterleaved32, kRowMajorInterleaved32,
-    kColumnMajorInterleaved64, kRowMajorInterleaved64,
-    kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
-    kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
-    kTensorC64RSK64
-};
-
-/// ENUM class for opcode class
-
-
-} // namespace cutlass
diff --git a/python/cutlass/cpp/test/conv/conv_problems.h b/python/cutlass/cpp/test/conv/conv_problems.h
deleted file mode 100644
index f2c8ec86d6..0000000000
--- a/python/cutlass/cpp/test/conv/conv_problems.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution problems to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-
-#include "unit/conv/device/conv2d_problems.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-namespace py = pybind11;
-
-PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
-
-void bind_conv_problem_size_test(py::module &m) {
-    
-    py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
-        .def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
-    // Get Conv2d problem sizes
-    py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
-        .def(py::init<int>())
-        .def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
-}
diff --git a/python/cutlass/cpp/test/conv/convolution.h b/python/cutlass/cpp/test/conv/convolution.h
deleted file mode 100644
index dd97d28d3b..0000000000
--- a/python/cutlass/cpp/test/conv/convolution.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind convolution related types to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "conv_problems.h"
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_convolution_test(py::module &m) {
-    // Conv problem sizes
-    bind_conv_problem_size_test(m);
-
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_conv_host_references(host_submodule);
-}
diff --git a/python/cutlass/cpp/test/conv/host.h b/python/cutlass/cpp/test/conv/host.h
deleted file mode 100644
index 69a98390a6..0000000000
--- a/python/cutlass/cpp/test/conv/host.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind Convolution host test helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-#include "unit/conv/device/cache_testbed_output.h"
-
-
-#include "cutlass/util/reference/host/convolution.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-
-namespace py = pybind11;
-
-
-template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
-void bind_conv2d_host(py::module &m) {
-    m.def("conv2d", \
-        &cutlass::reference::host::Conv2d< \
-            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
-    
-    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
-}
-
-template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
-void bind_conv2d_host_sat(py::module &m) {
-    m.def("conv2d", \
-        &cutlass::reference::host::Conv2d< \
-            Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
-    
-    m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
-}
-
-template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
-void bind_conv2d_host_nhwc(py::module &m) {
-    bind_conv2d_host<
-        Ta, cutlass::layout::TensorNHWC, 
-        Tb, cutlass::layout::TensorNHWC, 
-        Tc, cutlass::layout::TensorNHWC, 
-        Tacc, Te>(m);
-}
-
-template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
-void bind_conv2d_host_nc32hw32(py::module &m) {
-    bind_conv2d_host_sat<
-        Ta, cutlass::layout::TensorNCxHWx<32>,
-        Tb, cutlass::layout::TensorCxRSKx<32>,
-        Tc, cutlass::layout::TensorNCxHWx<32>,
-        Tacc, Te>(m);
-}
-
-
-template<typename T, typename Layout>
-void bind_tensor_equals(py::module &m) {
-    m.def("equals", py::overload_cast<
-        const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
-            &cutlass::reference::host::TensorEquals<T, Layout>
-        ));
-}
-
-#define BIND_TENSOR_HASH(Element, Layout) { \
-    m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
-}
-
-void bind_conv_host_references(py::module &m) {
-    //
-    // Conv2d reference on host
-    // tools/util/include/cutlass/util/reference/host/convolution.h
-
-    /// double
-    bind_conv2d_host_nhwc<double, double, double, double, double>(m);
-    /// float
-    bind_conv2d_host_nhwc<float, float, float, float, float>(m);
-    /// half
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
-    /// bfloat16
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
-    bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
-    /// s8
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    //
-    // Compare whether two tensors are equal
-    //
-    /// double
-    bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
-    /// float
-    bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
-    /// half
-    bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
-    /// bfloat16
-    bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
-    /// s32
-    bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
-    bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
-    /// s8
-    bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
-    bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
-
-    /// Cache
-    py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
-        .def(py::init<>())
-        .def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>())
-        .def_readwrite("problem", &test::conv::device::CachedTestKey::problem);
-    
-    py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
-        .def(py::init<>())
-        .def(py::init<uint32_t>())
-        .def_readwrite("D", &test::conv::device::CachedTestResult::D);
-    
-    py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
-        .def(py::init<const std::string &>())
-        .def("find", &test::conv::device::CachedTestResultListing::find)
-        .def("append", &test::conv::device::CachedTestResultListing::append)
-        .def("write", &test::conv::device::CachedTestResultListing::write);
-    
-    py::class_<test::conv::device::CRC32>(m, "CRC32")
-        .def(py::init<>());
-    
-    BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
-    BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
-    BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
-}
diff --git a/python/cutlass/cpp/test/gemm/gemm.h b/python/cutlass/cpp/test/gemm/gemm.h
deleted file mode 100644
index 749d8d9d8f..0000000000
--- a/python/cutlass/cpp/test/gemm/gemm.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm test to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "host.h"
-
-namespace py = pybind11;
-
-void bind_gemm_test(py::module &m) {
-    py::module_ host_submodule = m.def_submodule("host");
-    bind_gemm_host_reference(host_submodule);
-}
diff --git a/python/cutlass/cpp/test/gemm/host.h b/python/cutlass/cpp/test/gemm/host.h
deleted file mode 100644
index c6aeee8420..0000000000
--- a/python/cutlass/cpp/test/gemm/host.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Bind gemm test host functions to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/util/reference/host/gemm.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/host_reorder.h"
-
-#include "cutlass/functional.h"
-
-namespace py = pybind11;
-
-
-template<
-    typename ElementA, typename LayoutA,
-    typename ElementB, typename LayoutB,
-    typename ElementC, typename LayoutC,
-    typename AccumulatorType, typename ComputeType, 
-    typename InnerProductOp>
-void bind_host_gemm_saturate(py::module &m) {
-    m.def("gemm_saturate", py::overload_cast<
-        cutlass::gemm::GemmCoord, ComputeType,
-        cutlass::TensorRef<ElementA, LayoutA>,
-        cutlass::TensorRef<ElementB, LayoutB>,
-        ComputeType,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        AccumulatorType>(
-            &cutlass::reference::host::compute_gemm<
-                        ElementA, LayoutA,
-                        ElementB, LayoutB,
-                        ElementC, LayoutC,
-                        ComputeType,
-                        AccumulatorType,
-                        InnerProductOp, 
-                        cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
-                        ));
-}
-
-template<
-    typename ElementA, typename LayoutA,
-    typename ElementB, typename LayoutB,
-    typename ElementC, typename LayoutC,
-    typename AccumulatorType, typename ComputeType, 
-    typename InnerProductOp>
-void bind_host_gemm(py::module &m) {
-    m.def("gemm", py::overload_cast<
-        cutlass::gemm::GemmCoord, ComputeType,
-        cutlass::TensorRef<ElementA, LayoutA>,
-        cutlass::TensorRef<ElementB, LayoutB>,
-        ComputeType,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        cutlass::TensorRef<ElementC, LayoutC>,
-        AccumulatorType>(
-            &cutlass::reference::host::compute_gemm<
-                        ElementA, LayoutA,
-                        ElementB, LayoutB,
-                        ElementC, LayoutC,
-                        ComputeType,
-                        AccumulatorType,
-                        InnerProductOp, 
-                        cutlass::NumericConverter<ElementC, AccumulatorType>>
-                        ));
-}
-
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add(py::module &m) {
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_saturate(py::module &m) {
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::RowMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::RowMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajor, 
-        ElementB, cutlass::layout::ColumnMajor, 
-        ElementC, cutlass::layout::ColumnMajor, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_interleaved(py::module &m) {
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-template<
-    typename ElementA, typename ElementB, typename ElementC,
-    typename AccumulatorType, typename ComputeType>
-void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        ComputeType, AccumulatorType,
-        cutlass::multiply_add<AccumulatorType>>(m);
-    
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::RowMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::RowMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-
-    bind_host_gemm_saturate<
-        ElementA, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementB, cutlass::layout::ColumnMajorInterleaved<32>, 
-        ElementC, cutlass::layout::ColumnMajorInterleaved<32>, 
-        AccumulatorType, ComputeType, 
-        cutlass::multiply_add<AccumulatorType>>(m);
-}
-
-#define BIND_TENSOR_EQUAL(Element, Layout) { \
-    m.def("equals", py::overload_cast< \
-        const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
-        &cutlass::reference::host::TensorEquals<Element, Layout>)); \
-}
-
-void bind_gemm_host_reference(py::module &m) {
-
-    /// double
-    bind_host_gemm_multiply_add<double, double, double, double, double>(m);
-    /// float
-    bind_host_gemm_multiply_add<float, float, float, float, float>(m);
-    /// half_t
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
-    bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
-    /// bfloat16
-    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
-    bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
-
-    /// s8
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-    bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
-
-    // float
-    BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
-
-    // double
-    BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
-
-    // half_t
-    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
-
-    // bfloat16
-    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
-
-    // int32_t
-    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
-
-    // int8_t
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
-    BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
-    
-
-}
diff --git a/python/cutlass/emit/common.py b/python/cutlass/emit/common.py
index a8deb85616..c52818ca30 100644
--- a/python/cutlass/emit/common.py
+++ b/python/cutlass/emit/common.py
@@ -214,12 +214,12 @@
   cutlass::Tensor4DCoord tensor_coord_C = cutlass::conv::implicit_gemm_tensor_c_extent(
     cutlass::conv::Operator::k${conv_kind_name}, *problem_size
   );
-  
+
   TensorRefA tensor_ref_A = get_tensor_ref<TensorRefA, UnderlyingKernel::ElementA>(tensor_coord_A, A);
   TensorRefB tensor_ref_B = get_tensor_ref<TensorRefB, UnderlyingKernel::ElementB>(tensor_coord_B, B);
   TensorRefC tensor_ref_C = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, C);
   TensorRefC tensor_ref_D = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, D);
-  
+
   cutlass::conv::SplitKMode mode;
   if (split_k_mode == "serial") {
     mode = cutlass::conv::SplitKMode::kSerial;
@@ -228,7 +228,7 @@
   } else {
     throw std::runtime_error("Invalid split_k_mode: " + split_k_mode);
   }
-  
+
   typename DeviceKernel::Arguments arguments{
     *problem_size,
     tensor_ref_A,
@@ -238,18 +238,18 @@
     {alpha, beta},
     mode
   };
-  
+
   DeviceKernel implicit_gemm_op;
-  
+
   size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-  
+
   void* workspace_ptr = device_memory_allocation(workspace_size, device_id);
-  
+
   cutlass::Status status = implicit_gemm_op.can_implement(arguments);
   if (status != cutlass::Status::kSuccess) {
     return status;
   }
-  
+
   status = implicit_gemm_op.initialize(arguments, workspace_ptr, stream);
   if (status != cutlass::Status::kSuccess) {
     return status;
@@ -260,6 +260,6 @@
   //
   status = implicit_gemm_op(stream);
 
-  return status;                    
+  return status;
 }
 """
diff --git a/python/cutlass/emit/pytorch.py b/python/cutlass/emit/pytorch.py
index 1beedd0606..737f5cdf34 100644
--- a/python/cutlass/emit/pytorch.py
+++ b/python/cutlass/emit/pytorch.py
@@ -81,12 +81,10 @@
 import logging
 import os
 
-import cutlass_bindings
-
-from cutlass import CUTLASS_PATH, logger, swizzle
+from cutlass import CUTLASS_PATH, logger, swizzle, ConvKind, ConvKindNames, DataType
 from cutlass.backend.gemm_operation import GemmOperationGrouped, GemmOperationUniversal
 from cutlass.backend.conv2d_operation import Conv2dOperation
-from cutlass.backend.library import ApiVersion, ConvKindNames
+from cutlass.backend.library import ApiVersion
 from cutlass.backend.utils.software import CheckPackages, SubstituteTemplate
 from cutlass.emit import common
 
@@ -165,26 +163,26 @@
 
 // CUDA forward declarations
 at::Tensor ${name}_kernel(
-    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,  
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
     std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
     float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1);
 
 // C++ interface
 at::Tensor ${name}(
-    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, 
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
     std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
-    float alpha=1.f, float beta=0.f, 
+    float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1) {
-    return ${name}_kernel(A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);        
+    return ${name}_kernel(A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
 }
- 
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("run",
   py::overload_cast<
     const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
     std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float,  std::string, int>(
-        &${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr, 
+        &${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
         py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
         py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
         py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
@@ -198,26 +196,26 @@
 
 // CUDA forward declarations
 at::Tensor ${name}_kernel(
-    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, 
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
     std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
-    float alpha=1.f, float beta=0.f, 
+    float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1);
 
 // C++ interface
 at::Tensor ${name}(
-    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, 
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
     std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
-    float alpha=1.f, float beta=0.f, 
+    float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1) {
-    return ${name}_kernel(result_size, A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);        
+    return ${name}_kernel(result_size, A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("run",
   py::overload_cast<
-    std::tuple<int, int, int, int>, const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>, 
+    std::tuple<int, int, int, int>, const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
     std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float, std::string, int>(
-        &${name}), py::arg("result_size"), py::arg("A"), py::arg("B"), py::arg("C") = nullptr, 
+        &${name}), py::arg("result_size"), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
         py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
         py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
         py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
@@ -251,11 +249,11 @@
 """
 
 _CUTLASS_TYPE_TO_TORCH_TYPE = {
-    cutlass_bindings.float16: "torch::kF16",
-    cutlass_bindings.float32: "torch::kF32",
-    cutlass_bindings.float64: "torch::kF64",
-    cutlass_bindings.int8: "torch::I8",
-    cutlass_bindings.int32: "torch::I32",
+    DataType.f16: "torch::kF16",
+    DataType.f32: "torch::kF32",
+    DataType.f64: "torch::kF64",
+    DataType.s8: "torch::I8",
+    DataType.s32: "torch::I32",
 }
 
 _PYTORCH_GEMM_IMPL_TEMPLATE_2x = (
@@ -446,16 +444,16 @@
 
 _PYTORCH_CONV2D_IMPL_TEMPLATE_2x = """
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    
+
     cutlass::Status status = ${name}_kernel_run(
-        &problem_size, 
+        &problem_size,
         reinterpret_cast<typename UnderlyingKernel::ElementA*>(A.data_ptr()),
         reinterpret_cast<typename UnderlyingKernel::ElementB*>(B.data_ptr()),
         ptrC,
         reinterpret_cast<typename UnderlyingKernel::ElementC*>(D.data_ptr()),
         alpha, beta,
         split_k_mode, stream, B.device().index());
-    
+
     TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
     return D;
 }
@@ -464,19 +462,19 @@
 _PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x = (
     common._CUTLASS_KERNEL_RUN_CONV2D_2x
     + """
-at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, 
-    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, 
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
     float alpha=1.f, float beta=0.f, std::string split_k_mode="serial", int split_k_slices=1) {
     int N, H, W, C_, K, R, S, P, Q;
     N = A.size(0);
     C_ = A.size(1);
     H = A.size(2);
     W = A.size(3);
-    
+
     K = B.size(0);
     R = B.size(2);
     S = B.size(3);
-    
+
     cutlass::conv::Conv2dProblemSize problem_size(
         cutlass::Tensor4DCoord(N, H, W, C_),
         cutlass::Tensor4DCoord(K, R, S, C_),
@@ -486,14 +484,14 @@
         cutlass::conv::Mode::kCrossCorrelation,
         split_k_slices
     );
-    
+
     P = problem_size.P;
     Q = problem_size.Q;
-    
+
     typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
                                             nullptr :
                                             reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
-    
+
     torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
     at::Tensor D = torch::zeros({N, K, P, Q}, options);
 """ + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
@@ -503,7 +501,7 @@
 _PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x = (
     common._CUTLASS_KERNEL_RUN_CONV2D_2x
     + """
-at::Tensor ${name}_kernel(std::tuple<int, int, int, int> input_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,  
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> input_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
     std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1) {
     int N, H, W, C_, K, R, S;
@@ -511,11 +509,11 @@
     C_ = std::get<1>(input_size);
     H = std::get<2>(input_size);
     W = std::get<3>(input_size);
-    
+
     K = B.size(0);
     R = B.size(2);
     S = B.size(3);
-    
+
     cutlass::conv::Conv2dProblemSize problem_size(
         cutlass::Tensor4DCoord(N, H, W, C_),
         cutlass::Tensor4DCoord(K, R, S, C_),
@@ -525,11 +523,11 @@
         cutlass::conv::Mode::kCrossCorrelation,
         split_k_slices
     );
-    
+
     typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
                                             nullptr :
                                             reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
-    
+
     torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
     at::Tensor D = torch::empty({N, C_, H, W}, options);
 """ + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
@@ -539,19 +537,19 @@
 _PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x = (
     common._CUTLASS_KERNEL_RUN_CONV2D_2x
     + """
-at::Tensor ${name}_kernel(std::tuple<int, int, int, int> weight_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, 
-    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f, 
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> weight_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
     std::string split_k_mode="serial", int split_k_slices=1) {
     int N, H, W, C_, K, R, S;
     K = std::get<0>(weight_size);
     C_ = std::get<1>(weight_size);
     R = std::get<2>(weight_size);
     S = std::get<3>(weight_size);
-    
+
     N = B.size(0);
     H = B.size(2);
     W = B.size(3);
-    
+
     cutlass::conv::Conv2dProblemSize problem_size(
         cutlass::Tensor4DCoord(N, H, W, C_),
         cutlass::Tensor4DCoord(K, R, S, C_),
@@ -561,11 +559,11 @@
         cutlass::conv::Mode::kCrossCorrelation,
         split_k_slices
     );
-    
+
     typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
                                             nullptr :
                                             reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
-    
+
     torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
     at::Tensor D = torch::empty({K, C_, R, S}, options);
 """ + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
@@ -726,7 +724,7 @@ def _pytorch_gemm(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""
         impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_3x
     else:
         impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_2x
-        if isinstance(op.swizzling_functor, swizzle.ThreadblockSwizzleStreamK):
+        if op.swizzling_functor == swizzle.ThreadblockSwizzleStreamK:
             extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x_STREAM_K
         else:
             extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x
@@ -837,9 +835,9 @@ def _pytorch_conv2d(op, name: str, cc: int, jit: bool = False, sourcedir: str =
     :type jit: bool
     :param sourcedir: directory to which generated source files should be written
     :type sourcedir: str
-    
+
     Note that the when conv kind is `dgrad` or `wgrad`, the size of the input `(N, C, H, W)` or
-    weight `(K, C, R, S)` should be provided. This is because there are multiple valid solutions 
+    weight `(K, C, R, S)` should be provided. This is because there are multiple valid solutions
     for H/W/R/S given the same P/Q.
 
     :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
@@ -848,13 +846,13 @@ def _pytorch_conv2d(op, name: str, cc: int, jit: bool = False, sourcedir: str =
         os.makedirs(sourcedir)
     cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
     extra_kw = {}
-    if op.conv_kind == cutlass_bindings.conv.Operator.fprop:
+    if op.conv_kind == ConvKind.Fprop:
         impl_template = _PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x
         cpp_template = _PYTORCH_CONV2D_FPROP_CPP_TEMPLATE
-    elif op.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+    elif op.conv_kind == ConvKind.Dgrad:
         impl_template = _PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x
         cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
-    elif op.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+    elif op.conv_kind == ConvKind.Wgrad:
         impl_template = _PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x
         cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
     extra_kw["conv_kind_name"] = ConvKindNames[op.conv_kind].capitalize()
diff --git a/python/cutlass/epilogue/__init__.py b/python/cutlass/epilogue/__init__.py
new file mode 100644
index 0000000000..103cb5fcd8
--- /dev/null
+++ b/python/cutlass/epilogue/__init__.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass.epilogue.epilogue import (
+    get_activations,
+    get_activation_epilogue,
+    gelu,
+    hardswish,
+    identity,
+    leaky_relu,
+    relu,
+    sigmoid,
+    silu,
+    tanh,
+    trace
+)
+
+from cutlass.epilogue.evt_ops import (
+    max,
+    multiply_add,
+    sum,
+    permute,
+    reshape
+)
diff --git a/python/cutlass/epilogue.py b/python/cutlass/epilogue/epilogue.py
similarity index 69%
rename from python/cutlass/epilogue.py
rename to python/cutlass/epilogue/epilogue.py
index 6355a071f3..d76123f34e 100644
--- a/python/cutlass/epilogue.py
+++ b/python/cutlass/epilogue/epilogue.py
@@ -45,6 +45,7 @@
 
 from cutlass.backend import epilogue
 
+
 gelu = epilogue.gelu
 hardswish = epilogue.hardswish
 identity = epilogue.identity
@@ -99,9 +100,59 @@ def get_activation_epilogue(
         )
     else:
         return epilogue.LinearCombinationGeneric(
-            activation(element_compute),
+            activation,
             element_output,
             elements_per_access,
             element_accumulator,
             element_compute,
         )
+
+
+"""
+Frontend for EVT that generates epilogue functor through tracing the input function
+"""
+from cutlass.backend.evt.frontend import PythonASTFrontend
+
+
+def trace(fn, example_tensors, **kwargs):
+    """
+    Trace `fn(**example_tensors)` and generates epilogue visitor
+
+    :param fn: Python callables
+    :param example_tensors: example inputs for fn
+    :type example_tensors: dict
+
+    .. hightlight:: python
+    .. code-block:: python
+        import cutlass.backend.evt
+
+        # Define epilogue function as Python callable
+        def example_fn(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        # Define the example tensors
+        example_inputs = {
+            "accum": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "C": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "alpha": 1.5,
+            "beta": 0.5,
+            "gamma": 2.5,
+            "D": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda")
+        }
+
+        # Generate the epilogue functor
+        epilogue_visitor = cutlass.epilogue.trace(example_fn, example_inputs)
+    """
+    if callable(fn):
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+            pass
+        setattr(EpilogueFunctor, "__call__", staticmethod(fn))
+
+        epilogue_functor = EpilogueFunctor(**kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+    else:
+        raise NotImplementedError("Expect a callable Python function")
diff --git a/python/cutlass/backend/tensor_ref.py b/python/cutlass/epilogue/evt_ops.py
similarity index 60%
rename from python/cutlass/backend/tensor_ref.py
rename to python/cutlass/epilogue/evt_ops.py
index 9f7aa9dac6..19f79a3dab 100644
--- a/python/cutlass/backend/tensor_ref.py
+++ b/python/cutlass/epilogue/evt_ops.py
@@ -1,6 +1,6 @@
-################################################################################
+#################################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,42 +28,52 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
-################################################################################
+#################################################################################################
+
+"""
+Collection of builtin functions used for host reference in EVT
+"""
 
-from cuda import cuda
-import cutlass_bindings
 import numpy as np
 
 from cutlass.backend.utils.software import CheckPackages
 
-cupy_available = CheckPackages().check_cupy()
-if cupy_available:
-    import cupy as cp
-
 torch_available = CheckPackages().check_torch()
 if torch_available:
     import torch
 
 
-class TensorRef:
-    """
-    Python Wrapper for cutlass_bindings.TensorRef
-    """
+def multiply_add(x, y, z):
+    return x * y + z
+
+
+def sum(x, dim):
+    if isinstance(x, np.ndarray):
+        return x.sum(axis=tuple(dim))
+    elif torch_available and isinstance(x, torch.Tensor):
+        return torch.sum(x, dim)
+
+
+def max(x, dim):
+    if isinstance(x, np.ndarray):
+        return x.max(axis=tuple(dim))
+    elif torch_available and isinstance(x, torch.Tensor):
+        return torch.amax(x, dim)
+
+
+##############################################################################
+# Layout manipulate nodes
+##############################################################################
+
+def permute(x, indices: tuple):
+    if isinstance(x, np.ndarray):
+        return np.transpose(x, axes=indices)
+    elif torch_available and isinstance(x, torch.Tensor):
+        return x.permute(*indices)
 
-    def __init__(self, tensor, dtype, layout) -> None:
-        if isinstance(tensor, np.ndarray):
-            ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
-        elif torch_available and isinstance(tensor, torch.Tensor):
-            ptr = cuda.CUdeviceptr(tensor.data_ptr())
-        elif torch_available and isinstance(tensor, cp.ndarray):
-            ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
-        elif isinstance(tensor, cuda.CUdeviceptr):
-            ptr = tensor
-        elif isinstance(tensor, int):
-            ptr = cuda.CUdeviceptr(tensor)
-        else:
-            raise NotImplementedError(tensor)
 
-        # the dtype(0) is used to overload between different data types
-        # with the same layout
-        self.tensor_ref = cutlass_bindings.get_tensor_ref(int(ptr), dtype(0), layout)
+def reshape(x, new_shape: tuple):
+    if isinstance(x, np.ndarray):
+        return np.reshape(x, newshape=new_shape)
+    elif torch_available and isinstance(x, torch.Tensor):
+        return x.view(new_shape)
diff --git a/python/cutlass/library_defaults.py b/python/cutlass/library_defaults.py
index 31150aed3b..ad3e9ba8b3 100644
--- a/python/cutlass/library_defaults.py
+++ b/python/cutlass/library_defaults.py
@@ -35,25 +35,21 @@
 """
 
 import logging
-from cuda import __version__
-
-# Strip any additional information from the CUDA version
-_cuda_version = __version__.split("rc")[0]
 
-# Imports from CUTLASS profiler generator and manifest scripts
-import generator as prof_generator
-import manifest as prof_manifest
-from library import (
-    ConvKind, IteratorAlgorithm, StrideSupport, GroupMode
-)
+from cuda import __version__
+import cutlass_library
+from cutlass_library.library import ConvKind, IteratorAlgorithm, StrideSupport, GroupMode
 
 import cutlass
 from cutlass.utils.check import valid_stage_count
-from cutlass.utils.datatypes import td_from_profiler_td, td_from_profiler_op, has_binding_type
+from cutlass.utils.datatypes import td_from_profiler_td, td_from_profiler_op
 
 
 _generator_ccs = [50, 60, 61, 70, 75, 80, 90]
 
+# Strip any additional information from the CUDA version
+_cuda_version = __version__.split("rc")[0]
+
 
 class KernelsForDataType:
     """
@@ -202,10 +198,10 @@ def __init__(
         # Identify the method within CUTLASS generator script that generates kernel
         # descriptions for the target CC
         generate_function_name = "GenerateSM" + str(kernel_cc)
-        if not hasattr(prof_generator, generate_function_name):
+        if not hasattr(cutlass_library.generator, generate_function_name):
             cutlass.logger.warning(f"No generator found for architecture {kernel_cc}")
             return
-        generate_function = getattr(prof_generator, generate_function_name)
+        generate_function = getattr(cutlass_library.generator, generate_function_name)
 
         # Initialize a default manifest and populate it with valid kernel descriptions
         # for the target CC
@@ -213,8 +209,8 @@ def __init__(
             "--kernels=all",
             f"--log-level={logging.getLevelName(cutlass.logger.level)}"
         ]
-        manifest_args = prof_generator.define_parser().parse_args(args)
-        manifest = prof_manifest.Manifest(manifest_args)
+        manifest_args = cutlass_library.generator.define_parser().parse_args(args)
+        manifest = cutlass_library.manifest.Manifest(manifest_args)
         generate_function(manifest, _cuda_version)
 
         if operation_kind not in manifest.operations:
@@ -223,9 +219,15 @@ def __init__(
             cutlass.logger.warning(f"No operations of type {operation_kind} found for CC {kernel_cc}")
             return
 
+        # Only one CC should be returned, given the setup above of calling only the generation scripts
+        # for a given CC
+        if len(manifest.operations[operation_kind].keys()) != 1 or kernel_cc not in manifest.operations[operation_kind]:
+            raise Exception(f"Error finding kernels for SM{kernel_cc}. Check that your CUDA toolkit version "
+                             "is sufficient for the architecture in question.")
+
         # Iterate through the available operations for this operation kind and
         # find available opclasses and data types
-        for name, op_list in manifest.operations[operation_kind].items():
+        for name, op_list in manifest.operations[operation_kind][kernel_cc].items():
             for op in op_list:
                 if operation_kind == cutlass.OperationKind.Gemm:
                     if op.gemm_kind not in gemm_kinds:
@@ -235,15 +237,15 @@ def __init__(
                 if mi.math_operation not in self.allowed_math_operations:
                     continue
 
-                datatype_comb = (mi.element_a, mi.element_b, mi.element_accumulator)
-
-                # Skip any data types that do not currently have conversions via cutlass_bindings
-                if False in [has_binding_type(elt) for elt in datatype_comb]:
+                if op.C.element == cutlass.DataType.void:
+                    # The CUTLASS Python interface currently does not support void-C kernels
                     continue
 
+                datatype_comb = (mi.element_a, mi.element_b, mi.element_accumulator)
+
                 # Prune operations that don't fit in shared memory
                 td = td_from_profiler_op(op)
-                if not valid_stage_count(target_cc, td)[0]:
+                if not valid_stage_count(target_cc, kernel_cc, td)[0]:
                     continue
 
                 if mi.opcode_class not in self.operations_by_opclass:
@@ -337,19 +339,19 @@ def __init__(
                     [128, 128, 8], 2, [4, 2, 1], math_inst, 50, 1024)
 
                 # Prune operations that don't fit in shared memory
-                if not valid_stage_count(target_cc, td_from_profiler_td(td))[0]:
+                if not valid_stage_count(target_cc, kernel_cc, td_from_profiler_td(td))[0]:
                     continue
 
                 new_kernels = KernelsForDataType(type_comb, layout_comb)
 
                 if operation_kind == cutlass.OperationKind.Gemm:
-                    new_operation = prof_manifest.GemmOperation(
+                    new_operation = cutlass_library.manifest.GemmOperation(
                         cutlass.GemmKind.Universal, td.minimum_compute_capability,
                         td, A, B, C, type_comb[2], epilogue_functor, swizzling_functor)
                     new_kernels.add(new_operation)
                 elif operation_kind == cutlass.OperationKind.Conv2d:
                     for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
-                        new_operation = prof_manifest.Conv2dOperation(
+                        new_operation = cutlass_library.manifest.Conv2dOperation(
                             conv_kind, IteratorAlgorithm.Analytic, td.minimum_compute_capability, td,
                             A, B, C, type_comb[2], StrideSupport.Strided, epilogue_functor, swizzling_functor,
                             group_mode=GroupMode.SingleGroup
diff --git a/python/cutlass/op/__init__.py b/python/cutlass/op/__init__.py
index d3cfbe7e22..fb232c54e5 100644
--- a/python/cutlass/op/__init__.py
+++ b/python/cutlass/op/__init__.py
@@ -30,7 +30,7 @@
 #
 #################################################################################################
 
-from cutlass.op.gemm import Gemm
 from cutlass.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
+from cutlass.op.gemm import Gemm
 from cutlass.op.gemm_grouped import GroupedGemm
 from cutlass.op.op import OperationBase
diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
index 32c5a8e778..3968785925 100644
--- a/python/cutlass/op/conv.py
+++ b/python/cutlass/op/conv.py
@@ -49,7 +49,7 @@
         # A, B, C, and D are torch/numpy/cupy tensor objects
         plan = cutlass.op.Conv(A, B, C, D)
         plan.run(stride=(1, 1), padding=(0, 0), dilation=(1, 1))
-    
+
     One can also use the interface by specifying data types of operands at construction
     and using different tensor objects with these data types at runtime:
 
@@ -112,21 +112,29 @@
         args.sync()
 """
 
-import cutlass_bindings
 import cutlass
 from cutlass import epilogue
+from cutlass import (
+    ConvKind,
+    ConvMode,
+    IteratorAlgorithm,
+    SplitKMode,
+    StrideSupport,
+)
 from cutlass.backend import compiler
 from cutlass.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
 from cutlass.backend.reduction_operation import ReductionOperation, ReductionArguments
 from cutlass.backend.library import TensorDescription, TileDescription
 from cutlass.op.op import OperationBase
+from cutlass.shape import Conv2DProblemSize, MatrixCoord
 from cutlass.utils import check, datatypes
 
+
 class Conv2d(OperationBase):
     """
-    Constructs a ``Conv2d`` object. 
+    Constructs a ``Conv2d`` object.
 
-    The convolution kind (fprop, wgrad, degrad), the data types of operands A, B, and C, 
+    The convolution kind (fprop, wgrad, degrad), the data types of operands A, B, and C,
     along with the data type of output D and that used for accumulation, are bound to the ``Conv``
     object throughout its lifetime -- these are not to be changed after a ``Conv2d`` has been constructed.
 
@@ -142,7 +150,7 @@ class Conv2d(OperationBase):
         Conv2d(kind="fprop", element=cutlass.DataType.f32)
 
         # Explicitly specify the data types to use for A, B, C, and D.
-        Conv2d(kind="fprop", element_A=cutlass.DataType.f32, element_B=cutlass.DataType.f32, 
+        Conv2d(kind="fprop", element_A=cutlass.DataType.f32, element_B=cutlass.DataType.f32,
             element_C=cutlass.DataType.f32, element_D=cutlass.DataType.f32)
 
         # Set the data types and elements from existing tensors. Note that one can use different tensors when
@@ -151,7 +159,7 @@ class Conv2d(OperationBase):
         # A, B, C, and D are torch.Tensor objects of type torch.float32 under the channel-last layout
         Conv2d(kind="fprop", A=A, B=B, C=C, D=D)
 
-        # Explicitly specify the data type for only some of A, B, C, and D. Unspecified data types will inherit 
+        # Explicitly specify the data type for only some of A, B, C, and D. Unspecified data types will inherit
         # those passed in via the generic ``element``
         Conv2d(kind="fprop", element_A=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
             element=cutlass.DataType.f32)
@@ -187,9 +195,9 @@ class Conv2d(OperationBase):
     :type kernel_cc: int
     """
     def __init__(
-        self, kind="fprop", 
+        self, kind="fprop",
         A=None, B=None, C=None, D=None, alpha=1.0, beta=0.0,
-        element=None, 
+        element=None,
         element_A=None, element_B=None, element_C=None, element_D=None,
         element_accumulator=None,
         cc: int = None, kernel_cc: int = None
@@ -202,18 +210,18 @@ def __init__(
             cutlass.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
             self.specified_kernel_cc = 80
             self._reset_options(80)
-        
+
         # The arch is used in testing
         self.arch = self.current_cc
         self.name = "conv2d" + kind
 
-        # The convolution kind. (concept: cutlass_bindings.conv.Operator)
-        self.conv_kind = getattr(cutlass_bindings.conv.Operator, kind)
-        
+        # The convolution kind. (concept: cutlass_library.library.ConvKind)
+        self.conv_kind = datatypes.getattr_enum(ConvKind, kind)
+
         # The element types (concept: cutlass library types) of A, B, C, and D
         elements = []
         layouts = []
-        
+
         # Complete the data types based on user-provided arguments
         for elt, tens, name in zip([element_A, element_B, element_C, element_D],
                                    [A, B, C, D],
@@ -222,27 +230,27 @@ def __init__(
                 raise Exception(f'Must not specify both element_{name} and tensor {name}')
             if elt is None and tens is None and element is None:
                 raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
-            
+
             elt_to_set = None
             lay_to_set = None
-            
+
             if tens is not None:
                 elt_to_set, _ = datatypes.get_datatype_and_layout(tens)
             else:
                 elt_to_set = elt if elt is not None else element
-            
+
             assert elt_to_set is not None
-            
+
             # Currently we only support layout TensorNHWC
-            lay_to_set = cutlass.LayoutType.TensorNHWC 
+            lay_to_set = cutlass.LayoutType.TensorNHWC
             elements.append(datatypes.library_type(elt_to_set))
             layouts.append(lay_to_set)
-        
+
         self._element_a, self._element_b, self._element_c, self._element_d = elements
         self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
-        
+
         self.A, self.B, self.C, self.D, self.alpha, self.beta = A, B, C, D, alpha, beta
-        
+
         if element_accumulator is None:
             self._element_accumulator = self._element_c
         else:
@@ -253,38 +261,38 @@ def __init__(
         self.B = B
         self.C = C
         self.D = D
-        
+
         self.alpha = alpha
-        self.beta = beta  
-        
+        self.beta = beta
+
         # We only specify the stride of the swizzling functor here
         # The actual swizzling functor is determined in run based on conv_kind and stride
         self._swizzling_stride = 1
-        
+
         # Arguments that will be set to default value in _reset_operations
         # The default tile_description and op_class are fetched from manifest of cutlass library
         self._tile_description = None
         self.op_class = None
         # The default identity epilogue will be created
         self.epilogue_functor = None
-        
+
         self._reset_operations()
-        
+
         # Arguments that will be determined online based on arguments of "run"
         # based on stride, input/output channels, alignment, and conv_kind
         self._iterator_algorithm = None
         self._stride_support = None
-        
+
     def _reset_operations(self, reset_epilogue: bool = True):
         # Set the default op class
         datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
         layout_comb = (self._layout_a, self._layout_b)
-        
+
         self.possible_op_classes = self.options.supporting_opclasses(
             self._element_a, self._element_b, self._element_accumulator,
             self._layout_a, self._layout_b
         )
-        
+
         if cutlass.OpcodeClass.TensorOp in self.possible_op_classes:
             self.opclass = cutlass.OpcodeClass.TensorOp
         elif cutlass.OpcodeClass.Simt in self.possible_op_classes:
@@ -292,34 +300,34 @@ def _reset_operations(self, reset_epilogue: bool = True):
         else:
             raise Exception(f'No kernel configuration found for supported data type and layout '
                             f'combination {datatype_comb}x{layout_comb}')
-        
+
         if reset_epilogue:
             self._reset_epilogue_functor_activation(epilogue.identity)
-        
+
         self.alignment_pref_A = min(
             128 // cutlass.DataTypeSize[self._element_a], max(self.possible_operations.alignments))
         self.alignment_pref_B = min(
             128 // cutlass.DataTypeSize[self._element_b], max(self.possible_operations.alignments))
         self.alignment_pref_C = min(
-            128 // cutlass.DataTypeSize[self._element_c], max(self.possible_operations.alignments))    
+            128 // cutlass.DataTypeSize[self._element_c], max(self.possible_operations.alignments))
 
     #
     # Tile description Related
     #
-    
+
     @property
     def tile_description(self) -> TileDescription:
         """
         Returns the tile description
         """
         return self._tile_description
-    
+
     @tile_description.setter
     def tile_description(
         self, td=None):
         """
         Set the tile description
-        
+
         :param td: tile description
         :type td: cutlass.backend.TileDescription, or a dict with keys
                   {
@@ -340,7 +348,7 @@ def tile_description(
             if "cluster_shape" in td.keys():
                 if td["cluster_shape"] != [1, 1, 1]:
                     cutlass.logger.warning("Conv2d currently only support 'cluster_shape'=[1, 1, 1]'.")
-                    td["cluster_shape"] = [1, 1, 1]              
+                    td["cluster_shape"] = [1, 1, 1]
             td = self._tile_description.clone_and_update(td)
 
         valid, msg = self._valid_tile_description(td)
@@ -348,7 +356,7 @@ def tile_description(
             self._tile_description = td
         else:
             raise Exception(msg)
-    
+
     def _valid_tile_description(self, td: TileDescription) -> tuple:
         """
         Checks whether the provided tile description is valid for the given compute capability. At present,
@@ -366,9 +374,7 @@ def _valid_tile_description(self, td: TileDescription) -> tuple:
                  and the second element is a string providing an optional error message.
         :rtype: tuple
         """
-        # Check stage count based on the CC to which we are compiling (self.cc), rather
-        # than the CC from which we find kernels (self.current_cc)
-        valid, msg = check.valid_stage_count(self.cc, td)
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td)
         if not valid:
             return (valid, msg)
 
@@ -393,11 +399,11 @@ def tile_descriptions(self) -> list:
                 description_str.append(str(td))
                 descriptions.append(td)
         return descriptions
-    
+
     #
     # Swizzling functor Related
     #
-    
+
     @property
     def swizzling_stride(self):
         """
@@ -420,108 +426,110 @@ def _propose_swizzling_functor(self, stride):
         """
         Automatically propose the swizzling functor based on the stride
         """
-        if self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+        if self.conv_kind == ConvKind.Dgrad:
             if stride[0] != 1 or stride[1] != 1:
                 return getattr(cutlass.swizzle, f"StridedDgradIdentitySwizzle{self._swizzling_stride}")
-        
+
         return getattr(cutlass.swizzle, f"IdentitySwizzle{self._swizzling_stride}")
 
     #
     # Iterator Algorithm Related
     #
-    
+
     @property
-    def iterator_algorithm(self) -> cutlass_bindings.conv.IteratorAlgorithm:
+    def iterator_algorithm(self) -> IteratorAlgorithm:
         """
         Returns the iterator algorithm
         """
         return self._iterator_algorithm
-    
+
     @iterator_algorithm.setter
     def iterator_algorithm(self, alg: str):
         """
         Sets the iterator algorithm
-        
+
         :param alg: The iterator algorithm
         :type td: string, options: "analytic", "optimized", "few_channels", and "fixed_channels"
         """
+        iterator_alg = datatypes.getattr_enum(IteratorAlgorithm, alg)
+
         # Check if the iterator algorithm is valid
-        if alg in ["few_channels", "fixed_channels"] and self.conv_kind != cutlass_bindings.conv.Operator.fprop:
+        if iterator_alg in [IteratorAlgorithm.FewChannels, IteratorAlgorithm.FixedChannels] and self.conv_kind != ConvKind.Fprop:
             raise Exception(f"{self.conv_kind} does not support iterator algorithm {alg}.")
-        
-        self._iterator_algorithm = getattr(cutlass_bindings.conv.IteratorAlgorithm, alg)
 
-    def _propose_iterator_algorithm(self, problem_size, alignment_a, alignment_b) -> cutlass_bindings.conv.IteratorAlgorithm:
+        self._iterator_algorithm = iterator_alg
+
+    def _propose_iterator_algorithm(self, problem_size, alignment_a, alignment_b) -> IteratorAlgorithm:
         """
         Propose a valid iterator algorithm based on problem size and alignment
         """
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+        if self.conv_kind == ConvKind.Fprop:
             # Check whether the fixed channel is applicable
             if problem_size.C == alignment_a:
-                return cutlass_bindings.conv.IteratorAlgorithm.fixed_channels
+                return IteratorAlgorithm.FixedChannels
             elif (problem_size.C % alignment_a == 0 and
                   problem_size.R <= 32 and problem_size.S <= 32):
-                return cutlass_bindings.conv.IteratorAlgorithm.optimized
+                return IteratorAlgorithm.Optimized
             else:
-                return cutlass_bindings.conv.IteratorAlgorithm.analytic
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Dgrad:
             if (problem_size.K % alignment_a == 0 and
                 problem_size.R <= 32 and problem_size.S <= 32 and
                 problem_size.C % alignment_b == 0):
-                return cutlass_bindings.conv.IteratorAlgorithm.optimized
+                return IteratorAlgorithm.Optimized
             else:
-                return cutlass_bindings.conv.IteratorAlgorithm.analytic
-        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Wgrad:
             if (problem_size.K % alignment_a == 0 and
                 problem_size.C % alignment_b == 0):
-                return cutlass_bindings.conv.IteratorAlgorithm.optimized
+                return IteratorAlgorithm.Optimized
             else:
-                return cutlass_bindings.conv.IteratorAlgorithm.analytic
-    
+                return IteratorAlgorithm.Analytic
+
     def _validate_iterator_algorithm(self, iterator_algorithm, problem_size, alignment_a, alignment_b) -> bool:
         """
         Validate whether the user provide iterator algorithm works for the given problem size
         """
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-            if iterator_algorithm == cutlass_bindings.conv.IteratorAlgorithm.fixed_channels:
+        if self.conv_kind == ConvKind.Fprop:
+            if iterator_algorithm == IteratorAlgorithm.FixedChannels:
                 return problem_size.C == alignment_a
-            elif iterator_algorithm == cutlass_bindings.conv.IteratorAlgorithm.optimized:
+            elif iterator_algorithm == IteratorAlgorithm.Optimized:
                 return (problem_size.C % alignment_a == 0 and
                   problem_size.R <= 32 and problem_size.S <= 32)
-            elif iterator_algorithm == cutlass_bindings.conv.IteratorAlgorithm.few_channels:
+            elif iterator_algorithm == IteratorAlgorithm.FewChannels:
                 return problem_size.C % alignment_a == 0
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-            if iterator_algorithm == cutlass_bindings.conv.IteratorAlgorithm.optimized:
+        elif self.conv_kind == ConvKind.Dgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
                 return (problem_size.K % alignment_a == 0 and
                         problem_size.R <= 32 and problem_size.S <= 32 and
                         problem_size.C % alignment_b == 0)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
-            if iterator_algorithm == cutlass_bindings.conv.IteratorAlgorithm.optimized:
+        elif self.conv_kind == ConvKind.Wgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
                 return (problem_size.K % alignment_a == 0 and
                 problem_size.C % alignment_b == 0)
-        
+
         return True
-    
+
     #
     # Stride Support Related
     #
-    
+
     def _propose_stride_support(self, stride):
-        if self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+        if self.conv_kind == ConvKind.Dgrad:
             if stride[0] == 1 and stride[1] == 1:
-                return cutlass.backend.library.StrideSupport.Unity
-        
-        return cutlass.backend.library.StrideSupport.Strided
-    
+                return StrideSupport.Unity
+
+        return StrideSupport.Strided
+
     #
     # Construct and Compilation
     #
-    
+
     def construct(
-        self, tile_description: TileDescription = None, 
+        self, tile_description: TileDescription = None,
         alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
-        iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm = None,
-        stride_support = None, swizzling_functor: cutlass.swizzle = None, 
+        iterator_algorithm: IteratorAlgorithm = None,
+        stride_support = None, swizzling_functor: cutlass.swizzle = None,
         epilogue_functor=None) -> cutlass.backend.Conv2dOperation:
         """
         Constructs a ``cutlass.backend.Conv2dOperation`` based on the input parameters and current
@@ -536,9 +544,9 @@ def construct(
         :param alignment_C: alignment of operand C
         :type alignment_C: int
         :param iterator_algorithm: the iterator algorithm used
-        :type iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
         :param stride_support: the stride support of dgrad
-        :type stride_support: cutlass.backend.library.StrideSupport
+        :type stride_support: cutlass_library.library.StrideSupport
         :param swizzling_functor: the swizzling functor
         :type swizzling_functor: cutlass.swizzle
         :param epilogue_functor: the epilogue functor
@@ -550,66 +558,55 @@ def construct(
         alignment_A = check.alignment_or_default(alignment_A, self.alignment_pref_A)
         alignment_B = check.alignment_or_default(alignment_B, self.alignment_pref_B)
         alignment_C = check.alignment_or_default(alignment_C, self.alignment_pref_C)
-        
-        tensor_A = TensorDescription(
-            datatypes.binding_type(self._element_a),
-            datatypes.binding_layout(self._layout_b),
-            alignment_A
-        )
-        tensor_B = TensorDescription(
-            datatypes.binding_type(self._element_b),
-            datatypes.binding_layout(self._layout_b),
-            alignment_B
-        )
-        tensor_C = TensorDescription(
-            datatypes.binding_type(self._element_c),
-            datatypes.binding_layout(self._layout_c),
-            alignment_C
-        )
-        
+
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+
         if tile_description is None:
             if self.tile_description is not None:
                 tile_description = self.tile_description
             else:
-                op = self.possible_operations.operations(alignment_A)[0]
+                min_alignment = min([alignment_A, alignment_B, alignment_C])
+                op = self.possible_operations.operations(min_alignment)[0]
                 tile_description = datatypes.td_from_profiler_op(op)
         else:
             valid, err_str = self._valid_tile_description(tile_description)
             if not valid:
                 raise Exception(f"Invalid tile description. {err_str}")
             self.tile_description = tile_description
-        
+
         if iterator_algorithm is None:
             # If the iterator algorithm is already set
-            if self.iterator_algorithm is not None: 
+            if self.iterator_algorithm is not None:
                 iterator_algorithm = self.iterator_algorithm
             else:
                 # Otherwise, we conservatively use the analytic iterator for correctness
-                iterator_algorithm = cutlass_bindings.conv.IteratorAlgorithm.analytic
-        
+                iterator_algorithm = IteratorAlgorithm.Analytic
+
         if stride_support is None:
             # If the stride support is already set
             if self._stride_support is not None:
                 stride_support = self._stride_support
             else:
                 # Otherwise, we assume strided
-                stride_support = cutlass.backend.library.StrideSupport.Strided
-        
+                stride_support = StrideSupport.Strided
+
         if swizzling_functor is None:
             # If the swizzling functor is already set
             swizzling_functor = self._propose_swizzling_functor(stride=(2, 2))
-        
+
         if epilogue_functor is None:
             if self.epilogue_functor is not None:
                 epilogue_functor = self.epilogue_functor
             else:
                 epilogue_functor = self._create_epilogue_functor_activation(self._activation)
-        
+
         # Reset the alignment of the epilogue functor
         epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, epilogue_functor)
-        
+
         operation = Conv2dOperation(
-            conv_kind=self.conv_kind, 
+            conv_kind=self.conv_kind,
             iterator_algorithm=iterator_algorithm,
             arch=self.current_cc,
             tile_description=tile_description,
@@ -618,13 +615,13 @@ def construct(
             epilogue_functor=epilogue_functor,
             swizzling_functor=swizzling_functor,
         )
-        
+
         return operation
 
     def compile(self, tile_description: TileDescription = None,
                 alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
-                iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm = None,
-                stride_support = None, swizzling_functor: cutlass.swizzle = None, 
+                iterator_algorithm: IteratorAlgorithm = None,
+                stride_support = None, swizzling_functor: cutlass.swizzle = None,
                 epilogue_functor = None, print_module: bool = False) -> cutlass.backend.Conv2dOperation:
         """
         Emits and compiles the kernel currently specified. If ``tile_description`` and any
@@ -641,9 +638,9 @@ def compile(self, tile_description: TileDescription = None,
         :param alignment_C: alignment of operand C
         :type alignment_C: int
         :param iterator_algorithm: the iterator algorithm used
-        :type iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
         :param stride_support: the stride support of dgrad
-        :type stride_support: cutlass.backend.library.StrideSupport
+        :type stride_support: cutlass_library.library.StrideSupport
         :param swizzling_functor: the swizzling functor
         :type swizzling_functor: cutlass.swizzle
         :param epilogue_functor: the epilogue functor
@@ -651,17 +648,17 @@ def compile(self, tile_description: TileDescription = None,
         :return: operation that was compiled
         :rtype: cutlass.backend.Conv2dOperation
         """
-        
+
         self.operation = self.construct(
-            tile_description, alignment_A, alignment_B, alignment_C, 
+            tile_description, alignment_A, alignment_B, alignment_C,
             iterator_algorithm, stride_support, swizzling_functor, epilogue_functor)
-        
+
         if print_module:
             print(self.operation.rt_module.emit())
-        
+
         compiler.add_module([self.operation,])
         return self.operation
-    
+
     #
     # Run Related
     #
@@ -681,21 +678,19 @@ def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
         if dtype != ref_type:
             raise Exception(f'Tensor {name} with type and layout {dtype} '
                             f'does not match the expected type of {ref_type}.')
-    
-
 
     def _get_and_verify_conv_problem_size(self, A, B, C, stride, padding, dilation):
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+        if self.conv_kind == ConvKind.Fprop:
             input = A
             weight = B
             output = C
             output_tensor = "C"
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+        elif self.conv_kind == ConvKind.Dgrad:
             output = A
             weight = B
             input = C
             output_tensor = "A"
-        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+        elif self.conv_kind == ConvKind.Wgrad:
             output = A
             input = B
             weight = C
@@ -703,27 +698,27 @@ def _get_and_verify_conv_problem_size(self, A, B, C, stride, padding, dilation):
         else:
             raise Exception(f"Convolution kind {self.conv_kind} is not supported")
 
-        N_, H_, W_, C_ = datatypes.get_tensor_shape(input)
-        K_, R_, S_, _ = datatypes.get_tensor_shape(weight)
-        _, P_, Q_, _ = datatypes.get_tensor_shape(output)
-        
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(N_, H_, W_, C_),
-            cutlass_bindings.Tensor4DCoord(K_, R_, S_, C_),
-            cutlass_bindings.Tensor4DCoord(padding[0], padding[0], padding[1], padding[1]),
-            cutlass_bindings.MatrixCoord(stride[0], stride[1]),
-            cutlass_bindings.MatrixCoord(dilation[0], dilation[1]),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        N_, H_, W_, C_ = datatypes.get_tensor_shape(input, op="CONV")
+        K_, R_, S_, _ = datatypes.get_tensor_shape(weight, op="CONV")
+        _, P_, Q_, _ = datatypes.get_tensor_shape(output, op="CONV")
+
+        problem_size = Conv2DProblemSize(
+            N_, H_, W_, C_,
+            K_, R_, S_, C_,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
             1, 1
         )
-        
+
         if P_ != problem_size.P or Q_ != problem_size.Q:
             raise Exception(
                 f"Tensor {output_tensor} size should be ({N_}, {problem_size.P}, {problem_size.Q}, {K_}), got ({N_}, {P_}, {Q_}, {K_})")
-        
+
         return problem_size
-    
-    def run(self, A=None, B=None, C=None, D=None, 
+
+    def run(self, A=None, B=None, C=None, D=None,
             stride=(1, 1), padding=(0, 0), dilation=(1, 1),
             alpha=None, beta=None,
             split_k=("serial", 1), sync: bool = True,
@@ -732,9 +727,9 @@ def run(self, A=None, B=None, C=None, D=None,
         Runs the kernel currently specified. If it has not already been, the kernel is emitted and
         compiled. Tensors holding operands and outputs of the kernel are sourced either from the
         ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
-        parameters provided in the call, or from those 
+        parameters provided in the call, or from those
         passed in on the construction of this object -- one of the two must be specified.
-        
+
         By default, this call returns only once the kernel has completed. To launch the kernel
         and immediately return, set ``sync=False``. In this case, it is the responsibility of the
         caller to syncrhonize the results of the kernel before attempting to access outputs
@@ -754,7 +749,7 @@ def run(self, A=None, B=None, C=None, D=None,
         :type sync: bool
         :param print_module: whether to print the emitted C++ code
         :type print_module: bool
-        
+
         :return: arguments passed in to the kernel
         :rtype: cutlass.backend.Conv2dArguments
         """
@@ -764,45 +759,49 @@ def run(self, A=None, B=None, C=None, D=None,
         D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
         alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
         beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
-        
+
         # handle the case when there is no C
         if C is None:
             if beta != 0:
                 raise Exception(f"With beta {beta} != 0, C has to be provided.")
             else:
                 C = D
-        
+
         # Construct problem size based on input
         # It also verifies whether the A, B, C, D, stride, padding, and dilation are matching
         problem_size = self._get_and_verify_conv_problem_size(A, B, C, stride, padding, dilation)
-        
+
         # Propose stride support based on input
         stride_support = self._propose_stride_support(stride)
-        
+
         # Propose swizzling functor
         swizzling_functor = self._propose_swizzling_functor(stride)
-        
+
+        shape_a = datatypes.get_tensor_shape(A, op="CONV")
+        shape_b = datatypes.get_tensor_shape(B, op="CONV")
+        shape_c = datatypes.get_tensor_shape(C, op="CONV")
+
         # Get the alignment
-        alignment_a = self.possible_operations.find_alignment(datatypes.get_tensor_shape(A), self._layout_a)
-        alignment_b = self.possible_operations.find_alignment(datatypes.get_tensor_shape(B), self._layout_b)
-        alignment_c = self.possible_operations.find_alignment(datatypes.get_tensor_shape(C), self._layout_c)
-        
+        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a)
+        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b)
+        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c)
+
         alignment_a = check.update_alignment(alignment_a, self.alignment_pref_A)
         alignment_b = check.update_alignment(alignment_b, self.alignment_pref_B)
         alignment_c = check.update_alignment(alignment_c, self.alignment_pref_C)
-        
+
         # Propose iterator algorithm based on input
         if self._iterator_algorithm is None:
-            # Propose a default itertaor algorithm based on the problem size
+            # Propose a default iterator algorithm based on the problem size
             iterator_algorithm = self._propose_iterator_algorithm(problem_size, alignment_a, alignment_b)
         else:
             if (self._validate_iterator_algorithm(self._iterator_algorithm, problem_size, alignment_a, alignment_b)):
                 iterator_algorithm = self._iterator_algorithm
             else:
                 raise Exception(f"Iterator algorithm {self._iterator_algorithm} is invalid for current problem.")
-        
+
         epilogue_args = [alpha, beta]
-        
+
         if hasattr(self, "_activation_args"):
             if isinstance(self._activation_args, list):
                 epilogue_args += self._activation_args
@@ -813,43 +812,41 @@ def run(self, A=None, B=None, C=None, D=None,
             epilogue_functor = self._create_epilogue_functor_activation(epilogue.identity)
         else:
             epilogue_functor = self.epilogue_functor
-        
+
         # The alignment is determined by the iterator function (I believe)
-        self.compile(tile_description=self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b, 
+        self.compile(tile_description=self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
                      alignment_C=alignment_c, iterator_algorithm=iterator_algorithm, stride_support=stride_support,
                      swizzling_functor=swizzling_functor, epilogue_functor=epilogue_functor, print_module=print_module)
-        
+
         # Create reduction operation for parallel split-k
         if split_k[0] == "parallel" and split_k[1] > 1:
             epilogue_functor_reduction = self._reset_epilogue_functor_alignment(alignment_c, self.epilogue_functor)
             self.reduction_operation = ReductionOperation(
-                shape=cutlass_bindings.MatrixCoord(4, 32 * alignment_c), C=self.operation.C, 
-                element_accumulator=datatypes.binding_type(self._element_accumulator),
-                element_compute=datatypes.binding_type(self._element_accumulator),
+                shape=MatrixCoord(4, 32 * alignment_c), C=self.operation.C,
+                element_accumulator=self._element_accumulator,
+                element_compute=self._element_accumulator,
                 epilogue_functor=epilogue_functor_reduction,
                 count=alignment_c
             )
             if print_module:
                 print(self.reduction_operation.rt_module.emit())
             compiler.add_module([self.reduction_operation,])
-        
+
         arguments = Conv2dArguments(
             operation=self.operation, problem_size=problem_size,
             A=A, B=B, C=C, D=D,
             output_op=self.operation.epilogue_type(*epilogue_args),
-            split_k_mode=datatypes.getattr_enum(cutlass_bindings.conv.SplitKMode, split_k[0]),
+            split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]),
             split_k_slices=split_k[1]
         )
-        
+
         self.operation.run(arguments)
-        
+
         if split_k[0] == "parallel" and split_k[1] > 1:
-            implicit_gemm_size = cutlass_bindings.conv.implicit_gemm_problem_size(
-                self.conv_kind, arguments.problem_size
-            )
+            implicit_gemm_size = arguments.problem_size.implicit_gemm_size(self.conv_kind)
             reduction_arguments = ReductionArguments(
                 self.reduction_operation,
-                problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()],
+                problem_size=[implicit_gemm_size.m, implicit_gemm_size.n],
                 partitions=split_k[1],
                 workspace=arguments.ptr_D,
                 destination=D,
@@ -857,7 +854,7 @@ def run(self, A=None, B=None, C=None, D=None,
                 output_op=self.reduction_operation.epilogue_type(*epilogue_args)
             )
             self.reduction_operation.run(reduction_arguments)
-        
+
         if sync:
             if split_k[0] == "parallel" and split_k[1] > 1:
                 reduction_arguments.sync()
@@ -865,23 +862,23 @@ def run(self, A=None, B=None, C=None, D=None,
                 arguments.sync()
 
         return arguments
-    
+
     #
     # Helper functions
     #
     @staticmethod
     def output_size(input_size, weight_size, padding, stride, dilation):
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(*input_size),
-            cutlass_bindings.Tensor4DCoord(*weight_size),
-            cutlass_bindings.Tensor4DCoord(padding[0], padding[0], padding[1], padding[1]),
-            cutlass_bindings.MatrixCoord(stride[0], stride[1]),
-            cutlass_bindings.MatrixCoord(dilation[0], dilation[1]),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
             1, 1
         )
         return (problem_size.N, problem_size.P, problem_size.Q, problem_size.K)
-        
+
 
 #
 # Easy to use interfaces for fprop, wgrad, and dgrad
@@ -890,23 +887,23 @@ def output_size(input_size, weight_size, padding, stride, dilation):
 class Conv2dFprop(Conv2d):
     def __init__(
         self,
-        input=None, weight=None, C=None, output=None, alpha=1, beta=0, 
-        element=None, 
-        element_input=None, element_weight=None, element_C=None, element_output=None, 
-        element_accumulator=None, 
+        input=None, weight=None, C=None, output=None, alpha=1, beta=0,
+        element=None,
+        element_input=None, element_weight=None, element_C=None, element_output=None,
+        element_accumulator=None,
         cc: int = None, kernel_cc: int = None):
         A, B, D = input, weight, output
         element_A, element_B, element_D = element_input, element_weight, element_output
         super().__init__(
-            "fprop", A, B, C, D, alpha, beta, element, 
-            element_A, element_B, element_C, element_D, 
+            "fprop", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
             element_accumulator, cc, kernel_cc)
-    
+
     def run(
-        self, input=None, weight=None, C=None, output=None, alpha=None, beta=None, 
-        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), 
+        self, input=None, weight=None, C=None, output=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
         sync: bool = True, print_module: bool = False) -> Conv2dArguments:
-        
+
         A, B, D = input, weight, output
         return super().run(
             A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module)
@@ -915,20 +912,20 @@ def run(
 class Conv2dDgrad(Conv2d):
     def __init__(
         self,
-        grad_output=None, weight=None, C=None, grad_input=None, alpha=1, beta=0, 
-        element=None, 
-        element_grad_output=None, element_weight=None, element_C=None, element_grad_input=None, 
-        element_accumulator=None, 
+        grad_output=None, weight=None, C=None, grad_input=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_weight=None, element_C=None, element_grad_input=None,
+        element_accumulator=None,
         cc: int = None, kernel_cc: int = None):
         A, B, D = grad_output, weight, grad_input
         element_A, element_B, element_D = element_grad_output, element_weight, element_grad_input
         super().__init__(
-            "dgrad", A, B, C, D, alpha, beta, element, 
-            element_A, element_B, element_C, element_D, 
+            "dgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
             element_accumulator, cc, kernel_cc)
-    
-    def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None, 
-        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), 
+
+    def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
         sync: bool = True, print_module: bool = False) -> Conv2dArguments:
         #
         A, B, D = grad_output, weight, grad_input
@@ -939,20 +936,20 @@ def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None
 class Conv2dWgrad(Conv2d):
     def __init__(
         self,
-        grad_output=None, input=None, C=None, grad_weight=None, alpha=1, beta=0, 
-        element=None, 
-        element_grad_output=None, element_input=None, element_C=None, element_grad_weight=None, 
-        element_accumulator=None, 
+        grad_output=None, input=None, C=None, grad_weight=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_input=None, element_C=None, element_grad_weight=None,
+        element_accumulator=None,
         cc: int = None, kernel_cc: int = None):
         A, B, D = grad_output, input, grad_weight
         element_A, element_B, element_D = element_grad_output, element_input, element_grad_weight
         super().__init__(
-            "wgrad", A, B, C, D, alpha, beta, element, 
-            element_A, element_B, element_C, element_D, 
+            "wgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
             element_accumulator, cc, kernel_cc)
-    
-    def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None, 
-        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1), 
+
+    def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
         sync: bool = True, print_module: bool = False) -> Conv2dArguments:
         #
         A, B, D = grad_output, input, grad_weight
diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py
index fe8c559704..3046e34dbb 100644
--- a/python/cutlass/op/gemm.py
+++ b/python/cutlass/op/gemm.py
@@ -116,14 +116,18 @@
 
 from math import prod
 
-import cutlass_bindings
-
 import cutlass
-from cutlass import epilogue, swizzle
+from cutlass import (
+    epilogue,
+    swizzle,
+    GemmUniversalMode,
+)
 from cutlass.backend import compiler
+from cutlass.backend.evt import EpilogueFunctorVisitor
 from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
 from cutlass.backend.library import TensorDescription, TileDescription
 from cutlass.op.op import OperationBase
+from cutlass.shape import GemmCoord
 from cutlass.utils import check, datatypes
 
 
@@ -245,7 +249,7 @@ def __init__(
                 lay_to_set = lay if lay is not None else layout
 
             elements.append(datatypes.library_type(elt_to_set))
-            layouts.append(datatypes.library_layout(lay_to_set))
+            layouts.append(lay_to_set)
 
         self._element_a, self._element_b, self._element_c, self._element_d = elements
         self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
@@ -265,6 +269,7 @@ def __init__(
 
         self.epilogue_functor = None
         self.op_class = None
+        self._tile_description = None
 
         self._reset_operations()
 
@@ -311,6 +316,48 @@ def swizzling_functor(self, swizzling_functor):
                 raise Exception('ThreadblockSwizzleStreamK is currently unsupported on SM90')
         self._swizzling_functor = swizzling_functor
 
+    #
+    # Tile description Related
+    #
+
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+
+        :param td: tile description
+        :type td: cutlass.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                alignment = list(self.possible_operations.kernels_by_alignment.keys())[0]
+                op = self.possible_operations.operations(alignment)[0]
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            td = self._tile_description.clone_and_update(td)
+
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+
     def _valid_tile_description(self, td: TileDescription) -> tuple:
         """
         Checks whether the provided tile description is valid for the given compute capability. At present,
@@ -328,9 +375,7 @@ def _valid_tile_description(self, td: TileDescription) -> tuple:
                  and the second element is a string providing an optional error message.
         :rtype: tuple
         """
-        # Check stage count based on the CC to which we are compiling (self.cc), rather
-        # than the CC from which we find kernels (self.current_cc)
-        valid, msg = check.valid_stage_count(self.cc, td, self._element_c, self._element_d)
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td, self._element_c, self._element_d)
         if not valid:
             return (valid, msg)
 
@@ -378,30 +423,21 @@ def construct(
 
         self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
 
-        tensor_A = TensorDescription(
-            datatypes.binding_type(self._element_a),
-            datatypes.binding_layout(self._layout_a),
-            alignment_A
-        )
-        tensor_B = TensorDescription(
-            datatypes.binding_type(self._element_b),
-            datatypes.binding_layout(self._layout_b),
-            alignment_B
-        )
-        tensor_C = TensorDescription(
-            datatypes.binding_type(self._element_c),
-            datatypes.binding_layout(self._layout_c),
-            alignment_C
-        )
+        tensor_A = TensorDescription(self._element_a, self._layout_a, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
 
         if tile_description is None:
-            op = self.possible_operations.operations(alignment_A)[0]
-            tile_description = datatypes.td_from_profiler_op(op)
+            if self._tile_description is None:
+                op = self.possible_operations.operations(alignment_A)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+            else:
+                tile_description = self._tile_description
         else:
             valid, err_str = self._valid_tile_description(tile_description)
             if not valid:
                 raise Exception(f"Invalid tile description. {err_str}")
-            self.tile_description = tile_description
+            self._tile_description = tile_description
 
         operation = GemmOperationUniversal(
             arch=self.current_cc,
@@ -473,21 +509,13 @@ def _get_batch_count(self, A, B, C, D) -> int:
         :return: tuple of batch count dimensions
         :rtype: tuple
         """
-        A_batch = A.shape[:-2] if len(A.shape) > 2 else tuple()
-        B_batch = B.shape[:-2] if len(B.shape) > 2 else tuple()
-        C_batch = C.shape[:-2] if len(C.shape) > 2 else tuple()
-        D_batch = D.shape[:-2] if len(D.shape) > 2 else tuple()
+        A_batch = prod(A.shape[:-2]) if len(A.shape) > 2 else 1
+        B_batch = prod(B.shape[:-2]) if len(B.shape) > 2 else 1
 
-        if len(D_batch) > 0 and D_batch not in [A_batch, B_batch, C_batch]:
-            raise Exception(f"Batch count in D must be present in one of operands A, B, and C. "
-                            f"Batch counts are: A={A_batch}, B={B_batch}, C={C_batch}, D={D_batch}")
-
-        for batch_shape in [A_batch, B_batch, C_batch]:
-            if len(batch_shape) > 0 and batch_shape != D_batch:
-                raise Exception(f"Batch count for all other operands must either match that of D or be zero."
-                                f"Received batch shape of {batch_shape}, which does not match that of D of {D_batch}.")
-
-        return D_batch
+        if 1 not in [A_batch, B_batch]:
+            if A_batch != B_batch:
+                raise Exception(f"Get invalid batch counts: A={A_batch}, B={B_batch}")
+        return max(A_batch, B_batch)
 
     def _get_batch_stride(self, tensor) -> int:
         """
@@ -518,38 +546,38 @@ def _get_problem_args(self, A, B, C, D) -> tuple:
         :param D: tensor D
         :type D: numpy/cupy/torch array/tensor object
 
-        :return: tuple containing the problem size (cutlass_bindings.gemm.GemmCoord), the GEMM mode (cutlass_bindings.gemm.Mode), and the batch count (int)
+        :return: tuple containing the problem size (cutlass.shape.GemmCoord), the GEMM mode (cutlass.GemmUniversalMode), and the batch count (int)
         :rtype: tuple
         """
         M, K = A.shape[-2:]
         N = B.shape[-1]
-        mode = cutlass_bindings.gemm.Mode.Gemm
+        mode = GemmUniversalMode.Gemm
 
         batch_count = self._get_batch_count(A, B, C, D)
-        returned_batch_count = prod(batch_count) if len(batch_count) > 0 else 1
+        returned_batch_count = batch_count
 
         # If we are running a batched GEMM in which there is a nonzero batch stride
         # only for A, then we can fold the batched dimension of A into the M dimension
         # (i.e., (b, m, k) x (k, n) -> (m*b, k) x (k, n)). This works only if both A
         # and C are row major. A similar operation can be performed if only B has a nonzero
         # batch dimension
-        if len(batch_count) > 0:
+        if batch_count > 1:
             A_row = self._layout_a == cutlass.LayoutType.RowMajor
             B_row = self._layout_b == cutlass.LayoutType.RowMajor
             C_row = self._layout_c == cutlass.LayoutType.RowMajor
 
-            batched = lambda x : len(x.shape) == 2 + len(batch_count)
+            batched = lambda x : len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count
 
             if batched(A) and not batched(B) and batched(C) and A_row and C_row:
-                M *= prod(batch_count)
+                M *= batch_count
                 returned_batch_count = 1
             elif not batched(A) and batched(B) and batched(C) and not B_row and not C_row:
-                N *= prod(batch_count)
+                N *= batch_count
                 returned_batch_count = 1
             else:
-                mode = cutlass_bindings.gemm.Mode.Batched
+                mode = GemmUniversalMode.Batched
 
-        return cutlass_bindings.gemm.GemmCoord(M, N, K), mode, returned_batch_count
+        return GemmCoord(M, N, K), mode, returned_batch_count
 
     def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
         """
@@ -570,7 +598,7 @@ def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
                             f'layout of ({ref_type}, {ref_layout}).')
 
     def run(self, A=None, B=None, C=None, D=None,
-            alpha=None, beta=None, sync: bool = True, print_module: bool = False) -> GemmArguments:
+            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None) -> GemmArguments:
         """
         Runs the kernel currently specified. If it has not already been, the kernel is emitted and
         compiled. Tensors holding operands and outputs of the kernel are sourced either from the
@@ -612,12 +640,12 @@ def run(self, A=None, B=None, C=None, D=None,
         alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a)
         alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b)
         alignment_c = self.possible_operations.find_alignment(C.shape, self._layout_c)
-        self.compile(self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+        self.compile(self._tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
                      alignment_C=alignment_c, print_module=print_module)
 
         problem_size, mode, batch_count = self._get_problem_args(A, B, C, D)
 
-        if mode == cutlass_bindings.gemm.Mode.Gemm or batch_count == 1:
+        if mode == GemmUniversalMode.Gemm or batch_count == 1:
             kwargs = {'split_k_slices': 1}
         else:
             kwargs = {
@@ -630,10 +658,15 @@ def run(self, A=None, B=None, C=None, D=None,
                 }
             }
 
+        if isinstance(self.epilogue_functor, EpilogueFunctorVisitor):
+            output_op = self.operation.epilogue_type(visitor_args)
+        else:
+            output_op = self.operation.epilogue_type(alpha, beta)
+
         arguments = GemmArguments(
             operation=self.operation, problem_size=problem_size,
             A=A, B=B, C=C, D=D,
-            output_op=self.operation.epilogue_type(alpha, beta),
+            output_op=output_op,
             gemm_mode=mode,
             **kwargs
         )
diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py
index d7eeb53f43..bc8c98693e 100644
--- a/python/cutlass/op/gemm_grouped.py
+++ b/python/cutlass/op/gemm_grouped.py
@@ -51,19 +51,18 @@
         plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])
 """
 
-import cutlass_bindings
-
+from cutlass import DataTypeSize
 from cutlass.backend.gemm_operation import (
     GemmGroupedArguments,
     GemmOperationGrouped,
 )
 from cutlass.backend.library import (
-    DataTypeSize,
     SchedulerMode,
     TensorDescription,
     TileDescription,
 )
 from cutlass.op.gemm import Gemm
+from cutlass.shape import GemmCoord
 from cutlass.utils import check, datatypes
 
 
@@ -170,21 +169,9 @@ def construct(self, tile_description: TileDescription = None,
 
         self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
 
-        tensor_A = TensorDescription(
-            datatypes.binding_type(self._element_a),
-            datatypes.binding_layout(self._layout_a),
-            alignment_A
-        )
-        tensor_B = TensorDescription(
-            datatypes.binding_type(self._element_b),
-            datatypes.binding_layout(self._layout_b),
-            alignment_B
-        )
-        tensor_C = TensorDescription(
-            datatypes.binding_type(self._element_c),
-            datatypes.binding_layout(self._layout_c),
-            alignment_C
-        )
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
 
         if tile_description is None:
             op = self.possible_operations.operations(alignment_A)[0]
@@ -244,7 +231,7 @@ def run(self, A, B, C, D,
             Bs[i] = self._verify_tensor(B[i], self.B, self._element_b, self._layout_b, "B")
             Cs[i] = self._verify_tensor(C[i], self.C, self._element_c, self._layout_c, "C")
             Ds[i] = self._verify_tensor(D[i], self.D, self._element_d, self._layout_d, "D")
-            problem_sizes.append(cutlass_bindings.gemm.GemmCoord(A[i].shape[0], B[i].shape[1], A[i].shape[1]))
+            problem_sizes.append(GemmCoord(A[i].shape[0], B[i].shape[1], A[i].shape[1]))
 
         alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
         beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
diff --git a/python/cutlass/op/op.py b/python/cutlass/op/op.py
index be0fb2ae9b..7bd0e545e9 100644
--- a/python/cutlass/op/op.py
+++ b/python/cutlass/op/op.py
@@ -38,11 +38,12 @@
 
 import cutlass
 from cutlass import option_registry, epilogue
+from cutlass.backend.evt import EpilogueFunctorVisitor
 from cutlass.backend.utils.device import device_cc
 from cutlass.epilogue import get_activations
-from cutlass.library_defaults import _generator_ccs
+from cutlass.library_defaults import KernelsForDataType, _generator_ccs
 from cutlass.swizzle import get_swizzling_functors
-from cutlass.utils import datatypes
+from cutlass.utils import datatypes, check
 
 
 class OperationBase:
@@ -67,7 +68,7 @@ def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = cutla
 
         if self.options is None:
             raise Exception(f"Invalid or unsupported compute capability: {self.current_cc}")
-        
+
         # Default activation function: identity
         self._activation = epilogue.identity
 
@@ -120,7 +121,7 @@ def _reset_options(self, cc: int):
                 raise Exception(f'Invalid CC for CUTLASS kernels: {cc}.')
             self.current_cc = cc
             self.options = option_registry.options_for_cc(self.current_cc, self.operation_kind)
-    
+
     def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
         """
         Verifies the following properties:
@@ -153,7 +154,7 @@ def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
                     f"Tensor {name} with type {dtype} does not match expected type {ref_dtype}."
                 )
         return scalar
-    
+
     def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
         """
         Verifies the following properties:
@@ -183,11 +184,11 @@ def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
 
         self._verify_type_and_layout(tensor, ref_dtype, ref_layout, name)
         return tensor
-    
+
     #
     # Opcode Related
     #
-    
+
     @property
     def opclass(self) -> cutlass.OpcodeClass:
         """
@@ -197,7 +198,7 @@ def opclass(self) -> cutlass.OpcodeClass:
         :rtype: cutlass.OpcodeClass
         """
         return self.op_class
-    
+
     @opclass.setter
     def opclass(self, oc: cutlass.OpcodeClass):
         if isinstance(oc, str):
@@ -223,11 +224,11 @@ def opclass(self, oc: cutlass.OpcodeClass):
         self.possible_operations = self.options.operations(
             self.op_class, self._element_a, self._element_b,
             self._element_accumulator, self._layout_a, self._layout_b)
-        
+
     #
     # Epilogue
     #
-    
+
     def _create_epilogue_functor_activation(self, activation):
         """
         Returns the epilogue functor with given activation function
@@ -261,44 +262,47 @@ def _create_epilogue_functor_activation(self, activation):
 
         return epilogue.get_activation_epilogue(
             activation,
-            datatypes.binding_type(self._element_c),
+            self._element_c,
             elements_per_access,
-            datatypes.binding_type(self._element_accumulator),
-            datatypes.binding_type(self._element_accumulator),
+            self._element_accumulator,
+            self._element_accumulator,
         )
-    
+
     def _reset_epilogue_functor_activation(self, activation):
         """
         Set the epilogue functor based on the provided activation function
         """
         self.epilogue_functor = self._create_epilogue_functor_activation(activation)
-    
+
     def _reset_epilogue_functor_alignment(self, alignment, epilogue_functor):
         """
         Reset the alignment of the current epilogue functor based on alignment C
         """
+        if isinstance(epilogue_functor, EpilogueFunctorVisitor):
+            return epilogue_functor
+
         if epilogue_functor is None or not hasattr(epilogue_functor, 'activation_functor'):
             # Identity epilogue does not have 'activation_functor'
             activation = epilogue.identity
         else:
-            activation = type(epilogue_functor.activation_functor)
+            activation = epilogue_functor.activation_functor
 
         epilogue_functor = epilogue.get_activation_epilogue(
             activation,
-            datatypes.binding_type(self._element_c),
+            self._element_c,
             alignment,
-            datatypes.binding_type(self._element_accumulator),
-            datatypes.binding_type(self._element_accumulator),
+            self._element_accumulator,
+            self._element_accumulator,
         )
         return epilogue_functor
-    
+
     @property
     def activation(self):
         """
         Returns the type of the current activation function used
         """
         if hasattr(self.epilogue_functor, "activation_functor"):
-            return type(self.epilogue_functor.activation_functor)
+            return self.epilogue_functor.activation_functor
         else:
             return epilogue.identity
 
@@ -307,7 +311,7 @@ def activation(self, act):
         """
         Sets the type of the activation function to use
         Activation can come with a set of arguments
-        
+
         :param act: type of activation function to use
         :type act: str or tuple. e.g. "relu", ("leaky_relu", 0.01)
 
@@ -325,4 +329,50 @@ def activation(self, act):
                 act = getattr(cutlass.backend.epilogue, act)
             self._reset_epilogue_functor_activation(act)
             self._activation = act
-    
+
+    @property
+    def epilogue_visitor(self):
+        """
+        Return the epilogue functor
+        """
+        return self.epilogue_functor
+
+    @epilogue_visitor.setter
+    def epilogue_visitor(self, visitor):
+        """
+        Create the epilogue visitor
+        """
+        self.epilogue_functor = EpilogueFunctorVisitor(self.cc, visitor)
+
+        # The epilogue_functor may consume too much shared memory
+        # Reset the possible operations
+        if self.cc != 90:
+            # The shared memory is only a concern for sm90 epilogue
+            # In sm80, the epilogue and mainloop share the shared memory
+            return
+        datatype_comb = self.possible_operations.datatype_comb
+        layout_comb = self.possible_operations.layout_comb
+        new_possible_operations = KernelsForDataType(datatype_comb, layout_comb)
+        for operation in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(operation)
+            # Filter invalid epilogue schedules
+            if td.epilogue_schedule not in [
+                cutlass.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative]:
+                continue
+            epilogue_smem_bytes = self.epilogue_functor.get_smem_size(td)
+
+            # Verify the maximum number of mainloop stages
+            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, cutlass.OperationKind.Gemm)
+            smem_capacity_bytes = cutlass.SharedMemPerCC[self.cc] << 10
+            mainloop_stages = (smem_capacity_bytes - epilogue_smem_bytes) // mainloop_smem_per_stage
+            if mainloop_stages < 2:
+                # Mainloop stages must >= 2
+                continue
+
+            new_possible_operations.add(operation)
+        if len(new_possible_operations.all_operations) == 0:
+            raise RuntimeError(
+                "The epilogue consumes too much shared memory. "
+                "No valid tile description is found in the generator.")
+        self.possible_operations = new_possible_operations
diff --git a/python/cutlass/profiler/__init__.py b/python/cutlass/profiler/__init__.py
new file mode 100644
index 0000000000..d9e9cdc854
--- /dev/null
+++ b/python/cutlass/profiler/__init__.py
@@ -0,0 +1,37 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Profilers for Python Interface
+"""
+
+from cutlass.profiler.event_profiler import CUDAEventProfiler
diff --git a/python/cutlass/profiler/event_profiler.py b/python/cutlass/profiler/event_profiler.py
new file mode 100644
index 0000000000..71f290c120
--- /dev/null
+++ b/python/cutlass/profiler/event_profiler.py
@@ -0,0 +1,185 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Profiler based on the cuda events
+"""
+
+import re
+import subprocess
+
+from cuda import cuda, cudart
+import numpy as np
+import torch
+
+from cutlass import CUTLASS_PATH
+from cutlass.backend.library import DataTypeSize
+from cutlass.op.op import OperationBase
+from cutlass.shape import GemmCoord
+
+
+class GpuTimer:
+    def __init__(self) -> None:
+        self.events = [
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+        ]
+
+    def start(self, stream=cuda.CUstream(0)):
+        (err,) = cuda.cuEventRecord(self.events[0], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def stop(self, stream=cuda.CUstream(0)):
+        (err,) = cuda.cuEventRecord(self.events[1], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        pass
+
+    def stop_and_wait(self, stream=cuda.CUstream(0)):
+        self.stop(stream)
+        if stream:
+            (err,) = cuda.cuStreamSynchronize(stream)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+        else:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def duration(self, iterations=1):
+        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        return duration / float(iterations)
+
+
+class CUDAEventProfiler:
+    def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
+        self.arguments = op.run(*args, **kwargs)
+        self.operation = op.operation
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+        self.timer = GpuTimer()
+
+    #
+    # Cutlass Python Interface Profiler
+    #
+
+    def __call__(self):
+        for _ in range(self.warmup_iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.start()
+        for _ in range(self.iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.stop_and_wait()
+        runtime = self.timer.duration(self.iterations)
+        return runtime
+
+    #
+    # CUTLASS Profiler
+    #
+
+    def run_cutlass_profiler(self):
+        alpha = 1.0
+        beta = 1.0
+
+        profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
+        kernel_name = self.operation.procedural_name()
+        verification_providers = "device"
+        provider = "cutlass"
+        problem_size = self.arguments.problem_size
+
+        if "cutlass3x" in kernel_name:
+            # cutlass3x generator only have column-major output
+            layout_name = self.operation.layout_name_3x()
+            if layout_name[-1] == "t":
+                new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
+                problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+                kernel_name = kernel_name.replace(layout_name, new_layout_name)
+
+        batch_count = self.arguments.batch_count
+
+        cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
+              f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
+              f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
+              f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
+
+        result = subprocess.getoutput(cmd)
+
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group("runtime"))
+
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group("bytes"))
+
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group("flops"))
+
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, batch_count, beta)
+        assert flops == self.flops(problem_size, batch_count, beta)
+
+        return runtime
+
+    def bytes(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        bytes = (
+            (DataTypeSize[self.operation.A.element] * m // 8) * k
+            + (DataTypeSize[self.operation.B.element] * n // 8) * k
+            + (DataTypeSize[self.operation.C.element] * m // 8) * n
+        )
+
+        if beta != 0:
+            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
+
+        bytes *= batch_count
+
+        return bytes
+
+    def flops(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        flops_ = (m * n * k) * 2 * batch_count
+
+        if beta != 0:
+            flops_ += m * n * batch_count * 2
+
+        return flops_
+
diff --git a/python/cutlass/shape.py b/python/cutlass/shape.py
new file mode 100644
index 0000000000..78e164d764
--- /dev/null
+++ b/python/cutlass/shape.py
@@ -0,0 +1,184 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Utilities for expressing shapes
+"""
+
+from cutlass import (
+    ConvMode,
+    ConvKind,
+    LayoutType
+)
+from cutlass.backend.c_types import (
+    Conv2DProblemSize_,
+    GemmCoord_,
+    GemmCoordBatched_
+)
+
+
+class MatrixCoord:
+    def __init__(self, row, col):
+        self._row = row
+        self._col = col
+
+    @property
+    def row(self):
+        return self._row
+
+    @property
+    def column(self):
+        return self._col
+
+    def leading_dimension(self, layout: LayoutType) -> int:
+        """
+        Returns the leading dimension for a matrix with layout ``layout`` and shape provided by the MatrixCoord.
+
+        :param layout: layout of matrix
+        :type layout: cutlass.LayoutType
+
+        :returns: leading dimension
+        :rtype: int
+        """
+        if layout == LayoutType.RowMajor:
+            return self._col
+        elif layout == LayoutType.ColumnMajor:
+            return self._row
+        else:
+            raise Exception(f'Unsupported layout for leading dimension calculation: {layout}')
+
+
+class GemmCoord:
+    def __init__(self, m: int, n: int, k: int):
+        self._m = m
+        self._n = n
+        self._k = k
+
+    @property
+    def m(self) -> int:
+        return self._m
+
+    @property
+    def n(self) -> int:
+        return self._n
+
+    @property
+    def k(self) -> int:
+        return self._k
+
+    @property
+    def mk(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._k)
+
+    @property
+    def mn(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._n)
+
+    @property
+    def kn(self) -> MatrixCoord:
+        return MatrixCoord(self._k, self._n)
+
+    @property
+    def ctype(self) -> GemmCoord_:
+        return GemmCoord_(self._m, self._n, self._k)
+
+    def batched_ctype(self, batch_count: int) -> GemmCoordBatched_:
+        return GemmCoordBatched_(self._m, self._n, self._k, batch_count)
+
+
+class Conv2DProblemSize:
+    def __init__(
+        self, n: int, h: int, w: int, c: int,
+        k: int, r: int, s: int, c_: int,
+        pad_h: int, pad_w: int, stride_h: int, stride_w: int,
+        dilation_h: int, dilation_w: int, mode: ConvMode=ConvMode.CrossCorrelation,
+        split_k_slices: int=1, groups: int=1):
+
+        self.N = n
+        self.H = h
+        self.W = w
+        self.C = c
+        self.K = k
+        self.R = r
+        self.S = s
+        self.pad_h = pad_h
+        self.pad_w = pad_w
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+        self.dilation_h = dilation_h
+        self.dilation_w = dilation_w
+        self.mode = int(mode)
+        self.split_k_slices = split_k_slices
+        self.groups = groups
+        self.P = ((h + pad_h * 2 - r * dilation_h) // stride_h) + 1
+        self.Q = ((w + pad_w * 2 - s * dilation_w) // stride_w) + 1
+
+    @property
+    def ctype(self) -> Conv2DProblemSize_:
+        return Conv2DProblemSize_(self)
+
+    def implicit_gemm_size(self, kind: ConvKind):
+        if kind == ConvKind.Fprop:
+            return GemmCoord(
+                self.N * self.P * self.Q,
+                self.K,
+                self.R * self.S * self.C // self.groups
+            )
+        elif kind == ConvKind.Dgrad:
+            return GemmCoord(
+                self.N * self.H * self.W,
+                self.C,
+                self.R * self.S * self.K
+            )
+        elif kind == ConvKind.Wgrad:
+            return GemmCoord(
+                self.K,
+                self.R * self.S * self.C,
+                self.N * self.P * self.Q
+            )
+
+    @staticmethod
+    def from_sizes(input_size, weight_size):
+        K, R, S, _ = weight_size
+        pad_h = R // 2
+        pad_w = S // 2
+        stride_h = 1
+        stride_w = 1
+        dilation_h = 1
+        dilation_w = 1
+        return Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dilation_h, dilation_w
+        )
diff --git a/python/cutlass/swizzle.py b/python/cutlass/swizzle.py
index 479fafdb01..498ab74eb5 100644
--- a/python/cutlass/swizzle.py
+++ b/python/cutlass/swizzle.py
@@ -34,18 +34,18 @@
 Registry of swizzling functions
 """
 
-import cutlass_bindings
+from cutlass import SwizzlingFunctor
 
-IdentitySwizzle1 = cutlass_bindings.IdentitySwizzle1
-IdentitySwizzle2 = cutlass_bindings.IdentitySwizzle2
-IdentitySwizzle4 = cutlass_bindings.IdentitySwizzle4
-IdentitySwizzle8 = cutlass_bindings.IdentitySwizzle8
-HorizontalSwizzle = cutlass_bindings.HorizontalSwizzle
-BatchedIdentitySwizzle = cutlass_bindings.BatchedIdentitySwizzle
-ThreadblockSwizzleStreamK = cutlass_bindings.ThreadblockSwizzleStreamK
-StridedDgradIdentitySwizzle1 = cutlass_bindings.StridedDgradIdentitySwizzle1
-StridedDgradIdentitySwizzle4 = cutlass_bindings.StridedDgradIdentitySwizzle4
-StridedDgradHorizontalSwizzle = cutlass_bindings.StridedDgradHorizontalSwizzle
+
+IdentitySwizzle1 = SwizzlingFunctor.Identity1
+IdentitySwizzle2 = SwizzlingFunctor.Identity2
+IdentitySwizzle4 = SwizzlingFunctor.Identity4
+IdentitySwizzle8 = SwizzlingFunctor.Identity8
+HorizontalSwizzle = SwizzlingFunctor.Horizontal
+ThreadblockSwizzleStreamK = SwizzlingFunctor.StreamK
+StridedDgradIdentitySwizzle1 = SwizzlingFunctor.StridedDgradIdentity1
+StridedDgradIdentitySwizzle4 = SwizzlingFunctor.StridedDgradIdentity4
+StridedDgradHorizontalSwizzle = SwizzlingFunctor.StridedDgradHorizontal
 
 
 _swizzling_functors = [
@@ -54,7 +54,6 @@
     IdentitySwizzle4,
     IdentitySwizzle8,
     HorizontalSwizzle,
-    BatchedIdentitySwizzle,
     ThreadblockSwizzleStreamK,
     StridedDgradIdentitySwizzle1,
     StridedDgradIdentitySwizzle4,
diff --git a/python/cutlass/utils/__init__.py b/python/cutlass/utils/__init__.py
index 2d4b703094..f854804bf4 100644
--- a/python/cutlass/utils/__init__.py
+++ b/python/cutlass/utils/__init__.py
@@ -32,10 +32,10 @@
 
 from cutlass.utils.check import (
     alignment_or_default,
-    update_alignment,
     calculate_smem_usage,
     calculate_smem_usage_per_stage,
     valid_cluster_shape,
     valid_schedule,
     valid_stage_count,
+    update_alignment,
 )
diff --git a/python/cutlass/utils/check.py b/python/cutlass/utils/check.py
index 6983dbb005..1ca0eb8a8c 100644
--- a/python/cutlass/utils/check.py
+++ b/python/cutlass/utils/check.py
@@ -36,10 +36,9 @@
 
 import ctypes
 
-import cutlass_bindings
 import cutlass
-from cutlass.backend.library import DataTypeSize, TileDescription
-from cutlass.utils.datatypes import binding_type
+from cutlass import DataTypeSize
+from cutlass.backend.library import TileDescription
 
 
 def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: cutlass.OperationKind) -> int:
@@ -80,6 +79,7 @@ def calculate_smem_usage(operation) -> int:
 
 def valid_stage_count(
     cc: int,
+    kernel_cc: int,
     td: TileDescription,
     element_C: cutlass.DataType = None,
     element_D: cutlass.DataType = None) -> tuple:
@@ -89,6 +89,8 @@ def valid_stage_count(
 
     :param cc: compute capability of device in question
     :type cc: int
+    :param kernel_cc: compute capability that the kernel targets (corresponding to the arch::SMxy tag in CUTLASS)
+    :type kernel_cc: int
     :param td: tile description to check
     :type td: TileDescription
     :param element_C: data type of operand C
@@ -100,7 +102,7 @@ def valid_stage_count(
              valid for the provided device and the second element being an error message
     :rtype: tuple
     """
-    if cc == 90:
+    if kernel_cc == 90:
         if (td.stages is None or td.stages == 0):
             # Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
             # determines the stage count to use. Thus, all settings are valid in these scenarios.
diff --git a/python/cutlass/utils/datatypes.py b/python/cutlass/utils/datatypes.py
index cc84aa0ba5..fa229557f5 100644
--- a/python/cutlass/utils/datatypes.py
+++ b/python/cutlass/utils/datatypes.py
@@ -34,14 +34,13 @@
 Utility functions for converting between frontend datatypes and CUTLASS datatypes
 """
 
-import cutlass_bindings
-
 import cutlass
-from cutlass.backend.library import (
+from cutlass import (
     DataTypeSize,
+)
+from cutlass.backend.library import (
     MathInstruction,
     MathOperation,
-    ShortLayoutTypeNames,
     TileDescription,
 )
 
@@ -123,6 +122,9 @@ def cupy_type(inp):
         torch.float32: cutlass.DataType.f32,
         torch.double: cutlass.DataType.f64,
         torch.float64: cutlass.DataType.f64,
+        torch.int8: cutlass.DataType.s8,
+        torch.int32: cutlass.DataType.s32,
+        torch.uint8: cutlass.DataType.u8,
     }
 
     _library_to_torch_dict = {
@@ -133,6 +135,9 @@ def cupy_type(inp):
         cutlass.DataType.f32: torch.float32,
         cutlass.DataType.f64: torch.double,
         cutlass.DataType.f64: torch.float64,
+        cutlass.DataType.s8: torch.int8,
+        cutlass.DataType.s32: torch.int32,
+        cutlass.DataType.u8: torch.uint8,
     }
 except ImportError:
     torch_available = False
@@ -162,51 +167,12 @@ def bfloat16_library_type(inp) -> cutlass.DataType:
             return cutlass.DataType.bf16
 
 
-def bfloat16_type(inp) -> bfloat16.bfloat16:
+def bfloat16_type(inp):
     if bfloat16_available:
         if inp == cutlass.DataType.bf16:
             return bfloat16.bfloat16
 
 
-# Mapping from library data type to Python-bound CUTLASS data type
-library_to_binding_dict = {
-    cutlass.DataType.s8: cutlass_bindings.int8,
-    cutlass.DataType.s32: cutlass_bindings.int32,
-    cutlass.DataType.f16: cutlass_bindings.float16,
-    cutlass.DataType.bf16: cutlass_bindings.bfloat16,
-    cutlass.DataType.f32: cutlass_bindings.float32,
-    cutlass.DataType.f64: cutlass_bindings.float64,
-    cutlass.DataType.tf32: cutlass_bindings.tfloat32,
-}
-
-# Mapping from Python-bound CUTLASS data type to library data type
-binding_to_library = {
-    cutlass_bindings.int8: cutlass.DataType.s8,
-    cutlass_bindings.int32: cutlass.DataType.s32,
-    cutlass_bindings.float16: cutlass.DataType.f16,
-    cutlass_bindings.bfloat16: cutlass.DataType.bf16,
-    cutlass_bindings.float32: cutlass.DataType.f32,
-    cutlass_bindings.float64: cutlass.DataType.f64,
-    cutlass_bindings.tfloat32: cutlass.DataType.tf32,
-}
-
-
-def binding_library_type(inp):
-    if inp in binding_to_library:
-        return binding_to_library[inp]
-    return None
-
-
-def has_binding_type(inp: cutlass.DataType):
-    return inp in library_to_binding_dict
-
-
-def library_to_binding(inp: cutlass.DataType):
-    if not has_binding_type(inp):
-        raise Exception(f"No available conversion from library type {inp} to Python-bound CUTLASS type")
-    return library_to_binding_dict[inp]
-
-
 def library_type(inp):
     if inp in cutlass.DataTypeSize.keys():
         return inp
@@ -216,7 +182,6 @@ def library_type(inp):
         cupy_library_type,
         numpy_library_type,
         torch_library_type,
-        binding_library_type,
     ]:
         out = cvt_fn(inp)
         if out is not None:
@@ -225,42 +190,6 @@ def library_type(inp):
     raise Exception(f"No available conversion from type {inp} to a library type.")
 
 
-def library_layout(layout):
-    if layout in cutlass.LayoutTag.keys():
-        return layout
-
-    # Convert Python-bound CUTLASS layout to profiler library layout
-    if layout == cutlass_bindings.RowMajor:
-        return cutlass.LayoutType.RowMajor
-    elif layout == cutlass_bindings.ColumnMajor:
-        return cutlass.LayoutType.ColumnMajor
-    elif layout == cutlass_bindings.TensorNHWC:
-        return cutlass.LayoutType.TensorNHWC
-    else:
-        raise Exception(f"No conversion available for layout {layout} to library layout.")
-
-
-def binding_type(inp):
-    if inp in DataTypeSize.keys():
-        return inp
-
-    libtype = library_type(inp)
-    return library_to_binding(libtype)
-
-
-def binding_layout(layout):
-    if layout in ShortLayoutTypeNames.keys():
-        return layout
-    elif layout == cutlass.LayoutType.RowMajor:
-        return cutlass_bindings.RowMajor
-    elif layout == cutlass.LayoutType.ColumnMajor:
-        return cutlass_bindings.ColumnMajor
-    elif layout == cutlass.LayoutType.TensorNHWC:
-        return cutlass_bindings.TensorNHWC
-    else:
-        raise Exception(f"No conversion available for layout {layout} to Python-bound CUTLASS layout.")
-
-
 def _tensor_from_numpy(np_tensor):
     dtype = library_type(np_tensor.dtype)
     if np_tensor.flags.c_contiguous:
@@ -282,28 +211,28 @@ def get_datatype_and_layout(tensor):
         return _tensor_from_numpy(tensor)
     elif torch_available and isinstance(tensor, torch.Tensor):
         return _tensor_from_torch(tensor)
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (cutlass.DataType.f32, cutlass.LayoutType.RowMajor)
     else:
         raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
 
-def get_tensor_shape(tensor):
+def get_tensor_shape(tensor, op="GEMM"):
     if (numpy_available and isinstance(tensor, np.ndarray)) or (
         cupy_available and isinstance(tensor, cp.ndarray)
     ):
         return tensor.shape
     elif torch_available and isinstance(tensor, torch.Tensor):
         size = tensor.size()
-        return (size[0], size[2], size[3], size[1])
+        if op == "CONV":
+            # PyTorch Tensors have shape NCHW
+            return (size[0], size[2], size[3], size[1])
+        else:
+            return tuple(tensor.size())
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (1,)
     else:
         raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
 
-def binding_opclass(opclass: cutlass.OpcodeClass):
-    if opclass == cutlass.OpcodeClass.TensorOp:
-        return cutlass_bindings.OpClass.TensorOp
-    elif opclass == cutlass.OpcodeClass.Simt:
-        return cutlass_bindings.OpClass.Simt
-    else:
-        raise Exception(f"Unable to convert opcode class of type {opclass} to Python-bound CUTLASS opcode class.")
-
 
 _math_operation_value_map = {x.value: x for x in MathOperation}
 
@@ -321,10 +250,10 @@ def construct_backend_td(td: cutlass.TileDescription,
     mi = td.math_instruction
     backend_mi = MathInstruction(
         mi.instruction_shape,
-        binding_type(mi.element_a),
-        binding_type(mi.element_b),
-        binding_type(mi.element_accumulator),
-        binding_opclass(mi.opcode_class),
+        mi.element_a,
+        mi.element_b,
+        mi.element_accumulator,
+        mi.opcode_class,
         backend_math_operation(mi.math_operation)
     )
     cluster_shape = td.cluster_shape if hasattr(td, "cluster_shape") else [1, 1, 1]
@@ -347,7 +276,7 @@ def td_from_profiler_op(op) -> TileDescription:
     return construct_backend_td(op.tile_description, kschedule, eschedule, tschedule)
 
 
-def td_from_profiler_td(td: cutlass.backend.TileDescription) -> TileDescription:
+def td_from_profiler_td(td: TileDescription) -> TileDescription:
     """
     Converts the profiler's TileDescription into the backend TileDescription
 
@@ -359,6 +288,7 @@ def td_from_profiler_td(td: cutlass.backend.TileDescription) -> TileDescription:
     """
     return construct_backend_td(td, kernel_schedule=None, epilogue_schedule=None, tile_scheduler=None)
 
+
 def to_camel_case(snake_str):
     return "".join(x.capitalize() for x in snake_str.lower().split("_"))
 
diff --git a/python/cutlass_library/__init__.py b/python/cutlass_library/__init__.py
new file mode 100644
index 0000000000..dfc3154138
--- /dev/null
+++ b/python/cutlass_library/__init__.py
@@ -0,0 +1,49 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import sys
+
+from . import conv2d_operation
+from . import conv3d_operation
+from . import gemm_operation
+
+if '-m' not in sys.argv:
+    # Do not import generator when running python -m cutlass_library.generator to
+    # avoid double-import warnings
+    from . import generator
+
+from . import library
+from . import manifest
+from . import rank_2k_operation
+from . import rank_k_operation
+from . import symm_operation
+from . import trmm_operation
diff --git a/tools/library/scripts/conv2d_operation.py b/python/cutlass_library/conv2d_operation.py
similarity index 89%
rename from tools/library/scripts/conv2d_operation.py
rename to python/cutlass_library/conv2d_operation.py
index 23f8e6e6e6..b59771ef2e 100644
--- a/tools/library/scripts/conv2d_operation.py
+++ b/python/cutlass_library/conv2d_operation.py
@@ -1,15 +1,44 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv2d kernels
+"""
 
 import enum
 import os.path
 import shutil
 
-from library import *
+from cutlass_library.library import *
 
 ###################################################################################################
 
@@ -40,7 +69,7 @@ def is_complex(self):
       MathOperation.multiply_add_complex_gaussian
       ]
     return self.tile_description.math_instruction.math_operation in complex_operators
-  
+
   #
   def accumulator_type(self):
     accum = self.tile_description.math_instruction.element_accumulator
@@ -96,7 +125,7 @@ def configuration_name(self):
     ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
 
     opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-    
+
     threadblock = self.tile_description.procedural_name()
 
     # grouped conv
@@ -137,13 +166,13 @@ class EmitConv2dInstance:
   def __init__(self):
     self.template = """
   // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-  using ${operation_name}_base = 
+  using ${operation_name}_base =
   typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
-    ${element_a}, 
+    ${element_a},
     ${layout_a},
-    ${element_b}, 
+    ${element_b},
     ${layout_b},
-    ${element_c}, 
+    ${element_c},
     ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -254,7 +283,7 @@ def emit(self, operation):
       'layout_b': LayoutTag[operation.B.layout],
       'element_c': DataTypeTag[operation.C.element],
       'layout_c': LayoutTag[operation.C.layout],
-      'element_accumulator': DataTypeTag[operation.accumulator_type()], 
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
       'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
       'arch': "cutlass::arch::Sm%d" % operation.arch,
       'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
@@ -289,7 +318,7 @@ def emit(self, operation):
       values['threadblock_output_shape_n'] = str(operation.tile_description.threadblock_output_shape[0])
       values['threadblock_output_shape_p'] = str(operation.tile_description.threadblock_output_shape[1])
       values['threadblock_output_shape_q'] = str(operation.tile_description.threadblock_output_shape[2])
-      
+
       values['groups_per_cta'] = str(operation.tile_description.threadblock_output_shape[3])
 
       values['filter_shape_r'] = str(operation.tile_description.filter_shape[0])
@@ -350,7 +379,7 @@ def __init__(self, operation_path, configuration_name):
 ${operation_instance}
 
 // Derived class
-struct ${operation_name} : 
+struct ${operation_name} :
   public ${operation_name}_base { };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -446,12 +475,12 @@ def __exit__(self, exception_type, exception_value, traceback):
       if operation.group_mode == GroupMode.Depthwise:
         self.configuration_file.write(SubstituteTemplate(self.configuration_direct_conv_instance, {
           'configuration_name': self.configuration_name,
-          'operation_name': operation.procedural_name()  
+          'operation_name': operation.procedural_name()
         }))
-      else: 
+      else:
         self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
           'configuration_name': self.configuration_name,
-          'operation_name': operation.procedural_name()  
+          'operation_name': operation.procedural_name()
         }))
 
     self.configuration_file.write(self.configuration_epilogue)
diff --git a/tools/library/scripts/conv3d_operation.py b/python/cutlass_library/conv3d_operation.py
similarity index 85%
rename from tools/library/scripts/conv3d_operation.py
rename to python/cutlass_library/conv3d_operation.py
index 4ba31b0395..0a3265bb8c 100644
--- a/tools/library/scripts/conv3d_operation.py
+++ b/python/cutlass_library/conv3d_operation.py
@@ -1,15 +1,44 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv3d kernels
+"""
 
 import enum
 import os.path
 import shutil
 
-from library import *
+from cutlass_library.library import *
 
 ###################################################################################################
 
@@ -74,7 +103,7 @@ def configuration_name(self):
     ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
 
     opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-    
+
     threadblock = "%dx%d_%dx%d" % (
       self.tile_description.threadblock_shape[0],
       self.tile_description.threadblock_shape[1],
@@ -111,13 +140,13 @@ class EmitConv3dInstance:
   def __init__(self):
     self.template = """
   // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
-  using ${operation_name}_base = 
+  using ${operation_name}_base =
   typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}<
-    ${element_a}, 
+    ${element_a},
     cutlass::layout::TensorNDHWC,
-    ${element_b}, 
+    ${element_b},
     cutlass::layout::TensorNDHWC,
-    ${element_c}, 
+    ${element_c},
     cutlass::layout::TensorNDHWC,
     ${element_accumulator},
     ${opcode_class},
@@ -223,7 +252,7 @@ def __init__(self, operation_path, configuration_name):
 ${operation_instance}
 
 // Derived class
-struct ${operation_name} : 
+struct ${operation_name} :
   public ${operation_name}_base { };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -308,7 +337,7 @@ def __exit__(self, exception_type, exception_value, traceback):
     for operation in self.operations:
       self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
         'configuration_name': self.configuration_name,
-        'operation_name': operation.procedural_name()  
+        'operation_name': operation.procedural_name()
       }))
 
     self.configuration_file.write(self.configuration_epilogue)
diff --git a/tools/library/scripts/gemm_operation.py b/python/cutlass_library/gemm_operation.py
similarity index 96%
rename from tools/library/scripts/gemm_operation.py
rename to python/cutlass_library/gemm_operation.py
index 58dba0ff58..e92b891f3c 100644
--- a/tools/library/scripts/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -1,8 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting GEMM kernels
+"""
 
 import enum
 import os.path
@@ -11,7 +41,7 @@
 import operator
 import collections
 
-from library import *
+from cutlass_library.library import *
 
 ###################################################################################################
 #
diff --git a/tools/library/scripts/generator.py b/python/cutlass_library/generator.py
similarity index 98%
rename from tools/library/scripts/generator.py
rename to python/cutlass_library/generator.py
index 161110d974..facd5d960c 100644
--- a/tools/library/scripts/generator.py
+++ b/python/cutlass_library/generator.py
@@ -1,8 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for enumerating CUTLASS library kernels
+"""
 
 import enum
 import os.path
@@ -10,8 +40,8 @@
 import argparse
 import logging
 
-from library import *
-from manifest import *
+from cutlass_library.library import *
+from cutlass_library.manifest import *
 from itertools import product
 
 ###################################################################################################
@@ -1438,7 +1468,7 @@ def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 90
 
   alignment_constraints = [16,]
   alignment_constraints_small_channels = [16, 8, 4]
@@ -1540,7 +1570,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 90
 
   alignment_constraints = [16,]
 
@@ -1599,7 +1629,8 @@ def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 89
+
   alignment_constraints = [32,]
 
   for math_inst in math_instructions:
@@ -1680,7 +1711,8 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
   ]
 
   min_cc = 75
-  max_cc = 1024
+  max_cc = 89
+
   alignment_constraints = [32,]
 
   for math_inst in math_instructions:
@@ -4332,31 +4364,31 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
 
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_medium, data_types, [
       [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
-      [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
+      [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
+      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
+      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
     ])
 
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_large, data_types, [
       [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
     ])
 
-    CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_medium, data_types, [
-      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
-      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
-    ])
-
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_small, data_types, [
       [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
     ])
+
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_medium, data_types, [
-      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed]
+      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
+      [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
     ])
+
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_large, data_types, [
-      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed]
+      [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
     ])
+
   else:
     CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions, data_types, schedules_default)
-
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_types, schedules_transposed_epilogue)
+    CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_types, schedules_transposed_epilogue)
 
 #
 def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
@@ -4385,12 +4417,14 @@ def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
   max_cc = 90
 
   for math_inst in math_instructions:
+    # 64x128x128
     tile_descriptions_small = [
       TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
         0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
       TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
         0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
     ]
+    # 128x128x128
     tile_descriptions_medium = [
       TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
         0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
@@ -4482,27 +4516,6 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
       DataType.e5m2, DataType.e5m2, DataType.f32,
       OpcodeClass.TensorOp,
       MathOperation.multiply_add),
-    # inst 64x64x32
-    MathInstruction(
-      [64, 64, 32],
-      DataType.e4m3, DataType.e4m3, DataType.f32,
-      OpcodeClass.TensorOp,
-      MathOperation.multiply_add),
-    MathInstruction(
-      [64, 64, 32],
-      DataType.e4m3, DataType.e5m2, DataType.f32,
-      OpcodeClass.TensorOp,
-      MathOperation.multiply_add),
-    MathInstruction(
-      [64, 64, 32],
-      DataType.e5m2, DataType.e4m3, DataType.f32,
-      OpcodeClass.TensorOp,
-      MathOperation.multiply_add),
-    MathInstruction(
-      [64, 64, 32],
-      DataType.e5m2, DataType.e5m2, DataType.f32,
-      OpcodeClass.TensorOp,
-      MathOperation.multiply_add),
   ]
 
   min_cc = 90
@@ -4584,6 +4597,25 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
       },
     ]
 
+    data_types_large_tile = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+    ]
+
     if math_inst.instruction_shape[1] == 128:
       tile_descriptions_small = [
         # 64x128x128
@@ -4592,19 +4624,18 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
         TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
           0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
       ]
-      tile_descriptions = [
-        # 128x128x128
-        TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+      tile_descriptions_large = [
+        # 256x128x128
+        TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
           0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
-        TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+        TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
           0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
       ]
-    elif math_inst.instruction_shape[1] == 64:
       tile_descriptions = [
-        # 256x64x128
-        TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+        # 128x128x128
+        TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
           0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
-        TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
+        TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
           0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
       ]
 
@@ -4647,6 +4678,11 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
            [[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized],
             [KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]])
 
+        # Large tiles
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_large, data_types_large_tile,
+          [[KernelScheduleType.TmaWarpSpecializedCooperative,             EpilogueScheduleType.TmaWarpSpecializedCooperative],
+           [KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+
         # Add stream-K variants (with and without TMA epilogues)
         CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
         CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
@@ -5342,5 +5378,5 @@ def define_parser():
       with open(args.selected_kernel_list, 'w') as file_writer:
         for line in manifest.selected_kernels:
           file_writer.write("%s\n" % line)
-#
+
 ###################################################################################################
diff --git a/tools/library/scripts/library.py b/python/cutlass_library/library.py
similarity index 92%
rename from tools/library/scripts/library.py
rename to python/cutlass_library/library.py
index 88ec518ec1..a1d75c21e3 100644
--- a/tools/library/scripts/library.py
+++ b/python/cutlass_library/library.py
@@ -1,14 +1,41 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
 
-import re
-
-###################################################################################################
+"""
+Data types and tags used for emitting CUTLASS C++ kernels
+"""
 
 import enum
+import re
 
 # The following block implements enum.auto() for Python 3.5 variants that don't include it such
 # as the default 3.5.2 on Ubuntu 16.04.
@@ -697,10 +724,10 @@ class GroupScheduleMode(enum.Enum):
 ###################################################################################################
 
 #
-class ConvKind(enum.Enum):
-  Fprop = enum_auto()
-  Dgrad = enum_auto()
-  Wgrad = enum_auto()
+class ConvKind(enum.IntEnum):
+  Fprop = 0
+  Dgrad = 1
+  Wgrad = 2
 
 #
 ConvKindTag = {
@@ -715,13 +742,17 @@ class ConvKind(enum.Enum):
   ConvKind.Wgrad: 'wgrad',
 }
 
+class ConvMode(enum.IntEnum):
+  CrossCorrelation = 0
+  Convolution = 1
+
 #
 class IteratorAlgorithm(enum.Enum):
-  Analytic = enum_auto()
-  Optimized = enum_auto()
-  FixedChannels = enum_auto()
-  FewChannels = enum_auto()
-  FixedStrideDilation = enum_auto()
+  Analytic = 0
+  Optimized = 1
+  FixedChannels = 2
+  FewChannels = 3
+  FixedStrideDilation = 4
 
 #
 IteratorAlgorithmTag = {
@@ -742,9 +773,9 @@ class IteratorAlgorithm(enum.Enum):
 
 #
 class StrideSupport(enum.Enum):
-  Strided = enum_auto()
-  Unity = enum_auto()
-  Fixed = enum_auto()
+  Strided = 0
+  Unity = 1
+  Fixed = 2
 
 #
 StrideSupportTag = {
@@ -914,8 +945,6 @@ def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment =
     self.alignment = alignment
     self.complex_transform = complex_transform
 
-###################################################################################################
-
 #
 def CalculateSmemUsage(operation):
   cta_shape = operation.tile_description.threadblock_shape
@@ -940,4 +969,22 @@ def CalculateSmemUsage(operation):
 
   smem_usage = smem_per_stage * stages
   return (smem_usage >> 10)
-###################################################################################################
+
+
+class GemmUniversalMode(enum.IntEnum):
+  """
+  Types corresponding to GemmUniversalMode
+  """
+  Gemm = 0
+  GemmSplitKParallel = 1
+  Batched = 2
+  Array = 3
+
+
+class SplitKMode(enum.IntEnum):
+  """
+  Types corresponding to SplitKMode
+  """
+  NoneSplitK = 0
+  Serial = 1
+  Parallel = 2
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
new file mode 100644
index 0000000000..07427d6a88
--- /dev/null
+++ b/python/cutlass_library/manifest.py
@@ -0,0 +1,683 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for filtering CUTLASS library kernels and emitting library intitialization
+and building code
+"""
+
+import enum
+import os.path
+import shutil
+
+from cutlass_library.library import *
+from cutlass_library.gemm_operation import *
+from cutlass_library.rank_k_operation import *
+from cutlass_library.rank_2k_operation import *
+from cutlass_library.trmm_operation import *
+from cutlass_library.symm_operation import *
+from cutlass_library.conv2d_operation import *
+from cutlass_library.conv3d_operation import *
+import logging
+
+###################################################################################################
+_LOGGER = logging.getLogger(__name__)
+
+
+class EmitOperationKindAll:
+  def __init__(self, generated_path, kind, args):
+    self.generated_path = generated_path
+    self.kind = kind
+    self.args = args
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
+
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
+    os.makedirs(self.operation_path, exist_ok=True)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = [self.top_level_path,]
+
+    self.configurations = []
+
+    return self
+
+  #
+  def emit(self, operations):
+    for min_cc, configurations in sorted(operations.items()):
+      for configuration_name, _ in configurations.items():
+        self.configurations.append(configuration_name)
+        self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
+
+    for configuration_name in self.configurations:
+      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+
+class EmitOperationKindLibrary:
+  def __init__(self, generated_path, min_cc, kind, args):
+    self.generated_path = generated_path
+    self.min_cc = min_cc
+    self.kind = kind
+    self.args = args
+    self.emitters = {
+      OperationKind.Gemm: EmitGemmConfigurationLibrary,
+      OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
+      OperationKind.Conv3d: EmitConv3dConfigurationLibrary,
+      OperationKind.RankK: EmitRankKConfigurationLibrary,
+      OperationKind.Rank2K: EmitRank2KConfigurationLibrary,
+      OperationKind.Trmm: EmitTrmmConfigurationLibrary,
+      OperationKind.Symm: EmitSymmConfigurationLibrary
+    }
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template = "  initialize_${configuration_name}(manifest);\n"
+    self.subclass_call_template = "  initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
+
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind], str(self.min_cc))
+    os.makedirs(self.operation_path)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = {}
+
+    # Each {operation_kind x cc} combination is further decomposed by the instruction
+    # types used. This dictionary used to track the file handles for the top-level
+    # files of each subclass
+    self.subclass_files = {}
+
+    # Configurations in each sub class
+    self.subclass_configurations = {}
+
+    return self
+
+  #
+  def emit(self, configuration_name, operations):
+    assert len(operations) > 0
+
+    # The extended name for all operations of a given configuration_name is guaranteed
+    # to be the same because extended_name() is used in defining configuration_name. Thus,
+    # we can safely use the extended_name() of the first operation.
+    extended_name = operations[0].extended_name()
+
+    # Create a directory for operations with this subclass if it does not exist
+    if extended_name not in self.subclass_files:
+      subclass_path = os.path.join(self.operation_path, extended_name)
+      os.mkdir(subclass_path)
+
+      self.subclass_configurations[extended_name] = []
+
+      # Open a new top-level file for this sub class
+      subclass_top_level_path = os.path.join(
+        subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu")
+      self.subclass_files[extended_name] = open(subclass_top_level_path, "w")
+      self.subclass_files[extended_name].write(self.header_template)
+
+      self.source_files[extended_name] = [subclass_top_level_path]
+
+    subclass_dir = os.path.dirname(self.subclass_files[extended_name].name)
+    with self.emitters[self.kind](subclass_dir, configuration_name) as configuration_emitter:
+      for operation in operations:
+        configuration_emitter.emit(operation)
+
+      self.source_files[extended_name].append(configuration_emitter.configuration_path)
+
+    self.subclass_configurations[extended_name].append(configuration_name)
+    self.subclass_files[extended_name].write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    self.top_level_file.write(
+      SubstituteTemplate(self.entry_template, {
+        'min_cc': str(self.min_cc),
+        'subclass_name': '',
+        'operation_name': OperationKindNames[self.kind]
+      }))
+
+    # Finish and close all subclass files
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      subclass_file.write(SubstituteTemplate(self.entry_template, subclass_cfg))
+
+      for configuration in self.subclass_configurations[subclass_name]:
+        subclass_file.write(
+          SubstituteTemplate(self.configuration_template, {
+            'configuration_name': configuration
+          }))
+
+      subclass_file.write(self.epilogue_template)
+      subclass_file.close()
+
+      # Write the call to initialize_all for this subclass to the top-level file
+      self.top_level_file.write(SubstituteTemplate(self.subclass_call_template, subclass_cfg))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+class EmitInterfaceLibrary:
+  def __init__(self, generated_path, operation_count, args):
+    self.generated_path = generated_path
+    self.args = args
+
+    self.prototypes = []
+    self.fn_calls = []
+    self.operation_count = str(operation_count)
+
+    self.top_level_hdr_template = '''
+/*
+ Generated by manifest.py - Do not edit.
+*/
+'''
+    self.top_level_prologue = '''
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+\tnamespace library {
+
+${prototypes}
+'''
+
+    self.top_level_initialize_kind = '''
+\t\tvoid initialize_all_${kind}_operations(Manifest &manifest) {
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_initialize = '''
+\t\tvoid initialize_all(Manifest &manifest) {
+\t\t\tmanifest.reserve(${operation_count});\n
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_suffix = '''
+\t} // namespace library
+} // namespace cutlass
+
+'''
+
+  #
+  def __enter__(self):
+    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.top_level_hdr_template)
+
+    self.source_files = [self.top_level_path,]
+
+    return self
+
+  #
+  def emit(self, operation_name):
+    self.prototypes.append(SubstituteTemplate(
+       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
+       {'operation_kind': operation_name}))
+
+    self.fn_calls.append(SubstituteTemplate(
+      "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
+      {'operation_kind': operation_name}))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes)}))
+
+    # Write out initialize_all method
+    self.top_level_file.write(SubstituteTemplate(self.top_level_initialize,
+                              {'operation_count': self.operation_count, 'fn_calls':"\n".join(self.fn_calls)}))
+
+    self.top_level_file.write(self.top_level_suffix)
+    self.top_level_file.close()
+
+###################################################################################################
+###################################################################################################
+
+class Options:
+  def __init__(self):
+    pass
+
+###################################################################################################
+
+#
+class Manifest:
+
+  #
+  def __init__(self, args = None):
+    self.operations = {}
+    self.args = args
+    self.operation_count = 0
+    self.operations_by_name = {}
+
+    self.kernel_filter = ''
+    self.kernel_filter_list = []
+    self.kernel_names = []
+    self.operations_enabled = []
+    self.selected_kernels = []
+    self.ignore_kernel_names = []
+    self.compute_capabilities = [50,]
+    self.curr_build_dir = '.'
+    self.filter_by_cc = True
+
+    if self.args:
+      self.kernel_filter = self.args.kernels
+      self.curr_build_dir = args.curr_build_dir
+
+      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
+      architectures = [x if x != '90a' else '90' for x in architectures]
+
+      self.compute_capabilities = [int(x) for x in architectures]
+
+      if args.filter_by_cc in ['false', 'False', '0']:
+        self.filter_by_cc = False
+
+    if args.operations == 'all':
+      self.operations_enabled = []
+    else:
+      operations_list = [
+        OperationKind.Gemm
+        , OperationKind.Conv2d
+        , OperationKind.Conv3d
+          , OperationKind.RankK
+          , OperationKind.Trmm
+          , OperationKind.Symm
+      ]
+      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
+
+    if args.kernels == 'all':
+      self.kernel_names = []
+    else:
+      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
+
+    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
+
+    if args.kernel_filter_file is None:
+        self.kernel_filter_list = []
+    else:
+        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
+        _LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format(
+            filter_count = len(self.kernel_filter_list),
+            filter_file = args.kernel_filter_file))
+
+    self.operation_count = 0
+    self.operations_by_name = {}
+    self.disable_full_archs_compilation = args.disable_full_archs_compilation
+
+
+  def get_kernel_filters (self, kernelListFile):
+    if os.path.isfile(kernelListFile):
+        with open(kernelListFile, 'r') as fileReader:
+            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
+
+        lines = [re.compile(line) for line in lines if line]
+        return lines
+    else:
+        return []
+
+  #
+  def filter_out_kernels(self, kernel_name, kernel_filter_list):
+
+    for kernel_filter_re in kernel_filter_list:
+        if kernel_filter_re.search(kernel_name) is not None:
+            return True
+
+    return False
+
+
+  #
+  def _filter_string_matches(self, filter_string, haystack):
+    ''' Returns true if all substrings appear in the haystack in order'''
+    substrings = filter_string.split('*')
+    for sub in substrings:
+      idx = haystack.find(sub)
+      if idx < 0:
+        return False
+      haystack = haystack[idx + len(sub):]
+    return True
+
+  #
+  def filter(self, operation):
+    ''' Filtering operations based on various criteria'''
+
+    # filter based on compute capability
+    enabled = not (self.filter_by_cc)
+
+    for cc in self.compute_capabilities:
+      if cc >= operation.tile_description.minimum_compute_capability and \
+         cc <= operation.tile_description.maximum_compute_capability and \
+         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
+
+        enabled = True
+        break
+
+    if not enabled:
+      return False
+
+    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
+      return False
+
+    # eliminate duplicates
+    if operation.procedural_name() in self.operations_by_name.keys():
+      return False
+
+    # Filter based on list of valid substrings
+    if len(self.kernel_names):
+      name = operation.procedural_name()
+      enabled = False
+
+      # compare against the include list
+      for name_substr in self.kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
+          enabled = True
+          break
+
+      # compare against the exclude list
+      for name_substr in self.ignore_kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
+          enabled = False
+          break
+
+    if len(self.kernel_filter_list) > 0:
+        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
+          _LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = True
+        else:
+          _LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = False
+
+
+    # TODO: filter based on compute data type
+    return enabled
+  #
+
+  #
+  def append(self, operation):
+    '''
+      Inserts the operation.
+
+      operation_kind -> configuration_name -> []
+    '''
+
+    if self.filter(operation):
+
+      self.selected_kernels.append(operation.procedural_name())
+
+      self.operations_by_name[operation.procedural_name()] = operation
+
+      # add the configuration
+      configuration_name = operation.configuration_name()
+
+      # Split operations by minimum CC
+      min_cc = operation.arch
+
+      if operation.operation_kind not in self.operations.keys():
+        self.operations[operation.operation_kind] = {}
+
+      if min_cc not in self.operations[operation.operation_kind]:
+        self.operations[operation.operation_kind][min_cc] = {}
+
+      if configuration_name not in self.operations[operation.operation_kind][min_cc].keys():
+        self.operations[operation.operation_kind][min_cc][configuration_name] = []
+
+      self.operations[operation.operation_kind][min_cc][configuration_name].append(operation)
+      self.operation_count += 1
+    else:
+      _LOGGER.debug("Culled {} from manifest".format(operation.procedural_name()))
+  #
+
+  def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
+    with open(manifest_path, "w") as manifest_file:
+
+      target_text = SubstituteTemplate("""cutlass_target_sources(cutlass_library_objs PRIVATE
+      """, { })
+      manifest_file.write(target_text + '\n\n')
+      manifest_file.write("    %s\n" % str(top_level_path.replace('\\', '/')))
+      generated_path = os.path.join(self.curr_build_dir, 'generated')
+      for kind in self.operations.keys():
+        kind_str = OperationKindNames[kind]
+        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/')
+        manifest_file.write(f"    {all_kind_file}\n")
+      manifest_file.write(')\n\n')
+
+      for kind in self.operations.keys():
+        for min_cc in sorted(self.operations[kind].keys()):
+          for subclass in sorted(source_files[kind][min_cc].keys()):
+            target_text = SubstituteTemplate("""cutlass_add_cutlass_library(
+      SUFFIX ${kind}_sm${min_cc}_${subclass}
+""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
+            manifest_file.write(target_text + '\n\n')
+
+            for source_file in source_files[kind][min_cc][subclass]:
+              manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
+
+            manifest_file.write(")\n")
+
+          if self.disable_full_archs_compilation:
+            self.emit_disable_full_archs_compilation(manifest_file, source_files)
+
+  def emit_disable_full_archs_compilation(manifest_file, source_files):
+      def for_hopper(name):
+          pass
+
+      def for_ampere(name):
+          return "16816" in name or \
+                  "16832" in name or \
+                  "16864" in name or \
+                  ("1688" in name and "tf32" in name)
+
+      def for_turing(name):
+          return ("1688" in name and "tf32" not in name) or \
+                  "8816" in name
+
+      def for_volta(name):
+          return "884" in name
+
+      def is_cpp(name):
+          return name.endswith(".cpp")
+
+      def get_src_archs_str_given_requested_cuda_archs(archs, source_file):
+          intersected_archs = archs & set(self.compute_capabilities)
+          if intersected_archs == set():
+              raise RuntimeError(
+                    """
+                    Empty archs set for file {} after taking
+                    the intersection of {} (global requested archs) and
+                    {} (per file requested archs)
+                    """.format(source_file, set(self.compute_capabilities), archs))
+          else:
+              return " ".join(map(str, intersected_archs))
+
+      for min_cc in sorted(source_files.keys()):
+        for source_file in source_files[min_cc]:
+            if is_cpp(source_file):
+                continue # skip because source is cpp
+            elif for_ampere(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({80, 87, 90}, source_file)
+            elif for_turing(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({75}, source_file)
+            elif for_volta(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({70, 72}, source_file)
+            else:
+                raise RuntimeError("Per file archs are not set {}, as there is no rule specified for this file pattern".format(source_file))
+
+            manifest_file.write("cutlass_apply_cuda_gencode_flags({} SM_ARCHS {})\n".format(str(source_file.replace('\\', '/')), archs_str))
+
+  #
+  def emit(self, target = GeneratorTarget.Library):
+
+    operation_emitters = {
+      GeneratorTarget.Library: EmitOperationKindLibrary
+    }
+
+    # Emitters for all operations that fall under a particular kind (e.g., GEMM, Conv2d)
+    kind_emitters = {
+      GeneratorTarget.Library: EmitOperationKindAll
+    }
+
+    interface_emitters = {
+      GeneratorTarget.Library: EmitInterfaceLibrary
+    }
+
+    generated_path = os.path.join(self.curr_build_dir, 'generated')
+
+    # create generated/
+    if os.path.exists(generated_path):
+      shutil.rmtree(generated_path)
+
+    os.mkdir(generated_path)
+
+    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
+      top_level_path = iface_emitter.top_level_path
+      for operation_kind in self.operations.keys():
+        iface_emitter.emit(OperationKindNames[operation_kind])
+
+    source_files = {}
+    for kind in self.operations.keys():
+      source_files[kind] = {}
+      for min_cc in self.operations[kind].keys():
+        source_files[kind][min_cc] = {}
+
+    for operation_kind, ops in self.operations.items():
+      for min_cc, configurations in sorted(ops.items()):
+        with operation_emitters[target](generated_path, min_cc, operation_kind, self.args) as operation_kind_emitter:
+          for configuration_name, operations in configurations.items():
+            _LOGGER.info("Emitting {config} with {num_ops} operations.".format(
+                config = configuration_name, num_ops = len(operations)))
+            operation_kind_emitter.emit(configuration_name, operations)
+
+          for subclass, files in operation_kind_emitter.source_files.items():
+            if subclass not in source_files[operation_kind][min_cc]:
+              source_files[operation_kind][min_cc][subclass] = []
+            source_files[operation_kind][min_cc][subclass].extend(operation_kind_emitter.source_files[subclass])
+
+      # Emit top level all_{gemm, conv2d, ...}_operations.cu files
+      with kind_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
+        operation_kind_emitter.emit(ops)
+
+    # write the manifest.cmake file containing paths from all targets
+    manifest_path = os.path.join(generated_path, "manifest.cmake")
+
+    self.emit_manifest_cmake(manifest_path, top_level_path, source_files)
+
+###################################################################################################
diff --git a/tools/library/scripts/rank_2k_operation.py b/python/cutlass_library/rank_2k_operation.py
similarity index 85%
rename from tools/library/scripts/rank_2k_operation.py
rename to python/cutlass_library/rank_2k_operation.py
index df43ca9de4..4b3bab30a5 100644
--- a/tools/library/scripts/rank_2k_operation.py
+++ b/python/cutlass_library/rank_2k_operation.py
@@ -1,9 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Rank2K kernels
+"""
 
 import enum
 import os.path
@@ -11,7 +40,7 @@
 import functools
 import operator
 
-from library import *
+from cutlass_library.library import *
 
 
 ###################################################################################################
@@ -34,7 +63,7 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
     self.rank_k_kind = rank_k_kind
     # tensor A and B have same data type and layout
     self.A = A
-    self.B = A  
+    self.B = A
     self.C = C
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
@@ -43,7 +72,7 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
@@ -73,7 +102,7 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
@@ -127,7 +156,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -174,10 +203,10 @@ class EmitRank2KUniversalInstance:
   def __init__(self):
     self.rank_k_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Rank2K<
-    ${element_a}, ${layout_a}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -203,8 +232,8 @@ def __init__(self):
 // Rank K operator ${operation_name}
 using Operation_${operation_name} = 
   typename cutlass::gemm::device::Rank2K<
-    ${element_a}, ${layout_a}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -233,7 +262,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -267,7 +296,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform],
       'transform_b': ComplexTransformTag[operation.B.complex_transform],
@@ -376,7 +405,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -389,9 +418,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/tools/library/scripts/rank_k_operation.py b/python/cutlass_library/rank_k_operation.py
similarity index 85%
rename from tools/library/scripts/rank_k_operation.py
rename to python/cutlass_library/rank_k_operation.py
index 74ce78ac26..993df7ca1e 100644
--- a/tools/library/scripts/rank_k_operation.py
+++ b/python/cutlass_library/rank_k_operation.py
@@ -1,9 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting RankK kernels
+"""
 
 import enum
 import os.path
@@ -11,7 +40,7 @@
 import functools
 import operator
 
-from library import *
+from cutlass_library.library import *
 
 
 ###################################################################################################
@@ -41,7 +70,7 @@ def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
@@ -71,7 +100,7 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
@@ -125,7 +154,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -172,9 +201,9 @@ class EmitRankKUniversalInstance:
   def __init__(self):
     self.rank_k_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::RankK<
-    ${element_a}, ${layout_a}, 
+    ${element_a}, ${layout_a},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -197,9 +226,9 @@ def __init__(self):
 """
     self.rank_k_complex_template = """
 // Rank K operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::RankK<
-    ${element_a}, ${layout_a}, 
+    ${element_a}, ${layout_a},
     ${element_c}, ${layout_c}, ${fill_mode},
     ${element_accumulator},
     ${opcode_class},
@@ -226,7 +255,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+ 
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -257,7 +286,7 @@ def emit(self, operation):
       'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform],
       'blas_mode': BlasModeTag[operation.blas_mode]
@@ -365,7 +394,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -378,9 +407,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/tools/library/scripts/symm_operation.py b/python/cutlass_library/symm_operation.py
similarity index 85%
rename from tools/library/scripts/symm_operation.py
rename to python/cutlass_library/symm_operation.py
index af01fdc90c..5b2a160388 100644
--- a/tools/library/scripts/symm_operation.py
+++ b/python/cutlass_library/symm_operation.py
@@ -1,9 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Symm kernels
+"""
 
 import enum
 import os.path
@@ -11,7 +40,7 @@
 import functools
 import operator
 
-from library import *
+from cutlass_library.library import *
 
 
 ###################################################################################################
@@ -34,7 +63,7 @@ def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue,
     self.symm_kind = symm_kind
     # tensor A and B have same data type and layout
     self.A = A
-    self.B = B  
+    self.B = B
     self.C = C
     self.element_epilogue = element_epilogue
     self.epilogue_functor = epilogue_functor
@@ -43,7 +72,7 @@ def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
@@ -73,7 +102,7 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
@@ -127,7 +156,7 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
       )
     return "%s" % (ShortLayoutTypeNames[self.A.layout])
 
@@ -179,10 +208,10 @@ class EmitSymmUniversalInstance:
   def __init__(self):
     self.symm_template = """
 // Symm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Symm<
-    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -206,10 +235,10 @@ def __init__(self):
 """
     self.symm_complex_template = """
 // Symm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Symm<
     ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
-    ${element_b}, ${layout_b}, 
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -236,7 +265,7 @@ def __init__(self):
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
-    
+
     warp_count = operation.tile_description.warp_count
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
@@ -271,7 +300,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'blas_mode': BlasModeTag[operation.blas_mode]
     }
@@ -378,7 +407,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -391,9 +420,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/tools/library/scripts/trmm_operation.py b/python/cutlass_library/trmm_operation.py
similarity index 85%
rename from tools/library/scripts/trmm_operation.py
rename to python/cutlass_library/trmm_operation.py
index c234e6f9ce..b2b0577fd7 100644
--- a/tools/library/scripts/trmm_operation.py
+++ b/python/cutlass_library/trmm_operation.py
@@ -1,9 +1,38 @@
+#################################################################################################
 #
-# \file generator.py
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# \brief Generates the CUTLASS Library's instances
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Trmm kernels
+"""
 
 import enum
 import os.path
@@ -11,7 +40,7 @@
 import functools
 import operator
 
-from library import *
+from cutlass_library.library import *
 
 
 ###################################################################################################
@@ -40,7 +69,7 @@ def __init__(self, trmm_kind, arch, tile_description, A, B, C, element_epilogue,
   #
   def is_complex(self):
     complex_operators = [
-      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex,
       MathOperation.multiply_add_complex_gaussian,
       MathOperation.multiply_add_complex_fast_f32
     ]
@@ -71,7 +100,7 @@ def short_math_name(self):
   #
   def core_name(self):
     ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
-    
+
     inst_shape = ''
     inst_operation = ''
     intermediate_type = ''
@@ -123,8 +152,8 @@ def extended_name(self):
   def layout_name(self):
     if self.is_complex() or self.is_planar_complex():
       return "%s%s" % (
-        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)], 
-        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)] 
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
       )
     return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
 
@@ -181,11 +210,11 @@ class EmitTrmmUniversalInstance:
   def __init__(self):
     self.trmm_template = """
 // Trmm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Trmm<
     ${element_a}, ${layout_a},
-    ${side_mode}, ${fill_mode}, ${diag_type}, 
-    ${element_b}, ${layout_b}, 
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -210,11 +239,11 @@ def __init__(self):
 """
     self.trmm_complex_template = """
 // Trmm operator ${operation_name}
-using Operation_${operation_name} = 
+using Operation_${operation_name} =
   typename cutlass::gemm::device::Trmm<
-    ${element_a}, ${layout_a}, 
-    ${side_mode}, ${fill_mode}, ${diag_type}, 
-    ${element_b}, ${layout_b}, 
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
     ${element_c}, ${layout_c},
     ${element_accumulator},
     ${opcode_class},
@@ -235,7 +264,7 @@ def __init__(self):
     ${align_b},
     ${split_k_serial},
     ${math_operation},
-    ${transform_a} 
+    ${transform_a}
 >;
 """
 
@@ -252,7 +281,7 @@ def emit(self, operation):
       'operation_name': operation.procedural_name(),
       'element_a': DataTypeTag[operation.A.element],
       'layout_a': LayoutTag[operation.A.layout],
-      'side_mode' : SideModeTag[operation.A.side_mode], 
+      'side_mode' : SideModeTag[operation.A.side_mode],
       'fill_mode': FillModeTag[operation.A.fill_mode],
       'diag_type' : DiagTypeTag[operation.A.diag_type],
       'element_b': DataTypeTag[operation.B.element],
@@ -278,7 +307,7 @@ def emit(self, operation):
       'stages': str(operation.tile_description.stages),
       'align_a': str(1),  # TRMM A's alignment is always 1 for no padding to work until we make zfill work with variable bytes
       'align_b': str(operation.B.alignment),
-      'split_k_serial': 'false', 
+      'split_k_serial': 'false',
       'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
       'transform_a': ComplexTransformTag[operation.A.complex_transform]
     }
@@ -385,7 +414,7 @@ def emit(self, operation):
       'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
         if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
       'compile_guard_end': "#endif" \
-        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
       }))
 
   def __exit__(self, exception_type, exception_value, traceback):
@@ -398,9 +427,9 @@ def __exit__(self, exception_type, exception_value, traceback):
     self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
       'configuration_name': self.configuration_name
       }))
-   
+
     for instance_wrapper in self.instance_wrappers:
-      self.configuration_file.write(instance_wrapper) 
+      self.configuration_file.write(instance_wrapper)
 
     self.configuration_file.write(self.epilogue_template)
     self.configuration_file.close()
diff --git a/python/pycute/__init__.py b/python/pycute/__init__.py
new file mode 100644
index 0000000000..8cf4257907
--- /dev/null
+++ b/python/pycute/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from .int_tuple import *
+from .layout import *
+from .swizzle import *
+from .typing import *
diff --git a/python/pycute/int_tuple.py b/python/pycute/int_tuple.py
new file mode 100644
index 0000000000..c85403a6ff
--- /dev/null
+++ b/python/pycute/int_tuple.py
@@ -0,0 +1,230 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Functions for manipulating IntTuples
+"""
+
+from functools import reduce
+from itertools import chain
+from typing import Union
+from .typing import Integer
+
+
+def is_int(x):
+  return isinstance(x, Integer)
+
+
+def is_tuple(x):
+  return isinstance(x, tuple)
+
+
+def flatten(t):
+  if is_tuple(t):
+    if len(t) == 0:
+      return ()
+    else:
+      return tuple(i for a in t for i in flatten(a))
+  else:
+    return (t,)
+
+
+def signum(a):
+  return bool(a > 0) - bool(a < 0)
+
+
+def product(a):
+  if is_tuple(a):
+    return reduce(lambda val,elem : val*product(elem), a, 1)
+  else:
+    return a
+
+
+def inner_product(a, b):
+  if is_tuple(a):                      # tuple tuple
+    assert len(a) == len(b)
+    return sum(inner_product(x,y) for x,y in zip(a,b))
+  else:                                # "int" "int"
+    assert not is_tuple(b)
+    return a * b
+
+
+def tuple_max(a):
+  if is_tuple(a):
+    return max(tuple_max(x) for x in a)
+  else:
+    return a
+
+
+def elem_scale(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                     # tuple tuple
+      assert len(a) == len(b)
+      return tuple(elem_scale(x,y) for x,y in zip(a,b))
+    else:                               # tuple "int"
+      assert False           # Error
+  else:
+    if is_tuple(b):                     # "int" tuple
+      return elem_scale(a, product(b))
+    else:                               # "int" "int"
+      return a * b
+
+
+# Inclusive prefix ceil div with output congruent to input a
+def shape_div(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                    # tuple tuple
+      assert len(a) == len(b)
+      return tuple(shape_div(x,y) for x,y in zip(a,b))
+    else:                              # tuple "int"
+      #r = [shape_div(a[0],b)] + [shape_div(a[i],b := shape_div(b, product(a[i-1]))) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(shape_div(v,b))
+        b = shape_div(b,product(v))
+      return tuple(r)
+  else:
+    if is_tuple(b):                    # "int" tuple
+      return shape_div(a, product(b))
+    else:                              # "int" "int"
+      assert a % b == 0 or b % a == 0
+      #return -(-a // b)      # Python exclusive impl: "//" is always floor div
+      if a % b == 0:
+        return a // b
+      else:
+        return signum(a*b)
+
+
+# Exclusive prefix product with output congruent to input a
+def prefix_product(a, init=1):
+  if is_tuple(a):
+    if is_tuple(init):                 # tuple tuple
+      assert len(a) == len(init)
+      return tuple(prefix_product(x,i) for x,i in zip(a,init))
+    else:                              # tuple "int"
+      #r = [prefix_product(a[0],init)] + [prefix_product(a[i],init := init * product(a[i-1])) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(prefix_product(v,init))
+        init = init * product(v)
+      return tuple(r)
+  else:
+    if is_tuple(init):                 # "int" tuple
+      assert False           # Error
+    else:                              # "int" "int"
+      return init
+
+
+def idx2crd(idx, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(idx):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(idx) == len(shape) and len(idx) == len(stride)
+      return tuple(idx2crd(i, s, d) for i, s, d in zip(idx,shape,stride))
+    else:                              # tuple "int" "int"
+      assert False           # Error
+  else:
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      return tuple(idx2crd(idx, s, d) for s,d in zip(shape,stride))
+    else:                              # "int" "int" "int"
+      return (idx // stride) % shape
+
+
+def crd2idx(crd, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(crd):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(crd) == len(shape) and len(crd) == len(stride)
+      return sum(crd2idx(c, s, d) for c, s, d in zip(crd, shape, stride))
+    else:                              # tuple "int" "int"
+      assert False, f"crd={crd}, shape={shape}"           # Error
+  else:
+    if crd is None:
+      crd = 0
+
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      result = 0
+      for i in range(len(shape)-1):
+        result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
+        crd = crd // product(shape[i])
+      return result + crd2idx(crd, shape[-1], stride[-1])
+    else:                              # "int" "int" "int"
+      return crd * stride
+
+
+# Transform crd into the dst_shape's iteration space
+def crd2crd(crd, dst_shape, src_shape=None):
+  if is_tuple(crd):
+    if is_tuple(dst_shape):            # tuple tuple
+      assert len(crd) == len(dst_shape)
+      return tuple(crd2crd(x, y) for x, y in zip(crd,dst_shape))
+    else:                              # tuple "int"
+      # Ambiguous unless we have src_shape
+      assert src_shape is not None
+      return crd2idx(crd, src_shape)
+  else:
+    if is_tuple(dst_shape):            # "int" tuple
+      return idx2crd(crd, dst_shape)
+    else:                              # "int" "int"
+      assert crd < dst_shape
+      return crd
+
+
+# Filter trg according to crd: keep only elements of trg that are paired with None
+def slice_(crd: Union[None, tuple, int],
+           trg: Union[tuple, int]):
+  if is_tuple(crd):
+    if is_tuple(trg):                  # tuple tuple
+      assert len(crd) == len(trg)
+      # match C++ behavior of `filter_tuple` using `tuple_cat(...)`
+      return tuple(chain(*filter(lambda x: x != (), [slice_(c, s) for c, s in zip(crd, trg)])))
+    else:
+      assert False                     # tuple "int" : Error
+  elif crd is None:
+    # match C++ behavior `return cute::tuple<B>{b};`
+    return (trg,)
+  else:
+    return ()
+
+
+# Determine if None appears at any of an int_tuples' terminals
+def has_none(a: Union[None, tuple, int]):
+  if is_tuple(a):
+    return any(has_none(v) for v in a)
+  else:
+    return a is None
diff --git a/python/pycute/layout.py b/python/pycute/layout.py
new file mode 100644
index 0000000000..c2ad2d11ed
--- /dev/null
+++ b/python/pycute/layout.py
@@ -0,0 +1,358 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Definition of CuTe Layouts and functions to manipulate them
+"""
+
+from itertools import chain
+from typing import Union
+
+from .int_tuple import *
+
+
+class LayoutBase:
+  pass
+
+
+def is_layout(x):
+  return isinstance(x, LayoutBase)
+
+
+class Layout(LayoutBase):
+  def __init__(self, _shape, _stride=None):
+    self.shape  = _shape
+    if _stride is None:
+      self.stride = prefix_product(self.shape)
+    else:
+      self.stride = _stride
+
+  # operator ==
+  def __eq__(self, other):
+    return self.shape == other.shape and self.stride == other.stride
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    if is_tuple(self.shape):
+      return len(self.shape)
+    else:
+      return 1
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    """
+    Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+    OR
+    Slice the layout and return the sublayout (Coord has an Underscore slice op)
+
+    Follow the same behavior of `Layout::operator(Coord const&)` in cute C++
+    """
+    if has_none(args):
+      if len(args) == 1:
+        return Layout(slice_(args[0], self.shape), slice_(args[0], self.stride))
+      else:
+        return Layout(slice_(args, self.shape), slice_(args, self.stride))
+    else:
+      if len(args) == 1:
+        return crd2idx(args[0], self.shape, self.stride)
+      else:
+        return crd2idx(args, self.shape, self.stride)
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    if is_tuple(self.shape):
+      return Layout(self.shape[i], self.stride[i])
+    else:
+      assert i == 0
+      return Layout(self.shape, self.stride)
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return product(self.shape)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return tuple_max(tuple((1, elem_scale(self.shape, self.stride))))
+
+  # print and str
+  def __str__(self):
+    return f"{self.shape}:{self.stride}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Layout({self.shape},{self.stride})"
+
+
+# Make Layout from a list of layouts (each layout it's own mode in the result)
+def make_layout(*layouts):
+  if len(layouts) == 1 and not is_layout(layouts[0]):
+    layouts = layouts[0]
+
+  shape, stride = zip(*((a.shape,a.stride) for a in layouts))
+  return Layout(shape, stride)
+
+
+# Size of the domain
+def size(layout):
+  if is_layout(layout):
+    return layout.size()
+  return product(layout)
+
+
+# Size of the codomain
+def cosize(layout):
+  return layout.cosize()
+
+
+# Layout coalesce -- flatten and combine as many modes as possible while preserving the int-to-int function
+def coalesce(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((coalesce(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                       for i in range(len(profile),len(layout)))))
+
+  result_shape  = [1]
+  result_stride = [0]
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s
+    if shape == 1:
+      continue
+    # replace our shape-1 with anything
+    elif result_shape[-1] == 1:
+      result_shape[-1]  = shape
+      result_stride[-1] = stride
+    # merge modes if the shape*stride match
+    elif result_shape[-1] * result_stride[-1] == stride:
+      result_shape[-1] = result_shape[-1] * shape
+    # append a new mode
+    else:
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 1:
+    return Layout(result_shape[0], result_stride[0])
+  else:
+    return Layout(tuple(result_shape), tuple(result_stride))
+
+
+# Layout filter -- replace all stride-0 modes with size-1 and then coalesce to remove them
+def filter(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((filter(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                     for i in range(len(profile),len(layout)))))
+
+  result_shape  = []
+  result_stride = []
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s and stride-0s
+    if not (shape == 1 or stride == 0):
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 0:
+    return Layout(1,0)
+  else:
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout composition
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def composition(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return composition(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((composition(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                          for i in range(len(layoutB),len(layoutA)))))
+  elif is_tuple(layoutB.shape):
+    return make_layout(composition(layoutA, layoutB_i) for layoutB_i in layoutB)
+
+  if layoutB.stride == 0:
+    return Layout(layoutB.shape, 0)
+  else:
+    result_shape  = []
+    result_stride = []
+    rest_shape   = layoutB.shape
+    rest_stride  = layoutB.stride
+    for (s, d) in zip(flatten(layoutA.shape)[:-1], flatten(layoutA.stride)[:-1]):
+      s1 = shape_div(s, rest_stride)
+      result_shape.append(min(s1,rest_shape))
+      result_stride.append(rest_stride * d)
+      rest_shape  = shape_div(rest_shape, abs(s1))
+      rest_stride = shape_div(rest_stride, s)
+
+    result_shape.append(rest_shape)
+    result_stride.append(rest_stride * flatten(layoutA.stride)[-1])
+
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout complement
+def complement(layout, max_idx=1):
+  if is_int(layout):
+    return complement(Layout(layout))
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  sorted_DS = sorted(zip(flatten(layout.stride), flatten(layout.shape)))
+  for (stride, shape) in sorted_DS:
+    if stride == 0 or shape == 1:
+      continue
+
+    in_bound = current_idx <= shape * stride
+    # To support symbolic value which can't be evaluated now
+    assert (type(in_bound) is not bool) or in_bound
+
+    result_shape.append(stride // current_idx)
+    result_stride.append(current_idx)
+    current_idx = shape * stride
+
+  result_shape.append((max_idx + current_idx - 1) // current_idx)  # ceil_div
+  result_stride.append(current_idx)
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout right inverse
+def right_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  flat_shape  = flatten(layout.shape)
+  flat_stride = flatten(layout.stride)
+  sorted_DSA = sorted(zip(flat_stride, flat_shape, prefix_product(flat_shape)))
+  for (stride,shape,rstride) in sorted_DSA:
+    if shape == 1:
+      continue
+    if current_idx != stride:
+      break
+
+    result_shape.append(shape)
+    result_stride.append(rstride)
+    current_idx = shape * stride
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout left inverse
+def left_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+  return right_inverse(make_layout(layout, complement(layout)))
+
+
+# Split a layout by the composition of B and the "rest"
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_divide(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_divide(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                             for i in range(len(layoutB),len(layoutA)))))
+
+  return composition(layoutA, make_layout(layoutB, complement(layoutB, size(layoutA))))
+
+
+# Reproduce a layoutA over a layoutB
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_product(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_product(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                              for i in range(len(layoutB),len(layoutA)))))
+
+  return make_layout(layoutA, composition(complement(layoutA, size(layoutA)*cosize(layoutB)), layoutB));
+
+
+# Gather the modes from a hierarchical logical_divide or logical_product
+def hier_unzip(splitter, layoutA, layoutB):
+  if layoutB is None:
+    return make_layout(Layout(1,0), layoutA)
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    # A layout with shape ((A,a),(B,b),(C,c))
+    split = make_layout(hier_unzip(splitter, layoutA[i], layoutB[i]) for i in range(0,len(layoutB)))
+    # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
+    return make_layout(make_layout(       split[i][0] for i in range(           0,len(layoutB))),
+                       make_layout(chain((split[i][1] for i in range(           0,len(layoutB))),
+                                         (layoutA[i]  for i in range(len(layoutB),len(layoutA))))))
+
+  # splitter must return a rank-2 layout
+  return splitter(layoutA, layoutB)
+
+
+# Apply logical divide hierarchically and gather the split modes into two modes
+def zipped_divide(layoutA, layoutB):
+  return hier_unzip(logical_divide, layoutA, layoutB)
+
+
+# Perform logical divide hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_divide(layoutA, layoutB):
+  result = zipped_divide(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+# Apply logical product hierarchically and gather the split modes into two modes
+def zipped_product(layoutA, layoutB):
+  return hier_unzip(logical_product, layoutA, layoutB)
+
+
+# Perform logical product hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_product(layoutA, layoutB):
+  result = zipped_product(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+def slice_and_offset(crd: tuple,
+                     layout: Layout):
+  return (Layout(slice_(crd, layout.shape), slice_(crd, layout.stride)),
+          crd2idx(crd, layout.shape, layout.stride))
diff --git a/python/pycute/swizzle.py b/python/pycute/swizzle.py
new file mode 100644
index 0000000000..d707ecca3b
--- /dev/null
+++ b/python/pycute/swizzle.py
@@ -0,0 +1,129 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Methods for layout swizzling
+"""
+
+from .layout import *
+
+
+def shiftr(a, s):
+  return a >> s if s > 0 else shiftl(a, -s)
+
+
+def shiftl(a, s):
+  return a << s if s > 0 else shiftr(a, -s)
+
+
+## A generic Swizzle functor
+ # 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ #                               ^--^  Base is the number of least-sig bits to keep constant
+ #                  ^-^       ^-^      Bits is the number of bits in the mask
+ #                    ^---------^      Shift is the distance to shift the YYY mask
+ #                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ #
+ # e.g. Given
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ # the result is
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ #
+class Swizzle:
+  def __init__(self, bits, base, shift):
+    assert bits >= 0
+    assert base >= 0
+    assert abs(shift) >= bits
+    self.bits = bits
+    self.base = base
+    self.shift = shift
+    bit_msk = (1 << bits) - 1
+    self.yyy_msk = bit_msk << (base + max(0,shift))
+    self.zzz_msk = bit_msk << (base - min(0,shift))
+
+  # operator ()    (transform integer)
+  def __call__(self, offset):
+    return offset ^ shiftr(offset & self.yyy_msk, self.shift)
+
+  # Size of the domain
+  def size(self):
+    return 1 << (bits + base + abs(shift))
+
+  # Size of the codomain
+  def cosize(self):
+    return self.size()
+
+  # print and str
+  def __str__(self):
+    return f"SW_{self.bits}_{self.base}_{self.shift}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Swizzle({self.bits},{self.base},{self.shift})"
+
+
+class ComposedLayout(LayoutBase):
+  def __init__(self, layoutB, offset, layoutA):
+    self.layoutB = layoutB
+    self.offset  = offset
+    self.layoutA = layoutA
+
+  # operator ==
+  def __eq__(self, other):
+    return self.layoutB == other.layoutB and self.offset == other.offset and self.layoutA == other.layoutA
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    return len(self.layoutA)
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    return self.layoutB(self.offset + self.layoutA(*args))
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    return ComposedLayout(self.layoutB, self.offset, self.layoutA[i])
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return size(self.layoutA)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return cosize(self.layoutB)
+
+  # print and str
+  def __str__(self):
+    return f"{self.layoutB} o {self.offset} o {self.layoutA}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"ComposedLayout({repr(self.layoutB)},{repr(self.offset)},{repr(self.layoutA)})"
diff --git a/python/pycute/typing.py b/python/pycute/typing.py
new file mode 100644
index 0000000000..0f16454ae5
--- /dev/null
+++ b/python/pycute/typing.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from abc import ABC
+
+
+class Integer(ABC):
+    @classmethod
+    def __subclasshook__(cls, c):
+        if c in [bool, float]:
+            return False
+
+        return issubclass(c, int)
diff --git a/python/setup.py b/python/setup.py
index 569889849c..6ff46c4419 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -29,116 +29,46 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #################################################################################################
+
+
 import copy
 import os
-from pybind11.setup_helpers import Pybind11Extension
 import setuptools
 from setuptools import setup
 from setuptools.command.build_ext import build_ext
 
+import setup_pycute
+import setup_library
 
-def _cutlass_path_from_dir() -> str:
-    cutlass_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../')
-    if not os.path.isdir(cutlass_path):
-        raise Exception(f'Environment variable "CUTLASS_PATH" is not defined, and default path of {cutlass_path} does not exist.')
-    return cutlass_path
-
-
-def _cuda_install_path_from_nvcc() -> str:
-    import subprocess
-    # Attempt to detect CUDA_INSTALL_PATH based on location of NVCC
-    result = subprocess.run(['which', 'nvcc'], capture_output=True)
-    if result.returncode != 0:
-        raise Exception(f'Unable to find nvcc via `which` utility.')
-
-    cuda_install_path = result.stdout.decode('utf-8').split('/bin/nvcc')[0]
-    if not os.path.isdir(cuda_install_path):
-        raise Exception(f'Environment variable "CUDA_INSTALL_PATH" is not defined, and default path of {cuda_install_path} does not exist.')
-
-    return cuda_install_path
-
-
-cutlass_path = (
-    os.getenv('CUTLASS_PATH')
-    if os.getenv('CUTLASS_PATH') is not None
-    else _cutlass_path_from_dir()
-)
-
-
-cuda_install_path = (
-    os.getenv('CUDA_INSTALL_PATH')
-    if os.getenv('CUDA_INSTALL_PATH') is not None
-    else _cuda_install_path_from_nvcc()
-)
-
-
-class BuildExtension(build_ext):
-    """
-    Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def build_extensions(self):
-        original_compile = self.compiler._compile
-
-        def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-            """
-            Wrapper around build_ext.compiler._compile method
-            """
-            postargs = copy.deepcopy(extra_postargs)
-            postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
-            postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
-            try:
-                original_compiler = self.compiler.compiler_so
-                self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
-                original_compile(obj, src, ext, cc_args, postargs, pp_opts)
-            finally:
-                self.compiler.set_executable('compiler_so', original_compiler)
-
-        self.compiler._compile = custom_compile
-        super().build_extensions()
 
+# Install cutlass_library package
+setup_library.perform_setup()
 
-include_dirs = [
-    cutlass_path + '/include',
-    cuda_install_path + '/include',
-    cutlass_path + '/tools/util/include',
-    cutlass_path + '/test',
-]
 
-
-library_dirs = [
-    cuda_install_path + '/lib64',
-]
-
-
-ext_modules = [
-    Pybind11Extension('cutlass_bindings',
-                      ['cutlass/cpp/cutlass_bindings.cpp'],
-                      include_dirs=include_dirs,
-                      library_dirs=library_dirs,
-                      extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
-                      libraries=['cudart'])
-]
+# Install the PyCuTe package
+setup_pycute.perform_setup()
 
 
 setup(
     name='cutlass',
-    version='3.1.0',
+    version='3.2.1',
     description='CUTLASS Pythonic Interface',
     package_dir={'': '.'},
-    packages=['cutlass', 'cutlass.emit', 'cutlass.op', 'cutlass.utils', 'cutlass.backend', 'cutlass.backend.utils'],
+    packages=[
+        'cutlass',
+        'cutlass.emit',
+        'cutlass.op',
+        'cutlass.utils',
+        'cutlass.backend',
+        'cutlass.backend.utils'
+        ],
     setup_requires=['pybind11'],
     install_requires=[
         'bfloat16',
         'cuda-python>=11.8.0',
         'pybind11',
         'scikit-build',
-        'treelib'
-        ],
-    ext_modules=ext_modules,
-    cmdclass={
-                 'build_ext': BuildExtension
-             }
+        'treelib',
+        'pydot'
+        ]
 )
diff --git a/test/python/backend/gemm/run_all_tests.py b/python/setup_library.py
similarity index 84%
rename from test/python/backend/gemm/run_all_tests.py
rename to python/setup_library.py
index 12b8c609cd..2aff4e0c5c 100644
--- a/test/python/backend/gemm/run_all_tests.py
+++ b/python/setup_library.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -17,7 +17,7 @@
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
@@ -30,12 +30,17 @@
 #
 #################################################################################################
 
-import cutlass.backend
-import unittest
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='cutlass_library',
+        version='3.2.1',
+        description='CUTLASS library generation scripts',
+        packages=['cutlass_library']
+    )
+
 
 if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    loader = unittest.TestLoader()
-    tests = loader.discover('./', 'gemm_*.py')
-    testRunner = unittest.runner.TextTestRunner()
-    testRunner.run(tests)
+    perform_setup()
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
new file mode 100644
index 0000000000..ab7881f067
--- /dev/null
+++ b/python/setup_pycute.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='pycute',
+        version='3.2.1',
+        description='Python implementation of CuTe',
+        packages=['pycute'],
+    )
+
+
+if __name__ == '__main__':
+    perform_setup()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d3ea3028da..bbc31de2a4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,5 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-add_subdirectory(unit)
-
+if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
+  add_subdirectory(unit)
+endif()
diff --git a/test/python/backend/conv/__init__.py b/test/python/backend/conv/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
deleted file mode 100644
index 831602b692..0000000000
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ /dev/null
@@ -1,233 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 30b1d5cb8e..0000000000
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=4, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
deleted file mode 100644
index c811cff710..0000000000
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32, 
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index e4b9d07de0..0000000000
--- a/test/python/backend/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Unity,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 42fa51874c..0000000000
--- a/test/python/backend/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,196 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_few_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=1)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=2, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 76787d1402..0000000000
--- a/test/python/backend/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,220 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-def conv2d_fixed_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
-    
-    def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
deleted file mode 100644
index 35f78a145e..0000000000
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ /dev/null
@@ -1,341 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float16)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 28),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 28),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 23, 56, 100),
-                cutlass_bindings.Tensor4DCoord(128, 3, 3, 100),
-                cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 28ee42c287..0000000000
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py b/test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
deleted file mode 100644
index c63c41ccdc..0000000000
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle2
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 5067b34252..0000000000
--- a/test/python/backend/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=2)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=2)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            )
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 85a62e2448..0000000000
--- a/test/python/backend/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,285 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64], stages=3, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 56, 56, 12),
-                cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(2, 2),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 55, 55, 12),
-                cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(2, 2),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
deleted file mode 100644
index df4f79eaa9..0000000000
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            cutlass_bindings.float16
-        )
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float16,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            cutlass_bindings.float16
-        )
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index cf7547c2fd..0000000000
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,274 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=8)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 256, 32], stages=3, 
-            warp_count=[1, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-    def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-                cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-                cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-                cutlass_bindings.MatrixCoord(3, 3),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
deleted file mode 100644
index 04f52da89f..0000000000
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend.conv2d_operation import *
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 8], stages=4, 
-            warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py b/test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
deleted file mode 100644
index 1e3d3e078c..0000000000
--- a/test/python/backend/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-from cutlass.backend.utils.device import device_cc
-import unittest
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=4)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=8)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 16], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        self.assertTrue(test_all_conv2d(operation))
-    
-    def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        A = TensorDescription(
-            element=math_inst.element_a, 
-            layout=cutlass_bindings.TensorNHWC,
-            alignment=1)
-        B = TensorDescription(
-            element=math_inst.element_b, 
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=1)
-        C = TensorDescription(
-            element=cutlass_bindings.float32,
-            layout=cutlass_bindings.TensorNHWC, 
-            alignment=4)
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32], stages=3, 
-            warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-
-        operation = Conv2dOperation(
-            conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
-            arch=80, tile_description=tile_description, A=A, B=B, C=C, 
-            stride_support=StrideSupport.Strided,
-            epilogue_functor=epilogue_functor,
-            swizzling_functor=cutlass_bindings.IdentitySwizzle1
-        )
-        
-        problem_sizes = [
-            cutlass_bindings.conv.Conv2dProblemSize(
-                cutlass_bindings.Tensor4DCoord(1, 8, 8, 1),
-                cutlass_bindings.Tensor4DCoord(1, 3, 3, 1),
-                cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.MatrixCoord(1, 1),
-                cutlass_bindings.conv.Mode.cross_correlation,
-                1, 1
-            ),
-        ]
-        
-        self.assertTrue(test_all_conv2d(operation, problem_sizes))
-    
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**26, 2**26)
-    unittest.main()
diff --git a/test/python/backend/gemm/__init__.py b/test/python/backend/gemm/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/python/backend/gemm/gemm_bf16_sm80.py b/test/python/backend/gemm/gemm_bf16_sm80.py
deleted file mode 100644
index d77a005edc..0000000000
--- a/test/python/backend/gemm/gemm_bf16_sm80.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmBF16TensorOpSm80(unittest.TestCase):
-    def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 128, 64],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 128, 32],
-            stages=6, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, cutlass_bindings.float32)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_bf16_sm90.py b/test/python/backend/gemm/gemm_bf16_sm90.py
deleted file mode 100644
index 9970c2188c..0000000000
--- a/test/python/backend/gemm/gemm_bf16_sm90.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.bfloat16
-        element_B = cutlass_bindings.bfloat16
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmBF16Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [4, 4, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 5)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None, persistent=True)
-add_test_simt(GemmBF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_f16_sm80.py b/test/python/backend/gemm/gemm_f16_sm80.py
deleted file mode 100644
index fed73dd16a..0000000000
--- a/test/python/backend/gemm/gemm_f16_sm80.py
+++ /dev/null
@@ -1,479 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF16Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
-            direct_store=True
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 64],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[256, 128, 64],
-            stages=3, warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 64],
-            stages=3, warp_count=[2, 1, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float16
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 32],
-            stages=10, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float16
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[256, 128, 64],
-            stages=3, warp_count=[4, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 64],
-            stages=3, warp_count=[2, 1, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16],
-            element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 256, 64],
-            stages=3, warp_count=[2, 4, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_f16_sm90.py b/test/python/backend/gemm/gemm_f16_sm90.py
deleted file mode 100644
index 357ec7d95d..0000000000
--- a/test/python/backend/gemm/gemm_f16_sm90.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-# Partial specialziation for naming tests
-name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
-
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-
-        element_A = cutlass_bindings.float16
-        element_B = cutlass_bindings.float16
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmF16Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-# Tests with 1x1x1 clusters
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], 5)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
-
-# Tests with different cluster shapes
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 2, 1], [64, 128, 64], None)
-
-# Tests for persistent warp-specialized threadblocks
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 2, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None, persistent=True)
-add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 4, 1], [64, 128, 64], None, persistent=True)
-
-# Tests using SIMT
-add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 8], 2)
-add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 8], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_f32_sm80.py b/test/python/backend/gemm/gemm_f32_sm80.py
deleted file mode 100644
index 31c2d2d55c..0000000000
--- a/test/python/backend/gemm/gemm_f32_sm80.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.memory_manager import get_allocated_size
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_fast_bf16
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-
-    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 8],
-            element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
-            element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_fast_f32
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**24, 2**24)
-    cutlass.backend.compiler.load_from_cache()
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_f64_sm80.py b/test/python/backend/gemm/gemm_f64_sm80.py
deleted file mode 100644
index afccac2f88..0000000000
--- a/test/python/backend/gemm/gemm_f64_sm80.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmF64TensorOpSm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4],
-            element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
-            element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[32, 32, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        # alignment 1 restricted for double
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-    
-    def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4],
-            element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
-            element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        # alignment 1 restricted for double
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_f64_sm90.py b/test/python/backend/gemm/gemm_f64_sm90.py
deleted file mode 100644
index 3d40c70f1b..0000000000
--- a/test/python/backend/gemm/gemm_f64_sm90.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.float64
-        element_B = cutlass_bindings.float64
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmF64Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float64, cutlass_bindings.float64, cutlass_bindings.float64, [1, 1, 1], [64, 64, 32], 2)
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_grouped_sm80.py b/test/python/backend/gemm/gemm_grouped_sm80.py
deleted file mode 100644
index 03800fbb34..0000000000
--- a/test/python/backend/gemm/gemm_grouped_sm80.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_grouped_testbed import TestbedGrouped
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmGroupedSm80(unittest.TestCase):
-    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
-            element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(24))
-    
-    def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
-        math_inst = MathInstruction(
-            instruction_shape=[8, 8, 4], element_a=cutlass_bindings.float64,
-            element_b=cutlass_bindings.float64, element_accumulator=cutlass_bindings.float64,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 16],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float64
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(24))
-    
-    def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
-        math_inst = MathInstruction(
-            instruction_shape=[1, 1, 1], element_a=cutlass_bindings.float32,
-            element_b=cutlass_bindings.float32, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.Simt,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 64, 8],
-            stages=4, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
-            alignment=1
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(27))
-    
-    def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
-            element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
-            opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 32],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        B = TensorDescription(
-            element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
-            alignment=8
-        )
-
-        C = TensorDescription(
-            element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.float32
-        epilogue_functor = LinearCombination(
-            C.element, C.alignment, 
-            math_inst.element_accumulator, element_epilogue)
-        swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
-
-        for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
-            operation = GemmOperationGrouped(
-                80,
-                tile_description, A, B, C,
-                epilogue_functor, swizzling_functor,
-                precompute_mode=precompute_mode
-            )
-
-            testbed = TestbedGrouped(operation=operation)
-
-            self.assertTrue(testbed.run(5))
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_s8_sm80.py b/test/python/backend/gemm/gemm_s8_sm80.py
deleted file mode 100644
index 1352f8e1d5..0000000000
--- a/test/python/backend/gemm/gemm_s8_sm80.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend.epilogue import LinearCombinationClamp
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
-class GemmS8TensorOpF32Sm80(unittest.TestCase):
-    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add_saturate
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[64, 64, 64],
-            stages=6, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajorInterleaved32,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
-            alignment=8
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "interleaved"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-
-        epilogue_functor = FastLinearCombinationClamp(
-            C.element, C.alignment
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int32, layout=cutlass_bindings.ColumnMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.int32
-
-        epilogue_functor = LinearCombinationClamp(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            element_epilogue
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C, 
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-    def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
-        math_inst = MathInstruction(
-            instruction_shape=[16, 8, 32],
-            element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
-            element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
-            math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=[128, 128, 128],
-            stages=3, warp_count=[2, 2, 1],
-            math_instruction=math_inst
-        )
-
-        A = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
-            alignment=16
-        )
-        B = TensorDescription(
-            element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
-            alignment=16
-        )
-        C = TensorDescription(
-            element=cutlass_bindings.int32, layout=cutlass_bindings.RowMajor,
-            alignment=4
-        )
-
-        element_epilogue = cutlass_bindings.int32
-
-        epilogue_functor = LinearCombinationClamp(
-            C.element, C.alignment, math_inst.element_accumulator, 
-            element_epilogue
-        )
-        
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=80, tile_description=tile_description,
-            A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
-        )
-
-        self.assertTrue(test_all_gemm(operation, "multistage"))
-    
-
-
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/backend/gemm/gemm_s8_sm90.py b/test/python/backend/gemm/gemm_s8_sm90.py
deleted file mode 100644
index 2491f3a826..0000000000
--- a/test/python/backend/gemm/gemm_s8_sm90.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-from functools import partial
-import cutlass.backend
-from cutlass.backend import *
-from cutlass.backend import library
-from cutlass.backend.test import *
-import unittest
-
-from cutlass.backend.test.utils import LayoutCombination, get_name
-from cutlass.backend.test.gemm_testbed import test_all_gemm
-from cutlass.backend.utils.device import device_cc
-
-
-name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
-
-def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
-             cluster_shape, threadblock_shape, stages, opclass, persistent=False):
-    """
-    Create a test-running function with the given specification and set it as a method of `cls`.
-
-    :param cls: class to which the generated method will be added
-    :type cls: type
-    :param layouts: indexable container of layouts of A, B, and C operands
-    :param alignments: indexable container of alignments of A, B, and C operands
-    :param element_output: data type of the output element
-    :param element_accumulator: data type used in accumulation
-    :param element_epilogue: data type used in computing the epilogue
-    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param persistent: whether this is a persistent warp-specialized kernel
-    :type persistent: bool
-    """
-
-    def run(self):
-        """
-        Dynamically-generated function that constructs a GEMM operation and verifies it against
-        multiple test cases.
-        """
-        element_A = cutlass_bindings.int8
-        element_B = cutlass_bindings.int8
-        inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
-        math_inst = MathInstruction(
-            instruction_shape=inst_shape,
-            element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
-            opcode_class=opclass, math_operation=MathOperation.multiply_add
-        )
-
-        tile_description = TileDescription(
-            threadblock_shape=threadblock_shape,
-            cluster_shape=cluster_shape,
-            stages=stages, warp_count=warp_count,
-            math_instruction=math_inst,
-            persistent=persistent
-        )
-
-        A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
-        B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
-        C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
-
-        if opclass == cutlass_bindings.OpClass.Simt:
-            epilogue_functor_cls = LinearCombinationClamp
-        else:
-            epilogue_functor_cls = LinearCombination
-        epilogue_functor = epilogue_functor_cls(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
-
-        swizzling_functor = cutlass_bindings.IdentitySwizzle1
-
-        operation = GemmOperationUniversal(
-            arch=90, tile_description=tile_description, A=A, B=B, C=C,
-            epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
-
-        self.assertTrue(test_all_gemm(operation, "universal"))
-
-    if persistent:
-        suffix = "_persistent"
-    else:
-        suffix = ""
-
-    name = name_fn(layouts, alignments, element_output, element_accumulator,
-                  element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
-    setattr(cls, name, run)
-
-    return run
-
-
-@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
-class GemmS8Sm90(unittest.TestCase):
-    """
-    Wrapper class to which tests will be added dynamically in __main__
-    """
-    pass
-
-
-add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
-add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
-
-# Tests with 1x1x1 clusters
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], 3)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8],  cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 64, 32], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
-
-# Tests with different cluster shapes
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 2, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 4, 1], [128, 128, 128], None)
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [4, 4, 1], [128, 128, 128], None)
-
-# Tests with persistent warp-specialized threadblocks
-add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 1, 1], [128, 128, 128], None, persistent=True)
-
-# Tests for SIMT
-add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 32, 8], 2)
-
-if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**30, 2**30)
-    unittest.main()
diff --git a/test/python/conv2d/conv2d_test_utils.py b/test/python/conv2d/conv2d_test_utils.py
deleted file mode 100644
index 4fc8f0a251..0000000000
--- a/test/python/conv2d/conv2d_test_utils.py
+++ /dev/null
@@ -1,508 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-"""
-Util Functions for Conv2d Test
-"""
-import torch
-import cutlass
-import unittest
-import cutlass_bindings
-from cutlass.utils.datatypes import binding_type, binding_opclass
-from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
-from cutlass.backend.utils.device import device_cc
-from cutlass.backend.test.utils import get_name_conv2d
-import numpy as np
-
-def conv2d_few_channel_problemsizes(channels):
-    problem_sizes = [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
-            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
-            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
-            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
-            cutlass_bindings.MatrixCoord(2, 2),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-
-    return problem_sizes
-
-torch_dtype = {
-    cutlass.DataType.f16: torch.float16,
-    cutlass.DataType.f32: torch.float32,
-    cutlass.DataType.f64: torch.float64
-}
-
-numpy_dtype = {
-    cutlass.DataType.f16: np.float16,
-    cutlass.DataType.f32: np.float32,
-    cutlass.DataType.f64: np.float64
-}
-
-
-def validate_problem_size(ps, conv_kind, split_k_slices):
-    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
-    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
-    if P != ps.P or Q != ps.Q:
-        return False
-
-    # Split-K (serial or parallel) is not supported for strided dgrad
-    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
-        return False
-    return True
-
-
-# Override the backend launcher
-class Conv2dLauncherFrontend(Conv2dLauncher):
-    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
-        self.operation = plan
-        self.conv_kind = plan.conv_kind
-        self.seed = seed
-        self.backend = backend
-        
-        self.dtype_A = plan._element_a
-        self.dtype_B = plan._element_b
-        self.dtype_C = plan._element_c
-        self.dtype_acc = plan._element_accumulator
-        
-        self.layout_A = cutlass_bindings.TensorNHWC
-        self.layout_B = cutlass_bindings.TensorNHWC
-        self.layout_C = cutlass_bindings.TensorNHWC
-        self.layout_D = cutlass_bindings.TensorNHWC
-        
-        self.element_compute = cutlass_bindings.float32
-        self.enable_cached_results = True
-        
-        # Get randomization_max
-        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
-            if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
-                self.randomization_max = 2
-            else:
-                self.randomization_max = 3
-        else:
-            self.randomization_max = 7
-            
-        self.activation = plan.activation
-        
-        self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
-            
-    
-    def set_seed(self):
-        if self.backend == "numpy":
-            np.random.seed(self.seed)
-        else:
-            torch.manual_seed(self.seed)
-    
-    def uniform_init(self, size, dtype):
-        if self.backend == "numpy":
-            return super().uniform_init(size, numpy_dtype[dtype])
-        else:
-            tensor = torch.ceil(
-                torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
-            ).to(memory_format=torch.channels_last)
-            return tensor
-    
-    def zeros_like(self, tensor):
-        if self.backend == "numpy":
-            return np.zeros_like(tensor)
-        else:
-            return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
-    
-    def reference(self, ps, A, B, C, alpha, beta, activation):
-        if self.backend == "numpy":
-            numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
-            return numpy_result
-        else:
-            if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-                torch_result = alpha * torch.ops.aten.conv2d(
-                    A,
-                    B,
-                    stride=(ps.stride_h, ps.stride_w),
-                    padding=(ps.pad_h, ps.pad_w),
-                    dilation=(ps.dilation_h, ps.dilation_w)
-                ) + beta * C
-            elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-                torch_result = alpha * torch.nn.grad.conv2d_input(
-                    (ps.N, ps.C, ps.H, ps.W),
-                    B,
-                    A,
-                    padding=(ps.pad_h, ps.pad_w),
-                    stride=(ps.stride_h, ps.stride_w)
-                ) + beta * C
-            elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
-                torch_result = alpha * torch.nn.grad.conv2d_weight(
-                    B,
-                    (ps.K, ps.C, ps.R, ps.S),
-                    A,
-                    padding=(ps.pad_h, ps.pad_w),
-                    stride=(ps.stride_h, ps.stride_w)
-                ) + beta * C
-            else:
-                raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
-            
-            if activation == cutlass.backend.epilogue.relu:
-                torch_result = torch.nn.functional.relu(torch_result)
-            elif activation == cutlass.backend.epilogue.leaky_relu:
-                torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
-            
-            return torch_result
-    
-    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
-        if self.element_compute == cutlass_bindings.float16:
-            alpha = cutlass_bindings.float16(alpha)
-            beta = cutlass_bindings.float16(beta)
-        elif self.element_compute == cutlass_bindings.int32:
-            alpha = int(alpha)
-            beta = int(beta)
-        else:
-            alpha = alpha
-            beta = beta
-
-        # If cached result is loaded
-        cached_result_loaded = False
-
-        if self.enable_cached_results:
-            # Get problem key
-            cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
-                self.conv_kind,
-                problem_size,
-                alpha,
-                beta,
-                getTensorView(
-                    tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-                ),
-                getTensorView(
-                    tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-                ),
-                getTensorView(
-                    tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-                ),
-            )
-            
-            cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
-
-            cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
-
-            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
-                self.operation.arch,
-                self.seed,
-            )
-
-            cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
-                conv2d_result_cache_name
-            )
-            # CachedTestResultListing cached_results(conv2d_result_cache_name);
-            cached = cached_results.find(cached_test_key)
-            cached_result_loaded = cached[0]
-            if cached_result_loaded:
-                cached_test_result = cached[1]
-
-        if not cached_result_loaded:
-            # Compute the conv2d on host
-            tensor_D_ref = np.ones_like(tensor_C)
-            tensor_ref_A = getTensorRef(
-                tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
-            )
-            tensor_ref_B = getTensorRef(
-                tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
-            )
-            tensor_ref_C = getTensorRef(
-                tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
-            )
-            tensor_ref_D_ref = getTensorRef(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            self.host_conv2d(
-                self.conv_kind,
-                problem_size,
-                tensor_ref_A,
-                tensor_ref_B,
-                tensor_ref_C,
-                tensor_ref_D_ref,
-                alpha,
-                beta,
-            )
-            
-            if activation == cutlass.backend.epilogue.leaky_relu:
-                tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
-            else:
-                tensor_D_ref = activation.numpy(tensor_D_ref)
-
-            tensor_view_D_ref = getTensorView(
-                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
-            )
-
-            if self.enable_cached_results:
-                cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
-                    tensor_view_D_ref
-                )
-                cached_results = (
-                    cutlass_bindings.test.conv.host.CachedTestResultListing(
-                        conv2d_result_cache_name
-                    )
-                )
-                cached_results.append(cached_test_key, cached_test_result)
-                cached_results.write(conv2d_result_cache_name)
-            else:
-                return tensor_D_ref
-
-        return cached_test_result.D
-    
-    def equal(self, tensor_D, tensor_D_ref, problem_size):
-        if self.backend == "numpy":
-            return super().equal(tensor_D, tensor_D_ref, problem_size)
-        else:
-            torch.cuda.synchronize()
-            return torch.equal(tensor_D, tensor_D_ref)
-                
-    
-    def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
-        
-        #
-        # Initialize input and output tensors
-        #
-        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
-                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
-                tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
-            else:
-                tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
-                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
-                tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
-                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
-                tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
-            else:
-                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
-                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
-                tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
-        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
-            if self.backend == "torch":
-                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
-                tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
-                tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
-            else:
-                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
-                tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
-                tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
-        else:
-            raise Exception(f"Conv kind {self.conv_kind} is not supported")
-
-        self.set_seed()
-
-        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
-        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
-        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
-        tensor_D = self.zeros_like(tensor_C)
-        
-        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D, 
-            stride=(ps.stride_h, ps.stride_w),
-            padding=(ps.pad_h, ps.pad_w),
-            dilation=(ps.dilation_h, ps.dilation_w),
-            alpha=alpha, beta=beta,
-            split_k=(split_k_mode, split_k_slices))
-        
-        tensor_D_ref = self.reference(
-            ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
-        )
-        
-        return self.equal(tensor_D, tensor_D_ref, ps)
-
-
-def add_test(
-    cls, 
-    cc, 
-    conv_kind,
-    problem_sizes,
-    element,
-    element_accumulator,
-    element_output,
-    opclass,
-    threadblock_shape,
-    warp_count,
-    instruction_shape,
-    stages,
-    iterator_algorithm=None,
-    swizzle=None,
-    split_k_mode="serial",
-    split_k_slices=1,
-    activation = "identity"
-):
-    """Create a test-running function with the given specification"""
-    test_name = get_name_conv2d(
-        cc, conv_kind, element, element_accumulator,
-        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
-        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
-    
-    def run(self):
-        # Create the plan
-        plan = cutlass.Conv2d(
-            kind=conv_kind,
-            element=element,
-            element_accumulator=element_accumulator,
-            element_C=element_output,
-            element_D=element_output
-        )
-        
-        # Set the opclass
-        plan.opclass = opclass
-        # Set the tile description
-        td = {
-            "threadblock_shape": threadblock_shape,
-            "warp_count": warp_count,
-            "stages": stages,
-            "instruction_shape": instruction_shape,
-        }
-
-        plan.tile_description = td
-        # Set iterator algorithm
-        if iterator_algorithm is not None:
-            plan.iterator_algorithm = iterator_algorithm
-        # Set swizzling functor
-        if swizzle is not None:
-            plan.swizzling_stride = swizzle
-        
-        if activation != "identity":
-            if activation == "leaky_relu":
-                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
-            else:
-                plan.activation = getattr(cutlass.epilogue, activation)
-        
-        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
-        
-        for ps in problem_sizes:
-            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
-            
-            self.assertTrue(
-                conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
-            )
-    
-    setattr(cls, test_name, run)
-    
-    return run
-
-
-def get_conv_problems():  
-    # 64: minimum channel size
-    conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
-    # Insert alignment 4 & 2 tests
-    conv_problems += [
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-        cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
-            cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
-            cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
-            1, 1
-        ),
-    ]
-    
-    return conv_problems
diff --git a/test/python/cutlass/conv2d/conv2d_problem_sizes.py b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
new file mode 100644
index 0000000000..3c16f406e9
--- /dev/null
+++ b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@@ -0,0 +1,660 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for defining Conv2D problem sizes for testing.
+
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+
+import cutlass
+from cutlass import ConvMode
+from cutlass.shape import Conv2DProblemSize
+
+
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+
+
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        return conv2d_problem_vector
+
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        return sizes
diff --git a/test/python/conv2d/conv2d_sm80.py b/test/python/cutlass/conv2d/conv2d_sm80.py
similarity index 79%
rename from test/python/conv2d/conv2d_sm80.py
rename to test/python/cutlass/conv2d/conv2d_sm80.py
index 32d897f90c..5e6af3fc9c 100644
--- a/test/python/conv2d/conv2d_sm80.py
+++ b/test/python/cutlass/conv2d/conv2d_sm80.py
@@ -31,56 +31,64 @@
 #################################################################################################
 
 """
-Low-level functionality tests for Conv2d operands on SM80
+Low-level functionality tests for Conv2d opreations on SM80
 """
-from conv2d_test_utils import *
-import cutlass
+
 import logging
+import unittest
+
+import cutlass
+from cutlass.backend.utils.device import device_cc
+
+from conv2d_test_utils import *
 
 
 cutlass.set_log_level(logging.WARNING)
 cc = 80
 
-@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
 class Conv2dSm80(unittest.TestCase):
     """
     Wrapper class to which tests will be added dynamically in __main__
     """
     pass
 
+
 conv_problems = get_conv_problems()
 
+
 # Tests for optimized & analytic
 for conv_kind in ["fprop", "wgrad", "dgrad"]:
     # F16, simt
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="simt", threadblock_shape=[128, 128, 8], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
         warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
     # F16, tensor op
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
     # F16, tensor op, analytic iterator
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
     # F16, tensor op, f32 output
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
     # F16, tensor op, different tile description
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
     # F32, simt
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, 
-        opclass="simt", threadblock_shape=[128, 128, 8], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
         warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
     # Tf32, tensorop
     add_test(
@@ -90,19 +98,19 @@ class Conv2dSm80(unittest.TestCase):
     )
     # Split-K
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
         split_k_slices=2)
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
         split_k_slices=5)
     # Swizzling functor
     add_test(
-        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-        opclass="tensor_op", threadblock_shape=[128, 64, 32], 
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
 
 # Tests for few channels and fixed channels
@@ -113,14 +121,14 @@ class Conv2dSm80(unittest.TestCase):
                                 [[16, 8, 16], [16, 8, 8]]):
     add_test(
         Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
-        opclass="tensor_op", threadblock_shape=tb, 
+        opclass="tensor_op", threadblock_shape=tb,
         warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
     )
 # F16, tensor op, fixed channels
 for c in [8, 4, 2]:
     add_test(
         Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
-        opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
         warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
     )
 
@@ -128,11 +136,11 @@ class Conv2dSm80(unittest.TestCase):
 for activation in ["relu", "leaky_relu"]:
     for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
         add_test(
-            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16, 
-            opclass="tensor_op", threadblock_shape=[128, 128, 64], 
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
             warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
             split_k_slices=split_k_slices, activation=activation)
-    
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/cutlass/conv2d/conv2d_test_utils.py b/test/python/cutlass/conv2d/conv2d_test_utils.py
new file mode 100644
index 0000000000..8cc288d0c3
--- /dev/null
+++ b/test/python/cutlass/conv2d/conv2d_test_utils.py
@@ -0,0 +1,425 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for Conv2d tests.
+"""
+
+import torch
+
+import cutlass
+from cutlass import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass.backend.utils.software import SubstituteTemplate
+from cutlass.shape import Conv2DProblemSize
+from cutlass.utils.datatypes import numpy_type, torch_type
+
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+
+
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+
+        self.element_compute = DataType.f32
+
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+
+        if activation == cutlass.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        torch.manual_seed(self.seed)
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+
+        torch.cuda.synchronize()
+        passed = torch.equal(tensor_D, tensor_D_ref)
+
+        return passed
+
+
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
+
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+
+    setattr(cls, test_name, run)
+
+    return run
+
+
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return conv_problems
diff --git a/test/python/backend/conv/run_all_tests.py b/test/python/cutlass/conv2d/run_all_tests.py
similarity index 85%
rename from test/python/backend/conv/run_all_tests.py
rename to test/python/cutlass/conv2d/run_all_tests.py
index abbad54c42..5e796d8b16 100644
--- a/test/python/backend/conv/run_all_tests.py
+++ b/test/python/cutlass/conv2d/run_all_tests.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -30,13 +30,15 @@
 #
 #################################################################################################
 
-import cutlass.backend
+import pathlib
 import unittest
-from cutlass.backend.memory_manager import *
+
 
 if __name__ == '__main__':
-    cutlass.backend.get_memory_pool(2**32, 2**32)
     loader = unittest.TestLoader()
-    tests = loader.discover('./', 'conv2d_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
     testRunner = unittest.runner.TextTestRunner()
-    testRunner.run(tests)
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/test/python/emit/pytorch.py b/test/python/cutlass/emit/pytorch.py
similarity index 89%
rename from test/python/emit/pytorch.py
rename to test/python/cutlass/emit/pytorch.py
index c1d8a591ff..8f6c2c6db7 100644
--- a/test/python/emit/pytorch.py
+++ b/test/python/cutlass/emit/pytorch.py
@@ -39,7 +39,6 @@
 import unittest
 
 import cutlass
-import cutlass_bindings
 
 if cutlass.utils.datatypes.torch_available:
     import torch
@@ -94,7 +93,7 @@ def _generate_conv2d_problem(conv_kind, dtype, ps):
     :type conv_kind: str
     :param dtype: data type of tensors
     :param problem_size: the conv2d problem size
-    :type problem_size: cutlass_bindings.conv.Conv2dProblemSize
+    :type problem_size: cutlass.shape.Conv2DProblemSize
 
     :return: initialized tensors A, B, C, and D
     :rtype: list
@@ -196,13 +195,11 @@ def test_conv2d_fprop(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
         
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
             1, 1
         )
         
@@ -239,13 +236,13 @@ def test_conv2d_dgrad(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
         
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
             1, 1
         )
         
@@ -273,13 +270,13 @@ def test_conv2d_wgrad(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
         
-        problem_size = cutlass_bindings.conv.Conv2dProblemSize(
-            cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
-            cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
-            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
-            cutlass_bindings.MatrixCoord(3, 3),
-            cutlass_bindings.MatrixCoord(1, 1),
-            cutlass_bindings.conv.Mode.cross_correlation,
+        problem_size = cutlass.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            cutlass.ConvMode.CrossCorrelation,
             1, 1
         )
         
diff --git a/test/python/cutlass/evt/evt_compute_sm80_90.py b/test/python/cutlass/evt/evt_compute_sm80_90.py
new file mode 100644
index 0000000000..e79a282298
--- /dev/null
+++ b/test/python/cutlass/evt/evt_compute_sm80_90.py
@@ -0,0 +1,100 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass import swizzle
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTComputeSM90(EVTTestCaseBase):
+
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_layout_sm80_90.py b/test/python/cutlass/evt/evt_layout_sm80_90.py
new file mode 100644
index 0000000000..3cbc95309f
--- /dev/null
+++ b/test/python/cutlass/evt/evt_layout_sm80_90.py
@@ -0,0 +1,173 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLayoutSM90(EVTTestCaseBase):
+
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_load_sm80_90.py b/test/python/cutlass/evt/evt_load_sm80_90.py
new file mode 100644
index 0000000000..57a8ed0d45
--- /dev/null
+++ b/test/python/cutlass/evt/evt_load_sm80_90.py
@@ -0,0 +1,142 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for load nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTLoadSM90(EVTTestCaseBase):
+
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_mixed_sm80_90.py b/test/python/cutlass/evt/evt_mixed_sm80_90.py
new file mode 100644
index 0000000000..82fd75bbbc
--- /dev/null
+++ b/test/python/cutlass/evt/evt_mixed_sm80_90.py
@@ -0,0 +1,274 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unittest for mixed types of nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+from cutlass.swizzle import ThreadblockSwizzleStreamK
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTMixedSM90(EVTTestCaseBase):
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        if device_cc() == 80:
+            aligments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            aligments = [8,]
+        for align in aligments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/cutlass/evt/evt_store_sm80_90.py b/test/python/cutlass/evt/evt_store_sm80_90.py
new file mode 100644
index 0000000000..7046d4d0d3
--- /dev/null
+++ b/test/python/cutlass/evt/evt_store_sm80_90.py
@@ -0,0 +1,155 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass
+from cutlass.backend import *
+from cutlass.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class TestEVTStoreSM90(EVTTestCaseBase):
+
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/conv2d/run_all_tests.py b/test/python/cutlass/evt/run_all_tests.py
similarity index 93%
rename from test/python/conv2d/run_all_tests.py
rename to test/python/cutlass/evt/run_all_tests.py
index 63bbefb30a..4760490091 100644
--- a/test/python/conv2d/run_all_tests.py
+++ b/test/python/cutlass/evt/run_all_tests.py
@@ -30,12 +30,14 @@
 #
 #################################################################################################
 
+import pathlib
 import unittest
 
 
 if __name__ == '__main__':
     loader = unittest.TestLoader()
-    tests = loader.discover('./', 'conv2d_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
     testRunner = unittest.runner.TextTestRunner()
     results = testRunner.run(tests)
     if not results.wasSuccessful():
diff --git a/test/python/cutlass/evt/utils/evt_testbed.py b/test/python/cutlass/evt/utils/evt_testbed.py
new file mode 100644
index 0000000000..ea2ecc947e
--- /dev/null
+++ b/test/python/cutlass/evt/utils/evt_testbed.py
@@ -0,0 +1,230 @@
+################################################################################
+#
+# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Testbed classes of EVT
+"""
+
+import torch
+import unittest
+
+import cutlass
+from cutlass import Tensor
+import cutlass.backend.evt
+from cutlass.profiler import CUDAEventProfiler
+from cutlass.shape import GemmCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+
+        if self.layout_B == cutlass.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+
+        if self.layout_C == cutlass.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+        if self.layout_C == cutlass.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+
+        return torch.flatten(out)
+
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+        
+
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        
+        # Create the Gemm plan
+        self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        
+        self.profile = profile
+
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
+        
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+
+
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        
+        self.element = cutlass.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        
+        self.problem_size = (self.m, self.n, self.k)
+        
+        torch.random.manual_seed(42)
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
+    
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        
+        return problem_sizes
diff --git a/test/python/gemm/gemm_batched.py b/test/python/cutlass/gemm/gemm_batched.py
similarity index 95%
rename from test/python/gemm/gemm_batched.py
rename to test/python/cutlass/gemm/gemm_batched.py
index 69b425c930..77592740b6 100644
--- a/test/python/gemm/gemm_batched.py
+++ b/test/python/cutlass/gemm/gemm_batched.py
@@ -35,15 +35,15 @@
 """
 
 from functools import partial
+import logging
 from math import prod
+import unittest
 
 import cutlass
-import logging
+from cutlass.backend.utils.device import device_cc
 import torch
-import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
-from cutlass.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
 
 cutlass.set_log_level(logging.WARNING)
 
@@ -130,10 +130,5 @@ def test_batched_B(self):
         self.run_batched((3,), False, True, False)
         self.run_batched((2, 3), False, True, False)
 
-    def test_batched_C(self):
-        self.run_batched((3,), False, False, True)
-        self.run_batched((2, 3), False, False, True)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/gemm/gemm_f16_sm80.py b/test/python/cutlass/gemm/gemm_f16_sm80.py
similarity index 99%
rename from test/python/gemm/gemm_f16_sm80.py
rename to test/python/cutlass/gemm/gemm_f16_sm80.py
index 39174a0e5d..e2ec3718a3 100644
--- a/test/python/gemm/gemm_f16_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm80.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 80
diff --git a/test/python/gemm/gemm_f16_sm90.py b/test/python/cutlass/gemm/gemm_f16_sm90.py
similarity index 99%
rename from test/python/gemm/gemm_f16_sm90.py
rename to test/python/cutlass/gemm/gemm_f16_sm90.py
index 90236d0554..7df305267a 100644
--- a/test/python/gemm/gemm_f16_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm90.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 90
diff --git a/test/python/gemm/gemm_f32_sm80.py b/test/python/cutlass/gemm/gemm_f32_sm80.py
similarity index 98%
rename from test/python/gemm/gemm_f32_sm80.py
rename to test/python/cutlass/gemm/gemm_f32_sm80.py
index f03ef737bd..0dca12c00b 100644
--- a/test/python/gemm/gemm_f32_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f32_sm80.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 80
diff --git a/test/python/gemm/gemm_f64_sm80.py b/test/python/cutlass/gemm/gemm_f64_sm80.py
similarity index 98%
rename from test/python/gemm/gemm_f64_sm80.py
rename to test/python/cutlass/gemm/gemm_f64_sm80.py
index e1fc5d78c2..32c0348359 100644
--- a/test/python/gemm/gemm_f64_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm80.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 80
diff --git a/test/python/gemm/gemm_f64_sm90.py b/test/python/cutlass/gemm/gemm_f64_sm90.py
similarity index 97%
rename from test/python/gemm/gemm_f64_sm90.py
rename to test/python/cutlass/gemm/gemm_f64_sm90.py
index 7626bafc57..a3145a4b33 100644
--- a/test/python/gemm/gemm_f64_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm90.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 90
diff --git a/test/python/gemm/gemm_s8_sm80.py b/test/python/cutlass/gemm/gemm_s8_sm80.py
similarity index 98%
rename from test/python/gemm/gemm_s8_sm80.py
rename to test/python/cutlass/gemm/gemm_s8_sm80.py
index 3ca2e67449..f98770a056 100644
--- a/test/python/gemm/gemm_s8_sm80.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm80.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 80
diff --git a/test/python/gemm/gemm_s8_sm90.py b/test/python/cutlass/gemm/gemm_s8_sm90.py
similarity index 98%
rename from test/python/gemm/gemm_s8_sm90.py
rename to test/python/cutlass/gemm/gemm_s8_sm90.py
index 2ea4ddd22b..aafa9fd238 100644
--- a/test/python/gemm/gemm_s8_sm90.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm90.py
@@ -35,14 +35,14 @@
 """
 
 from functools import partial
-
-import cutlass
 import logging
 import unittest
 
-from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
+import cutlass
 from cutlass.backend.utils.device import device_cc
 
+from utils import LayoutCombination, add_test_gemm
+
 
 cutlass.set_log_level(logging.WARNING)
 cc = 90
diff --git a/test/python/cutlass/gemm/gemm_testbed.py b/test/python/cutlass/gemm/gemm_testbed.py
new file mode 100644
index 0000000000..ac0a551d21
--- /dev/null
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@@ -0,0 +1,387 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+import os
+import re
+import subprocess
+
+import torch
+
+from cutlass import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+
+from cutlass.backend import compiler
+from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass.backend.memory_manager import get_allocated_size
+from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass.shape import GemmCoord, MatrixCoord
+from cutlass.utils.datatypes import torch_type
+
+
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        # Create the reduction kernel, if needed
+        self.reduction_operation: ReductionOperation = ReductionOperation(
+            shape=MatrixCoord(4, 32 * operation.C.alignment),
+            C=operation.C,
+            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_compute=operation.epilogue_functor.element_epilogue,
+            epilogue_functor=operation.epilogue_functor,
+            count=operation.C.alignment,
+        )
+
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            op_list.append(self.reduction_operation)
+
+        compiler.add_module(op_list, bypass_cache=False)
+
+        self.operation = operation
+
+        self.dtype_A = torch_type(operation.A.element)
+        self.dtype_B = torch_type(operation.B.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.C.element)
+
+        accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
+        element_size = DataTypeSize[operation.A.element]
+
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+
+        self.seed = seed
+
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+
+        if dtype == torch.float64 or dtype == torch.float32:
+            data = data.to("cpu")
+
+        data_ref = data.reshape(shape)
+
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+
+        data_cutlass = data_cutlass.to("cuda")
+        return data_cutlass, data_ref
+
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        tensor_C = tensor_C.to(device)
+
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        tensor_C, tensor_C_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_C,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_C)
+
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+
+        self.operation.run(arguments)
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+
+            tensor_D_ref = tensor_D_ref.to('cuda')
+
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+
+            passed = tensor_D.equal(tensor_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+
+        cur_size = get_allocated_size()
+        assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
+
+        return passed
+
+
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    assert testcase != "interleaved"
+
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+
+                                problem_size = GemmCoord(m, n, k)
+
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+
+                                if not passed:
+                                    return False
+
+    return passed
diff --git a/test/python/gemm/run_all_tests.py b/test/python/cutlass/gemm/run_all_tests.py
similarity index 93%
rename from test/python/gemm/run_all_tests.py
rename to test/python/cutlass/gemm/run_all_tests.py
index 57b23a22ab..2634430174 100644
--- a/test/python/gemm/run_all_tests.py
+++ b/test/python/cutlass/gemm/run_all_tests.py
@@ -30,12 +30,14 @@
 #
 #################################################################################################
 
+import pathlib
 import unittest
 
 
 if __name__ == '__main__':
     loader = unittest.TestLoader()
-    tests = loader.discover('./', 'gemm_*.py')
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
     testRunner = unittest.runner.TextTestRunner()
     results = testRunner.run(tests)
     if not results.wasSuccessful():
diff --git a/python/cutlass/backend/test/utils.py b/test/python/cutlass/gemm/utils.py
similarity index 70%
rename from python/cutlass/backend/test/utils.py
rename to test/python/cutlass/gemm/utils.py
index 7aa9b211aa..7282fe5a50 100644
--- a/python/cutlass/backend/test/utils.py
+++ b/test/python/cutlass/gemm/utils.py
@@ -31,22 +31,29 @@
 #################################################################################################
 
 import cutlass
-import cutlass_bindings
 
-from cutlass import EpilogueScheduleSuffixes, KernelScheduleSuffixes
-from cutlass.utils.datatypes import binding_opclass, binding_type
+from cutlass import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
 from cutlass.backend import library
-from cutlass.backend.test.gemm_testbed import test_all_gemm
 from cutlass.backend.utils.software import SubstituteTemplate
 
+from gemm_testbed import test_all_gemm
+
 
 class Layout:
     """
     Utility class to map transpose and non-transpose terminology to row- and column-major terminology
     """
 
-    T = cutlass_bindings.RowMajor
-    N = cutlass_bindings.ColumnMajor
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
 
 
 class LayoutCombination:
@@ -98,7 +105,7 @@ def get_name(
     :param arch: compute capability of kernel being generated
     :type arch: int
     :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
+    :type opclass: cutlass.OpcodeClass
     :param kernel_schedule: kernel_schedule type
     :type kernel_schedule: cutlass.KernelScheduleType
     :param epilogue_schedule: epilogue_schedule type
@@ -113,14 +120,14 @@ def get_name(
         name_format,
         {
             "arch": str(arch),
-            "eA": library.DataTypeNames[binding_type(element_a)],
-            "eB": library.DataTypeNames[binding_type(element_b)],
-            "eC": library.DataTypeNames[binding_type(element_output)],
-            "lA": library.ShortLayoutTypeNames[layouts[0]],
-            "lB": library.ShortLayoutTypeNames[layouts[1]],
-            "lC": library.ShortLayoutTypeNames[layouts[2]],
-            "opclass": library.OpcodeClassNames[binding_opclass(opclass)],
-            "acc": library.DataTypeNames[binding_type(element_accumulator)],
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_output],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
             "cM": str(cluster_shape[0]),
             "cN": str(cluster_shape[1]),
             "cK": str(cluster_shape[2]),
@@ -137,81 +144,6 @@ def get_name(
         },
     )
 
-def get_name_conv2d(
-    arch,
-    conv_kind,
-    element,
-    element_accumulator,
-    element_output,
-    opclass,
-    threadblock_shape,
-    warp_count,
-    instruction_shape,
-    stages,
-    iterator_algorithm,
-    swizzle,
-    split_k_mode,
-    split_k_slices,
-    activation
-):
-    """
-    Generates a procedural name for a test case for conv2d
-    
-    :param arch: compute capability of kernel being generated
-    :type arch: int
-    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
-    :type conv_kind: str
-    :param iterator_algorithm: the iterator algorithm applied
-    :type iterator_algorithm: cutlass_bindings.conv.IteratorAlgorithm
-    :param element_a: data type of operand A
-    :param element_b: data type of operand B
-    :param element_c: data type of operand C
-    :param element_accumulator: data type used in accumulation
-    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass_bindings.OpClass
-    :param threadblock_shape: indexable container of dimensions of threadblock tiles
-    :param stages: number of pipeline stages to use in the kernel
-    :type stages: int
-    :param stride_support: stride support of dgrad
-    :param alignment: int
-    :type alignment: int
-    
-    :return: str
-    """
-    if iterator_algorithm is None:
-        iterator_algorithm = "AUTO"
-    if swizzle is None:
-        swizzle = 1
-    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
-    
-    return SubstituteTemplate(
-        name_format,
-        {
-            "arch": str(arch),
-            "conv_kind": conv_kind,
-            "iter_alg": iterator_algorithm,
-            "eA": library.DataTypeNames[binding_type(element)],
-            "eB": library.DataTypeNames[binding_type(element)],
-            "eC": library.DataTypeNames[binding_type(element_output)],
-            "opclass": opclass,
-            "acc": library.DataTypeNames[binding_type(element_accumulator)],
-            "tbM": str(threadblock_shape[0]),
-            "tbN": str(threadblock_shape[1]),
-            "tbK": str(threadblock_shape[2]),
-            "wM": str(threadblock_shape[0] // warp_count[0]),
-            "wN": str(threadblock_shape[1] // warp_count[1]),
-            "wK": str(threadblock_shape[2] // warp_count[2]),
-            "IM": str(instruction_shape[0]),
-            "IN": str(instruction_shape[1]),
-            "IK": str(instruction_shape[2]),
-            "stages": str(stages),
-            "swizzle": str(swizzle),
-            "split_k_mode": split_k_mode,
-            "split_k_slices": str(split_k_slices),
-            "activation": activation
-        }
-    )
-    
 
 def add_test_gemm(
     cls=None,
@@ -256,7 +188,7 @@ def add_test_gemm(
     :param stages: number of pipeline stages to use in the kernel
     :type stages: int
     :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
-    :type opclass: cutlass.OpClass
+    :type opclass: cutlass.OpcodeClass
     :param swizzle: threadblock swizzling functor
     :param kernel_schedule: kernel schedule to use
     :type kernel_schedule: cutlass.KernelScheduleType
@@ -286,11 +218,13 @@ def run(self):
             plan.opclass = opclass
             if swizzle is not None:
                 plan.swizzling_functor = swizzle
+
             td = plan.tile_descriptions()[0]
-            td.threadblock_shape = threadblock_shape
-            td.stages = stages
+
             if warp_count is not None:
                 td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
             td.cluster_shape = cluster_shape
             op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
             self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
diff --git a/test/python/interface/conv2d_interface.py b/test/python/cutlass/interface/conv2d_interface.py
similarity index 99%
rename from test/python/interface/conv2d_interface.py
rename to test/python/cutlass/interface/conv2d_interface.py
index 9667979621..4937c4a0e0 100644
--- a/test/python/interface/conv2d_interface.py
+++ b/test/python/cutlass/interface/conv2d_interface.py
@@ -38,7 +38,6 @@
 import unittest
 
 import cutlass
-import cutlass_bindings
 import cutlass.utils.datatypes as datatypes
 from cutlass.backend.utils.device import device_cc
 from utils import ExpectException
diff --git a/test/python/cutlass/interface/evt_interface.py b/test/python/cutlass/interface/evt_interface.py
new file mode 100644
index 0000000000..bd284f9e9e
--- /dev/null
+++ b/test/python/cutlass/interface/evt_interface.py
@@ -0,0 +1,245 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Test the EVT interface
+"""
+
+import numpy as np
+import unittest
+
+import cutlass
+from cutlass import LayoutType, Tensor
+from cutlass.backend.utils.device import device_cc
+from cutlass.epilogue import reshape, permute
+
+from utils import ExpectException
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(device_cc() == 90, 
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            
+            cutlass.epilogue.trace(evt_root_not_d, example_tensors)
+
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass.epilogue.trace(evt_no_accum, example_tensors)
+    
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D = D4 + C5
+            return D, D1, D2, D3, D4
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        
+        plan = cutlass.op.Gemm(
+            element=np.float16, layout=cutlass.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        
+        with ExpectException(True, 
+            "RuntimeError: The epilogue consumes too much shared memory. " 
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass.epilogue.trace(evt_redefine, example_tensors)
+
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass.epilogue.trace(evt_undefine, example_tensors)
+    
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass.epilogue.trace(evt_return_expr, example_tensors)
+    
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, 
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
+    
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/interface/gemm_interface.py b/test/python/cutlass/interface/gemm_interface.py
similarity index 98%
rename from test/python/interface/gemm_interface.py
rename to test/python/cutlass/interface/gemm_interface.py
index d8b7d648be..2429718280 100644
--- a/test/python/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@@ -38,7 +38,6 @@
 import unittest
 
 import cutlass
-import cutlass_bindings
 import cutlass.utils.datatypes as datatypes
 from cutlass.backend.utils.device import device_cc
 from utils import ExpectException
@@ -262,13 +261,13 @@ def test_opclass_switch(self):
 
         # Ensure that all tile descriptions have opclass of TensorOp
         for td in plan.tile_descriptions():
-            assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
 
         plan.opclass = cutlass.OpcodeClass.Simt
 
         # Ensure that all tile descriptions have opclass of Simt
         for td in plan.tile_descriptions():
-            assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.Simt
+            assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
 
     def test_invalid_tile_description(self):
         """
diff --git a/test/python/interface/utils.py b/test/python/cutlass/interface/utils.py
similarity index 91%
rename from test/python/interface/utils.py
rename to test/python/cutlass/interface/utils.py
index b7050d6c28..aca516fd72 100644
--- a/test/python/interface/utils.py
+++ b/test/python/cutlass/interface/utils.py
@@ -50,9 +50,10 @@ class ExpectException:
     :param message: message to print if an exception is raised when not expected or vice versa
     :type message: str
     """
-    def __init__(self, exception_expected: bool, message: str = ''):
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
         self.exception_expected = exception_expected
         self.message = message
+        self.verify_msg = verify_msg
 
     def __enter__(self):
         return self
@@ -60,6 +61,9 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, traceback):
         exception_raised = exc_type is not None
         assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
 
         # Suppress the exception
         return True
diff --git a/test/python/pycute/run_all_tests.py b/test/python/pycute/run_all_tests.py
new file mode 100644
index 0000000000..772c167958
--- /dev/null
+++ b/test/python/pycute/run_all_tests.py
@@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility script for discovering and running all PyCuTe tests
+"""
+
+import argparse
+import logging
+import pathlib
+import unittest
+
+
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level into the numeric identifier used
+  in setting the log level
+
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f"Invalid log level: {log_level}")
+  return numeric_level
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  args = parser.parse_args()
+
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
+  loader = unittest.TestLoader()
+  script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+  tests = loader.discover(script_dir, "test_*.py")
+  test_runner = unittest.runner.TextTestRunner()
+  results = test_runner.run(tests)
+  if not results.wasSuccessful():
+    raise Exception("Test cases failed")
diff --git a/test/python/pycute/test_coalesce.py b/test/python/pycute/test_coalesce.py
new file mode 100644
index 0000000000..60333a1e6d
--- /dev/null
+++ b/test/python/pycute/test_coalesce.py
@@ -0,0 +1,95 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.coalesce
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestCoalesce(unittest.TestCase):
+  def helper_test_coalesce(self, layout):
+    layoutR = coalesce(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    self.assertEqual(size(layoutR), size(layout))
+
+    for i in range(size(layout)):
+      self.assertEqual(layoutR(i), layout(i))
+
+  def test_coalesce(self):
+    layout = Layout(1,0)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(1,1)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (1,6,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (1,7,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (4,7,8))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,(4,6)))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4), (4,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (24,6,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,3), (2,4,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
+    self.helper_test_coalesce(layout)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_complement.py b/test/python/pycute/test_complement.py
new file mode 100644
index 0000000000..67a534616a
--- /dev/null
+++ b/test/python/pycute/test_complement.py
@@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.complement
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComplement(unittest.TestCase):
+  def helper_test_complement(self, layout):
+    layoutR = complement(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    # Post-condition: test disjointness of the codomains
+    for a in range(size(layout)):
+      for b in range(size(layoutR)):
+        assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+
+  def test_complement(self):
+    test = Layout(1,0)
+    self.helper_test_complement(test)
+
+    test = Layout(1,1)
+    self.helper_test_complement(test)
+
+    test = Layout(4,0)
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,3),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,4))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4,8),(8,1,64))
+    self.helper_test_complement(test)
+
+    test = Layout(((2,2),(2,2)),((1,4),(8,32)))
+    self.helper_test_complement(test)
+
+    test = Layout((2,(3,4)),(3,(1,6)))
+    self.helper_test_complement(test)
+
+    test = Layout((4,6),(1,6))
+    self.helper_test_complement(test)
+
+    test = Layout((4,10),(1,10))
+    self.helper_test_complement(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_composition.py b/test/python/pycute/test_composition.py
new file mode 100644
index 0000000000..da438da43b
--- /dev/null
+++ b/test/python/pycute/test_composition.py
@@ -0,0 +1,204 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.composition
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComposition(unittest.TestCase):
+  def helper_test_composition(self, layoutA, layoutB):
+    layoutR = composition(layoutA, layoutB)
+
+    _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+
+    # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+
+    # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+    for i in range(size(layoutR)):
+      self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+
+  def test_composition(self):
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((1), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((1), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((2,3), (2,4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8), (8,1))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    layoutB = Layout(8, 4)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((4,2)), ((1,16)))
+    layoutB = Layout((4,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((2,2), (2,1))
+    layoutB = Layout((2,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2))
+    layoutB = Layout((2,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((2,2,2), (1,8,2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((4,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_int_tuple.py b/test/python/pycute/test_int_tuple.py
new file mode 100644
index 0000000000..b2658fa401
--- /dev/null
+++ b/test/python/pycute/test_int_tuple.py
@@ -0,0 +1,80 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.int_tuple
+"""
+
+import unittest
+
+from pycute import *
+
+
+class TestIntTuple(unittest.TestCase):
+  def test_product(self):
+    self.assertEqual(product(2), 2)
+
+    self.assertEqual(product((3,2)), 6)
+
+    self.assertEqual(product(product(((2,3),4))), 24)
+
+  def test_inner_product(self):
+    self.assertEqual(inner_product(2, 3), 6)
+
+    self.assertEqual(inner_product((1,2), (3,2)), 7)
+
+    self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
+
+  def test_shape_div(self):
+    self.assertEqual(shape_div((3,4), 6), (1,2))
+
+    self.assertEqual(shape_div((3,4), 12), (1,1))
+
+    self.assertEqual(shape_div((3,4), 36), (1,1))
+
+    self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
+
+    self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
+
+  def test_prefix_product(self):
+    self.assertEqual(prefix_product(2), 1)
+
+    self.assertEqual(prefix_product((3,2)), (1,3))
+
+    self.assertEqual(prefix_product((3,2,4)), (1,3,6))
+
+    self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
+
+    self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5,  2,  1))),
+                                    ((1,2),(6,12,12),(24,120,240)))
+
+
diff --git a/test/python/pycute/test_left_inverse.py b/test/python/pycute/test_left_inverse.py
new file mode 100644
index 0000000000..0897c7389b
--- /dev/null
+++ b/test/python/pycute/test_left_inverse.py
@@ -0,0 +1,87 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestLeftInverse(unittest.TestCase):
+  def helper_test_left_inverse(self, layout):
+    inv_layout = left_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(layout)):
+      self.assertEqual(inv_layout(layout(i)), i)
+
+  def test_left_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_left_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_left_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_right_inverse.py b/test/python/pycute/test_right_inverse.py
new file mode 100644
index 0000000000..7092c49ada
--- /dev/null
+++ b/test/python/pycute/test_right_inverse.py
@@ -0,0 +1,96 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestRightInverse(unittest.TestCase):
+  def helper_test_right_inverse(self, layout):
+    inv_layout = right_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(inv_layout)):
+      self.assertEqual(layout(inv_layout(i)), i)
+
+  def test_right_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((3,7),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4),(0,2))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_right_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/python/pycute/test_typing.py b/test/python/pycute/test_typing.py
new file mode 100644
index 0000000000..02afed3237
--- /dev/null
+++ b/test/python/pycute/test_typing.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.typing
+"""
+
+import logging
+import unittest
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestTyping(unittest.TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1., Integer, False)
+        self.helper_test_typing(str, 'hi', Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
index 0f794f2fd3..bda105070d 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -42,8 +42,6 @@
 #include "conv2d_testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
-
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32,
   128x128_128x2_64x64x128) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
index 096e44f0e0..da51e6c8f7 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -42,7 +42,6 @@
 #include "conv2d_testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-    
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32,
   128x128_64x3_64x64x64) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
index 9d77fb1834..2052e83097 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -42,7 +42,6 @@
 #include "conv2d_testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32,
   128x128_64x2_64x64x64) {
 
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
index d15f5c99bf..720d804453 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -42,7 +42,6 @@
 #include "conv2d_testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-    
 TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32,
   128x128_64x2_64x64x64) {
 
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index 8041a78e17..0abcf71e84 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -43,3 +43,23 @@ cutlass_test_unit_add_executable(
   numeric_conversion.cu
   functional.cu
   )
+
+#
+# CUTLASS 3x increases the host compiler requirements to C++17. However, there are
+# certain existing integrations that will benefit from maintaining C++11 compatibility.
+#
+# This requirement only applies to select .h files which are explicitly annotated. It
+# does not apply to any .hpp file.
+#
+# `cutlass_test_unit_core_cpp11` enforces the C++11 requirement.
+#
+
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+add_executable(
+  cutlass_test_unit_core_cpp11
+
+  cpp11.cu
+)
+
diff --git a/test/unit/core/cpp11.cu b/test/unit/core/cpp11.cu
new file mode 100644
index 0000000000..42fdbcadfc
--- /dev/null
+++ b/test/unit/core/cpp11.cu
@@ -0,0 +1,86 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+  Note:  CUTLASS 3x increases the host compiler requirements to C++17. However, certain
+         existing integrations of CUTLASS require C++11 host compilers.
+
+         Until this requirement can be lifted, certain headers with this annotation are required
+         to be remain consistent with C++11 syntax.
+
+         C++11 compatibility is enforced by this unit test: `cutlass_test_unit_core_cpp11`.
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cutlass/cutlass.h>
+
+#include <cutlass/coord.h>
+#include <cutlass/core_io.h>
+#include <cutlass/array.h>
+#include <cutlass/bfloat16.h>
+#include <cutlass/fast_math.h>
+#include <cutlass/float8.h>
+#include <cutlass/half.h>
+#include <cutlass/integer_subbyte.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <cutlass/numeric_size.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tfloat32.h>
+#include <cutlass/workspace.h>
+#include <cutlass/subbyte_reference.h>
+
+#include <cutlass/conv/convolution.h>
+#include <cutlass/conv/conv2d_problem_size.h>
+
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator_params.h>
+
+#include <cutlass/gemm/gemm_enumerated_types.h>
+#include <cutlass/gemm/kernel/tile_scheduler_params.h>
+#include <cutlass/gemm/threadblock/threadblock_swizzle_streamk.h>
+
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/pitch_linear.h>
+
+#include <cutlass/transform/threadblock/predicated_tile_access_iterator_params.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (201700L <= __cplusplus )
+#error "This file and all of its includes must be compilable as C++11."
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main() {
+  return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/cute/core/CMakeLists.txt b/test/unit/cute/core/CMakeLists.txt
index a8eb4fbed2..b333974d75 100644
--- a/test/unit/cute/core/CMakeLists.txt
+++ b/test/unit/cute/core/CMakeLists.txt
@@ -36,7 +36,7 @@ cutlass_test_unit_add_executable(
   compare.cpp
   complement.cpp
   composition.cpp
-  constant_arithmetic.cpp
+  constants.cpp
   core_unit.cpp
   inverse_left.cpp
   inverse_right.cpp
diff --git a/test/unit/cute/core/constant_arithmetic.cpp b/test/unit/cute/core/constant_arithmetic.cpp
deleted file mode 100644
index c11da9e3ce..0000000000
--- a/test/unit/cute/core/constant_arithmetic.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass_unit_test.h"
-#include <cutlass/trace.h>
-#include <cute/swizzle.hpp>
-
-TEST(CuTe_core, ConstantArithmetic) {
-  using namespace cute;
-
-  constexpr cute::integral_constant<uint32_t, 0> uzero{};
-
-  // This extra test exists historically as part of the diagnosis
-  // of a possible Clang 14 bug.  However, it's a nice test for
-  // cute::integral_constant's arithmetic operators, so it's saved here.
-  // It also demonstrates how to work with cute::integral_constant
-  // and lambda captures.  Microsoft Visual Studio ("MSVC") tends to
-  // disagree with other compilers about the meaning of decltype
-  // for variables captured by reference.  MSVC and GCC 8.3.0
-  // also tend to disagree with other compilers (and other GCC versions)
-  // about whether expressions involving such variables
-  // are constant expressions.
-  //
-  // A typical CuTe idiom is to do lambda captures by reference [&].
-  // This test changes them to capture by value, except for
-  // the innermost lambda's capture of S1, which is by reference.
-  // The point is to show that MSVC and GCC 8 have issues with this
-  // that other compilers do not.  For example,
-  //
-  // 1. MSVC needs remove_cvref_t around decltype(S1)
-  //    in order to access decltype(S1)::value, and
-  // 2. MSVC and GCC 8.3.0 both report a build error with S1()
-  //    (that is, calling operator() on S1, which returns the
-  //    same thing as S1.value).
-  //
-  // The reason for (2) is that neither compiler thinks
-  // that S1() is a constant expression.
-  //
-  // This leaves S1.value as the most concise portable expression
-  // for the "value" member of a cute::integral_constant.
-  for_each(make_integer_sequence<uint32_t, 8>{}, [uzero](auto S0) {
-    for_each(make_integer_sequence<uint32_t, 8>{}, [uzero,S0](auto F0) {
-      for_each(make_integer_sequence<uint32_t, 8>{}, [uzero,S0,F0](auto S1) {
-        for_each(make_integer_sequence<uint32_t, 8>{}, [uzero,S0,F0,&S1](auto F1) {
-          static_assert((decltype(S0)::value & decltype(F0)::value) == decltype(S0 & F0)::value);
-
-          // Using S1.value means you don't have to use remove_cvref_t
-          // with a captured-by-reference variable.
-          static_assert((cute::remove_cvref_t<decltype(S1)>::value & decltype(F1)::value) == decltype(S1 & F1)::value);
-          static_assert((S1.value & decltype(F1)::value) == decltype(S1 & F1)::value);
-          // S1() _should_ work, but does not with Visual Studio 2022,
-          // which emits C2131 ("expression did not evaluate to a constant").
-          // It also does not with GCC 8.3.0, which emits an error with messages
-          // "non-constant condition for static assertion" and
-          // "'this' is not a constant expression."
-          //
-          //static_assert((S1() & decltype(F1)::value) == decltype(S1 & F1)::value);
-          static_assert(decltype((S0 & F0) != uzero)::value == ((decltype(S0)::value & decltype(F0)::value) != 0));
-
-          static_assert(decltype((S1 & F1) != uzero)::value == ((cute::remove_cvref_t<decltype(S1)>::value & decltype(F1)::value) != 0));
-          static_assert(decltype((S1 & F1) != uzero)::value == ((S1.value & decltype(F1)::value) != 0));
-
-          constexpr bool left  = decltype((S0 & F0) != uzero || (S1 & F1) != uzero)::value;
-          constexpr bool right =
-            ((decltype(S0)::value & decltype(F0)::value) != 0) ||
-            ((cute::remove_cvref_t<decltype(S1)>::value & decltype(F1)::value) != 0);
-          constexpr bool right2 =
-            ((decltype(S0)::value & decltype(F0)::value) != 0) ||
-            ((S1.value & decltype(F1)::value) != 0);
-          static_assert(right == right2);
-          static_assert(left == right);
-          constexpr bool left2 = decltype((S0 & F0) != uzero)::value || decltype((S1 & F1) != uzero)::value;
-          static_assert(left == left2);
-        });
-      });
-    });
-  });
-}
diff --git a/python/cutlass/cpp/include/gemm/host.h b/test/unit/cute/core/constants.cpp
similarity index 57%
rename from python/cutlass/cpp/include/gemm/host.h
rename to test/unit/cute/core/constants.cpp
index 3a6a58715c..271bf81440 100644
--- a/python/cutlass/cpp/include/gemm/host.h
+++ b/test/unit/cute/core/constants.cpp
@@ -28,20 +28,33 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/* \file
-   \brief Bind gemm host helpers to python
-*/
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/stl_bind.h>
 
-#include "cutlass/util/host_reorder.h"
-#include "cutlass/layout/tensor.h"
+#include "cutlass_unit_test.h"
+#include <cutlass/trace.h>
 
-namespace py = pybind11;
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
 
+TEST(CuTe_core, MakeIntegerSequence) {
+  cute::for_each(cute::make_integer_sequence<uint32_t, 13>{}, [](auto c) {
+    using c_type = decltype(c);
+    constexpr auto c_value = c_type::value;
+    using expected_type = cute::integral_constant<uint32_t, c_value>;
+    static_assert(cute::is_same_v<c_type, expected_type>);
+    static_assert(cute::is_same_v<typename c_type::value_type, uint32_t>);
+    static_assert(cute::is_constant<c_value, c_type>::value);
+    static_assert(cute::is_constant<0, decltype(c * cute::Int<0>{})>::value);
+    static_assert(cute::is_constant<2*c_value, decltype(c * cute::Int<2>{})>::value);
+  });
 
-void bind_gemm_host_helper(py::module &m) {
-    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
-    m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
+  cute::for_each(cute::make_integer_sequence<int64_t, 17>{}, [](auto c) {
+    using c_type = decltype(c);
+    constexpr auto c_value = c_type::value;
+    using expected_type = cute::integral_constant<int64_t, c_value>;
+    static_assert(cute::is_same_v<c_type, expected_type>);
+    static_assert(cute::is_same_v<typename c_type::value_type, int64_t>);
+    static_assert(cute::is_constant<c_value, c_type>::value);
+    static_assert(cute::is_constant<0, decltype(c * cute::Int<0>{})>::value);
+    static_assert(cute::is_constant<2*c_value, decltype(c * cute::Int<2>{})>::value);
+  });
 }
diff --git a/test/unit/cute/core/mixedbits.cpp b/test/unit/cute/core/mixedbits.cpp
index f439566ee2..0cf1e013c2 100644
--- a/test/unit/cute/core/mixedbits.cpp
+++ b/test/unit/cute/core/mixedbits.cpp
@@ -31,87 +31,31 @@
 
 #include "cutlass_unit_test.h"
 
-// C<uint32_t(something)>::value_type is not uint32_t for GCC 7.5.0.
-// This test is thus disabled for GCC < 8.
-#if defined(__GNUC__) && (__GNUC__ < 8)
-
 #include <cutlass/trace.h>
 #include <cute/swizzle.hpp>
 
-namespace { // (anonymous)
-
-  // This function exists to work around a Clang 14 issue, in which
-  // the compiler tries to instantiate code that lives inside the
-  // "else" branch of an "if constexpr," even when the "else" branch
-  // is false.  That triggers a spurious static_assert in MixedBits.
-  // The work-around is to make the body of the "else" branch a
-  // function, rather than leaving it in line.
-  //
-  // Some compilers strangely deduce the first two terms of
-  // make_integer_sequence<uint32_t, 8> as C<false> and C<true>, and
-  // the remaining terms as C<2>, C<3>, etc.  Making this function take
-  // cute::integral_constant<uint32_t, S0_value>, etc. doesn't work
-  // with those compilers.
-  template<class S0_type, S0_type S0_value,
-    class F0_type, F0_type F0_value,
-    class S1_type, S1_type S1_value,
-    class F1_type, F1_type F1_value>
-  void clang14_workaround(cute::integral_constant<S0_type, S0_value>,
-    cute::integral_constant<F0_type, F0_value>,
-    cute::integral_constant<S1_type, S1_value>,
-    cute::integral_constant<F1_type, F1_value>)
-  {
-    constexpr cute::C<static_cast<uint32_t>(S0_value)> S0{};
-    constexpr cute::C<static_cast<uint32_t>(F0_value)> F0{};
-    constexpr cute::C<static_cast<uint32_t>(S1_value)> S1{};
-    constexpr cute::C<static_cast<uint32_t>(F1_value)> F1{};
-
-    for (uint32_t d0 = 0; d0 < 8; ++d0) {
-      if ((d0 & F0) != d0) { continue; }    // Skip repeats
-      for (uint32_t d1 = 0; d1 < 8; ++d1) {
-        if ((d1 & F1) != d1) { continue; }  // Skip repeats
-        auto m0 = make_mixed_bits(S0, d0, F0);
-        auto m1 = make_mixed_bits(S1, d1, F1);
-        //print(m0); print(" & "); print(m1); print(" = "); print(m0 & m1); print("\n");
-        EXPECT_EQ(uint32_t(m0 & m1), uint32_t(m0) & uint32_t(m1));
-        //print(m0); print(" | "); print(m1); print(" = "); print(m0 | m1); print("\n");
-        EXPECT_EQ(uint32_t(m0 | m1), uint32_t(m0) | uint32_t(m1));
-        //print(m0); print(" ^ "); print(m1); print(" = "); print(m0 ^ m1); print("\n");
-        EXPECT_EQ(uint32_t(m0 ^ m1), uint32_t(m0) ^ uint32_t(m1));
-      }
-    }
-  }
-} // namespace (anonymous)
-
-TEST(CuTe_core, MixedBits) {
+TEST(CuTe_core, MixedBits) 
+{
   using namespace cute;
 
-  auto uzero = cute::integral_constant<uint32_t, 0>{};
-
-  for_each(make_integer_sequence<uint32_t, 8>{}, [&](auto S0) {
-    for_each(make_integer_sequence<uint32_t, 8>{}, [&](auto F0) {
-      for_each(make_integer_sequence<uint32_t, 8>{}, [&](auto S1) {
-        for_each(make_integer_sequence<uint32_t, 8>{}, [&](auto F1) {
-          if constexpr (decltype(S0 == uzero || S1 == uzero)::value) {
-            return;
-          } else if constexpr (decltype((S0 & F0) != uzero || (S1 & F1) != uzero)::value) {
-            return;
-          } else {
-            clang14_workaround(S0, F0, S1, F1);
+  for_each(make_int_sequence<8>{}, [&](auto S0) {
+    for_each(make_int_sequence<8>{}, [&](auto F0) {
+      for_each(make_int_sequence<8>{}, [&](auto S1) {
+        for_each(make_int_sequence<8>{}, [&](auto F1) {
+          for (uint32_t d0 = 0; d0 < 8; ++d0) {
+            for (uint32_t d1 = 0; d1 < 8; ++d1) {
+              auto m0 = make_mixed_bits(S0, d0, F0);
+              auto m1 = make_mixed_bits(S1, d1, F1);
+              //print(m0); print(" & "); print(m1); print(" = "); print(m0 & m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 & m1), uint32_t(m0) & uint32_t(m1));
+              //print(m0); print(" | "); print(m1); print(" = "); print(m0 | m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 | m1), uint32_t(m0) | uint32_t(m1));
+              //print(m0); print(" ^ "); print(m1); print(" = "); print(m0 ^ m1); print("\n");
+              EXPECT_EQ(uint32_t(m0 ^ m1), uint32_t(m0) ^ uint32_t(m1));
+            }
           }
         });
       });
     });
   });
 }
-
-TEST(CuTe_core, MakeIntegerSequence) {
-  cute::for_each(cute::make_integer_sequence<uint32_t, 8>{}, [](auto c) {
-    using c_type = decltype(c);
-    constexpr auto c_value = c_type::value;
-    using expected_type = cute::integral_constant<uint32_t, c_value>;
-    static_assert(cute::is_same_v<c_type, expected_type>);
-  });
-}
-
-#endif // defined(__GNUC__) && (__GNUC__ < 8)
diff --git a/test/unit/cute/hopper/tma_load.cu b/test/unit/cute/hopper/tma_load.cu
index 2b1eb94f79..335d6091b7 100644
--- a/test/unit/cute/hopper/tma_load.cu
+++ b/test/unit/cute/hopper/tma_load.cu
@@ -31,182 +31,79 @@
 
 #include "cutlass_unit_test.h"
 
-#include <iostream>
-
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-#include <cute/tensor.hpp>
+#include "../hopper/tma_load_testbed.hpp"
 
 using namespace cute;
-
-template <class ElementType, class SmemLayout>
-struct SharedStorage
-{
-  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
-  cute::uint64_t tma_load_mbar[1];
-};
+using namespace cutlass::test;
 
 #if CUDA_12_0_SM90_FEATURES_SUPPORTED
-template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
-__global__ void
-tma_test_device_cute(T const* g_in, T* g_out,
-                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
-                     GmemLayout gmem_layout, SmemLayout smem_layout)
-{
-  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
-
-  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
-  extern __shared__ char shared_memory[];
-  using SharedStorage = SharedStorage<T, SmemLayout>;
-  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
-  // Construct SMEM tensor
-  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
-  // Shared memory barriers use 64bits in SMEM for synchronization
-  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
-
-  // TMA requires special handling of strides to deal with coord codomain mapping
-  // Represent the full tensors -- get these from TMA
-  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
-  Tensor mB = make_tensor(make_gmem_ptr(g_out), gmem_layout);
-
-  constexpr int R = rank_v<CTA_Tiler>;
-  Tensor gA = local_tile(mA, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
-  Tensor gB = local_tile(mB, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
-
-  //
-  // Prepare the TMA_LOAD
-  //
-
-  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
-
-  Tensor tAgA_x = cta_tma.partition_S(gA);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
-  Tensor tAsA_x = cta_tma.partition_D(sA);                           // (TMA,TMA_M,TMA_N)
-
-#if 0
-  if (thread0()) {
-    print(tma);
-    print("TILE  :  "); print(cta_tiler); print("\n");
-    print("  mA  :  "); print(  mA.data());   print(" o "); print(  mA.layout());   print("\n");
-    print("  gA  :  "); print(  gA.data());   print(" o "); print(  gA.layout());   print("\n");
-    print("tAgA_x:  "); print(tAgA_x.data()); print(" o "); print(tAgA_x.layout()); print("\n");
-    print("  sA  :  "); print(  sA.data());   print(" o "); print(  sA.layout());   print("\n");
-    print("tAsA_x:  "); print(tAsA_x.data()); print(" o "); print(tAsA_x.layout()); print("\n");
-  }
-#endif
-
-  //
-  // Perform the TMA_LOAD
-  //
-
-  // INPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
-  Tensor tAgA = group_modes<1,rank(tAgA_x)>(tAgA_x);                 // (TMA,REST)
-  Tensor tAsA = group_modes<1,rank(tAsA_x)>(tAsA_x);                 // (TMA,REST)
-  static_assert(size<1>(tAsA) == 1);
-
-  // OUTPUT: Group the CTA_TILE_X modes and REST_X modes for output
-  Tensor tBgB = group_modes<0,R>(group_modes<R,rank(gB)>(gB));       // (CTA_TILE, REST)
-
-#if 0
-  if (thread0()) {
-    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
-    print("tAsA  :  "); print(tAsA.data()); print(" o "); print(tAsA.layout()); print("\n");
-    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
-  }
-#endif
-
-  // Loop over the TMA stages, using smem as our buffer
-  for (int stage = 0; stage < size<1>(tAgA); ++stage)
-  {
-    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
-    constexpr int kTmaTransactionBytes = size(sA) * sizeof_bits_v<T> / 8;
-
-    if (threadIdx.x == 0)
-    {
-      /// Initialize shared memory barrier
-      tma_load_mbar[0] = 0;
-      cute::initialize_barrier(tma_load_mbar[0], 1 /*numThreads*/);
-      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
-
-      copy(tma.with(tma_load_mbar[0]), tAgA(_,stage), tAsA(_,0));
-    }
-    __syncthreads();
-
-    /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
-    constexpr int kPhaseBit = 0;
-    cute::wait_barrier(tma_load_mbar[0], kPhaseBit);
-
-    //
-    // Write out trivially smem -> gmem
-    //
-
-    //if (thread0()) {
-    //  print_tensor(sA);
-    //}
 
-    for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
-      tBgB(i,stage) = sA(i);
-    }
-    __syncthreads();
-  }
-}
-
-template <class T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
-void
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
 test_tma_load(GMEM_Layout const& gmem_layout,
               SMEM_Layout const& smem_layout,
               CTA_Tile    const& cta_tile)
 {
-  thrust::host_vector<T> h_in(cosize(gmem_layout));
-  for (int i = 0; i < h_in.size(); ++i) { h_in[i] = T(i % 13); }
-  thrust::device_vector<T> d_in = h_in;
-  thrust::device_vector<T> d_out(h_in.size(), T(-1));
-
-  Tensor gA = make_tensor(d_in.data().get(), gmem_layout);
-  auto tma = make_tma_copy(SM90_TMA_LOAD{}, gA, smem_layout, cta_tile, Int<1>{});
-  //print(tma);
-
-  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
-  tma_test_device_cute<<<1, 128, smem_size>>>(
-    thrust::raw_pointer_cast(d_in.data()),
-    thrust::raw_pointer_cast(d_out.data()),
-    tma, cta_tile,
-    gmem_layout,
-    smem_layout);
-
-  thrust::host_vector<T> h_out = d_out;
-  Tensor hA_in  = make_tensor(h_in.data(),  gmem_layout);
-  Tensor hA_out = make_tensor(h_out.data(), gmem_layout);
-  for (int i = 0; i < size(gmem_layout); ++i) {
-    EXPECT_EQ(hA_in(i), hA_out(i));
-  }
+  using namespace cute;
+  return test_tma_load<T, TmaType>(SM90_TMA_LOAD{}, gmem_layout, smem_layout, cta_tile);
 }
 
-template <class T, class GMEM_Layout, class SMEM_Layout>
-void
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout>
+auto
 test_tma_load(GMEM_Layout const& gmem_layout,
               SMEM_Layout const& smem_layout)
 {
-  return test_tma_load<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  using namespace cute;
+  return test_tma_load<T, TmaType>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Load_1D)
 {
-  Layout smem_layout = Layout<_256, _1>{};
   {
-  Layout gmem_layout = smem_layout;
-  test_tma_load<int8_t>(gmem_layout, smem_layout);
-  test_tma_load<half_t>(gmem_layout, smem_layout);
-  test_tma_load< float>(gmem_layout, smem_layout);
-  test_tma_load<double>(gmem_layout, smem_layout);
+    Layout smem_layout = Layout<_256, _1>{};
+    {
+    Layout gmem_layout = smem_layout;
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    {
+    Layout gmem_layout = make_layout(128, GenColMajor{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    {
+    Layout gmem_layout = make_layout(384, GenColMajor{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
   }
 
   {
-  Layout gmem_layout = make_layout(128, GenColMajor{});
-  test_tma_load<int8_t>(gmem_layout, smem_layout);
-  test_tma_load<half_t>(gmem_layout, smem_layout);
-  test_tma_load< float>(gmem_layout, smem_layout);
-  test_tma_load<double>(gmem_layout, smem_layout);
+    Layout smem_layout = Layout<Shape<_8,_8>, Stride<_1,_8>>{};
+    {
+    Layout gmem_layout = smem_layout;
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
+
+    // This doesn't result in a 1D TMA, even though it could/should...
+    {
+    Layout gmem_layout = tile_to_shape(smem_layout, Shape<_16,_16>{});
+    test_tma_load<int8_t>(gmem_layout, smem_layout);
+    test_tma_load<half_t>(gmem_layout, smem_layout);
+    test_tma_load< float>(gmem_layout, smem_layout);
+    test_tma_load<double>(gmem_layout, smem_layout);
+    }
   }
 }
 
@@ -270,18 +167,32 @@ template <class T, template <typename> typename SWIZZLE_ATOM>
 void
 test_tma_load_swizzle_atom_mn()
 {
-  auto   smem_layout = SWIZZLE_ATOM<T>{};
-  Layout gmem_layout = make_layout(shape(smem_layout), GenColMajor{});
-  return test_tma_load<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  auto smem_layout = SWIZZLE_ATOM<T>{};
+  { // Static gmem
+  //Layout gmem_layout = make_layout(shape(smem_layout), GenColMajor{});
+  //test_tma_load<T>(gmem_layout, smem_layout);
+  }
+  { // Dynamic gmem
+  Layout gmem_layout = make_layout(make_shape(2*uint32_t(size<0>(smem_layout)), 2*uint32_t(size<1>(smem_layout))),
+                                   GenColMajor{});
+  test_tma_load<T>(gmem_layout, smem_layout);
+  }
 }
 
 template <class T, template <typename> typename SWIZZLE_ATOM>
 void
 test_tma_load_swizzle_atom_k()
 {
-  auto   smem_layout = SWIZZLE_ATOM<T>{};
-  Layout gmem_layout = make_layout(shape(smem_layout), GenRowMajor{});
-  return test_tma_load<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  auto smem_layout = SWIZZLE_ATOM<T>{};
+  { // Static gmem
+  //Layout gmem_layout = make_layout(shape(smem_layout), GenRowMajor{});
+  //test_tma_load<T>(gmem_layout, smem_layout);
+  }
+  { // Dynamic gmem
+  Layout gmem_layout = make_layout(make_shape(2*uint32_t(size<0>(smem_layout)), 2*uint32_t(size<1>(smem_layout))),
+                                   GenRowMajor{});
+  test_tma_load<T>(gmem_layout, smem_layout);
+  }
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Load_Swizzle_Atoms)
@@ -328,21 +239,21 @@ TEST(SM90_CuTe_Hopper, Tma_Load_Swizzle_Atoms)
 }
 
 template <class T, template <typename> typename SWIZZLE_ATOM>
-void
+auto
 test_tma_load_swizzle_tile_mn()
 {
   auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
   Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenColMajor{});
-  return test_tma_load<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  return test_tma_load<T>(gmem_layout, smem_layout);
 }
 
 template <class T, template <typename> typename SWIZZLE_ATOM>
-void
+auto
 test_tma_load_swizzle_tile_k()
 {
   auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
   Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenRowMajor{});
-  return test_tma_load<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  return test_tma_load<T>(gmem_layout, smem_layout);
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Load_Swizzle_Tiles)
@@ -431,4 +342,88 @@ TEST(SM90_CuTe_Hopper, Tma_Load_Tensor_Multimode)
   }
 }
 
+TEST(SM90_CuTe_Hopper, Tma_Load_Coalesce)
+{
+  // Interleaved ColMajor
+  {
+  Layout gmem_layout = make_layout(make_shape (  128, make_shape (_4{},  128)),
+                                   make_stride( _4{}, make_stride(_1{},  512)));
+  auto   smem_layout = make_layout(make_shape (_32{}, make_shape (_4{},  _32{})),
+                                   make_stride( _4{}, make_stride(_1{}, _128{})));
+
+  // By default, uses cta_tile = Shape<_32,_128>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Interleaved RowMajor
+  {
+  Layout gmem_layout = make_layout(make_shape (make_shape (_4{},   128),   128),
+                                   make_stride(make_stride(_1{},   512),   _4{}));
+  auto   smem_layout = make_layout(make_shape (make_shape (_4{},  _32{}), _32{}),
+                                   make_stride(make_stride(_1{}, _128{}),  _4{}));
+
+  // By default, uses cta_tile = Shape<_128,_32>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Account for stride-0 modes within the TMA tile
+  {
+  Layout gmem_layout = make_layout(make_shape (  128, make_shape (_32{},   4)),
+                                   make_stride( _1{}, make_stride( _0{}, 128)));
+  auto   smem_layout = make_layout(make_shape (_64{}, make_shape (_32{}     )),
+                                   make_stride( _1{}, make_stride( _0{}     )));
+
+  // By default, uses cta_tile = Shape<_64,_32>
+  auto tma = test_tma_load<uint16_t>(gmem_layout, smem_layout);
+  // Check the TMA rank
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 2);
+  }
+
+  // Coalesce many modes and account for stride-0 modes within the TMA tile
+  {
+  Layout gmem_layout = make_layout(make_shape (make_shape (_32{},_4{},     4), _32{}, make_shape (_4{},      4)),
+                                   make_stride(make_stride(_16{},_4{},  2048),  _0{}, make_stride(_1{}, _512{})));
+  auto   smem_layout = make_layout(make_shape (make_shape (_32{},_4{}       ), _32{}, make_shape (_4{}        )),
+                                   make_stride(make_stride(_16{},_4{}       ),  _0{}, make_stride(_1{}        )));
+
+  // By default, uses cta_tile = Shape<_128,_32,_4>
+  auto tma = test_tma_load<int8_t>(gmem_layout, smem_layout);
+  // Check the TMA rank (Could be 3 instead of 4 with even better coalescing...?)
+  EXPECT_EQ(rank(tma.get_tma_tensor(shape(gmem_layout))(0)), 4);
+  }
+}
+
+TEST(SM90_CuTe_Hopper, Tma_Load_InternalType)
+{
+  Layout smem_layout = Layout<Shape<_32,_32>, Stride<_1,_32>>{};
+  Layout gmem_layout = make_layout(make_shape(64, 64));
+
+  // Downcasted tensors to smaller TmaTypes
+  {
+  test_tma_load<int8_t, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load< float, uint8_t>(gmem_layout, smem_layout);
+  test_tma_load<double, uint8_t>(gmem_layout, smem_layout);
+  }
+
+  // Upcasted tensors to larger TmaTypes
+  {
+  test_tma_load<int8_t, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<half_t, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load< float, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<double, uint64_t>(gmem_layout, smem_layout);
+
+  }
+
+  // Complex<double> is 128bit, which the TMA has no concept of
+  {
+  test_tma_load<complex<double>, uint64_t>(gmem_layout, smem_layout);
+  test_tma_load<complex<double>, uint32_t>(gmem_layout, smem_layout);
+  }
+}
+
 #endif
diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
new file mode 100644
index 0000000000..ce8aa8dd84
--- /dev/null
+++ b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
+  cute::uint64_t tma_load_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = local_tile(mA, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = local_tile(mB, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_LOAD
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+
+  Tensor tAgA_x = cta_tma.partition_S(gA);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+  Tensor tAsA_x = cta_tma.partition_D(sA);                           // (TMA,TMA_M,TMA_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA.data());   print(" o "); print(  mA.layout());   print("\n");
+    print("  gA  :  "); print(  gA.data());   print(" o "); print(  gA.layout());   print("\n");
+    print("tAgA_x:  "); print(tAgA_x.data()); print(" o "); print(tAgA_x.layout()); print("\n");
+    print("  sA  :  "); print(  sA.data());   print(" o "); print(  sA.layout());   print("\n");
+    print("tAsA_x:  "); print(tAsA_x.data()); print(" o "); print(tAsA_x.layout()); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_LOAD
+  //
+
+  // INPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tAgA = group_modes<1,rank(tAgA_x)>(tAgA_x);                 // (TMA,REST)
+  Tensor tAsA = group_modes<1,rank(tAsA_x)>(tAsA_x);                 // (TMA,REST)
+  static_assert(size<1>(tAsA) == 1);
+
+  // OUTPUT: Group the CTA_TILE_X modes and REST_X modes for output
+  Tensor tBgB = group_modes<0,R>(group_modes<R,rank(gB)>(gB));       // (CTA_TILE, REST)
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
+    print("tAsA  :  "); print(tAsA.data()); print(" o "); print(tAsA.layout()); print("\n");
+    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
+  }
+#endif
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = size(sA) * sizeof_bits_v<T> / 8;
+
+    if (threadIdx.x == 0)
+    {
+      /// Initialize shared memory barrier
+      tma_load_mbar[0] = 0;
+      cute::initialize_barrier(tma_load_mbar[0], 1 /*numThreads*/);
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+
+      copy(tma.with(tma_load_mbar[0]), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+
+    /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
+    constexpr int kPhaseBit = 0;
+    cute::wait_barrier(tma_load_mbar[0], kPhaseBit);
+
+    //
+    // Write out trivially smem -> gmem
+    //
+
+    //if (thread0()) {
+    //  print_tensor(sA);
+    //}
+
+    for (int i = threadIdx.x; i < size(sA); i += blockDim.x) {
+      tBgB(i,stage) = sA(i);
+    }
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
+test_tma_load(CopyOp      const& copy_op,
+              GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout,
+              CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+  thrust::host_vector<T> h_in(cosize(gmem_layout));
+  for (int i = 0; i < h_in.size(); ++i) { h_in[i] = T(i % 13); }
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(h_in.size(), T(-1));
+
+  Tensor gA = make_tensor(d_in.data().get(), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  thrust::host_vector<T> h_out = d_out;
+
+  // Validate the results, and tolerate the first 3 errors:
+  Tensor hA_in  = make_tensor(h_in.data(),  gmem_layout);
+  Tensor hA_out = make_tensor(h_out.data(), gmem_layout);
+  int count = 3;
+  for (int i = 0; i < cute::size(gmem_layout) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+
+  return tma;
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/test/unit/cute/hopper/tma_store.cu b/test/unit/cute/hopper/tma_store.cu
index 7a8c49ac83..a94788e771 100644
--- a/test/unit/cute/hopper/tma_store.cu
+++ b/test/unit/cute/hopper/tma_store.cu
@@ -31,150 +31,30 @@
 
 #include "cutlass_unit_test.h"
 
-#include <iostream>
-
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-#include <cute/tensor.hpp>
+#include "../hopper/tma_store_testbed.hpp"
 
 using namespace cute;
-
-template <class ElementType, class SmemLayout>
-struct SharedStorage
-{
-  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
-};
+using namespace cutlass::test;
 
 #if CUDA_12_0_SM90_FEATURES_SUPPORTED
-template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
-__global__ void
-tma_test_device_cute(T const* g_in, T* g_out,
-                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
-                     GmemLayout gmem_layout, SmemLayout smem_layout)
-{
-  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
-
-  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
-  extern __shared__ char shared_memory[];
-  using SharedStorage = SharedStorage<T, SmemLayout>;
-  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
-  // Construct SMEM tensor
-  Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
-
-  // TMA requires special handling of strides to deal with coord codomain mapping
-  // Represent the full tensors -- get these from TMA
-  Tensor mA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
-  Tensor mB = tma.get_tma_tensor(shape(gmem_layout));
-
-  constexpr int R = rank_v<CTA_Tiler>;
-  Tensor gA = local_tile(mA, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
-  Tensor gB = local_tile(mB, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
-
-  //
-  // Prepare the TMA_STORE
-  //
-
-  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
-
-  Tensor tBsB_x = cta_tma.partition_S(sB);                           // (TMA,TMA_M,TMA_N)
-  Tensor tBgB_x = cta_tma.partition_D(gB);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
-
-#if 0
-  if (thread0()) {
-    print(tma);
-    print("TILE  :  "); print(cta_tiler); print("\n");
-    print("  mB  :  "); print(  mB.data());   print(" o "); print(  mB.layout());   print("\n");
-    print("  gB  :  "); print(  gB.data());   print(" o "); print(  gB.layout());   print("\n");
-    print("tBgB_x:  "); print(tBgB_x.data()); print(" o "); print(tBgB_x.layout()); print("\n");
-    print("  sB  :  "); print(  sB.data());   print(" o "); print(  sB.layout());   print("\n");
-    print("tBsB_x:  "); print(tBsB_x.data()); print(" o "); print(tBsB_x.layout()); print("\n");
-  }
-#endif
-
-  //
-  // Perform the TMA_STORE
-  //
-
-  // INPUT: Group the CTA_TILE_X modes and REST_X modes for input
-  Tensor tAgA = group_modes<0,R>(group_modes<R,rank(gA)>(gA));       // (CTA_TILE, REST)
-
-  // OUTPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
-  Tensor tBgB = group_modes<1,rank(tBgB_x)>(tBgB_x);                 // (TMA,REST)
-  Tensor tBsB = group_modes<1,rank(tBsB_x)>(tBsB_x);                 // (TMA,REST)
-  static_assert(size<1>(tBsB) == 1);
-
-#if 0
-  if (thread0()) {
-    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
-    print("tBsB  :  "); print(tBsB.data()); print(" o "); print(tBsB.layout()); print("\n");
-    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
-  }
-#endif
-
-  // Loop over the TMA stages, using smem as our buffer
-  for (int stage = 0; stage < size<1>(tBgB); ++stage)
-  {
-    //
-    // Read in trivially gmem -> smem
-    //
-
-    for (int i = threadIdx.x; i < size(sB); i += blockDim.x) {
-      sB(i) = tAgA(i,stage);
-    }
-
-    __syncthreads();
-
-    //
-    // Perform the TMA_STORE
-    //
 
-    if (threadIdx.x == 0) {
-      copy(tma, tBsB(_,0), tBgB(_,stage));
-    }
-
-    tma_store_wait<0>();
-    __syncthreads();
-  }
-}
-
-template <class T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
 void
 test_tma_store(GMEM_Layout const& gmem_layout,
                SMEM_Layout const& smem_layout,
                CTA_Tile    const& cta_tile)
 {
-  thrust::host_vector<T> h_in(cosize(gmem_layout));
-  for (int i = 0; i < h_in.size(); ++i) { h_in[i] = T(i % 13); }
-  thrust::device_vector<T> d_in = h_in;
-  thrust::device_vector<T> d_out(h_in.size(), T(-1));
-
-  Tensor gA = make_tensor(d_out.data().get(), gmem_layout);
-  auto tma = make_tma_copy(SM90_TMA_STORE{}, gA, smem_layout, cta_tile, Int<1>{});
-  //print(tma);
-
-  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
-  tma_test_device_cute<<<1, 128, smem_size>>>(
-    thrust::raw_pointer_cast(d_in.data()),
-    thrust::raw_pointer_cast(d_out.data()),
-    tma, cta_tile,
-    gmem_layout,
-    smem_layout);
-
-  thrust::host_vector<T> h_out = d_out;
-  Tensor hA_in  = make_tensor(h_in.data(),  gmem_layout);
-  Tensor hA_out = make_tensor(h_out.data(), gmem_layout);
-  for (int i = 0; i < size(gmem_layout); ++i) {
-    EXPECT_EQ(hA_in(i), hA_out(i));
-  }
+  using namespace cute;
+  return test_tma_store<T, TmaType>(SM90_TMA_STORE{}, gmem_layout, smem_layout, cta_tile);
 }
 
-template <class T, class GMEM_Layout, class SMEM_Layout>
+template <class T, class TmaType = T, class GMEM_Layout, class SMEM_Layout>
 void
 test_tma_store(GMEM_Layout const& gmem_layout,
                SMEM_Layout const& smem_layout)
 {
-  return test_tma_store<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  using namespace cute;
+  return test_tma_store<T, TmaType>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Load_1D)
@@ -258,8 +138,8 @@ void
 test_tma_store_swizzle_atom_mn()
 {
   auto   smem_layout = SWIZZLE_ATOM<T>{};
-  Layout gmem_layout = make_layout(shape(smem_layout), GenColMajor{});
-  return test_tma_store<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenColMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
 }
 
 template <class T, template <typename> typename SWIZZLE_ATOM>
@@ -267,8 +147,8 @@ void
 test_tma_store_swizzle_atom_k()
 {
   auto   smem_layout = SWIZZLE_ATOM<T>{};
-  Layout gmem_layout = make_layout(shape(smem_layout), GenRowMajor{});
-  return test_tma_store<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenRowMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Store_Swizzle_Atoms)
@@ -319,8 +199,8 @@ void
 test_tma_store_swizzle_tile_mn()
 {
   auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
-  Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenColMajor{});
-  return test_tma_store<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenColMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
 }
 
 template <class T, template <typename> typename SWIZZLE_ATOM>
@@ -328,8 +208,8 @@ void
 test_tma_store_swizzle_tile_k()
 {
   auto   smem_layout = tile_to_shape(SWIZZLE_ATOM<T>{}, Shape<_128,_128>{});
-  Layout gmem_layout = make_layout(make_shape(int(size<0>(smem_layout)), int(size<1>(smem_layout))), GenRowMajor{});
-  return test_tma_store<T>(gmem_layout, smem_layout, product_each(shape(smem_layout)));
+  Layout gmem_layout = make_layout(make_shape(2*size<0>(smem_layout), 2*size<1>(smem_layout)), GenRowMajor{});
+  return test_tma_store<T>(gmem_layout, smem_layout);
 }
 
 TEST(SM90_CuTe_Hopper, Tma_Store_Swizzle_Tiles)
@@ -353,7 +233,6 @@ TEST(SM90_CuTe_Hopper, Tma_Store_Swizzle_Tiles)
   test_tma_store_swizzle_tile_k<half_t, GMMA::Layout_K_INTER_Atom>();
 }
 
-
 // Tensor by-mode
 TEST(SM90_CuTe_Hopper, Tma_Store_Tensor)
 {
diff --git a/test/unit/cute/hopper/tma_store_testbed.hpp b/test/unit/cute/hopper/tma_store_testbed.hpp
new file mode 100644
index 0000000000..990d625dd6
--- /dev/null
+++ b/test/unit/cute/hopper/tma_store_testbed.hpp
@@ -0,0 +1,184 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::array_aligned<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+  // Construct SMEM tensor
+  Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+  Tensor mB = tma.get_tma_tensor(shape(gmem_layout));
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = local_tile(mA, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = local_tile(mB, cta_tiler, repeat<R>(_));               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_STORE
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+
+  Tensor tBsB_x = cta_tma.partition_S(sB);                           // (TMA,TMA_M,TMA_N)
+  Tensor tBgB_x = cta_tma.partition_D(gB);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mB  :  "); print(  mB.data());   print(" o "); print(  mB.layout());   print("\n");
+    print("  gB  :  "); print(  gB.data());   print(" o "); print(  gB.layout());   print("\n");
+    print("tBgB_x:  "); print(tBgB_x.data()); print(" o "); print(tBgB_x.layout()); print("\n");
+    print("  sB  :  "); print(  sB.data());   print(" o "); print(  sB.layout());   print("\n");
+    print("tBsB_x:  "); print(tBsB_x.data()); print(" o "); print(tBsB_x.layout()); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_STORE
+  //
+
+  // INPUT: Group the CTA_TILE_X modes and REST_X modes for input
+  Tensor tAgA = group_modes<0,R>(group_modes<R,rank(gA)>(gA));       // (CTA_TILE, REST)
+
+  // OUTPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tBgB = group_modes<1,rank(tBgB_x)>(tBgB_x);                 // (TMA,REST)
+  Tensor tBsB = group_modes<1,rank(tBsB_x)>(tBsB_x);                 // (TMA,REST)
+  static_assert(size<1>(tBsB) == 1);
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
+    print("tBsB  :  "); print(tBsB.data()); print(" o "); print(tBsB.layout()); print("\n");
+    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
+  }
+#endif
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tBgB); ++stage)
+  {
+    //
+    // Read in trivially gmem -> smem
+    //
+
+    for (int i = threadIdx.x; i < size(sB); i += blockDim.x) {
+      sB(i) = tAgA(i,stage);
+    }
+
+    __syncthreads();
+
+    //
+    // Perform the TMA_STORE
+    //
+
+    if (threadIdx.x == 0) {
+      copy(tma, tBsB(_,0), tBgB(_,stage));
+    }
+
+    tma_store_wait<0>();
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+void
+test_tma_store(CopyOp      const& copy_op,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout,
+               CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+  thrust::host_vector<T> h_in(cosize(gmem_layout));
+  for (int i = 0; i < h_in.size(); ++i) { h_in[i] = T(i % 13); }
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(h_in.size(), T(-1));
+
+  Tensor gA = make_tensor(d_out.data().get(), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    thrust::raw_pointer_cast(d_in.data()),
+    thrust::raw_pointer_cast(d_out.data()),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  thrust::host_vector<T> h_out = d_out;
+
+  // Validate the results, and tolerate the first 3 errors:
+  Tensor hA_in  = make_tensor(h_in.data(),  gmem_layout);
+  Tensor hA_out = make_tensor(h_out.data(), gmem_layout);
+  int count = 3;
+  for (int i = 0; i < cute::size(gmem_layout) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
index c3bb8b1aa4..905d03a8b4 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -46,7 +46,8 @@
 
 #include "testbed.h"
 
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
@@ -370,8 +371,12 @@ TEST(SM80_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
   EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
+#endif // defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
 ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+
 TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) {
   using ElementOutput = int32_t;
   using ElementAccumulator = int32_t;
@@ -694,6 +699,6 @@ TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) {
   EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
 
-////////////////////////////////////////////////////////////////////////////////
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
index 41ce86f111..895d3bacf1 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -47,10 +47,9 @@
 
 #include "testbed.h"
 
-#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 ////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
 
 CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x1024_64x64x1024, {
   using ElementOutput = int32_t;
@@ -376,4 +375,4 @@ CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#endif  // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+#endif  // #if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index f8505ed82a..e281e9f40f 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) {
@@ -195,5 +194,4 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index fbb576fc75..bb11ec4e3a 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index 23dc8eb097..cf8d766808 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ///////// WMMA Instruction Shape = 8x8x32, DataType/Instruction = s4 * s4 + s32 => s32 //////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
@@ -244,5 +243,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
-
 #endif //CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index 4016558f4a..a2e9199d6a 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index 3df11809df..a989f4ea37 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "cutlass/util/reference/host/gemm.h"
 
 #include "testbed.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 ///////// WMMA Instruction Shape = 8x8x32, DataType/Instruction = s4 * s4 + s32 => s32 //////////
 /////////////////////////////////////////////////////////////////////////////////////////////////    
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index 27c37159fa..e32c48205c 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) {
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 1fb0d7b0b7..f2ce0da322 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x128_64x64x128) {
@@ -243,6 +242,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x128_64x64x128) {
 
   EXPECT_TRUE(test::gemm::device::TestAllGemmBasic<Gemm>());
 }
+
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128) {
 
   using ElementOutput = cutlass::int4b_t;
@@ -340,5 +340,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index 98b74e136d..ef0100a420 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed_interleaved.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 32x64x64_16x32x64) {
@@ -289,5 +288,4 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index 7a7b9df156..3d30451009 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x256x64_64x64x64) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x64x64_32x32x64) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index 2c4ca985fd..f28d185e04 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64) {
@@ -245,5 +244,4 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index 2880777bb6..2cebb98b32 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, {
@@ -212,5 +211,4 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
 } )
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index 9e29076309..0a4706d73d 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -49,7 +49,6 @@
 #include "testbed.h"
 
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, {
@@ -186,5 +185,4 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
 } )
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
index 30b4264949..156913fdf3 100644
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -544,7 +544,6 @@ struct TestbedImpl {
     // Initialize the GEMM operator
     //
 
-    typename Gemm::Arguments arguments;
     cutlass::KernelHardwareInfo hw_info;
     hw_info.device_id = 0;
     if (not profiling) {
@@ -557,12 +556,12 @@ struct TestbedImpl {
     }
 
     typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
-    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileScheduleTag, cutlass::gemm::StreamKScheduler>) {
+    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
       scheduler_args = { static_cast<int>(splits) };
     }
 
     // DefaultEpilogue
-    arguments = typename Gemm::Arguments{
+    auto arguments = typename Gemm::Arguments {
       cutlass::gemm::GemmUniversalMode::kGemm,
       problem_size,
       {
@@ -741,7 +740,7 @@ struct Testbed3xFusionOperation {
   using ElementAux        = non_void_t<typename FusionOp::ElementAux>;
   using ElementAmax       = non_void_t<typename FusionOp::ElementAmax>;
   using LayoutTagAux      = non_void_t<typename FusionOp::GmemLayoutTagAux, LayoutTagD>;
-  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn<ElementCompute>,
+  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn,
                               cutlass::epilogue::thread::Identity<ElementCompute>>;
 
   static constexpr bool IsBiasEnabled        = FusionOp::IsPerRowBiasSupported;
@@ -1152,7 +1151,7 @@ struct Testbed3xFusionOperation {
     initialize(problem_size, alpha_, beta_);
 
     typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
-    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileScheduleTag, cutlass::gemm::StreamKScheduler>) {
+    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
       scheduler_args = { static_cast<int>(splits) };
     }
 
@@ -1208,6 +1207,13 @@ struct Testbed3xFusionOperation {
         fusion_args.bias_ptr = bias.device_data();
       }
 
+      // example of how to set kernel activation arguments
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>>) {
+        // see ActivationFunctor::Arguments in activation.h for definition
+        // if Arguments doesn't exist then fusion_args.activation is empty
+        fusion_args.activation.scale = ElementCompute(1);
+      }
+
       if constexpr (IsAbsMaxEnabled) {
         fusion_args.amax_D_ptr = abs_max_D.device_data();
       }
@@ -1297,7 +1303,7 @@ bool TestAll(double alpha = 1.0, double beta = 0.0, Testbed testbed = {}) {
   std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
 
   std::vector<int> problem_splits = {1};
-  if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileScheduleTag, cutlass::gemm::StreamKScheduler>) {
+  if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
     problem_splits.push_back(2);
     problem_splits.push_back(3);
 
diff --git a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
index c6d6da09f5..9127d40786 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@@ -1316,7 +1316,7 @@ class Testbed3xEVT {
     }
 
     typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
-    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileScheduleTag, cutlass::gemm::StreamKScheduler>) {
+    if constexpr (std::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
       scheduler_args = { splits };
     }
 
diff --git a/test/unit/gemm/device/sm90_evt_operations.hpp b/test/unit/gemm/device/sm90_evt_operations.hpp
index 767e84f09a..71425fee2c 100644
--- a/test/unit/gemm/device/sm90_evt_operations.hpp
+++ b/test/unit/gemm/device/sm90_evt_operations.hpp
@@ -454,57 +454,4 @@ using Sm90LinCombScalarReduce =
   >;
 } // namespace fusion
 
-namespace collective {
-
-template<
-  typename TileShape_MNK,
-  typename EpilogueTileType, 
-  typename ElementC,
-  typename ElementD,
-  typename Schedule
->
-struct EpilogueDescriptor{
-  using TileShape = TileShape_MNK;
-  using EpilogueTile = 
-    decltype(detail::sm90_compute_tile_shape_or_override<ElementD, EpilogueTileType, Schedule>());
-  using DispatchPolicy = 
-    decltype(detail::sm90_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile,ElementC,ElementD, Schedule>());
-  constexpr static int StagesC = DispatchPolicy::StagesC;
-  constexpr static int StagesD = DispatchPolicy::StagesD;
-};
-
-
-template<
-  typename EpilogueDescriptor,
-  typename GmemLayoutTagAux,
-  typename ElementAux
->
-struct AuxLoadDescriptor{
-  constexpr static int Stages = EpilogueDescriptor::StagesC;
-  using Element = ElementAux;
-  using Stride = gemm::TagToStrideC_t<GmemLayoutTagAux>;
-  using SmemLayoutAtom =
-    decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile>());
-  using CopyOpS2R =
-    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
-};
-
-
-template<
-  typename EpilogueDescriptor,
-  typename GmemLayoutTagAux,
-  typename ElementAux
->
-struct AuxStoreDescriptor{
-  constexpr static int Stages = EpilogueDescriptor::StagesD;
-  using Element = ElementAux;
-  using Stride = gemm::TagToStrideC_t<GmemLayoutTagAux>;
-  using SmemLayoutAtom =
-    decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile>());
-  using CopyOpR2S =
-    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
-};
-
-} // namespace collective
-
 } // namespace cutlass::epilogue
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
index aa8ab75947..237a560dc7 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
@@ -70,10 +70,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
   >;
 
@@ -128,10 +128,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
   >;
 
@@ -185,10 +185,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 12
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::ColumnMajor, float
   >;
 
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
index 9fd58b1d17..6d8275952d 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
@@ -72,10 +72,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
   
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
   
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombEVTDAG<
@@ -125,10 +125,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 12
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
   
-  using AuxStoreDescriptor = cutlass::epilogue::collective::AuxStoreDescriptor<
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
   
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombDAGEVT<
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
index b56a7db746..06a17645ad 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
@@ -71,7 +71,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
 
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
@@ -121,7 +121,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_cooperative_epilogue, 25
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
 
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
index c2e2a43ef0..7679378bfc 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
@@ -71,10 +71,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t
   >;
 
@@ -127,10 +127,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::ColumnMajor, cutlass::half_t
   >;
 
@@ -182,10 +182,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule
   >; 
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::ColumnMajor, float
   >;
 
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
index 10b1983e81..57bfe786aa 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
@@ -72,10 +72,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
   
-  using AuxLoadDescriptor = cutlass::epilogue::collective::AuxLoadDescriptor<
+  using AuxLoadDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
   
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombEVTDAG<
@@ -125,10 +125,10 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
   
-  using AuxStoreDescriptor = cutlass::epilogue::collective::AuxStoreDescriptor<
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::half_t>;
   
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombDAGEVT<
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
index 7a63657461..760185bb92 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
@@ -71,7 +71,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
 
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
@@ -121,7 +121,7 @@ TEST(SM90_Device_Gemm_f16t_f16n_f32t_tensor_op_gmma_f32_persistent_epilogue, 128
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::half_t, cutlass::half_t, EpilogueSchedule>;
 
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90LinCombPerColumnBias<
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
index a1d3711215..8db3ceff6b 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
@@ -139,9 +139,9 @@ TEST(SM90_Device_Gemm_e4m3t_e4m3n_bf16n_tensor_op_gmma_f32_epilogue, 64x128x128_
 
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::bfloat16_t, cutlass::bfloat16_t, EpilogueSchedule>;
-  using AuxStoreDescriptor = cutlass::epilogue::collective::AuxStoreDescriptor<
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::bfloat16_t>;
     
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
index e560af72d4..d6f5e272bb 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
@@ -139,9 +139,9 @@ TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32_cooperative_epilogue,
 
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, float, float, EpilogueSchedule>;
-  using AuxStoreDescriptor = cutlass::epilogue::collective::AuxStoreDescriptor<
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, float>;
     
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
index 5738a2ac67..21871e0c12 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
@@ -139,9 +139,9 @@ TEST(SM90_Device_Gemm_f8t_f8n_f8t_tensor_op_gmma_f32_persistent_epilogue, 64x128
 
   using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
   using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
-  using EpilogueDescriptor = cutlass::epilogue::collective::EpilogueDescriptor<
+  using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
     TileShape_MNK, EpilogueTileType, cutlass::float_e4m3_t, cutlass::float_e4m3_t, EpilogueSchedule>;
-  using AuxStoreDescriptor = cutlass::epilogue::collective::AuxStoreDescriptor<
+  using AuxStoreDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
     EpilogueDescriptor, cutlass::layout::RowMajor, cutlass::float_e4m3_t>;
     
   using FusionCallbacks = cutlass::epilogue::fusion::Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index 9c4f95d0c8..ff170ca355 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -569,7 +569,6 @@ TEST(SM75_gemm_threadblock_crosswise,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM75_gemm_threadblock_interleaved, tensor_op_32x32x64_16x16x64_8x8x16) {
   using ElementA = uint8_t;
   using LayoutA = cutlass::layout::ColumnMajorInterleaved<32>;
@@ -1793,6 +1792,7 @@ TEST(SM75_gemm_threadblock_interleaved,
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_64x64x512_8x8x128) {
   using ElementA = cutlass::uint1b_t;
   using LayoutA = cutlass::layout::RowMajor;
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index 54d0e930b0..857b8c6e05 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -193,7 +193,6 @@ TEST(SM75_gemm_threadblock_wmma_tensor_op_col_row_row_s8, 64x64x64_64x64x64_16x1
 ///////////////////////////////////////////////////////////////////////
 
 #if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
-
 TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
   using ElementA = cutlass::int4b_t;
   using LayoutA = cutlass::layout::RowMajor;
@@ -262,6 +261,7 @@ TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_col_s4, 64x64x64_64x64x64_8x8x
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
+
 TEST(SM75_gemm_threadblock_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
   using ElementA = cutlass::uint1b_t;
   using LayoutA = cutlass::layout::RowMajor;
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index 1c64422084..b5fbd15a56 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -193,7 +193,6 @@ TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_col_row_row_s8, 64x64x64_6
 ///////////////////////////////////////////////////////////////////////
 
 #if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
-
 TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_s4, 64x64x128_64x64x128_8x8x32) {
   using ElementA = cutlass::int4b_t;
   using LayoutA = cutlass::layout::RowMajor;
@@ -262,6 +261,7 @@ TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_col_s4, 64x64x64_6
                                             problem_size.k(), alpha, beta)
       .run(grid, block);
 }
+
 TEST(SM75_gemm_threadblock_singlestage_wmma_tensor_op_row_col_row_b1, 64x64x512_64x64x512_8x8x128) {
   using ElementA = cutlass::uint1b_t;
   using LayoutA = cutlass::layout::RowMajor;
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index 7ab4b21a9c..d1ac78b200 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -326,7 +326,6 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x8) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 TEST(SM75_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_8x8x16) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
@@ -746,6 +745,7 @@ TEST(SM75_warp_gemm_tensor_op_interleaved_i4, 128x128x128_16x16x128_8x8x32) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM75_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x64x512_8x8x128) {
   using Shape = cutlass::gemm::GemmShape<64, 64, 512>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>;
diff --git a/test/unit/pipeline/pipeline_async.cu b/test/unit/pipeline/pipeline_async.cu
index 61a46c7de5..964db2fecf 100644
--- a/test/unit/pipeline/pipeline_async.cu
+++ b/test/unit/pipeline/pipeline_async.cu
@@ -79,13 +79,9 @@ void pipeline_async_basic_device(uint32_t const num_iterations)
   using SharedStorage = SharedStorage<NumStages>;
   SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
 
-
-  auto cta_layout = Layout<ClusterShape>{}; // (m,n) -> cta_id
-
   int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
   int lane_predicate = cute::elect_one_sync();
   dim3 block_id_in_cluster = cute::block_id_in_cluster();
-  auto cluster_shape = ClusterShape{};
   
   // This example showcases 2 producer 1 consumer example 
   typename MainloopPipeline::Params params;
@@ -158,8 +154,6 @@ struct PipelineTest {
                   cudaStream_t stream = nullptr) {
 
     // Pipeline (multistage pipeline)
-    auto num_stages = Int<Stages>{};
-
     auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
 
     //
diff --git a/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
index f1e7e7f03d..efb389be89 100644
--- a/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
+++ b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
@@ -316,7 +316,6 @@ struct PipelineTest {
 
     float elapsed_ms = 0.0f;
     // Pipeline (multistage pipeline)
-    auto num_stages = Int<Stages>{}; 
     auto cluster_shape = Shape<Int<ClusterShape::kM>, Int<ClusterShape::kN>, _1>{};
 
     //
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index e282e32427..a11ebcf6d2 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -25,10 +25,17 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 cmake_policy(SET CMP0112 NEW)
+
 include(GNUInstallDirs)
 
-find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+################################################################################
+
+set(CUTLASS_BUILD_MONO_LIBRARY OFF CACHE BOOL 
+  "Determines whether the cutlass library is generated as a single file or multiple files.")
+
+################################################################################
 
 add_library(cutlass_library_includes INTERFACE)
 add_library(nvidia::cutlass::library::includes ALIAS cutlass_library_includes)
@@ -48,18 +55,160 @@ target_link_libraries(
   cutlass_tools_util_includes
   )
 
+install(
+  TARGETS cutlass_library_includes
+  EXPORT NvidiaCutlass
+  )
+
 install(
   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/
   )
 
+add_library(cutlass_library_internal_interface INTERFACE)
+add_library(nvidia::cutlass::library::obj_interface ALIAS cutlass_library_internal_interface)
+
+target_include_directories(
+  cutlass_library_internal_interface
+  INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+  )
+
+target_link_libraries(
+  cutlass_library_internal_interface
+  INTERFACE
+  cutlass_library_includes
+  )
+
+################################################################################
+
+function(cutlass_add_cutlass_library)
 #
-# CUTLASS Deliverables Library
-#
+# Generates static and shared libraries with the given SOURCES. The public CMake
+# targets produces will be cutlass_library(_${SUFFIX})? and 
+# cutlass_library(_${SUFFIX})?_static.
+# 
+# SUFFIX: An additional string to be joined to the default names. If suffix is given,
+#   the generated libraries will be linked as a dependency of the main cutlass library.
+
+  set(options)
+  set(oneValueArgs SUFFIX)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(DEFAULT_NAME cutlass_library)
+
+  set(__NAME ${DEFAULT_NAME})
+  set(__OUTPUT_NAME cutlass)
+  set(__EXPORT_NAME library)
+
+  if (__SUFFIX)
+    string(APPEND __NAME _${__SUFFIX})
+    string(APPEND __OUTPUT_NAME _${__SUFFIX})
+    string(APPEND __EXPORT_NAME _${__SUFFIX})
+  endif()
+
+  cutlass_add_library(
+    ${__NAME}_objs
+    OBJECT
+    ${__UNPARSED_ARGUMENTS}
+    )  
+
+  target_link_libraries(${__NAME}_objs
+    PUBLIC cutlass_library_includes
+    PRIVATE cutlass_library_internal_interface
+    )
+
+  if (CUTLASS_BUILD_MONO_LIBRARY AND __SUFFIX)
+
+    # If we're only building a single monolithic library then we
+    # simply link the generated object files to the default library. 
+
+    target_link_libraries(${DEFAULT_NAME} PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+    target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+
+  else()
+
+    cutlass_add_library(
+      ${__NAME} 
+      SHARED
+      EXPORT_NAME ${__EXPORT_NAME}
+      ""
+      )
+    
+    set_target_properties(
+      ${__NAME}
+      PROPERTIES
+      OUTPUT_NAME ${__OUTPUT_NAME}
+      WINDOWS_EXPORT_ALL_SYMBOLS 1
+      )
+    
+    target_link_libraries(
+      ${__NAME}
+      PUBLIC cutlass_library_includes
+      PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
+      cuda_driver
+      )
+    
+    set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+    
+    cutlass_add_library(
+      ${__NAME}_static
+      STATIC
+      EXPORT_NAME ${__EXPORT_NAME}_static
+      ""
+      )
+    
+    if (WIN32)
+      set(STATIC_OUTPUT_NAME ${__OUTPUT_NAME}.static)
+    else()
+      set(STATIC_OUTPUT_NAME ${__OUTPUT_NAME})
+    endif()
+    
+    set_target_properties(
+      ${__NAME}_static
+      PROPERTIES
+      OUTPUT_NAME ${STATIC_OUTPUT_NAME}
+      WINDOWS_EXPORT_ALL_SYMBOLS 1
+      )
+    
+    target_link_libraries(
+      ${__NAME}_static
+      PUBLIC cutlass_library_includes
+      PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>
+      cuda_driver
+      )
+    
+    set_target_properties(${__NAME}_static PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+    
+    install(
+      TARGETS ${__NAME} ${__NAME}_static
+      EXPORT NvidiaCutlass
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      )
+    
+    if (__SUFFIX)
+    
+      # The partial libraries generated will be registered as linked libraries
+      # to the main cutlass library so users automatically get the necessary link
+      # commands to pull in all kernels by default.
+    
+      target_link_libraries(${DEFAULT_NAME} INTERFACE ${__NAME})
+      target_link_libraries(${DEFAULT_NAME}_static INTERFACE ${__NAME}_static)
+    
+    endif()
+
+  endif()
+
+endfunction()
+
+################################################################################
+
+cutlass_add_cutlass_library(
 
-cutlass_add_library(
-  cutlass_library_objs
-  OBJECT
   src/handle.cu
   src/manifest.cpp
   src/operation_table.cu
@@ -83,15 +232,22 @@ cutlass_add_library(
   src/reference/initialize_reference_operations.cu
 
   # cutlass reduction instances in cutlass library
+
   src/reduction/reduction_device.cu
   src/reduction/init_reduction_operations.cu
   
   # cutlass conv reference instances in cutlass library
+
   src/reference/conv2d.cu
   src/reference/conv3d.cu
 
   )
 
+# For backward compatibility with the old name
+add_library(cutlass_lib ALIAS cutlass_library)
+
+################################################################################
+
 file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py)
 
 #
@@ -103,11 +259,11 @@ set(CUTLASS_GENERATOR_CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
 set(CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated_kernels.txt CACHE STRING "Generated kernel listing file")
 
 # --log-level is set to DEBUG to enable printing information about which kernels were excluded
-# from generation in /tools/library/scripts/manifest.py. To avoid having this information appear
+# from generation in /python/cutlass_library/manifest.py. To avoid having this information appear
 # in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
 execute_process(
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generator.py
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../python/cutlass_library
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../python/cutlass_library/generator.py
     --operations "${CUTLASS_LIBRARY_OPERATIONS}" 
     --build-dir ${PROJECT_BINARY_DIR}
     --curr-build-dir ${CMAKE_CURRENT_BINARY_DIR}
@@ -124,12 +280,12 @@ execute_process(
   ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
 )
 
-message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
-
 if(NOT cutlass_lib_INSTANCE_GENERATION_RESULT EQUAL 0)
   message(FATAL_ERROR "Error generating library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log")
 endif()
 
+message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
+
 # include auto-instantiated kernels in he CUTLASS Deliverables Library
 set(CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE ${CMAKE_CURRENT_BINARY_DIR}/generated/manifest.cmake)
 if(EXISTS "${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}")
@@ -138,74 +294,7 @@ else()
   message(STATUS "auto-generated library manifest cmake file (${CUTLASS_LIBRARY_MANIFEST_CMAKE_FILE}) not found.")
 endif()
 
-target_include_directories(
-  cutlass_library_objs
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_BINARY_DIR}/include
-  )
-
-target_link_libraries(
-  cutlass_library_objs 
-  PUBLIC 
-  cutlass_library_includes
-  )
-
-function(cutlass_add_cutlass_library)
-
-  set(options)
-  set(oneValueArgs NAME TYPE EXPORT_NAME)
-  set(multiValueArgs)
-  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  cutlass_add_library(
-    ${__NAME} 
-    ${__TYPE}
-    EXPORT_NAME ${__EXPORT_NAME}
-    $<TARGET_OBJECTS:cutlass_library_objs>
-    )
-
-  target_link_libraries(
-    ${__NAME}
-    PUBLIC 
-    cutlass_library_includes
-    )
-
-  set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
-  
-  set(OUTPUT_NAME cutlass)
-
-  if (WIN32 AND ${__TYPE} STREQUAL "STATIC")
-    set(OUTPUT_NAME "${OUTPUT_NAME}.static")
-  endif()
-
-  set_target_properties(
-    ${__NAME}
-    PROPERTIES
-    OUTPUT_NAME ${OUTPUT_NAME}
-    WINDOWS_EXPORT_ALL_SYMBOLS 1
-    )
-
-endfunction()
-
-cutlass_add_cutlass_library(NAME cutlass_lib TYPE SHARED EXPORT_NAME library)
-cutlass_add_cutlass_library(NAME cutlass_library_static TYPE STATIC EXPORT_NAME library_static)
-
-install(
-  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-  )
-
-install(
-  TARGETS 
-    cutlass_lib
-    cutlass_library_static
-    cutlass_library_includes
-  EXPORT NvidiaCutlass
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  )
+################################################################################
 
 install(
   FILES ${CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE}
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index f298c6d56f..3c945d14ba 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -291,6 +291,7 @@ struct GemmUniversalArguments {
   // Needed for some 3.x kernels
   int sm_count;
 
+  library::RasterOrder raster_order;
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/include/cutlass/library/types.h b/tools/library/include/cutlass/library/types.h
index 9f0673f93d..c28efef539 100644
--- a/tools/library/include/cutlass/library/types.h
+++ b/tools/library/include/cutlass/library/types.h
@@ -250,6 +250,13 @@ enum class EpilogueKind {
   kInvalid
 };
 
+enum class RasterOrder {
+  kAlongN,
+  kAlongM,
+  kHeuristic,
+  kInvalid
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index 517c6e9e80..d385bfd759 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -170,6 +170,13 @@ char const *to_string(ConvKind type, bool pretty = false);
 template <>
 ConvKind from_string<ConvKind>(std::string const &str);
 
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(RasterOrder type, bool pretty = false);
+
+/// Convers a RasterOrder enumerant from a string
+template<>
+RasterOrder from_string<RasterOrder>(std::string const &str);
+
 /// Lexical cast from int64_t to string
 std::string lexical_cast(int64_t int_value);
 
diff --git a/tools/library/scripts/__init__.py b/tools/library/scripts/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
deleted file mode 100644
index d4f0483bda..0000000000
--- a/tools/library/scripts/manifest.py
+++ /dev/null
@@ -1,476 +0,0 @@
-#
-# \file generator.py
-#
-# \brief Generates the CUTLASS Library's instances
-#
-
-import enum
-import os.path
-import shutil
-
-from library import *
-from gemm_operation import *
-from rank_k_operation import *
-from rank_2k_operation import *
-from trmm_operation import *
-from symm_operation import *
-from conv2d_operation import *
-from conv3d_operation import *
-import logging
-
-###################################################################################################
-_LOGGER = logging.getLogger(__name__)
-
-
-class EmitOperationKindLibrary:
-  def __init__(self, generated_path, kind, args):
-    self.generated_path = generated_path
-    self.kind = kind
-    self.args = args
-    self.emitters = {
-      OperationKind.Gemm: EmitGemmConfigurationLibrary
-      , OperationKind.Conv2d: EmitConv2dConfigurationLibrary
-      , OperationKind.Conv3d: EmitConv3dConfigurationLibrary
-      , OperationKind.RankK: EmitRankKConfigurationLibrary
-      , OperationKind.Rank2K: EmitRank2KConfigurationLibrary
-      , OperationKind.Trmm: EmitTrmmConfigurationLibrary
-      , OperationKind.Symm: EmitSymmConfigurationLibrary
-    }
-
-    self.configurations = [];
-
-    self.header_template ="""
-/*
- Generated by manifest.py - Do not edit.
-*/
-
-#include "cutlass/cutlass.h"
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-namespace cutlass {
-namespace library {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-"""
-    self.entry_template = """
-
-//
-// Entry point to construct operations
-//
-void initialize_all_${operation_name}_operations(Manifest &manifest) {
-"""
-    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
-    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
-
-    self.epilogue_template ="""
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace library
-} // namespace cutlass
-
-"""
-
-  #
-  def __enter__(self):
-    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
-    os.mkdir(self.operation_path)
-
-    self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
-
-    self.top_level_file = open(self.top_level_path, "w")
-    self.top_level_file.write(self.header_template)
-
-    self.source_files = [self.top_level_path,]
-
-    return self
-
-  #
-  def emit(self, configuration_name, operations):
-
-    with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
-      for operation in operations:
-        configuration_emitter.emit(operation)
-
-      self.source_files.append(configuration_emitter.configuration_path)
-
-    self.configurations.append(configuration_name)
-    self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
-
-  #
-  def __exit__(self, exception_type, exception_value, traceback):
-    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
-
-    for configuration_name in self.configurations:
-      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
-
-    self.top_level_file.write(self.epilogue_template)
-    self.top_level_file.close()
-
-class EmitInterfaceLibrary:
-  def __init__(self, generated_path, operation_count, args):
-    self.generated_path = generated_path
-    self.args = args
-
-
-    self.prototypes = []
-    self.fn_calls = []
-    self.operation_count = str(operation_count)
-
-    self.top_level_hdr_template = '''
-/*
- Generated by manifest.py - Do not edit.
-*/
-'''
-    self.top_level_prologue = '''
-
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-namespace cutlass {
-\tnamespace library {
-
-${prototypes}
-
-\t\tvoid initialize_all(Manifest &manifest) {
-\t\t\tmanifest.reserve(${operation_count});\n\n
-${fn_calls}
-\t\t\t}
-
-\t} // namespace library
-} // namespace cutlass
-
-'''
-
-  #
-  def __enter__(self):
-    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
-
-    self.top_level_file = open(self.top_level_path, "w")
-    self.top_level_file.write(self.top_level_hdr_template)
-
-    self.source_files = [self.top_level_path,]
-
-    return self
-
-  #
-  def emit(self, operation_name):
-    self.prototypes.append(SubstituteTemplate(
-       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
-       {'operation_kind': operation_name}))
-    self.fn_calls.append(SubstituteTemplate(
-       "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
-       {'operation_kind': operation_name}))
-
-
-
-  #
-  def __exit__(self, exception_type, exception_value, traceback):
-    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes),
-                                                                           'fn_calls':"\n".join(self.fn_calls),
-                                                                           'operation_count': self.operation_count}))
-    self.top_level_file.close()
-
-###################################################################################################
-###################################################################################################
-
-class Options:
-  def __init__(self):
-    pass
-
-###################################################################################################
-
-#
-class Manifest:
-
-  #
-  def __init__(self, args = None):
-    self.operations = {}
-    self.args = args
-    self.operation_count = 0
-    self.operations_by_name = {}
-
-    self.kernel_filter = ''
-    self.kernel_filter_list = []
-    self.kernel_names = []
-    self.operations_enabled = []
-    self.selected_kernels = []
-    self.ignore_kernel_names = []
-    self.compute_capabilities = [50,]
-    self.curr_build_dir = '.'
-    self.filter_by_cc = True
-
-    if self.args:
-      self.kernel_filter = self.args.kernels
-      self.curr_build_dir = args.curr_build_dir
-
-      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
-      architectures = [x if x != '90a' else '90' for x in architectures]
-
-      self.compute_capabilities = [int(x) for x in architectures]
-
-      if args.filter_by_cc in ['false', 'False', '0']:
-        self.filter_by_cc = False
-
-    if args.operations == 'all':
-      self.operations_enabled = []
-    else:
-      operations_list = [
-        OperationKind.Gemm
-        , OperationKind.Conv2d
-        , OperationKind.Conv3d
-          , OperationKind.RankK
-          , OperationKind.Trmm
-          , OperationKind.Symm
-      ]
-      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
-
-    if args.kernels == 'all':
-      self.kernel_names = []
-    else:
-      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
-
-    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
-
-    if args.kernel_filter_file is None:
-        self.kernel_filter_list = []
-    else:
-        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
-        _LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format(
-            filter_count = len(self.kernel_filter_list), 
-            filter_file = args.kernel_filter_file))
-
-    self.operation_count = 0
-    self.operations_by_name = {}
-    self.disable_full_archs_compilation = args.disable_full_archs_compilation
-
-
-  def get_kernel_filters (self, kernelListFile):
-    if os.path.isfile(kernelListFile):
-        with open(kernelListFile, 'r') as fileReader:
-            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
-
-        lines = [re.compile(line) for line in lines if line]
-        return lines
-    else:
-        return []
-
-  #
-  def filter_out_kernels(self, kernel_name, kernel_filter_list):
-
-    for kernel_filter_re in kernel_filter_list:
-        if kernel_filter_re.search(kernel_name) is not None:
-            return True
-
-    return False
-
-
-  #
-  def _filter_string_matches(self, filter_string, haystack):
-    ''' Returns true if all substrings appear in the haystack in order'''
-    substrings = filter_string.split('*')
-    for sub in substrings:
-      idx = haystack.find(sub)
-      if idx < 0:
-        return False
-      haystack = haystack[idx + len(sub):]
-    return True
-
-  #
-  def filter(self, operation):
-    ''' Filtering operations based on various criteria'''
-
-    # filter based on compute capability
-    enabled = not (self.filter_by_cc)
-
-    for cc in self.compute_capabilities:
-      if cc >= operation.tile_description.minimum_compute_capability and \
-         cc <= operation.tile_description.maximum_compute_capability and \
-         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
-
-        enabled = True
-        break
-
-    if not enabled:
-      return False
-
-    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
-      return False
-
-    # eliminate duplicates
-    if operation.procedural_name() in self.operations_by_name.keys():
-      return False
-
-    # Filter based on list of valid substrings
-    if len(self.kernel_names):
-      name = operation.procedural_name()
-      enabled = False
-
-      # compare against the include list
-      for name_substr in self.kernel_names:
-        if self._filter_string_matches(name_substr, name):
-          _LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format(
-            kernel = operation.procedural_name(),
-            filt = name_substr))
-          enabled = True
-          break
-
-      # compare against the exclude list
-      for name_substr in self.ignore_kernel_names:
-        if self._filter_string_matches(name_substr, name):
-          _LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format(
-            kernel = operation.procedural_name(),
-            filt = name_substr))
-          enabled = False
-          break
-
-    if len(self.kernel_filter_list) > 0:
-        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
-          _LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name()))
-          enabled = True
-        else:
-          _LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name()))
-          enabled = False
-
-
-    # TODO: filter based on compute data type
-    return enabled
-  #
-
-  #
-  def append(self, operation):
-    '''
-      Inserts the operation.
-
-      operation_kind -> configuration_name -> []
-    '''
-
-    if self.filter(operation):
-
-      self.selected_kernels.append(operation.procedural_name())
-
-      self.operations_by_name[operation.procedural_name()] = operation
-
-      # add the configuration
-      configuration_name = operation.configuration_name()
-
-      if operation.operation_kind not in self.operations.keys():
-        self.operations[operation.operation_kind] = {}
-
-      if configuration_name not in self.operations[operation.operation_kind].keys():
-        self.operations[operation.operation_kind][configuration_name] = []
-
-      self.operations[operation.operation_kind][configuration_name].append(operation)
-      self.operation_count += 1
-    else:
-      _LOGGER.debug("Culled {} from manifest".format(operation.procedural_name()))
-  #
-
-  #
-  def emit(self, target = GeneratorTarget.Library):
-
-    operation_emitters = {
-      GeneratorTarget.Library: EmitOperationKindLibrary
-    }
-    interface_emitters = {
-      GeneratorTarget.Library: EmitInterfaceLibrary
-    }
-
-    generated_path = os.path.join(self.curr_build_dir, 'generated')
-
-    # create generated/
-    if os.path.exists(generated_path):
-      shutil.rmtree(generated_path)
-
-    os.mkdir(generated_path)
-
-    source_files = []
-
-    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
-      for operation_kind, configurations in self.operations.items():
-        iface_emitter.emit(OperationKindNames[operation_kind])
-
-      source_files += iface_emitter.source_files
-
-
-    # for each operation kind, emit initializer for all configurations
-    for operation_kind, configurations in self.operations.items():
-      with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
-        for configuration_name, operations in configurations.items():
-          _LOGGER.info("Emitting {config} with {num_ops} operations.".format(
-              config = configuration_name, num_ops = len(operations)))
-          operation_kind_emitter.emit(configuration_name, operations)
-
-        source_files += operation_kind_emitter.source_files
-
-    # write the manifest.cmake file containing paths from all targets
-    manifest_path = os.path.join(generated_path, "manifest.cmake")
-    with open(manifest_path, "w") as manifest_file:
-
-      target_name = 'cutlass_library_objs'
-
-      target_text = SubstituteTemplate("""cutlass_target_sources(
-  ${target_name}
-  BATCH_SOURCES ON
-  PRIVATE
-""", { 'target_name': target_name})
-
-      manifest_file.write(target_text + '\n\n')
-
-      for source_file in source_files:
-        manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
-      manifest_file.write(")\n")
-
-      if self.disable_full_archs_compilation:
-
-        def for_hopper(name):
-            pass
-
-        def for_ampere(name):
-            return "16816" in name or \
-                   "16832" in name or \
-                   "16864" in name or \
-                   ("1688" in name and "tf32" in name)
-
-        def for_turing(name):
-            return ("1688" in name and "tf32" not in name) or \
-                    "8816" in name
-
-        def for_volta(name):
-            return "884" in name
-
-        def is_cpp(name):
-            return name.endswith(".cpp")
-
-        def get_src_archs_str_given_requested_cuda_archs(archs, source_file):
-            intersected_archs = archs & set(self.compute_capabilities)
-            if intersected_archs == set():
-                raise RuntimeError(
-                      """
-                      Empty archs set for file {} after taking
-                      the intersection of {} (global requested archs) and
-                      {} (per file requested archs)
-                      """.format(source_file, set(self.compute_capabilities), archs))
-            else:
-                return " ".join(map(str, intersected_archs))
-
-        for source_file in source_files:
-            if is_cpp(source_file):
-                continue # skip because source is cpp
-            elif for_ampere(source_file):
-                archs_str = get_src_archs_str_given_requested_cuda_archs({80, 87, 90}, source_file)
-            elif for_turing(source_file):
-                archs_str = get_src_archs_str_given_requested_cuda_archs({75}, source_file)
-            elif for_volta(source_file):
-                archs_str = get_src_archs_str_given_requested_cuda_archs({70, 72}, source_file)
-            else:
-                raise RuntimeError("Per file archs are not set {}, as there is no rule specified for this file pattern".format(source_file))
-
-            manifest_file.write("cutlass_apply_cuda_gencode_flags({} SM_ARCHS {})\n".format(str(source_file.replace('\\', '/')), archs_str))
-  #
-
-###################################################################################################
diff --git a/tools/library/scripts/rt.py b/tools/library/scripts/rt.py
deleted file mode 100644
index 2acccae75e..0000000000
--- a/tools/library/scripts/rt.py
+++ /dev/null
@@ -1,796 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-
-# System imports
-import struct
-import io
-import ctypes
-
-# CUDA Python import
-from cuda import cuda
-from cuda import nvrtc
-
-# CUTLASS imports
-from library import *
-from gemm_operation import EmitGemmUniversalInstance
-
-#################################################################################################
-#
-# CUTLASS Py Runtime Components
-#
-#################################################################################################
-
-#
-def MaxAlignment(fmt):
-  align = 1
-  for x in fmt:
-    align = max(align, struct.calcsize(x))
-  return align
-  
-#
-def AlignedOffset(offset, align):
-  remainder = (offset % align)
-  if remainder:
-    offset += (align - remainder)
-  return offset
-
-#
-def PackInteger(host_workspace, offset, value):
-  fmt = "i"
-  padding = AlignedOffset(offset, 4)  
-  struct.pack_into(fmt, host_workspace, offset, value)
-  return padding + struct.calcsize(fmt)
-
-#
-def PackDevicePointer(host_workspace, offset, value):
-  fmt = "P"
-  offset = AlignedOffset(offset, 8)
-  struct.pack_into(fmt, host_workspace, offset, value)
-  return offset + struct.calcsize(fmt)
-  
-#
-def ceil_div(a, b):
-  return -(a // -b)
-
-#################################################################################################
-
-#
-class PitchLinearCoord:
-  def __init__(self, contiguous, strided):
-    self.contiguous = contiguous
-    self.strided = strided
-
-#
-class GemmCoord:
-  def __init__(self, m = 1, n = 1, k = 1):
-    self.m = m
-    self.n = n
-    self.k = k
-    self.fmt = "iii"
-
-  #
-  def ceil_div(self, rhs):
-    return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
-
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def pack_into(self, host_workspace, offset):
-    
-    offset = AlignedOffset(offset, 4)
-  
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, 
-      offset, 
-      self.m, self.n, self.k)
-
-    return offset + self.size()
-
-#
-class TensorRef:
-  def __init__(self, pointer = None, layout = 0):
-    self.pointer = pointer
-    self.layout = layout
-
-  def __str__(self):
-    return "(%x, %d)" % (self.pointer._ptr, self.layout)
-
-#################################################################################################
-
-#
-class PredicatedTileAccessIteratorDesc:
-  '''
-  '''
-  
-  def __init__(
-      self, 
-      element_size_bits, 
-      advance_rank, 
-      threadblock_shape, 
-      threadmap_iterations, 
-      threadmap_delta):
-
-    self.element_size_bits = element_size_bits
-    self.advance_rank = advance_rank
-    self.threadblock_shape = threadblock_shape
-    self.threadmap_iterations = threadmap_iterations
-    self.threadmap_delta = threadmap_delta
-
-#
-class PredicatedTileAccessIteratorParams:
-  '''
-  '''
-  #
-  def __init__(self, desc, label):
-    self.desc = desc
-    self.label = label
-    self.fmt = "qqqq"
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def initialize(self, host_workspace, offset, stride):
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    inc_strided = stride *                            \
-                  self.desc.threadmap_delta.strided * \
-                  self.desc.element_size_bits // 8
-
-    if self.desc.advance_rank:
-      inc_advance = self.desc.threadblock_shape.strided * \
-                          stride *                        \
-                          self.desc.element_size_bits // 8
-    else:
-      inc_advance = self.desc.threadblock_shape.contiguous * \
-                          self.desc.element_size_bits // 8
-
-    inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
-                      self.desc.threadmap_delta.strided *                   \
-                      stride *                                              \
-                      self.desc.element_size_bits // 8
-
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, 
-      offset, 
-      stride, inc_strided, inc_next, inc_advance)
-
-    return offset + self.size()
-  #
-
-#################################################################################################
-
-#
-class EpilogueTileDesc:
-  '''
-  '''
-  def __init__(self, column, row, group, cluster, tile):
-    self.column = column
-    self.row = row
-    self.group = group
-    self.cluster = cluster
-    self.tile = tile
-
-#
-class EpilogueThreadMap:
-  '''
-  '''
-  def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
-    self.threads = threads
-    self.elements_per_access = elements_per_access
-    self.element_size_bits = element_size_bits
-    self.shape = shape
-    self.iterations = iterations
-    self.delta = delta
-    self.count = count
-    pass
-
-#
-class EpilogueTileIteratorParams:
-  '''
-  '''
-  #
-  def __init__(self, desc, label):
-    self.desc = desc
-    self.label = label
-    self.fmt = "qqqqqqqq"
-
-  #
-  def size(self):
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  #
-  def initialize(self, host_workspace, offset, stride):
-
-    stride = stride * self.desc.element_size_bits // 8
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    increment_row = stride * self.desc.delta.row
-
-    increment_group = stride * self.desc.delta.group \
-      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
-
-    increment_cluster = stride * self.desc.delta.cluster \
-      - stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
-      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
-      
-    advance_row = stride * self.desc.shape.row
-
-    advance_group = stride *                   \
-      (self.desc.shape.group - 1) * \
-      self.desc.shape.row *         \
-      self.desc.count.row
-
-    advance_cluster = stride * \
-      self.desc.count.group * \
-      self.desc.shape.group * \
-      self.desc.count.row   * \
-      self.desc.shape.row
-
-    advance_tile = stride * \
-      self.desc.shape.group * \
-      self.desc.shape.row   * \
-      self.desc.shape.cluster * \
-      self.desc.shape.tile
-
-    struct.pack_into(
-      self.fmt,                                           \
-      host_workspace,                                     \
-      offset,                                             \
-      stride,                                             \
-      increment_row, increment_group, increment_cluster,  \
-      advance_row, advance_group, advance_cluster, advance_tile)
-
-    return offset + self.size()
-#
-
-#################################################################################################
-#
-# Launch configuration
-#
-#################################################################################################
-
-class LaunchConfiguration:
-  def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
-    self.grid = grid
-    self.block = block
-    self.shared_memory_capacity = smem
-
-#################################################################################################
-#
-# Functors
-#
-#################################################################################################
-
-#
-class Functor:
-  def __init__(self):
-    self.decl = ''
-    self.definition = ''
-    self.fmt = ''
-    self.identifier = ''
-
-  #
-  def emit_declaration(self):
-    return self.decl
-
-  #
-  def emit_definition(self):
-    return self.definition
-
-  # 
-  def size(self):
-    '''
-    Size of the packed Params structure
-    '''
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  # 
-  def initialize(self, host_workspace, offset, arguments):
-    return offset + self.size()
-
-#################################################################################################
-
-#
-class LinearCombinationFunctorArguments:
-  def __init__(self, alpha = 1.0, beta = 0.0):
-    self.alpha = alpha
-    self.beta = beta
-    self.alpha_ptr = 0
-    self.beta_ptr = 0
-
-#
-class LinearCombinationFunctor(Functor):
-  def __init__(self):
-    super().__init__()
-
-    self.decl = """
-    cutlass::epilogue::thread::LinearCombination<
-      float,
-      1,
-      float,
-      float
-    >"""
-    self.identifier = 'linear_combination'
-    self.fmt = "ffPP"
-
-  # 
-  def size(self):
-    '''
-    Size of the packed Params structure
-    '''
-    return struct.calcsize(self.fmt)
-
-  #
-  def alignment(self):
-    return MaxAlignment(self.fmt)
-
-  # 
-  def initialize(self, host_workspace, offset, arguments):
-
-    offset = AlignedOffset(offset, self.alignment())
-
-    struct.pack_into(
-      self.fmt, 
-      host_workspace, offset, 
-      arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
-
-    return offset + self.size()
-
-#################################################################################################
-#
-# Base class for an executable operation
-#
-#################################################################################################
-
-#
-class ExecutableOperation:
-  '''
-  '''
-  def __init__(self, operation):
-    self.operation = operation
-    self.module = None
-    self.kernel = None
-
-  #
-  def name(self):
-    return self.operation.procedural_name()
-
-  #
-  def emit(self):
-    return ''
-
-  #
-  def can_implement(self, configuration, arguments):
-    return False
-
-  #
-  def get_host_workspace_size(self, arguments):
-    return 0
-
-  #
-  def get_device_workspace_size(self, arguments):
-    return 0
-
-  #
-  def plan(self, arguments):
-    return LaunchConfiguration()
-
-  #
-  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
-    raise NotImplementedError()
-
-  #
-  def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
-
-    cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
-    packed = (ctypes.c_void_p * 1)()
-    packed[0] = ctypes.addressof(cArg)
-
-    err, = cuda.cuLaunchKernel(
-      self.kernel, 
-      launch_config.grid[0], launch_config.grid[1], launch_config.grid[2], 
-      launch_config.block[0], launch_config.block[1], launch_config.block[2], 
-      launch_config.shared_memory_capacity, 
-      stream, 
-      packed, 
-      0)
-
-    return err
-
-#################################################################################################
-
-
-#
-class GemmArguments:
-  '''
-  '''
-  def __init__(self):
-    self.problem_size = GemmCoord(0, 0, 0)
-    self.A = TensorRef()
-    self.B = TensorRef()
-    self.C = TensorRef()
-    self.D = TensorRef()
-    self.output_op = LinearCombinationFunctorArguments()
-
-#
-class ThreadblockSwizzle:
-  def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
-    self.threadblock_shape = threadblock_shape
-    self.log_threadblock_cohort = log_threadblock_cohort
-
-  def grid_tiled_shape(self, problem_size):
-    return GemmCoord(
-      ceil_div(problem_size.m, self.threadblock_shape.m), 
-      ceil_div(problem_size.n, self.threadblock_shape.n), 
-      1)
-
-#
-class Gemm(ExecutableOperation):
-  '''
-  GEMM manages the CUTLASS runtime components
-  '''
-  #
-  def __init__(self, operation):
-    super().__init__(operation)
-
-    self.emitter = EmitGemmUniversalInstance('_type')
-    self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
-
-    self.threads = 256
-    self.shared_memory_capacity = (32 << 10)
-
-    self.params_A = PredicatedTileAccessIteratorParams(
-        PredicatedTileAccessIteratorDesc(
-          32, 
-          1, 
-          PitchLinearCoord(128, 8), 
-          PitchLinearCoord(1, 4), 
-          PitchLinearCoord(1, 2)), 'A')
-
-    self.params_B = PredicatedTileAccessIteratorParams(
-        PredicatedTileAccessIteratorDesc(
-          32, 
-          1, 
-          PitchLinearCoord(128, 8), 
-          PitchLinearCoord(1, 4), 
-          PitchLinearCoord(1, 2)), 'B')
-
-    self.params_C = EpilogueTileIteratorParams(
-      EpilogueThreadMap(
-        256, 
-        1, 
-        32,
-        EpilogueTileDesc(128, 1, 4, 4, 1), 
-        EpilogueTileDesc(4, 1, 2, 1, 1), 
-        EpilogueTileDesc(32, 1, 8, 1, 1), 
-        EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
-
-    self.params_D = EpilogueTileIteratorParams(
-      EpilogueThreadMap(
-        256, 
-        1,
-        32,
-        EpilogueTileDesc(128, 1, 4, 4, 1), 
-        EpilogueTileDesc(4, 1, 2, 1, 1), 
-        EpilogueTileDesc(32, 1, 8, 1, 1), 
-        EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
-
-    self.output_op = LinearCombinationFunctor()
-
-  #
-  def emit(self):
-    return self.emitter.emit(self.operation)
-
-  #
-  def can_implement(self, configuration, arguments):
-    pass
-
-  #
-  def get_host_workspace_size(self, arguments):
-    return 336
-
-  #
-  def get_device_workspace_size(self, arguments):
-    return 0
-
-  #
-  def plan(self, arguments):
-    grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
-    return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
-
-  #
-  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
-    
-    offset = 0
-
-    # Compute intermediate results
-    swizzle_log_tile = 0
-    gemm_mode = 0
-    batch_count = 1
-    gemm_k_size = arguments.problem_size.k
-
-    # Pack into the host workspace buffer
-    offset = arguments.problem_size.pack_into(host_workspace, offset)
-
-    grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
-    offset = grid_tiled_shape.pack_into(host_workspace, offset)
-
-    offset = PackInteger(host_workspace, offset, swizzle_log_tile)
-
-    offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
-    offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
-    offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
-    offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
-
-    offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
-
-    offset = PackInteger(host_workspace, offset, gemm_mode)
-    offset = PackInteger(host_workspace, offset, batch_count)
-    offset = PackInteger(host_workspace, offset, gemm_k_size)
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
-    offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))   
-
-    return offset
-
-
-#################################################################################################
-#
-# Module represents a compilation unit 
-#
-#################################################################################################
-
-#
-class CompilationOptions:
-  '''
-  Compilation options.
-  '''
-
-  #
-  def __init__(self, architectures = [80], include_paths = []):
-    self.includes = []
-    self.include_paths = include_paths
-    self.flags = ['-std=c++11', '-default-device']
-    self.architectures = architectures
-
-  #
-  def get(self):
-    options = []
-
-    for flag in self.flags:
-      options.append(bytes(str.encode(flag)))
-
-    for incl in self.include_paths:
-      options.append(bytes(str.encode('--include-path=%s' % incl)))
-
-    arch_list = "-arch="
-    for idx, arch in enumerate(self.architectures):
-      if idx:
-        arch_list += ","
-      arch_list += "sm_%d" % arch
-
-    options.append(bytes(str.encode(arch_list)))
-
-    return options
-
-IncludeTemplate = r'''#include "${include}"
-'''
-
-KernelTemplate = r'''
-extern "C"
-__global__ void
-${operation_name}(${operation_name}${operation_suffix}::Params params) {
-
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-
-  // Declare pointer to dynamic shared memory.
-  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
-      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
-
-  ${operation_name}${operation_suffix} op;
-
-  op(params, *shared_storage);
-}
-
-'''
-
-#
-class Module:
-  def __init__(self, name, operations, compilation_options):
-    self.name = name
-    self.operations = operations
-    self.module = None
-    self.log = None
-    self.cubin_image = None
-    self.source_buffer = ''
-
-    #
-    # Emit source
-    #
-    self.emit_()
-
-    #
-    # Compile
-    #
-    self.compile_(compilation_options)
-
-    #
-    # Load module
-    #
-    self.load_()
-    
-    # Done
-    return
-
-  # Emit a source buffer
-  def emit_(self):
-
-    # 1. Includes
-    includes = []
-    for operation in self.operations:
-      for incl in operation.emitter.includes:
-        if incl not in includes:
-          includes.append(incl)
-
-    for incl in includes:
-      self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
-
-    # 2. Operations
-    for operation in self.operations:
-      self.source_buffer += operation.emit()
-      values = {
-        'operation_name': operation.name(),
-        'operation_suffix': operation.emitter.operation_suffix
-      }
-      self.source_buffer += SubstituteTemplate(KernelTemplate, values)
-
-    # Done
-    return
-
-  # Compile with NVRTC
-  def compile_(self, compilation_options):
-
-    err, program = nvrtc.nvrtcCreateProgram(
-      str.encode(self.source_buffer), 
-      bytes(str.encode(self.name)), 
-      0, [], [])
-
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-
-    # Compile program
-    options = compilation_options.get()
-
-    err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-
-      error_string = 'NVRTC Error: {}\n'.format(err)
-
-      # Get log from compilation
-      err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
-      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-          raise RuntimeError('NVRTC Error: {}'.format(err))
-      
-      self.log = b' ' * logSize
-      err, = nvrtc.nvrtcGetProgramLog(program, self.log)
-      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-          raise RuntimeError('NVRTC Error: {}'.format(err))
-      
-      raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
-
-    # Get data from compilation
-    err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-    
-    self.cubin_image = b' ' * dataSize
-    err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError('NVRTC Error: {}'.format(err))
-
-    return
-    
-  #
-  def load_(self):
-
-    # Load data as module data
-    err, self.module = cuda.cuModuleLoadData(self.cubin_image)
-    if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Cuda Error: {}'.format(err))
-    
-    # Get functions
-    for operation in self.operations:
-      err, operation.kernel = cuda.cuModuleGetFunction(
-        self.module, 
-        bytes(str.encode(operation.name())))
-
-      if err != cuda.CUresult.CUDA_SUCCESS:
-          raise RuntimeError('Cuda Error: {}'.format(err))
-
-      operation.module = self
-
-    return
-
-
-#################################################################################################
-#
-# Manifest represents an 'owner' for modules and operations
-#
-#################################################################################################
-
-#
-class Manifest:
-
-  #
-  def __init__(self):
-    self.operations = {}
-    self.modules = []
-    pass
-
-  #
-  def append_module(self, module):
-    '''
-    Appends a module and takes ownership of operations used to construct it.
-    '''
-    
-    self.modules.append(module)
-
-    for operation in module.operations:
-      self.operations[operation.name()] = operation
-
-
-#################################################################################################
diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp
index 4f9e39be8e..90ddac4839 100644
--- a/tools/library/src/gemm_operation_3x.hpp
+++ b/tools/library/src/gemm_operation_3x.hpp
@@ -248,6 +248,20 @@ class GemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
     /* Query device SM count to pass onto the kernel as an argument, where needed */
     operator_args.hw_info.sm_count = arguments->sm_count;
 
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default: 
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
     return status;
   }
 
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index 98a578c074..f734fb8f66 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -1005,6 +1005,50 @@ ConvKind from_string<ConvKind>(std::string const &str) {
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+static struct {
+  char const *text;
+  char const *pretty;
+  RasterOrder enumerant;
+}
+RasterOrder_enumerants[] = {
+  {"along_n", "<along_n>", RasterOrder::kAlongN},
+  {"along_m", "<along_m>", RasterOrder::kAlongM},
+  {"heuristic", "<heuristic>", RasterOrder::kHeuristic},
+};
+
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(RasterOrder type, bool pretty) {
+
+  for (auto const & possible : RasterOrder_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+
+/// Converts a RasterOrder enumerant from a string
+template <>
+RasterOrder from_string<RasterOrder>(std::string const &str) {
+
+  for (auto const & possible : RasterOrder_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return RasterOrder::kInvalid;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
 bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string const &str) {
   int size_bytes = sizeof_bits(type) / 8;
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 16cb9051ed..16d94db294 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -73,7 +73,7 @@ set_target_properties(cutlass_profiler PROPERTIES EXPORT_NAME profiler)
 target_include_directories(
   cutlass_profiler
   PRIVATE
-  ${CMAKE_CURRENT_LIST_DIR}/src
+  ${CMAKE_CURRENT_LIST_DIR}/include
   )
 
 #
@@ -97,14 +97,14 @@ install(
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   )
 
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K   --operation=RankK       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_k)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K   --operation=Rank2K       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_2k)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM   --operation=Trmm       --providers=cutlass --verification-providers=device,host        --junit-output=test_cutlass_profiler_trmm)
-set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --providers=cutlass --verification-providers=cublas,host        --junit-output=test_cutlass_profiler_symm)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --providers=cutlass --verification-providers=cublas,device      --junit-output=test_cutlass_profiler_gemm    --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm  --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K   --operation=RankK       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_k    --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K   --operation=Rank2K       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_2k --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM   --operation=Trmm       --providers=cutlass --verification-providers=device,host        --junit-output=test_cutlass_profiler_trmm    --print-kernel-before-running=true)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --providers=cutlass --verification-providers=cublas,host        --junit-output=test_cutlass_profiler_symm    --print-kernel-before-running=true)
 
 cutlass_add_executable_tests(
   test_profiler cutlass_profiler
diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/conv2d_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
index 8b285ecb8a..1b0345df15 100644
--- a/tools/profiler/src/conv2d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
@@ -383,6 +383,8 @@ class Conv2dOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~Conv2dOperationProfiler();
 
+  Conv2dProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
similarity index 99%
rename from tools/profiler/src/conv3d_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
index 4205d56111..130a661b8b 100644
--- a/tools/profiler/src/conv3d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
@@ -332,6 +332,8 @@ class Conv3dOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~Conv3dOperationProfiler();
 
+  Conv3dProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
similarity index 100%
rename from tools/profiler/src/cublas_helpers.h
rename to tools/profiler/include/cutlass/profiler/cublas_helpers.h
diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
similarity index 100%
rename from tools/profiler/src/cudnn_helpers.h
rename to tools/profiler/include/cutlass/profiler/cudnn_helpers.h
diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
similarity index 100%
rename from tools/profiler/src/cutlass_profiler.h
rename to tools/profiler/include/cutlass/profiler/cutlass_profiler.h
diff --git a/tools/profiler/src/debug.h b/tools/profiler/include/cutlass/profiler/debug.h
similarity index 100%
rename from tools/profiler/src/debug.h
rename to tools/profiler/include/cutlass/profiler/debug.h
diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/include/cutlass/profiler/device_allocation.h
similarity index 100%
rename from tools/profiler/src/device_allocation.h
rename to tools/profiler/include/cutlass/profiler/device_allocation.h
diff --git a/tools/profiler/src/device_context.h b/tools/profiler/include/cutlass/profiler/device_context.h
similarity index 100%
rename from tools/profiler/src/device_context.h
rename to tools/profiler/include/cutlass/profiler/device_context.h
diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/include/cutlass/profiler/enumerated_types.h
similarity index 100%
rename from tools/profiler/src/enumerated_types.h
rename to tools/profiler/include/cutlass/profiler/enumerated_types.h
diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
similarity index 97%
rename from tools/profiler/src/gemm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
index 309aaad185..28914a69e7 100644
--- a/tools/profiler/src/gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
@@ -82,6 +82,7 @@ class GemmOperationProfiler : public OperationProfiler {
     int split_k_slices;
     int batch_count;
 
+    cutlass::library::RasterOrder raster_order;
     // gemm with parallel interleaved reduction
     // gemm epilogue (alpha, beta) = (1.0, 0.0)
     // reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
@@ -94,7 +95,8 @@ class GemmOperationProfiler : public OperationProfiler {
 
     GemmProblem(): 
       mode(library::GemmUniversalMode::kGemm),
-      m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { }
+      m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1),
+      raster_order(cutlass::library::RasterOrder::kHeuristic){ }
 
     /// Parses the problem
     Status parse(
@@ -178,6 +180,8 @@ class GemmOperationProfiler : public OperationProfiler {
   /// Destructor
   virtual ~GemmOperationProfiler();
 
+  GemmProblem const& problem() const { return problem_; }
+
   /// Prints usage statement for the math function
   virtual void print_usage(std::ostream &out) const;
 
diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/include/cutlass/profiler/gpu_timer.h
similarity index 100%
rename from tools/profiler/src/gpu_timer.h
rename to tools/profiler/include/cutlass/profiler/gpu_timer.h
diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/include/cutlass/profiler/operation_profiler.h
similarity index 100%
rename from tools/profiler/src/operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/operation_profiler.h
diff --git a/tools/profiler/src/options.h b/tools/profiler/include/cutlass/profiler/options.h
similarity index 97%
rename from tools/profiler/src/options.h
rename to tools/profiler/include/cutlass/profiler/options.h
index 03dd71ee89..c57025fca2 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/include/cutlass/profiler/options.h
@@ -247,6 +247,10 @@ class Options {
     /// Sort results by (currently by flops-per-byte)
     bool sort_results;
 
+    /// Prints the name of the kernel being profiled before running the kernel.
+    /// This is useful for determining which kernel is causing a run of the profiler to hang
+    bool print_kernel_before_running;
+
     //
     // Methods
     //
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/include/cutlass/profiler/performance_report.h
similarity index 100%
rename from tools/profiler/src/performance_report.h
rename to tools/profiler/include/cutlass/profiler/performance_report.h
diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/include/cutlass/profiler/performance_result.h
similarity index 100%
rename from tools/profiler/src/performance_result.h
rename to tools/profiler/include/cutlass/profiler/performance_result.h
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/include/cutlass/profiler/problem_space.h
similarity index 98%
rename from tools/profiler/src/problem_space.h
rename to tools/profiler/include/cutlass/profiler/problem_space.h
index 8ec65ca81d..651444598e 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/include/cutlass/profiler/problem_space.h
@@ -935,6 +935,15 @@ bool arg_as_IteratorAlgorithmID(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(library::RasterOrder &raster_order, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
 
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr);
diff --git a/tools/profiler/src/rank_2k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/rank_2k_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
diff --git a/tools/profiler/src/rank_k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/rank_k_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
diff --git a/tools/profiler/src/reduction_operation_profiler.h b/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/reduction_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/sparse_gemm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
diff --git a/tools/profiler/src/symm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/symm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
diff --git a/tools/profiler/src/trmm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
similarity index 100%
rename from tools/profiler/src/trmm_operation_profiler.h
rename to tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
index cca50d0bab..f231510e70 100644
--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -39,9 +39,8 @@
 
 #include "cutlass/core_io.h"
 
-#include "conv2d_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/conv2d_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 using namespace cutlass::library;
 
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
index 24a53c39cb..27e6e66c70 100644
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -40,9 +40,8 @@
 
 #include "cutlass/core_io.h"
 
-#include "conv3d_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/conv3d_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 using namespace cutlass::library;
 
diff --git a/tools/profiler/src/cublas_helpers.cu b/tools/profiler/src/cublas_helpers.cu
index 00925dfc60..cf2dea90e9 100644
--- a/tools/profiler/src/cublas_helpers.cu
+++ b/tools/profiler/src/cublas_helpers.cu
@@ -35,7 +35,7 @@
 #include <stdexcept>
 
 #if CUTLASS_ENABLE_CUBLAS
-#include "cublas_helpers.h"
+#include "cutlass/profiler/cublas_helpers.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
index 158799e211..254cbaebd2 100644
--- a/tools/profiler/src/cudnn_helpers.cpp
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -35,7 +35,7 @@
 
 #include <stdexcept>
 
-#include "cudnn_helpers.h"
+#include "cutlass/profiler/cudnn_helpers.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index e4db2290ed..78d74e3b02 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -36,15 +36,15 @@
 #include <stdexcept>
 
 // Profiler includes
-#include "cutlass_profiler.h"
-#include "gemm_operation_profiler.h"
-#include "rank_k_operation_profiler.h"
-#include "rank_2k_operation_profiler.h"
-#include "trmm_operation_profiler.h"
-#include "symm_operation_profiler.h"
-#include "conv2d_operation_profiler.h"          
-#include "conv3d_operation_profiler.h"          
-#include "sparse_gemm_operation_profiler.h"
+#include "cutlass/profiler/cutlass_profiler.h"
+#include "cutlass/profiler/gemm_operation_profiler.h"
+#include "cutlass/profiler/rank_k_operation_profiler.h"
+#include "cutlass/profiler/rank_2k_operation_profiler.h"
+#include "cutlass/profiler/trmm_operation_profiler.h"
+#include "cutlass/profiler/symm_operation_profiler.h"
+#include "cutlass/profiler/conv2d_operation_profiler.h"          
+#include "cutlass/profiler/conv3d_operation_profiler.h"          
+#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index 2c6fdc3ffc..088358278a 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -46,7 +46,7 @@
 
 #include "cutlass/library/util.h"
 
-#include "device_allocation.h"
+#include "cutlass/profiler/device_allocation.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index cad454f8f1..280adab11b 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -32,7 +32,7 @@
    \brief 
 */
 
-#include "device_context.h"
+#include "cutlass/profiler/device_context.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index 2f0af02d5e..4c912bbaf6 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -32,7 +32,7 @@
    \brief Provides several functions for filling tensors with data.
 */
 
-#include "enumerated_types.h"
+#include "cutlass/profiler/enumerated_types.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 8c8f8b2124..dc1f088c27 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -39,10 +39,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "gemm_operation_profiler.h"
-#include "gpu_timer.h"
-
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/gemm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 #include "cutlass/library/singleton.h"
 #include "cutlass/library/library.h"
 #include "cutlass/library/handle.h"
@@ -74,6 +73,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
       {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
       {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
       {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
+      {ArgumentTypeID::kEnumerated, {"raster_order", "raster-order"}, "Raster order (heuristic, along_n, along_m)"},       
     },
     { library::Provider::kCUBLAS}
   ) {
@@ -174,7 +174,7 @@ Status GemmOperationProfiler::GemmProblem::parse(
   }
   
   this->mode = library::GemmUniversalMode::kGemm;
-  if(this->split_k_mode == library::SplitKMode::kParallel) {
+  if (this->split_k_mode == library::SplitKMode::kParallel) {
     this->mode = library::GemmUniversalMode::kGemmSplitKParallel;
   }
 
@@ -190,6 +190,11 @@ Status GemmOperationProfiler::GemmProblem::parse(
     this->mode = library::GemmUniversalMode::kBatched;
   }
 
+  if (!arg_as_RasterOrder(this->raster_order, "raster_order", problem_space, problem)) {
+    // default value
+    this->raster_order = library::RasterOrder::kHeuristic;
+  }
+  
   if (this->split_k_slices > 1 && this->batch_count > 1) {
     // At least one of these must be one
     return Status::kErrorInvalidProblem;
@@ -322,6 +327,7 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "split_k_mode", problem_space, library::to_string(split_k_mode));
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
+  set_argument(result, "raster_order", problem_space, library::to_string(raster_order)); 
   set_argument(result, "alpha", problem_space,
     library::lexical_cast(alpha, operation_desc.element_epilogue));
 
@@ -376,6 +382,8 @@ Status GemmOperationProfiler::initialize_configuration(
   gemm_workspace_.arguments.alpha = problem_.alpha.data();
   gemm_workspace_.arguments.beta = problem_.beta.data();
   gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+  gemm_workspace_.arguments.raster_order = problem_.raster_order;
+
   // initialize reduction operation for parallel splitKMode
   if (problem_.split_k_mode == library::SplitKMode::kParallel) {
     if (!initialize_reduction_configuration_(operation, problem)) {
@@ -610,7 +618,7 @@ Status GemmOperationProfiler::initialize_workspace(
     results_.back().op_kind = library::OperationKind::kGemm;
     results_.back().disposition = Disposition::kNotRun;
 
-    for(auto provider : verification_providers_) {
+    for (auto provider : verification_providers_) {
       results_.back().verification_map[provider] = Disposition::kNotRun;
     }
   }
@@ -1102,7 +1110,6 @@ Status GemmOperationProfiler::profile_cutlass_(
   void *device_workspace) {
 
   GpuTimer timer;
-
   // initialize gemm underlying operation to handle parallel reduction
   library::Operation const * underlying_operation = operation;
 
@@ -1223,7 +1230,6 @@ Status GemmOperationProfiler::profile_cutlass_(
   //
 
   timer.stop_and_wait();
-
   //
   // Update performance result
   //
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index 14f0d91b7a..67d218f8ce 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -34,7 +34,7 @@
 
 #include <stdexcept>
 
-#include "gpu_timer.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 namespace cutlass {
 namespace profiler {
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index bd4e2053dc..79eae7e969 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -34,9 +34,9 @@
 
 #include <iostream>
 
-#include "options.h"
+#include "cutlass/profiler/options.h"
 
-#include "cutlass_profiler.h"
+#include "cutlass/profiler/cutlass_profiler.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index a3ed990ba0..c9634b2bdf 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -47,9 +47,9 @@
 // sleep not supported
 #endif
 
-#include "options.h"
-#include "operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/options.h"
+#include "cutlass/profiler/operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -100,12 +100,11 @@ OperationProfiler::OperationProfiler(
       verification_providers_.push_back(provider);
     }
   }
+
 }
 
 /// Destructor
-OperationProfiler::~OperationProfiler() {
-
-}
+OperationProfiler::~OperationProfiler() {}
 
 /// Gets the schema description
 std::string const & OperationProfiler::description() const {
@@ -349,6 +348,11 @@ int OperationProfiler::profile_all(
 
         if (continue_profiling) {
 
+          if (options.report.print_kernel_before_running) {
+            std::cout << "Profiling kernel for JUnit test " << options.report.junit_output_path << ": "
+                      << operation_name << std::endl;
+          }
+
           status = this->initialize_workspace(
             options,
             report,
@@ -679,7 +683,7 @@ bool OperationProfiler::find_string_matches_(
 
   // Search filter_tokens in operation_name in order
   size_t start = 0, idx = 0;
-  for(auto & token : filter_tokens) {
+  for (auto & token : filter_tokens) {
     // Check if characters left to be parsed in operation_name
     if (start < operation_name.length()) {
       // Find token in operation_name[start:]
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 4bc03baee5..6f714434f0 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -39,7 +39,7 @@
 
 #include "cutlass/library/util.h"
 
-#include "options.h"
+#include "cutlass/profiler/options.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -145,7 +145,7 @@ void Options::Device::print_device_info(std::ostream &out) const {
 
   out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
 
-  for(int device = 0; device < num_devices; device++) {
+  for (int device = 0; device < num_devices; device++) {
     result = cudaSetDevice(device);
     if (result != cudaSuccess) {
       throw std::runtime_error("cudaSetDevice() failed for device");
@@ -587,7 +587,7 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
   cmdline.get_cmd_line_argument("append", append, false);
   cmdline.get_cmd_line_argument("output", output_path);
   cmdline.get_cmd_line_argument("junit-output", junit_output_path);
-
+ 
   if (cmdline.check_cmd_line_flag("tags")) {
     cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
   }
@@ -597,6 +597,8 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
   cmdline.get_cmd_line_argument("verbose", verbose, true);
 
   cmdline.get_cmd_line_argument("sort-results", sort_results, false);
+
+  cmdline.get_cmd_line_argument("print-kernel-before-running", print_kernel_before_running, false);
 }
 
 void Options::Report::print_usage(std::ostream &out) const {
@@ -613,6 +615,10 @@ void Options::Report::print_usage(std::ostream &out) const {
     << "  --junit-output=<path>                        "
     << "    Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"
 
+    << "  --print-kernel-before-running=<bool>                "
+    << "    Prints the name of the kernel being profiled before running the kernel." << end_of_line
+    << "      This is useful for determining which kernel is causing a run of the profiler to hang\n\n"
+
     << "  --report-not-run=<bool>                      "
     << "    If true, reports the status of all kernels including those that" << end_of_line
     << "      do not satisfy the given arguments.\n\n"
@@ -634,7 +640,8 @@ void Options::Report::print_options(std::ostream &out, int indent) const {
     << indent_str(indent) << "append: " << append << "\n"
     << indent_str(indent) << "output: " << output_path << "\n"
     << indent_str(indent) << "junit-output: " << junit_output_path << "\n"
-    << indent_str(indent) << "report_not_run: " << report_not_run << "\n"
+    << indent_str(indent) << "print-kernel-before-running: " << print_kernel_before_running << "\n"
+    << indent_str(indent) << "report-not-run: " << report_not_run << "\n"
     << indent_str(indent) << "tags:\n";
 
   for (auto const & tag : pivot_tags) {
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index c026ed994d..c3b81d8767 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -42,8 +42,8 @@
 
 #include "cutlass/library/util.h"
 
-#include "performance_report.h"
-#include "debug.h"
+#include "cutlass/profiler/performance_report.h"
+#include "cutlass/profiler/debug.h"
 namespace cutlass {
 namespace profiler {
 
@@ -382,6 +382,7 @@ std::ostream & PerformanceReport::print_result_csv_(
       << "," << result.gbytes_per_sec()
       << "," << result.gflops_per_sec()
       ;
+
   }
   else {
     out << std::string(2
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
index 810e26133a..438c0f258a 100644
--- a/tools/profiler/src/performance_result.cu
+++ b/tools/profiler/src/performance_result.cu
@@ -39,8 +39,8 @@
 #include "cutlass/cutlass.h"
 
 // CUTLASS Profiler includes
-#include "enumerated_types.h"
-#include "performance_result.h"
+#include "cutlass/profiler/enumerated_types.h"
+#include "cutlass/profiler/performance_result.h"
 
 // CUTLASS Library includes
 #include "cutlass/library/library.h"
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index 2040757b90..f89396b164 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -38,7 +38,7 @@
 
 #include "cutlass/library/util.h"
 
-#include "problem_space.h"
+#include "cutlass/profiler/problem_space.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -845,6 +845,46 @@ bool arg_as_NumericTypeID(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order, 
+  KernelArgument::Value const *value_ptr) {
+  
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+
+      raster_order = library::from_string<library::RasterOrder>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+
+      if (raster_order == library::RasterOrder::kInvalid) {
+        throw std::runtime_error(
+          "arg_as_RasterOrder() - illegal cast.");
+      }
+    }
+    else {
+      throw std::runtime_error(
+        "arg_as_RasterOrder() - illegal cast.");
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_RasterOrder(raster_order, value_ptr);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_LayoutTypeID(
   library::LayoutTypeID &layout_type, 
diff --git a/tools/profiler/src/rank_2k_operation_profiler.cu b/tools/profiler/src/rank_2k_operation_profiler.cu
index 4ff4e4e686..562992dcd3 100644
--- a/tools/profiler/src/rank_2k_operation_profiler.cu
+++ b/tools/profiler/src/rank_2k_operation_profiler.cu
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "rank_2k_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/rank_2k_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/rank_k_operation_profiler.cu b/tools/profiler/src/rank_k_operation_profiler.cu
index 5d3972eeea..ace8c2bbdc 100644
--- a/tools/profiler/src/rank_k_operation_profiler.cu
+++ b/tools/profiler/src/rank_k_operation_profiler.cu
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "rank_k_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/rank_k_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index 6499039a84..2f150a6020 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -38,9 +38,9 @@
 #include <iomanip>
 #include <ios>
 
-#include "cublas_helpers.h"
-#include "sparse_gemm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/symm_operation_profiler.cu b/tools/profiler/src/symm_operation_profiler.cu
index 2a34418293..1364c46637 100644
--- a/tools/profiler/src/symm_operation_profiler.cu
+++ b/tools/profiler/src/symm_operation_profiler.cu
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "symm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/symm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/profiler/src/trmm_operation_profiler.cu b/tools/profiler/src/trmm_operation_profiler.cu
index 14e6fb2d66..7d61f79f7b 100644
--- a/tools/profiler/src/trmm_operation_profiler.cu
+++ b/tools/profiler/src/trmm_operation_profiler.cu
@@ -41,9 +41,9 @@
 
 #include "cutlass/core_io.h"
 
-#include "cublas_helpers.h"
-#include "trmm_operation_profiler.h"
-#include "gpu_timer.h"
+#include "cutlass/profiler/cublas_helpers.h"
+#include "cutlass/profiler/trmm_operation_profiler.h"
+#include "cutlass/profiler/gpu_timer.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/util/include/cutlass/util/print_error.hpp b/tools/util/include/cutlass/util/print_error.hpp
index 0da8ecb08f..4d84af8a65 100644
--- a/tools/util/include/cutlass/util/print_error.hpp
+++ b/tools/util/include/cutlass/util/print_error.hpp
@@ -42,7 +42,6 @@
 
 #include <cute/numeric/half.hpp>
 #include <cute/numeric/complex.hpp>
-
 #include <cutlass/layout/layout.h>
 
 // The computed infinity norm does not include
@@ -222,8 +221,10 @@ auto host_matrix_to_const_cute_tensor(CutlassHostTensorType& X)
 };
 
 
+// Returns EXIT_SUCCESS if the 2-norm relative error is exactly zero, else returns EXIT_FAILURE.
+// This makes the return value suitable as the return value of main().
 template <typename T1, typename T2>
-double
+int
 print_relative_error(
     std::size_t n,
     T1 const& data,
@@ -285,5 +286,5 @@ print_relative_error(
   if (print_error)
     printf("Maximum relative error: [%.5e]\n", max_ind_rel_err);
 
-  return tot_rel_err;
+  return (tot_rel_err == 0.0) ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tools/util/include/cutlass/util/reference/host/gett.hpp b/tools/util/include/cutlass/util/reference/host/gett.hpp
index 6152f117fc..7b52dc5874 100644
--- a/tools/util/include/cutlass/util/reference/host/gett.hpp
+++ b/tools/util/include/cutlass/util/reference/host/gett.hpp
@@ -368,8 +368,8 @@ template <class TensorType>
 auto make_layout_rank3(const TensorType& tensor) {
   // append a batch mode of size 1 if we do not have tensors that are rank 3
   return make_layout(
-      make_shape(get<0>(tensor.shape()), get<1>(tensor.shape()), Int<1>{}),
-      make_stride(get<0>(tensor.stride()), get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
+      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
+      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
 }
 
 /// GEMM - General Matrix-Matrix contraction without conjugation options