diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu index 8fc193195c..886c17701a 100644 --- a/examples/04_tile_iterator/tile_iterator.cu +++ b/examples/04_tile_iterator/tile_iterator.cu @@ -50,7 +50,6 @@ #include #include #include -#include // CUTLASS includes #include "cutlass/transform/threadblock/predicated_tile_iterator.h" diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index b739121694..bd74ce12da 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -124,6 +124,7 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index c7dc7b275d..9a9dc88888 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -74,7 +74,6 @@ */ #include -#include #include #include "cutlass/cutlass.h" diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index 1dd358464f..272390f26b 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -72,7 +72,6 @@ */ #include -#include #include #include "cutlass/cutlass.h" diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu index a35a394623..66b0dee500 100644 --- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu +++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -111,6 +111,7 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu index cd2a48d9aa..756d465124 100644 --- a/examples/22_quaternion_conv/quaternion_conv.cu +++ b/examples/22_quaternion_conv/quaternion_conv.cu @@ -30,6 +30,7 @@ **************************************************************************************************/ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu index bb880f4fed..41ea3200a1 100644 --- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu +++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu @@ -41,6 +41,7 @@ epilogue/threadblock/epilogue_gemm_k_reduction.h */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu index 661efcf7be..fe756fbadd 100644 --- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu +++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu @@ -52,6 +52,7 @@ line is the same. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu index da3ec1ca84..72d7284f6f 100644 --- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu +++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu @@ -49,6 +49,7 @@ technical details. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu index b2996f2d39..a197e2efce 100644 --- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu +++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu @@ -36,6 +36,7 @@ compared with CUDA Cores. See example 27 for the trick of 3xTF32. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu index b49446cc03..5016adf292 100644 --- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu +++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu @@ -40,6 +40,7 @@ to correctly instantiate the GEMM template. */ #include +#include #include #include "cutlass/cutlass.h" diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu index 8ceea63834..f22e235f59 100644 --- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu +++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu @@ -69,6 +69,7 @@ #include #include +#include #include #include diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h index 125c177aea..9f0e04f94b 100644 --- a/test/unit/conv/device/conv2d_testbed.h +++ b/test/unit/conv/device/conv2d_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index db27199906..2aa60f0bea 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h index 1561ed8eee..dd12bf6056 100644 --- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h +++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h @@ -37,6 +37,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h index d2ccc9f1f5..a147275bd7 100644 --- a/test/unit/conv/device/conv2d_with_reduction_testbed.h +++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index 1c511c194b..f9cc3563c9 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -33,6 +33,8 @@ */ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu index 9373e7da1f..4b3feccacf 100644 --- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu +++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu index bca6c24727..386f2871fa 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu index 880d490ce7..84f9110f44 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index 48c8be179b..4aa27befe2 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu index 696af7b545..5d185dd5e4 100644 --- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu index 828e394107..415f7dd73c 100644 --- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu index b64901545a..922bebc12e 100644 --- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu @@ -33,8 +33,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu index 4606b84561..1e9e5c8738 100644 --- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu +++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu @@ -35,8 +35,6 @@ #ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu index 7d434eaa65..c33e0a89d4 100644 --- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu +++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu @@ -32,8 +32,6 @@ \brief Unit tests for thread-level GEMM */ -#include - #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index 561859a409..e6893026a6 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -35,7 +35,6 @@ #pragma once #include -#include #include #include diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h index 2641e8d181..5ec4161841 100644 --- a/test/unit/gemm/device/testbed_grouped.h +++ b/test/unit/gemm/device/testbed_grouped.h @@ -36,6 +36,7 @@ #pragma once #include +#include #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h index d7f63c3cf3..e397602451 100644 --- a/test/unit/gemm/device/testbed_sanity.h +++ b/test/unit/gemm/device/testbed_sanity.h @@ -33,7 +33,6 @@ */ #include -#include #include #include "../../common/cutlass_unit_test.h" diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h index 8ad85a9eb1..fcc136c1aa 100644 --- a/test/unit/gemm/device/testbed_splitk.h +++ b/test/unit/gemm/device/testbed_splitk.h @@ -35,7 +35,6 @@ #pragma once #include -#include #include #include "../../common/cutlass_unit_test.h" diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h index a9cf2c6241..c8343f8fcf 100644 --- a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h +++ b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h @@ -35,6 +35,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h index bdc0d873e9..c36e803029 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h @@ -34,6 +34,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h index d0e0e05a75..1d509d5cc7 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h @@ -35,6 +35,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/aligned_buffer.h" diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h index 59539062d6..1d8ef51c33 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h +++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h @@ -34,6 +34,8 @@ #pragma once +#include + #include "../../common/cutlass_unit_test.h" #include "cutlass/cutlass.h" diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h index 4ebf58bba1..773487eeb9 100644 --- a/tools/util/include/cutlass/util/distribution.h +++ b/tools/util/include/cutlass/util/distribution.h @@ -34,7 +34,7 @@ \brief This header contains a class to parametrize a statistical distribution function. */ -#include +#include namespace cutlass {