Skip to content

Commit

Permalink
add unit test for non int4 load
Browse files Browse the repository at this point in the history
  • Loading branch information
mengchi.hmc committed Apr 23, 2021
1 parent bb35a3b commit f4b0a33
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 2 deletions.
7 changes: 6 additions & 1 deletion include/cutlass/conv/kernel/default_conv2d_fprop.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,11 @@ struct DefaultConv2dFprop <
using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
using MmaPolicy = typename MmaCore::MmaPolicy;

static cutlass::arch::CacheOperation::Kind const CacheOpB =
((sizeof_bits<ElementA>::value * AlignmentB) == 128)
? cutlass::arch::CacheOperation::Global
: cutlass::arch::CacheOperation::Always;

// Define the Mma
using Mma = threadblock::ImplicitGemmMultistage<
ThreadblockShape,
Expand All @@ -623,7 +628,7 @@ struct DefaultConv2dFprop <
arch::CacheOperation::Always,
IteratorB,
SmemIteratorB,
arch::CacheOperation::Always,
CacheOpB,
MmaPolicy,
Stages
>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
// Parameters structure
//

static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;

struct Params {

Layout layout;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
}
}

CUTLASS_PRAGMA_UNROLL
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
}
Expand Down Expand Up @@ -212,7 +213,6 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
#else
if (clear) {
predicates_[index] = 0;
predicates_[index] = 0;
}
#endif
}
Expand Down Expand Up @@ -247,6 +247,7 @@ class Conv2dFpropFilterTileAccessIteratorOptimized{
filter_c_ += params_.filter_c_delta;
}

CUTLASS_PRAGMA_UNROLL
for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,89 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_ten
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2,
128x128_64x3_64x64x64) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
2,
2
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////
TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4,
128x128_64x3_64x64x64) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<16, 8, 16>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
4,
4
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,134 @@ TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_ten
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////

TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align1,
128x128_32x2_64x64x32) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = float;
using ElementAccumulator = float;
using ElementCompute = float;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 8>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
1,
1
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////

TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align2,
128x128_32x2_64x64x32) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = float;
using ElementAccumulator = float;
using ElementCompute = float;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 8>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
2,
2
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////

TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4,
128x128_32x2_64x64x32) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = float;
using ElementAccumulator = float;
using ElementCompute = float;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<16, 8, 8>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
4,
4
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;

/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////
#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,93 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_te
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////

TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1,
128x128_32x3_64x64x32) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::tfloat32_t;
using ElementB = cutlass::tfloat32_t;
using ElementC = float;
using ElementAccumulator = float;
using ElementCompute = float;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 16>,
cutlass::gemm::GemmShape<64, 64, 16>,
cutlass::gemm::GemmShape<16, 8, 8>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
1,
1
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;


/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////

TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2,
128x128_32x3_64x64x32) {

/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::tfloat32_t;
using ElementB = cutlass::tfloat32_t;
using ElementC = float;
using ElementAccumulator = float;
using ElementCompute = float;

/// Device-level Conv2d instance
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
cutlass::gemm::GemmShape<128, 128, 16>,
cutlass::gemm::GemmShape<64, 64, 16>,
cutlass::gemm::GemmShape<16, 8, 8>,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized,
2,
2
>::Kernel;

using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;


/// Run all unit test sizes with device-level Conv2d instance
EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
}

////////////////////////////////////////////////////////////////////////////////
#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED

0 comments on commit f4b0a33

Please sign in to comment.