Skip to content

Commit

Permalink
b2b bias vector support (NVIDIA#482)
Browse files Browse the repository at this point in the history
* b2b bias vector support

* add files

Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
  • Loading branch information
hwu36 and hwu36 authored Apr 30, 2022
1 parent 86ce09a commit ec2b4fd
Show file tree
Hide file tree
Showing 34 changed files with 1,099 additions and 327 deletions.
23 changes: 23 additions & 0 deletions examples/13_two_tensor_op_fusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,29 @@ When applying the above constraint to convolutions, it is required that the 2nd
kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.

# Build and run

- Run cmake at top-level CUTLASS
- `make 13_two_tensor_op_fusion`
- Run individual benchmarks
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_f16_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_convs_s8_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_f16_sm80_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm75_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm75_shmem`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_rf`
- `./examples/13_two_tensor_op_fusion/13_fused_two_gemms_s8_sm80_shmem`


# Copyright

Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Expand Down
31 changes: 24 additions & 7 deletions examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"

#include "reference/device/tensor_scale_bias.h"
#include "helper.h"

#define CHECK_GT(val1, val2) \
Expand Down Expand Up @@ -153,6 +154,7 @@ class B2bNonFusedConv2dRun {
cutlass::reference::host::TensorFill(view, Element(1));
}
else {
std::cerr << "Not implemented\n";
}
}

Expand Down Expand Up @@ -407,6 +409,7 @@ class B2bFusedConv2dRun {
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Scale0;
cutlass::HostTensor<typename B2bConv2d::ElementScaleBias, typename B2bConv2d::LayoutScaleBias> tensor_Bias0;
cutlass::HostTensor<ElementAccumulator, typename B2bConv2d::LayoutC> tensor_Z0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;

cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
Expand Down Expand Up @@ -487,6 +490,7 @@ class B2bFusedConv2dRun {
if(alpha0 == ElementCompute(0)) //per-channel scale
tensor_Scale0.resize({1, problem_size_0.K});
tensor_Bias0.resize({1, problem_size_0.K});
tensor_Z0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
Expand Down Expand Up @@ -607,22 +611,35 @@ class B2bFusedConv2dRun {
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
ElementAccumulator,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_Z0_reference.device_ref(),
tensor_Z0_reference.device_ref(),
ElementAccumulator(1), // intermediate alpha = 1
ElementAccumulator(0) // beta = 0
);

cutlass::reference::device::TensorScaleBiasConv2d<
ElementAccumulator,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
typename B2bConv2d::LayoutScaleBias
>(
problem_size_0,
tensor_Z0_reference.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0,
nullptr, // stream
alpha0,
tensor_Scale0.device_ref(),
tensor_Bias0.device_ref());
tensor_Bias0.device_ref()
);

if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
Expand Down
Loading

0 comments on commit ec2b4fd

Please sign in to comment.