Skip to content

Commit

Permalink
Integrate upstream cutlass_extensions changes (#7)
Browse files Browse the repository at this point in the history
* Integrate upstream cutlass_extensions changes

* Update cuda flag to compile new cutlass kernel

* Pass group_size argument to cutlass kernel
  • Loading branch information
vinx13 authored Nov 16, 2023
1 parent 145bee4 commit 9e34a94
Show file tree
Hide file tree
Showing 30 changed files with 2,332 additions and 1,061 deletions.
74 changes: 73 additions & 1 deletion cutlass_extensions/include/cutlass_extensions/arch/mma.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
*/

#pragma once
#include "cutlass_extensions/weight_only_quant_op.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

Expand All @@ -44,5 +45,76 @@ namespace arch
// Tag which triggers MMA which will trigger
struct OpMultiplyAddDequantizeInterleavedBToA;

/*
Below we have extra tags to signal what kind of dequantization we want to do
(per col, scale only fine grained, finegrained with zero). This still lets us
the existing template infrastructure (incl. that in CUTLASS). However, we
split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
with the quantization op before instantiating the GEMM pieces.
Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
code we need to duplicate.
*/
struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;

// The default just forwards the original operator
template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
struct TagOperator
{
using TaggedOperator = MmaOp;
};

// Specializations below attach more information to the operator
template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>
{
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
};

template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>
{
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
};

template <>
struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>
{
using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
};

// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
// as a default.
template <typename TaggedMmaOp>
struct DetagOperator
{
using Operator = TaggedMmaOp;
static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale>
{
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale>
{
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
};

template <>
struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias>
{
using Operator = OpMultiplyAddDequantizeInterleavedBToA;
static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
};

} // namespace arch
} // namespace cutlass
} // namespace cutlass
28 changes: 18 additions & 10 deletions cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,35 @@
#include "cutlass/device_kernel.h"
#include "utils/cuda_utils.h"

namespace fastertransformer
namespace tensorrt_llm
{
namespace cutlass_extensions
{

template <typename GemmKernel>
inline int compute_occupancy_for_kernel()
{

using fastertransformer::check;
int smem_size = int(sizeof(typename GemmKernel::SharedStorage));

if (smem_size > (48 << 10))
{
cudaError_t status
= cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
if (status == cudaError::cudaErrorInvalidValue)
cudaFuncAttributes attr;
int device = 0;
int max_smem_per_block = 0;
check_cuda_error(cudaGetDevice(&device));
check_cuda_error(
cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block))
{
// Clear the error bit since we can ignore this.
// This should mean that smem_size > cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
// occupancy of 0. This will cause the heuristic to ignore this configuration.
status = cudaGetLastError();
// This should mean that
// cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
// wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
// configuration.
return 0;
}
check_cuda_error(status);
}

int max_active_blocks = -1;
Expand All @@ -51,4 +58,5 @@ inline int compute_occupancy_for_kernel()
return max_active_blocks;
}

} // namespace fastertransformer
} // namespace cutlass_extensions
} // namespace tensorrt_llm
Loading

0 comments on commit 9e34a94

Please sign in to comment.