Integrate upstream cutlass_extensions changes (#7)

* Integrate upstream cutlass_extensions changes * Update cuda flag to compile new cutlass kernel * Pass group_size argument to cutlass kernel
tlc-pack · Nov 16, 2023 · 9e34a94 · 9e34a94
1 parent 145bee4
commit 9e34a94
Show file tree

Hide file tree

Showing 30 changed files with 2,332 additions and 1,061 deletions.
diff --git a/cutlass_extensions/include/cutlass_extensions/arch/mma.h b/cutlass_extensions/include/cutlass_extensions/arch/mma.h
@@ -33,6 +33,7 @@
 */
 
 #pragma once
+#include "cutlass_extensions/weight_only_quant_op.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -44,5 +45,76 @@ namespace arch
 // Tag which triggers MMA which will trigger
 struct OpMultiplyAddDequantizeInterleavedBToA;
 
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
+  code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator
+{
+    using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
+// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
+// as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator
+{
+    using Operator = TaggedMmaOp;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
 } // namespace arch
-} // namespace cutlass
+} // namespace cutlass
diff --git a/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h b/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
@@ -20,28 +20,35 @@
 #include "cutlass/device_kernel.h"
 #include "utils/cuda_utils.h"
 
-namespace fastertransformer
+namespace tensorrt_llm
+{
+namespace cutlass_extensions
 {
 
 template <typename GemmKernel>
 inline int compute_occupancy_for_kernel()
 {
 
+    using fastertransformer::check;
     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
     if (smem_size > (48 << 10))
     {
-        cudaError_t status
-            = cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-        if (status == cudaError::cudaErrorInvalidValue)
+        cudaFuncAttributes attr;
+        int device = 0;
+        int max_smem_per_block = 0;
+        check_cuda_error(cudaGetDevice(&device));
+        check_cuda_error(
+            cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+        check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+        if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block))
         {
-            // Clear the error bit since we can ignore this.
-            // This should mean that smem_size > cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
-            // occupancy of 0. This will cause the heuristic to ignore this configuration.
-            status = cudaGetLastError();
+            // This should mean that
+            // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+            // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+            // configuration.
             return 0;
         }
-        check_cuda_error(status);
     }
 
     int max_active_blocks = -1;
@@ -51,4 +58,5 @@ inline int compute_occupancy_for_kernel()
     return max_active_blocks;
 }
 
-} // namespace fastertransformer
+} // namespace cutlass_extensions
+} // namespace tensorrt_llm
diff --git a/...ns/epilogue/thread/ft_fused_activations.h → ...sions/epilogue/thread/fused_activations.h b/...ns/epilogue/thread/ft_fused_activations.h → ...sions/epilogue/thread/fused_activations.h