@@ -23,6 +23,7 @@ include "mlir/Interfaces/InferIntRangeInterface.td"
2323def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
2424def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
2525def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
26+ def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
2627
2728//===----------------------------------------------------------------------===//
2829// NVVM dialect definitions
@@ -2592,6 +2593,110 @@ def NVVM_Breakpoint : NVVM_Op<"breakpoint"> {
25922593 let assemblyFormat = "attr-dict";
25932594}
25942595
2596+ //===----------------------------------------------------------------------===//
2597+ // NVVM TCGEN05 Ops
2598+ //===----------------------------------------------------------------------===//
2599+ // Num CTAs in a group participating in the TCGEN05 operation.
2600+ // This corresponds to the "cta_group::1", "cta_group::2"
2601+ // modifiers in the PTX instructions.
2602+ def Tcgen05GroupCTA_1 : I32EnumAttrCase<"CTA_1", 0, "cta_1">;
2603+ def Tcgen05GroupCTA_2 : I32EnumAttrCase<"CTA_2", 1, "cta_2">;
2604+
2605+ def Tcgen05GroupKind : I32EnumAttr<"Tcgen05GroupKind",
2606+ "NVVM Tcgen05 group kind",
2607+ [Tcgen05GroupCTA_1, Tcgen05GroupCTA_2]> {
2608+ let genSpecializedAttr = 0;
2609+ let cppNamespace = "::mlir::NVVM";
2610+ }
2611+ def Tcgen05GroupKindAttr :
2612+ EnumAttr<NVVM_Dialect, Tcgen05GroupKind, "tcgen05_group"> {
2613+ let assemblyFormat = "`<` $value `>`";
2614+ }
2615+
2616+ def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
2617+ let summary = "Tcgen05 alloc operation";
2618+ let description = [{
2619+ The `tcgen05.alloc` Op allocates tensor core memory for
2620+ the amount specified by `nCols` and writes the destination
2621+ address to the `addr` argument. The `nCols` operand specifies the
2622+ number of columns to be allocated and it must be a power-of-two.
2623+ [For more information, refer to the PTX ISA]
2624+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2625+ }];
2626+
2627+ let arguments = (ins
2628+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
2629+ I32:$nCols,
2630+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2631+
2632+ let assemblyFormat = "$addr `,` $nCols attr-dict `:` type(operands)";
2633+
2634+ let extraClassDeclaration = [{
2635+ static llvm::Intrinsic::ID
2636+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2637+ llvm::SmallVector<llvm::Value *> &args);
2638+ }];
2639+ string llvmBuilder = [{
2640+ llvm::SmallVector<llvm::Value *> args;
2641+ auto id = NVVM::Tcgen05AllocOp::getIntrinsicIDAndArgs(
2642+ *op, moduleTranslation, args);
2643+ createIntrinsicCall(builder, id, args);
2644+ }];
2645+ }
2646+
2647+ def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
2648+ let summary = "Tcgen05 dealloc operation";
2649+ let description = [{
2650+ The `tcgen05.dealloc` Op de-allocates the tensor core memory
2651+ specified by `tmemAddr`, which must be from a previous tensor
2652+ memory allocation. The `nCols` operand specifies the number
2653+ of columns to be de-allocated, and it must be a power-of-two.
2654+ [For more information, refer to the PTX ISA]
2655+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2656+ }];
2657+
2658+ let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
2659+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2660+
2661+ let assemblyFormat = "$taddr `,` $nCols attr-dict `:` type(operands)";
2662+
2663+ let extraClassDeclaration = [{
2664+ static llvm::Intrinsic::ID
2665+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2666+ llvm::SmallVector<llvm::Value *> &args);
2667+ }];
2668+ string llvmBuilder = [{
2669+ llvm::SmallVector<llvm::Value *> args;
2670+ auto id = NVVM::Tcgen05DeallocOp::getIntrinsicIDAndArgs(
2671+ *op, moduleTranslation, args);
2672+ createIntrinsicCall(builder, id, args);
2673+ }];
2674+ }
2675+
2676+ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> {
2677+ let summary = "Tcgen05 Op to relinquish the right to allocate";
2678+ let description = [{
2679+ The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA
2680+ of the executing thread is relinquishing the right to allocate
2681+ Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
2682+ after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
2683+ [For more information, refer to the PTX ISA]
2684+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2685+ }];
2686+
2687+ let arguments = (ins
2688+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2689+
2690+ let assemblyFormat = "attr-dict";
2691+
2692+ string llvmBuilder = [{
2693+ auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ?
2694+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg1 :
2695+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg2;
2696+ createIntrinsicCall(builder, id);
2697+ }];
2698+ }
2699+
25952700//===----------------------------------------------------------------------===//
25962701// NVVM target attribute.
25972702//===----------------------------------------------------------------------===//
0 commit comments