Skip to content

[mlir][ROCDL] Plumb through AMDGPU memory access metadata #110916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 9, 2024

Conversation

krzysz00
Copy link
Contributor

@krzysz00 krzysz00 commented Oct 2, 2024

The LLVM backend has moved from function-wide attributes for making assurances about potentially unsafe atomic operations (like "unsafe-fp-atomics") to metadata on individual atomic operations.

This commit adds support for generating this metadata from MLIR.

The LLVM backend has moved from function-wide attributes for making
assurances about potentially unsafe atomic operations (like
"unsafe-fp-atomics") to metadata on individual atomic operations.

This commit adds support for generating this metadata from MLIR.
@llvmbot
Copy link
Member

llvmbot commented Oct 2, 2024

@llvm/pr-subscribers-mlir-llvm

Author: Krzysztof Drewniak (krzysz00)

Changes

The LLVM backend has moved from function-wide attributes for making assurances about potentially unsafe atomic operations (like "unsafe-fp-atomics") to metadata on individual atomic operations.

This commit adds support for generating this metadata from MLIR.


Full diff: https://github.com/llvm/llvm-project/pull/110916.diff

4 Files Affected:

  • (modified) mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td (+1)
  • (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+7-2)
  • (modified) mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp (+26-1)
  • (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+23)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 2da45eba77655b..fae2fe9cc3f8d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -1055,6 +1055,7 @@ def LLVM_ConstantRangeAttr : LLVM_Attr<"ConstantRange", "constant_range"> {
     Syntax:
     ```
     `<` `i`(width($lower)) $lower `,` $upper `>`
+    ```
   }];
 
   let builders = [
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index aae2cf88ded041..1d515b2b7c801c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -58,7 +58,12 @@ def ROCDL_Dialect : Dialect {
      "::mlir::StringAttr":$flat_work_group_size,
      "::mlir::IntegerAttr":$max_flat_work_group_size,
      "::mlir::IntegerAttr":$waves_per_eu,
-     "::mlir::BoolAttr":$unsafe_fp_atomics
+     "::mlir::BoolAttr":$unsafe_fp_atomics,
+     // Correspond to LLVM matadata of the same name
+     "::mlir::UnitAttr":$last_use,
+     "::mlir::UnitAttr":$no_remote_memory,
+     "::mlir::UnitAttr":$no_fine_grained_memory,
+     "::mlir::UnitAttr":$ignore_denormal_mode
   );
 
   let useDefaultAttributePrinterParser = 1;
@@ -88,7 +93,7 @@ class ROCDL_IntrPure1Op<string mnemonic> :
 
 class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   list<int> overloadedOperands, list<Trait> traits, int numResults,
-  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [], 
+  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [],
   list<string> immArgAttrNames = []> :
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index ec21fbf714c24a..88a9d4c2a7ef23 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -77,6 +77,7 @@ class ROCDLDialectLLVMIRTranslationInterface
                  NamedAttribute attribute,
                  LLVM::ModuleTranslation &moduleTranslation) const final {
     auto *dialect = dyn_cast<ROCDL::ROCDLDialect>(attribute.getNameDialect());
+    llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
     if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
@@ -198,7 +199,6 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!value)
         return op->emitOpError(Twine(attribute.getName()) +
                                " must be a dense i32 array attribute");
-      llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
       for (int32_t i : value.asArrayRef()) {
@@ -210,6 +210,31 @@ class ROCDLDialectLLVMIRTranslationInterface
       llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
       llvmFunc->setMetadata("reqd_work_group_size", node);
     }
+
+    // Atomic and nontemporal metadata
+    if (dialect->getLastUseAttrHelper().getName() == attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.last.use", llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getNoRemoteMemoryAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.no.remote.memory",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getNoFineGrainedMemoryAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.no.fine.grained.memory",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getIgnoreDenormalModeAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.ignore.denormal.mode",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+
     return success();
   }
 };
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 08c2d4e6477970..97276b087b7e93 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -564,11 +564,34 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
 }
 
 llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> {
+  // CHECK-LABEL: @rocdl_16bit_packed_floats
   // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}})
   %source = rocdl.cvt.pkrtz %sourceA, %sourceB  : vector<2xf16>
   llvm.return %source : vector<2xf16>
 }
 
+llvm.func @rocdl_atomic_attrs(%ptr: !llvm.ptr<1>, %data: f32) {
+  // CHECK-LABEL: @rocdl_atomic_attrs
+  // CHECK: atomicrmw
+  // CHECK-SAME: !amdgpu.ignore.denormal.mode
+  // CHECK-SAME: !amdgpu.no.fine.grained.memory
+  // CHECK-SAME: !amdgpu.no.remote.memory
+  llvm.atomicrmw fadd %ptr, %data monotonic {
+    rocdl.ignore_denormal_mode,
+    rocdl.no_fine_grained_memory,
+    rocdl.no_remote_memory} : !llvm.ptr<1>, f32
+  llvm.return
+}
+
+llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
+  // CHECK-LABEL: @rocdl_last_use
+  // CHECK: %[[ret:.+]] = load
+  // CHECK-SAME: !amdgpu.last.use
+  // CHECK: ret i32 %[[ret]]
+  %ret = llvm.load %ptr {rocdl.last_use} : !llvm.ptr<1> -> i32
+  llvm.return %ret : i32
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"

@llvmbot
Copy link
Member

llvmbot commented Oct 2, 2024

@llvm/pr-subscribers-mlir

Author: Krzysztof Drewniak (krzysz00)

Changes

The LLVM backend has moved from function-wide attributes for making assurances about potentially unsafe atomic operations (like "unsafe-fp-atomics") to metadata on individual atomic operations.

This commit adds support for generating this metadata from MLIR.


Full diff: https://github.com/llvm/llvm-project/pull/110916.diff

4 Files Affected:

  • (modified) mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td (+1)
  • (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+7-2)
  • (modified) mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp (+26-1)
  • (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+23)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 2da45eba77655b..fae2fe9cc3f8d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -1055,6 +1055,7 @@ def LLVM_ConstantRangeAttr : LLVM_Attr<"ConstantRange", "constant_range"> {
     Syntax:
     ```
     `<` `i`(width($lower)) $lower `,` $upper `>`
+    ```
   }];
 
   let builders = [
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index aae2cf88ded041..1d515b2b7c801c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -58,7 +58,12 @@ def ROCDL_Dialect : Dialect {
      "::mlir::StringAttr":$flat_work_group_size,
      "::mlir::IntegerAttr":$max_flat_work_group_size,
      "::mlir::IntegerAttr":$waves_per_eu,
-     "::mlir::BoolAttr":$unsafe_fp_atomics
+     "::mlir::BoolAttr":$unsafe_fp_atomics,
+     // Correspond to LLVM matadata of the same name
+     "::mlir::UnitAttr":$last_use,
+     "::mlir::UnitAttr":$no_remote_memory,
+     "::mlir::UnitAttr":$no_fine_grained_memory,
+     "::mlir::UnitAttr":$ignore_denormal_mode
   );
 
   let useDefaultAttributePrinterParser = 1;
@@ -88,7 +93,7 @@ class ROCDL_IntrPure1Op<string mnemonic> :
 
 class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   list<int> overloadedOperands, list<Trait> traits, int numResults,
-  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [], 
+  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [],
   list<string> immArgAttrNames = []> :
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index ec21fbf714c24a..88a9d4c2a7ef23 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -77,6 +77,7 @@ class ROCDLDialectLLVMIRTranslationInterface
                  NamedAttribute attribute,
                  LLVM::ModuleTranslation &moduleTranslation) const final {
     auto *dialect = dyn_cast<ROCDL::ROCDLDialect>(attribute.getNameDialect());
+    llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
     if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
@@ -198,7 +199,6 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!value)
         return op->emitOpError(Twine(attribute.getName()) +
                                " must be a dense i32 array attribute");
-      llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
       for (int32_t i : value.asArrayRef()) {
@@ -210,6 +210,31 @@ class ROCDLDialectLLVMIRTranslationInterface
       llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
       llvmFunc->setMetadata("reqd_work_group_size", node);
     }
+
+    // Atomic and nontemporal metadata
+    if (dialect->getLastUseAttrHelper().getName() == attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.last.use", llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getNoRemoteMemoryAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.no.remote.memory",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getNoFineGrainedMemoryAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.no.fine.grained.memory",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+    if (dialect->getIgnoreDenormalModeAttrHelper().getName() ==
+        attribute.getName()) {
+      for (llvm::Instruction *i : instructions)
+        i->setMetadata("amdgpu.ignore.denormal.mode",
+                       llvm::MDNode::get(llvmContext, {}));
+    }
+
     return success();
   }
 };
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 08c2d4e6477970..97276b087b7e93 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -564,11 +564,34 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
 }
 
 llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> {
+  // CHECK-LABEL: @rocdl_16bit_packed_floats
   // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}})
   %source = rocdl.cvt.pkrtz %sourceA, %sourceB  : vector<2xf16>
   llvm.return %source : vector<2xf16>
 }
 
+llvm.func @rocdl_atomic_attrs(%ptr: !llvm.ptr<1>, %data: f32) {
+  // CHECK-LABEL: @rocdl_atomic_attrs
+  // CHECK: atomicrmw
+  // CHECK-SAME: !amdgpu.ignore.denormal.mode
+  // CHECK-SAME: !amdgpu.no.fine.grained.memory
+  // CHECK-SAME: !amdgpu.no.remote.memory
+  llvm.atomicrmw fadd %ptr, %data monotonic {
+    rocdl.ignore_denormal_mode,
+    rocdl.no_fine_grained_memory,
+    rocdl.no_remote_memory} : !llvm.ptr<1>, f32
+  llvm.return
+}
+
+llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
+  // CHECK-LABEL: @rocdl_last_use
+  // CHECK: %[[ret:.+]] = load
+  // CHECK-SAME: !amdgpu.last.use
+  // CHECK: ret i32 %[[ret]]
+  %ret = llvm.load %ptr {rocdl.last_use} : !llvm.ptr<1> -> i32
+  llvm.return %ret : i32
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"

Copy link
Contributor

@qedawkins qedawkins left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Co-authored-by: Quinn Dawkins <quinn.dawkins@gmail.com>
@krzysz00 krzysz00 merged commit 774893d into llvm:main Oct 9, 2024
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants