-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] gfx1250 vmem prefetch target intrinsics and builtins #150466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] gfx1250 vmem prefetch target intrinsics and builtins #150466
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 20.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150466.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 878543566f0e3..0b16e1264ce6b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 75e9710f96705..e96dd669788d2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index a21862c4a9395..81f39f987e3b6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}
+// CHECK-LABEL: @test_prefetch(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
+// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
+// CHECK-NEXT: ret void
+//
+void test_prefetch(generic void *fptr, global void *gptr) {
+ __builtin_amdgcn_flat_prefetch(fptr, 0);
+ __builtin_amdgcn_global_prefetch(gptr, 8);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 9711b3bdded6b..12a0f3c27fca2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,5 +1,5 @@
// REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
@@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
+void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
+ __builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}}
+ __builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}}
+}
+
void test_cvt_f32_fp8_e5m3(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8bfa34584c3a4..f313c6b73ce41 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
+ Intrinsic<[],
+ [llvm_ptr_ty, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
+ Intrinsic<[],
+ [LLVMQualPointerType<1>, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9b05f7c339738..a10dca2f3986a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db827f4fd7c46..8054ea489ac51 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee118356e..1d7612cc2077f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1548,7 +1548,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 4ca7444a73b35..e5c896feb953c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
+ Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
new file mode 100644
index 0000000000000..8007597a32fbe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+
+---
+name: flat_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
new file mode 100644
index 0000000000000..89555d3060883
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
+
+define amdgpu_ps void @flat_prefetch(ptr %ptr) {
+; GCN-LABEL: flat_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
+; GCN-LABEL: flat_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %ptr, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr %ptr, i32 %offset
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
new file mode 100644
index 0000000000000..047a6ccf10d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
+
+define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: global_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 20.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150466.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 878543566f0e3..0b16e1264ce6b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 75e9710f96705..e96dd669788d2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index a21862c4a9395..81f39f987e3b6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}
+// CHECK-LABEL: @test_prefetch(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
+// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
+// CHECK-NEXT: ret void
+//
+void test_prefetch(generic void *fptr, global void *gptr) {
+ __builtin_amdgcn_flat_prefetch(fptr, 0);
+ __builtin_amdgcn_global_prefetch(gptr, 8);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 9711b3bdded6b..12a0f3c27fca2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,5 +1,5 @@
// REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
@@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
+void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
+ __builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}}
+ __builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}}
+}
+
void test_cvt_f32_fp8_e5m3(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8bfa34584c3a4..f313c6b73ce41 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
+ Intrinsic<[],
+ [llvm_ptr_ty, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
+ Intrinsic<[],
+ [LLVMQualPointerType<1>, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9b05f7c339738..a10dca2f3986a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db827f4fd7c46..8054ea489ac51 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee118356e..1d7612cc2077f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1548,7 +1548,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 4ca7444a73b35..e5c896feb953c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
+ Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
new file mode 100644
index 0000000000000..8007597a32fbe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+
+---
+name: flat_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
new file mode 100644
index 0000000000000..89555d3060883
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
+
+define amdgpu_ps void @flat_prefetch(ptr %ptr) {
+; GCN-LABEL: flat_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
+; GCN-LABEL: flat_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %ptr, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr %ptr, i32 %offset
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
new file mode 100644
index 0000000000000..047a6ccf10d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
+
+define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: global_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(...
[truncated]
|
|
@llvm/pr-subscribers-llvm-ir Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 20.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150466.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 878543566f0e3..0b16e1264ce6b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 75e9710f96705..e96dd669788d2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index a21862c4a9395..81f39f987e3b6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}
+// CHECK-LABEL: @test_prefetch(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
+// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
+// CHECK-NEXT: ret void
+//
+void test_prefetch(generic void *fptr, global void *gptr) {
+ __builtin_amdgcn_flat_prefetch(fptr, 0);
+ __builtin_amdgcn_global_prefetch(gptr, 8);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 9711b3bdded6b..12a0f3c27fca2 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,5 +1,5 @@
// REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
@@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
+void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
+ __builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}}
+ __builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}}
+}
+
void test_cvt_f32_fp8_e5m3(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8bfa34584c3a4..f313c6b73ce41 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
+ Intrinsic<[],
+ [llvm_ptr_ty, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
+ Intrinsic<[],
+ [LLVMQualPointerType<1>, // Pointer
+ llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+ IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
+ "", [SDNPMemOperand]
+ >;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9b05f7c339738..a10dca2f3986a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db827f4fd7c46..8054ea489ac51 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee118356e..1d7612cc2077f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1548,7 +1548,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 4ca7444a73b35..e5c896feb953c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
+ Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
new file mode 100644
index 0000000000000..8007597a32fbe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+
+---
+name: flat_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
new file mode 100644
index 0000000000000..89555d3060883
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
+
+define amdgpu_ps void @flat_prefetch(ptr %ptr) {
+; GCN-LABEL: flat_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
+; GCN-LABEL: flat_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %ptr, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr %ptr, i32 %offset
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
new file mode 100644
index 0000000000000..047a6ccf10d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
+
+define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: global_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(...
[truncated]
|
c140539 to
d245f4b
Compare
d245f4b to
bfba490
Compare

No description provided.