Skip to content

Commit 94cd66e

Browse files
authored
[SYCL][CUDA] Workaround for the problem with memory reordering (#1334)
Currently there is a bug in LLVM for PTX target. PTX target specific intrinsics like llvm.nvvm.barrier0 are treated like regular LLVM intrinsics in Globals AA. As a result, there are situations when Globals AA produces a result that barrier intrinsic doesn't modify internal globals. This allows llvm transformations like GVN to perform illegal memory reordering. This is a workaround while permanent fix is not implemented in LLVM project. Signed-off-by: Artur Gainullin <artur.gainullin@intel.com>
1 parent 12d6901 commit 94cd66e

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed

llvm/lib/Analysis/GlobalsModRef.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,17 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
534534
if (!F->isIntrinsic()) {
535535
KnowNothing = true;
536536
break;
537+
} else if (F->getName().contains("nvvm.barrier") or
538+
F->getName().contains("nvvm.membar")) {
539+
// Even if it is an intrinsic, consider that nothing is known for
540+
// NVVM barrier itrinsics to prevent illegal optimizations.
541+
// This is a workaround for the bug on PTX target: barrier
542+
// intrinsics are implemented as llvm intrinsics, as result there
543+
// are cases when globals alias analysis can produce a result that
544+
// barrier doesn't modify internal global which causes illegal
545+
// reordering of memory accesses.
546+
KnowNothing = true;
547+
break;
537548
}
538549
}
539550
continue;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -globals-aa -gvn -S | FileCheck %s
3+
4+
; Check that load from global variable is not moved across barrier.
5+
6+
7+
target triple = "nvptx"
8+
9+
@foo.l.0 = internal unnamed_addr addrspace(3) global i32 undef, align 4
10+
11+
define dso_local spir_kernel void @foo(i32 addrspace(1)* nocapture %0) {
12+
; CHECK-LABEL: @foo(
13+
; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @_Z13get_global_idj(i32 0) #0
14+
; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @_Z12get_local_idj(i32 0) #0
15+
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
16+
; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]]
17+
; CHECK: 5:
18+
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], 5
19+
; CHECK-NEXT: store i32 [[TMP6]], i32 addrspace(3)* @foo.l.0, align 4
20+
; CHECK-NEXT: br label [[TMP7]]
21+
; CHECK: 7:
22+
; CHECK-NEXT: tail call void @llvm.nvvm.barrier0() #2
23+
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32 addrspace(3)* @foo.l.0, align 4
24+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0:%.*]], i32 [[TMP2]]
25+
; CHECK-NEXT: store i32 [[TMP8]], i32 addrspace(1)* [[TMP9]], align 4
26+
; CHECK-NEXT: ret void
27+
;
28+
%2 = tail call i32 @_Z13get_global_idj(i32 0) #0
29+
%3 = tail call i32 @_Z12get_local_idj(i32 0) #0
30+
%4 = icmp eq i32 %3, 0
31+
br i1 %4, label %5, label %7
32+
33+
5: ; preds = %1
34+
%6 = add i32 %2, 5
35+
store i32 %6, i32 addrspace(3)* @foo.l.0, align 4
36+
br label %7
37+
38+
7: ; preds = %5, %1
39+
tail call void @llvm.nvvm.barrier0() #1
40+
%8 = load i32, i32 addrspace(3)* @foo.l.0, align 4
41+
%9 = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %2
42+
store i32 %8, i32 addrspace(1)* %9, align 4
43+
ret void
44+
}
45+
46+
; Function Attrs: convergent nounwind readnone
47+
declare dso_local i32 @_Z13get_global_idj(i32) local_unnamed_addr #0
48+
49+
; Function Attrs: convergent nounwind readnone
50+
declare dso_local i32 @_Z12get_local_idj(i32) local_unnamed_addr #0
51+
52+
; Function Attrs: convergent
53+
declare dso_local void @llvm.nvvm.barrier0() local_unnamed_addr #1
54+
55+
attributes #0 = { convergent nounwind readnone }
56+
attributes #1 = { convergent }

0 commit comments

Comments
 (0)