-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AtomicExpand] Add bitcasts when expanding load atomic vector #120716
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/jofrn/spr/main/80b9b6a7
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-arm Author: None (jofrn) ChangesAtomicExpand fails for aligned Stack:
Full diff: https://github.com/llvm/llvm-project/pull/120716.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a75fa688d87a8d..d860bbf2685ed2 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1884,7 +1884,9 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
IRBuilder<> Builder(I);
IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
- bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
+ const bool IsAtomic = isa<LoadInst>(I) ? cast<LoadInst>(I)->isAtomic() : false;
+ const bool UseSizedLibcall = !(I->getType()->isVectorTy() && IsAtomic) &&
+ canUseSizedAtomicCall(Size, Alignment, DL);
Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29d..7f0f3008d2d5c2 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,60 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT: ldr r0, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: ldr r0, [r0]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: ldr r0, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: movs r1, #0
+; THUMBONE-NEXT: mov r2, r1
+; THUMBONE-NEXT: bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: sub sp, sp, #8
+; ARMV4-NEXT: add r2, sp, #4
+; ARMV4-NEXT: mov r1, r0
+; ARMV4-NEXT: mov r0, #4
+; ARMV4-NEXT: mov r3, #2
+; ARMV4-NEXT: bl __atomic_load
+; ARMV4-NEXT: ldr r0, [sp, #4]
+; ARMV4-NEXT: add sp, sp, #8
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: ldr r0, [r0]
+; ARMV6-NEXT: mov r1, #0
+; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: ldr r0, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 3157e25a3eee20..e92d9ab4add1bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -399,17 +399,58 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
ret <2 x i32> %ret
}
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK3-LABEL: atomic_vec2_ptr_align:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: subq $24, %rsp
+; CHECK3-NEXT: movq %rdi, %rsi
+; CHECK3-NEXT: movq %rsp, %rdx
+; CHECK3-NEXT: movl $16, %edi
+; CHECK3-NEXT: movl $2, %ecx
+; CHECK3-NEXT: callq ___atomic_load
+; CHECK3-NEXT: movaps (%rsp), %xmm0
+; CHECK3-NEXT: addq $24, %rsp
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec2_ptr_align:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: subq $24, %rsp
+; CHECK0-NEXT: movq %rdi, %rsi
+; CHECK0-NEXT: movl $16, %edi
+; CHECK0-NEXT: movq %rsp, %rdx
+; CHECK0-NEXT: movl $2, %ecx
+; CHECK0-NEXT: callq ___atomic_load
+; CHECK0-NEXT: movdqa (%rsp), %xmm0
+; CHECK0-NEXT: addq $24, %rsp
+; CHECK0-NEXT: retq
+ %ret = load atomic <2 x ptr>, ptr %x acquire, align 16
+ ret <2 x ptr> %ret
+}
+
define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind {
-; CHECK-LABEL: atomic_vec4_float_align:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movl $2, %esi
-; CHECK-NEXT: callq ___atomic_load_16
-; CHECK-NEXT: movq %rdx, %xmm1
-; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; CHECK3-LABEL: atomic_vec4_float_align:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: subq $24, %rsp
+; CHECK3-NEXT: movq %rdi, %rsi
+; CHECK3-NEXT: movq %rsp, %rdx
+; CHECK3-NEXT: movl $16, %edi
+; CHECK3-NEXT: movl $2, %ecx
+; CHECK3-NEXT: callq ___atomic_load
+; CHECK3-NEXT: movaps (%rsp), %xmm0
+; CHECK3-NEXT: addq $24, %rsp
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec4_float_align:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: subq $24, %rsp
+; CHECK0-NEXT: movq %rdi, %rsi
+; CHECK0-NEXT: movl $16, %edi
+; CHECK0-NEXT: movq %rsp, %rdx
+; CHECK0-NEXT: movl $2, %ecx
+; CHECK0-NEXT: callq ___atomic_load
+; CHECK0-NEXT: movaps (%rsp), %xmm0
+; CHECK0-NEXT: addq $24, %rsp
+; CHECK0-NEXT: retq
%ret = load atomic <4 x float>, ptr %x acquire, align 16
ret <4 x float> %ret
}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
761d4d9
to
6506acb
Compare
241b7c7
to
1a9eae9
Compare
6506acb
to
db674f8
Compare
bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL); | ||
const bool IsAtomic = | ||
isa<LoadInst>(I) ? cast<LoadInst>(I)->isAtomic() : false; | ||
const bool UseSizedLibcall = !(I->getType()->isVectorTy() && IsAtomic) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't need to consider whether it's atomic or the type. The sized call should work fine. Some casts may be necessary
db674f8
to
13ea377
Compare
61ec1ef
to
dc2032f
Compare
13ea377
to
e11194d
Compare
45e7035
to
dd2c034
Compare
751b5eb
to
20bbd6e
Compare
0cb57c2
to
ba2a301
Compare
llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
Outdated
Show resolved
Hide resolved
ba2a301
to
f129a3c
Compare
0890009
to
c5cfc13
Compare
8e1beae
to
66207cc
Compare
c5cfc13
to
b50f678
Compare
Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS))); | ||
V = Builder.CreateIntToPtr(BC, I->getType()); | ||
} else | ||
V = Builder.CreateBitOrPointerCast(Result, I->getType()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
V = Builder.CreateBitOrPointerCast(Result, I->getType()); | |
V = Builder.CreateBitOrPointerCast(Result, VTy); |
unsigned AS = PtrTy->getAddressSpace(); | ||
Value *BC = Builder.CreateBitCast( | ||
Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS))); | ||
V = Builder.CreateIntToPtr(BC, I->getType()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
V = Builder.CreateIntToPtr(BC, I->getType()); | |
V = Builder.CreateIntToPtr(BC, VTy); |
%ret = load atomic <2 x i16>, ptr %x acquire, align 8 | ||
ret <2 x i16> %ret | ||
} | ||
|
||
define <2 x half> @atomic_vec2_half(ptr %x) nounwind { | ||
; CHECK-LABEL: define <2 x half> @atomic_vec2_half( | ||
; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] { | ||
; CHECK-NEXT: [[RET:%.*]] = load atomic <2 x half>, ptr [[X]] acquire, align 8 | ||
; CHECK-NEXT: ret <2 x half> [[RET]] | ||
; | ||
%ret = load atomic <2 x half>, ptr %x acquire, align 8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The 2 x i16 and 2 x half cases aren't expanded, I guess the default expansion rule missed these for some reason?
auto *PtrTy = dyn_cast<PointerType>(I->getType()->getScalarType()); | ||
auto *VTy = dyn_cast<VectorType>(I->getType()); | ||
if (VTy && PtrTy && !Result->getType()->isVectorTy()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should probably be a utility for this somewhere
66207cc
to
f2e8e75
Compare
b50f678
to
479a227
Compare
f2e8e75
to
fb247c3
Compare
479a227
to
6f96cf8
Compare
fb247c3
to
88e3dac
Compare
053d34b
to
3a1f677
Compare
88e3dac
to
db72700
Compare
3a1f677
to
bf8fc80
Compare
db72700
to
ce64f04
Compare
bf8fc80
to
9fe563b
Compare
ce64f04
to
e1eaeb6
Compare
9fe563b
to
684a542
Compare
e1eaeb6
to
717ea64
Compare
684a542
to
7ba3e69
Compare
717ea64
to
ac4069c
Compare
def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), | ||
(VMOVAPDrm addr:$src)>; // load atomic <2 x i64> | ||
def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), | ||
(VMOVAPDrm addr:$src)>; // load atomic <4 x i32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are required for 128 bit vectors in SSE/AVX. The tests added in this PR require both the AtomicExpand change and these td records.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These require AVX/AVX512 variants (see below) - but x86 doesn't guarantee atomics for anything above 8 bytes (and those must be aligned to avoid cacheline crossing).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think all known AVX capable x86 targets allow 16 byte aligned atomics, its not official but we assume it in X86TargetLowering::shouldExpandAtomicLoadInIR
ac4069c
to
9e3f3b1
Compare
AtomicExpand fails for aligned `load atomic <n x T>` because it does not find a compatible library call. This change adds appropriate bitcasts so that the call can be lowered. It also adds support for 128 bit lowering in tablegen to support SSE/AVX. commit-id:f430c1af
d074cd3
to
6eb10b5
Compare
9e3f3b1
to
70a2b52
Compare
def atomic_load_128_v2i64 : | ||
PatFrag<(ops node:$ptr), | ||
(atomic_load node:$ptr)> { | ||
let IsAtomic = true; | ||
let MemoryVT = v2i64; | ||
} | ||
|
||
def atomic_load_128_v4i32 : | ||
PatFrag<(ops node:$ptr), | ||
(atomic_load node:$ptr)> { | ||
let IsAtomic = true; | ||
let MemoryVT = v4i32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch should not require adding this, or touching any of the backend patterns
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests that use them must also have the changes from AtomicExpand.
AtomicExpand fails for aligned
load atomic <n x T>
because itdoes not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
Stack: