-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] Enable unaligned loads on x86 using cmpxchg #142645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-x86 Author: AZero13 (AZero13) ChangesWe can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that. Full diff: https://github.com/llvm/llvm-project/pull/142645.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..de4dc63d5cbbd 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -232,8 +232,21 @@ template <typename Inst>
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
unsigned Size = getAtomicOpSize(I);
Align Alignment = I->getAlign();
- return Alignment >= Size &&
- Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+
+ // X86 we can do unaligned loads
+ return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 &&
+ (Alignment >= Size || TLI->supportsUnalignedAtomics());
+}
+
+template <typename Inst>
+static bool canLowerAtomicAsUnaligned(const TargetLowering *TLI, Inst *I) {
+ if (!TLI->supportsUnalignedAtomics())
+ return false;
+ unsigned Size = getAtomicOpSize(I);
+ Align Alignment = I->getAlign();
+
+ // X86 we can do unaligned loads
+ return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 && (Alignment < Size);
}
bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
@@ -510,6 +523,10 @@ AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
}
bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
+
+ if (canLowerAtomicAsUnaligned(TLI, LI))
+ return expandAtomicLoadToCmpXchg(LI);
+
switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
case TargetLoweringBase::AtomicExpansionKind::None:
return false;
@@ -532,6 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
}
bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
+ if (canLowerAtomicAsUnaligned(TLI, SI)) {
+ expandAtomicStore(SI);
+ return true;
+ }
+
switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
case TargetLoweringBase::AtomicExpansionKind::None:
return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2399936ffd827..9c19cd4240f72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -137,6 +137,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Set up the TargetLowering object.
+ // X86 supports unaligned atomic memory accesses via cmpxchg8b and cmpxchg16b
+ setSupportsUnalignedAtomics(true);
+
// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
diff --git a/llvm/test/CodeGen/X86/atomic-unaligned.ll b/llvm/test/CodeGen/X86/atomic-unaligned.ll
index f02041cc5fc8f..3931746af8ff0 100644
--- a/llvm/test/CodeGen/X86/atomic-unaligned.ll
+++ b/llvm/test/CodeGen/X86/atomic-unaligned.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
; Quick test to ensure that atomics which are not naturally-aligned
@@ -5,11 +6,19 @@
; sized libcalls.
define void @test_i32(ptr %a) nounwind {
; CHECK-LABEL: test_i32:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: xchgl %eax, (%rdi)
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: xchgl %eax, (%rdi)
+; CHECK-NEXT: lock addl $2, (%rdi)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT: retq
%t0 = load atomic i32, ptr %a seq_cst, align 2
store atomic i32 1, ptr %a seq_cst, align 2
%t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
@@ -20,10 +29,74 @@ define void @test_i32(ptr %a) nounwind {
define void @test_i128(ptr %a) nounwind {
; CHECK-LABEL: test_i128:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $32, %rsp
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsp, %r14
+; CHECK-NEXT: movl $16, %edi
+; CHECK-NEXT: movq %rbx, %rsi
+; CHECK-NEXT: movq %r14, %rdx
+; CHECK-NEXT: movl $5, %ecx
+; CHECK-NEXT: callq __atomic_load@PLT
+; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq $1, (%rsp)
+; CHECK-NEXT: movq %rsp, %rdx
+; CHECK-NEXT: movl $16, %edi
+; CHECK-NEXT: movq %rbx, %rsi
+; CHECK-NEXT: movl $5, %ecx
+; CHECK-NEXT: callq __atomic_store@PLT
+; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq $1, (%rsp)
+; CHECK-NEXT: movq %rsp, %rdx
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: movl $16, %edi
+; CHECK-NEXT: movq %rbx, %rsi
+; CHECK-NEXT: movq %r15, %rcx
+; CHECK-NEXT: movl $5, %r8d
+; CHECK-NEXT: callq __atomic_exchange@PLT
+; CHECK-NEXT: movq (%rbx), %rdx
+; CHECK-NEXT: movq 8(%rbx), %rcx
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: addq $2, %rax
+; CHECK-NEXT: movq %rdx, (%rsp)
+; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: adcq $0, %rcx
+; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $16, %edi
+; CHECK-NEXT: movq %rbx, %rsi
+; CHECK-NEXT: movq %r14, %rdx
+; CHECK-NEXT: movq %r15, %rcx
+; CHECK-NEXT: movl $5, %r8d
+; CHECK-NEXT: movl $5, %r9d
+; CHECK-NEXT: callq __atomic_compare_exchange@PLT
+; CHECK-NEXT: movq (%rsp), %rdx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB1_1
+; CHECK-NEXT: # %bb.2: # %atomicrmw.end
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps %xmm0, (%rsp)
+; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rsp, %rdx
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movl $16, %edi
+; CHECK-NEXT: movq %rbx, %rsi
+; CHECK-NEXT: movl $5, %r8d
+; CHECK-NEXT: movl $5, %r9d
+; CHECK-NEXT: callq __atomic_compare_exchange@PLT
+; CHECK-NEXT: addq $32, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: retq
%t0 = load atomic i128, ptr %a seq_cst, align 8
store atomic i128 1, ptr %a seq_cst, align 8
%t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index e8e0ee0b7ef49..a3d706474b588 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -61,6 +61,42 @@ define void @store_i16(ptr %ptr, i16 %v) {
ret void
}
+define i16 @load_i16_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i16_unaligned:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: xorl %eax, %eax
+; CHECK-O0-NEXT: movw %ax, %cx
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: lock cmpxchgw %cx, (%rdi)
+; CHECK-O0-NEXT: sete %cl
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: load_i16_unaligned:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: xorl %ecx, %ecx
+; CHECK-O3-NEXT: xorl %eax, %eax
+; CHECK-O3-NEXT: lock cmpxchgw %cx, (%rdi)
+; CHECK-O3-NEXT: retq
+ %v = load atomic i16, ptr %ptr unordered, align 1
+ ret i16 %v
+}
+
+
+define void @store_i16_unaligned(ptr %ptr, i16 %v) {
+; CHECK-O0-LABEL: store_i16_unaligned:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw %si, %ax
+; CHECK-O0-NEXT: xchgw %ax, (%rdi)
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: store_i16_unaligned:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: xchgw %si, (%rdi)
+; CHECK-O3-NEXT: retq
+ store atomic i16 %v, ptr %ptr unordered, align 1
+ ret void
+}
+
define i32 @load_i32(ptr %ptr) {
; CHECK-LABEL: load_i32:
; CHECK: # %bb.0:
@@ -79,6 +115,34 @@ define void @store_i32(ptr %ptr, i32 %v) {
ret void
}
+define i32 @load_i32_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i32_unaligned:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: xorl %ecx, %ecx
+; CHECK-O0-NEXT: movl %ecx, %eax
+; CHECK-O0-NEXT: lock cmpxchgl %ecx, (%rdi)
+; CHECK-O0-NEXT: sete %cl
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: load_i32_unaligned:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: xorl %ecx, %ecx
+; CHECK-O3-NEXT: xorl %eax, %eax
+; CHECK-O3-NEXT: lock cmpxchgl %ecx, (%rdi)
+; CHECK-O3-NEXT: retq
+ %v = load atomic i32, ptr %ptr unordered, align 1
+ ret i32 %v
+}
+
+define void @store_i32_unaligned(ptr %ptr, i32 %v) {
+; CHECK-LABEL: store_i32_unaligned:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xchgl %esi, (%rdi)
+; CHECK-NEXT: retq
+ store atomic i32 %v, ptr %ptr unordered, align 1
+ ret void
+}
+
define i64 @load_i64(ptr %ptr) {
; CHECK-LABEL: load_i64:
; CHECK: # %bb.0:
@@ -97,6 +161,35 @@ define void @store_i64(ptr %ptr, i64 %v) {
ret void
}
+define i64 @load_i64_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i64_unaligned:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: xorl %eax, %eax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: movq %rcx, %rax
+; CHECK-O0-NEXT: lock cmpxchgq %rcx, (%rdi)
+; CHECK-O0-NEXT: sete %cl
+; CHECK-O0-NEXT: retq
+;
+; CHECK-O3-LABEL: load_i64_unaligned:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: xorl %ecx, %ecx
+; CHECK-O3-NEXT: xorl %eax, %eax
+; CHECK-O3-NEXT: lock cmpxchgq %rcx, (%rdi)
+; CHECK-O3-NEXT: retq
+ %v = load atomic i64, ptr %ptr unordered, align 1
+ ret i64 %v
+}
+
+define void @store_i64_unaligned(ptr %ptr, i64 %v) {
+; CHECK-LABEL: store_i64_unaligned:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xchgq %rsi, (%rdi)
+; CHECK-NEXT: retq
+ store atomic i64 %v, ptr %ptr unordered, align 1
+ ret void
+}
+
;; The tests in the rest of this file are intended to show transforms which we
;; either *can't* do for legality, or don't currently implement. The later
;; are noted carefully where relevant.
@@ -633,12 +726,12 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB35_1
+; CHECK-O3-NEXT: je .LBB41_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rsi
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB35_1:
+; CHECK-O3-NEXT: .LBB41_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -664,12 +757,12 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: orq %rcx, %rdx
; CHECK-O3-NEXT: shrq $32, %rdx
-; CHECK-O3-NEXT: je .LBB36_1
+; CHECK-O3-NEXT: je .LBB42_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rcx
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB36_1:
+; CHECK-O3-NEXT: .LBB42_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %ecx
@@ -719,12 +812,12 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB38_1
+; CHECK-O3-NEXT: je .LBB44_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rsi
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB38_1:
+; CHECK-O3-NEXT: .LBB44_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -751,12 +844,12 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: orq %rcx, %rdx
; CHECK-O3-NEXT: shrq $32, %rdx
-; CHECK-O3-NEXT: je .LBB39_1
+; CHECK-O3-NEXT: je .LBB45_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rcx
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB39_1:
+; CHECK-O3-NEXT: .LBB45_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %ecx
@@ -816,13 +909,13 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB41_1
+; CHECK-O3-NEXT: je .LBB47_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rsi
; CHECK-O3-NEXT: movq %rdx, %rax
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB41_1:
+; CHECK-O3-NEXT: .LBB47_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -849,13 +942,13 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: orq %rcx, %rdx
; CHECK-O3-NEXT: shrq $32, %rdx
-; CHECK-O3-NEXT: je .LBB42_1
+; CHECK-O3-NEXT: je .LBB48_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rcx
; CHECK-O3-NEXT: movq %rdx, %rax
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB42_1:
+; CHECK-O3-NEXT: .LBB48_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %ecx
@@ -912,13 +1005,13 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB44_1
+; CHECK-O3-NEXT: je .LBB50_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rsi
; CHECK-O3-NEXT: movq %rdx, %rax
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB44_1:
+; CHECK-O3-NEXT: .LBB50_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -946,13 +1039,13 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) {
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: orq %rcx, %rdx
; CHECK-O3-NEXT: shrq $32, %rdx
-; CHECK-O3-NEXT: je .LBB45_1
+; CHECK-O3-NEXT: je .LBB51_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rcx
; CHECK-O3-NEXT: movq %rdx, %rax
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB45_1:
+; CHECK-O3-NEXT: .LBB51_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %ecx
@@ -1469,13 +1562,13 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB74_1
+; CHECK-O3-NEXT: je .LBB80_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rsi
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB74_1:
+; CHECK-O3-NEXT: .LBB80_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -1521,13 +1614,13 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB76_1
+; CHECK-O3-NEXT: je .LBB82_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rsi
; CHECK-O3-NEXT: movq %rax, (%rdi)
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB76_1:
+; CHECK-O3-NEXT: .LBB82_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -1599,13 +1692,13 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB78_1
+; CHECK-O3-NEXT: je .LBB84_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: cqto
; CHECK-O3-NEXT: idivq %rsi
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB78_1:
+; CHECK-O3-NEXT: .LBB84_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -1667,13 +1760,13 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) {
; CHECK-O3-NEXT: movq %rax, %rcx
; CHECK-O3-NEXT: orq %rsi, %rcx
; CHECK-O3-NEXT: shrq $32, %rcx
-; CHECK-O3-NEXT: je .LBB80_1
+; CHECK-O3-NEXT: je .LBB86_1
; CHECK-O3-NEXT: # %bb.2:
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divq %rsi
; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB80_1:
+; CHECK-O3-NEXT: .LBB86_1:
; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-O3-NEXT: xorl %edx, %edx
; CHECK-O3-NEXT: divl %esi
@@ -2323,11 +2416,11 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-NEXT: movl (%rdi), %eax
; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
-; CHECK-O0-NEXT: jne .LBB116_2
+; CHECK-O0-NEXT: jne .LBB122_2
; CHECK-O0-NEXT: # %bb.1: # %taken
; CHECK-O0-NEXT: movb $1, %al
; CHECK-O0-NEXT: retq
-; CHECK-O0-NEXT: .LBB116_2: # %untaken
+; CHECK-O0-NEXT: .LBB122_2: # %untaken
; CHECK-O0-NEXT: xorl %eax, %eax
; CHECK-O0-NEXT: # kill: def $al killed $al killed $eax
; CHECK-O0-NEXT: retq
@@ -2337,11 +2430,11 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-NEXT: movl (%rdi), %eax
; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
-; CHECK-O3-NEXT: jne .LBB116_2
+; CHECK-O3-NEXT: jne .LBB122_2
; CHECK-O3-NEXT: # %bb.1: # %taken
; CHECK-O3-NEXT: movb $1, %al
; CHECK-O3-NEXT: retq
-; CHECK-O3-NEXT: .LBB116_2: # %untaken
+; CHECK-O3-NEXT: .LBB122_2: # %untaken
; CHECK-O3-NEXT: xorl %eax, %eax
; CHECK-O3-NEXT: retq
%v2 = load atomic i32, ptr %p unordered, align 4
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.
What does gcc do? If something compiled with gcc uses libatomic and something compiled with llvm uses cmpxchg, I don't think they can be linked together. |
the actual condition is that the value must fit in a single cache line How can I ensure this? |
LLVM has no way to know that other than emitting a runtime branch to check the alignment. |
We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.