[X86] Enable unaligned loads on x86 using cmpxchg #142645

AZero13 · 2025-06-03T17:42:39Z

We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.

llvmbot · 2025-06-03T17:43:20Z

@llvm/pr-subscribers-backend-x86

Author: AZero13 (AZero13)

Changes

We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.

Full diff: https://github.com/llvm/llvm-project/pull/142645.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+24-2)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3)
(modified) llvm/test/CodeGen/X86/atomic-unaligned.ll (+82-9)
(modified) llvm/test/CodeGen/X86/atomic-unordered.ll (+121-28)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..de4dc63d5cbbd 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -232,8 +232,21 @@ template <typename Inst>
 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
   unsigned Size = getAtomicOpSize(I);
   Align Alignment = I->getAlign();
-  return Alignment >= Size &&
-         Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+
+  // X86 we can do unaligned loads
+  return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 &&
+         (Alignment >= Size || TLI->supportsUnalignedAtomics());
+}
+
+template <typename Inst>
+static bool canLowerAtomicAsUnaligned(const TargetLowering *TLI, Inst *I) {
+  if (!TLI->supportsUnalignedAtomics())
+    return false;
+  unsigned Size = getAtomicOpSize(I);
+  Align Alignment = I->getAlign();
+
+  // X86 we can do unaligned loads
+  return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 && (Alignment < Size);
 }
 
 bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
@@ -510,6 +523,10 @@ AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
 }
 
 bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
+
+  if (canLowerAtomicAsUnaligned(TLI, LI))
+    return expandAtomicLoadToCmpXchg(LI);
+
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
@@ -532,6 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
 }
 
 bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
+  if (canLowerAtomicAsUnaligned(TLI, SI)) {
+    expandAtomicStore(SI);
+    return true;
+  }
+
   switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2399936ffd827..9c19cd4240f72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -137,6 +137,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Set up the TargetLowering object.
 
+  // X86 supports unaligned atomic memory accesses via cmpxchg8b and cmpxchg16b
+  setSupportsUnalignedAtomics(true);
+
   // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
diff --git a/llvm/test/CodeGen/X86/atomic-unaligned.ll b/llvm/test/CodeGen/X86/atomic-unaligned.ll
index f02041cc5fc8f..3931746af8ff0 100644
--- a/llvm/test/CodeGen/X86/atomic-unaligned.ll
+++ b/llvm/test/CodeGen/X86/atomic-unaligned.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
 
 ; Quick test to ensure that atomics which are not naturally-aligned
@@ -5,11 +6,19 @@
 ; sized libcalls.
 define void @test_i32(ptr %a) nounwind {
 ; CHECK-LABEL: test_i32:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    xchgl %eax, (%rdi)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    xchgl %eax, (%rdi)
+; CHECK-NEXT:    lock addl $2, (%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT:    retq
   %t0 = load atomic i32, ptr %a seq_cst, align 2
   store atomic i32 1, ptr %a seq_cst, align 2
   %t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
@@ -20,10 +29,74 @@ define void @test_i32(ptr %a) nounwind {
 
 define void @test_i128(ptr %a) nounwind {
 ; CHECK-LABEL: test_i128:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsp, %r14
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r14, %rdx
+; CHECK-NEXT:    movl $5, %ecx
+; CHECK-NEXT:    callq __atomic_load@PLT
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, (%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movl $5, %ecx
+; CHECK-NEXT:    callq __atomic_store@PLT
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, (%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r15, %rcx
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    callq __atomic_exchange@PLT
+; CHECK-NEXT:    movq (%rbx), %rdx
+; CHECK-NEXT:    movq 8(%rbx), %rcx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    addq $2, %rax
+; CHECK-NEXT:    movq %rdx, (%rsp)
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r14, %rdx
+; CHECK-NEXT:    movq %r15, %rcx
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    movl $5, %r9d
+; CHECK-NEXT:    callq __atomic_compare_exchange@PLT
+; CHECK-NEXT:    movq (%rsp), %rdx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    movl $5, %r9d
+; CHECK-NEXT:    callq __atomic_compare_exchange@PLT
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
   %t0 = load atomic i128, ptr %a seq_cst, align 8
   store atomic i128 1, ptr %a seq_cst, align 8
   %t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index e8e0ee0b7ef49..a3d706474b588 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -61,6 +61,42 @@ define void @store_i16(ptr %ptr, i16 %v) {
   ret void
 }
 
+define i16 @load_i16_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i16_unaligned:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    xorl %eax, %eax
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    lock cmpxchgw %cx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-O3-LABEL: load_i16_unaligned:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    xorl %ecx, %ecx
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; CHECK-O3-NEXT:    retq
+  %v = load atomic i16, ptr %ptr unordered, align 1
+  ret i16 %v
+}
+
+
+define void @store_i16_unaligned(ptr %ptr, i16 %v) {
+; CHECK-O0-LABEL: store_i16_unaligned:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw %si, %ax
+; CHECK-O0-NEXT:    xchgw %ax, (%rdi)
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-O3-LABEL: store_i16_unaligned:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    xchgw %si, (%rdi)
+; CHECK-O3-NEXT:    retq
+  store atomic i16 %v, ptr %ptr unordered, align 1
+  ret void
+}
+
 define i32 @load_i32(ptr %ptr) {
 ; CHECK-LABEL: load_i32:
 ; CHECK:       # %bb.0:
@@ -79,6 +115,34 @@ define void @store_i32(ptr %ptr, i32 %v) {
   ret void
 }
 
+define i32 @load_i32_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i32_unaligned:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    xorl %ecx, %ecx
+; CHECK-O0-NEXT:    movl %ecx, %eax
+; CHECK-O0-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-O3-LABEL: load_i32_unaligned:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    xorl %ecx, %ecx
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-O3-NEXT:    retq
+  %v = load atomic i32, ptr %ptr unordered, align 1
+  ret i32 %v
+}
+
+define void @store_i32_unaligned(ptr %ptr, i32 %v) {
+; CHECK-LABEL: store_i32_unaligned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xchgl %esi, (%rdi)
+; CHECK-NEXT:    retq
+  store atomic i32 %v, ptr %ptr unordered, align 1
+  ret void
+}
+
 define i64 @load_i64(ptr %ptr) {
 ; CHECK-LABEL: load_i64:
 ; CHECK:       # %bb.0:
@@ -97,6 +161,35 @@ define void @store_i64(ptr %ptr, i64 %v) {
   ret void
 }
 
+define i64 @load_i64_unaligned(ptr %ptr) {
+; CHECK-O0-LABEL: load_i64_unaligned:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    xorl %eax, %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    movq %rcx, %rax
+; CHECK-O0-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-O3-LABEL: load_i64_unaligned:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    xorl %ecx, %ecx
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; CHECK-O3-NEXT:    retq
+  %v = load atomic i64, ptr %ptr unordered, align 1
+  ret i64 %v
+}
+
+define void @store_i64_unaligned(ptr %ptr, i64 %v) {
+; CHECK-LABEL: store_i64_unaligned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xchgq %rsi, (%rdi)
+; CHECK-NEXT:    retq
+  store atomic i64 %v, ptr %ptr unordered, align 1
+  ret void
+}
+
 ;; The tests in the rest of this file are intended to show transforms which we
 ;; either *can't* do for legality, or don't currently implement.  The later
 ;; are noted carefully where relevant.
@@ -633,12 +726,12 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB35_1
+; CHECK-O3-NEXT:    je .LBB41_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rsi
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB35_1:
+; CHECK-O3-NEXT:  .LBB41_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -664,12 +757,12 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) {
 ; CHECK-O3-NEXT:    movq %rax, %rdx
 ; CHECK-O3-NEXT:    orq %rcx, %rdx
 ; CHECK-O3-NEXT:    shrq $32, %rdx
-; CHECK-O3-NEXT:    je .LBB36_1
+; CHECK-O3-NEXT:    je .LBB42_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rcx
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB36_1:
+; CHECK-O3-NEXT:  .LBB42_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %ecx
@@ -719,12 +812,12 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB38_1
+; CHECK-O3-NEXT:    je .LBB44_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rsi
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB38_1:
+; CHECK-O3-NEXT:  .LBB44_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -751,12 +844,12 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) {
 ; CHECK-O3-NEXT:    movq %rax, %rdx
 ; CHECK-O3-NEXT:    orq %rcx, %rdx
 ; CHECK-O3-NEXT:    shrq $32, %rdx
-; CHECK-O3-NEXT:    je .LBB39_1
+; CHECK-O3-NEXT:    je .LBB45_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rcx
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB39_1:
+; CHECK-O3-NEXT:  .LBB45_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %ecx
@@ -816,13 +909,13 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB41_1
+; CHECK-O3-NEXT:    je .LBB47_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rsi
 ; CHECK-O3-NEXT:    movq %rdx, %rax
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB41_1:
+; CHECK-O3-NEXT:  .LBB47_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -849,13 +942,13 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) {
 ; CHECK-O3-NEXT:    movq %rax, %rdx
 ; CHECK-O3-NEXT:    orq %rcx, %rdx
 ; CHECK-O3-NEXT:    shrq $32, %rdx
-; CHECK-O3-NEXT:    je .LBB42_1
+; CHECK-O3-NEXT:    je .LBB48_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rcx
 ; CHECK-O3-NEXT:    movq %rdx, %rax
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB42_1:
+; CHECK-O3-NEXT:  .LBB48_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %ecx
@@ -912,13 +1005,13 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB44_1
+; CHECK-O3-NEXT:    je .LBB50_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rsi
 ; CHECK-O3-NEXT:    movq %rdx, %rax
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB44_1:
+; CHECK-O3-NEXT:  .LBB50_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -946,13 +1039,13 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) {
 ; CHECK-O3-NEXT:    movq %rax, %rdx
 ; CHECK-O3-NEXT:    orq %rcx, %rdx
 ; CHECK-O3-NEXT:    shrq $32, %rdx
-; CHECK-O3-NEXT:    je .LBB45_1
+; CHECK-O3-NEXT:    je .LBB51_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rcx
 ; CHECK-O3-NEXT:    movq %rdx, %rax
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB45_1:
+; CHECK-O3-NEXT:  .LBB51_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %ecx
@@ -1469,13 +1562,13 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB74_1
+; CHECK-O3-NEXT:    je .LBB80_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rsi
 ; CHECK-O3-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB74_1:
+; CHECK-O3-NEXT:  .LBB80_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -1521,13 +1614,13 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB76_1
+; CHECK-O3-NEXT:    je .LBB82_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rsi
 ; CHECK-O3-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB76_1:
+; CHECK-O3-NEXT:  .LBB82_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -1599,13 +1692,13 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB78_1
+; CHECK-O3-NEXT:    je .LBB84_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    cqto
 ; CHECK-O3-NEXT:    idivq %rsi
 ; CHECK-O3-NEXT:    movq %rdx, (%rdi)
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB78_1:
+; CHECK-O3-NEXT:  .LBB84_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -1667,13 +1760,13 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) {
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    orq %rsi, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    je .LBB80_1
+; CHECK-O3-NEXT:    je .LBB86_1
 ; CHECK-O3-NEXT:  # %bb.2:
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divq %rsi
 ; CHECK-O3-NEXT:    movq %rdx, (%rdi)
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB80_1:
+; CHECK-O3-NEXT:  .LBB86_1:
 ; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O3-NEXT:    xorl %edx, %edx
 ; CHECK-O3-NEXT:    divl %esi
@@ -2323,11 +2416,11 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
 ; CHECK-O0-NEXT:    movl (%rdi), %eax
 ; CHECK-O0-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; CHECK-O0-NEXT:    cmpl %eax, %esi
-; CHECK-O0-NEXT:    jne .LBB116_2
+; CHECK-O0-NEXT:    jne .LBB122_2
 ; CHECK-O0-NEXT:  # %bb.1: # %taken
 ; CHECK-O0-NEXT:    movb $1, %al
 ; CHECK-O0-NEXT:    retq
-; CHECK-O0-NEXT:  .LBB116_2: # %untaken
+; CHECK-O0-NEXT:  .LBB122_2: # %untaken
 ; CHECK-O0-NEXT:    xorl %eax, %eax
 ; CHECK-O0-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-O0-NEXT:    retq
@@ -2337,11 +2430,11 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
 ; CHECK-O3-NEXT:    movl (%rdi), %eax
 ; CHECK-O3-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    cmpl %eax, %esi
-; CHECK-O3-NEXT:    jne .LBB116_2
+; CHECK-O3-NEXT:    jne .LBB122_2
 ; CHECK-O3-NEXT:  # %bb.1: # %taken
 ; CHECK-O3-NEXT:    movb $1, %al
 ; CHECK-O3-NEXT:    retq
-; CHECK-O3-NEXT:  .LBB116_2: # %untaken
+; CHECK-O3-NEXT:  .LBB122_2: # %untaken
 ; CHECK-O3-NEXT:    xorl %eax, %eax
 ; CHECK-O3-NEXT:    retq
   %v2 = load atomic i32, ptr %p unordered, align 4

github-actions · 2025-06-03T17:45:47Z

✅ With the latest revision this PR passed the C/C++ code formatter.

We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.

llvm/lib/Target/X86/X86ISelLowering.cpp

topperc · 2025-06-03T19:31:52Z

What does gcc do? If something compiled with gcc uses libatomic and something compiled with llvm uses cmpxchg, I don't think they can be linked together.

AZero13 · 2025-06-04T15:30:31Z

the actual condition is that the value must fit in a single cache line
which I think means no straddling 64-byte boundaries
(aligned values always fit in a single cache line)

How can I ensure this?

topperc · 2025-06-04T17:51:22Z

the actual condition is that the value must fit in a single cache line which I think means no straddling 64-byte boundaries (aligned values always fit in a single cache line)

How can I ensure this?

LLVM has no way to know that other than emitting a runtime branch to check the alignment.

Pre-commit tests (NFC)

80f8e1e

llvmbot added backend:X86 llvm:codegen labels Jun 3, 2025

Enable unaligned loads on x86 using cmpxchg

7775d9a

We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.

AZero13 force-pushed the atomic-unaligned branch from d1f23d5 to 7775d9a Compare June 3, 2025 17:46

topperc reviewed Jun 3, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Show resolved Hide resolved

Fix comment

303ac02

AZero13 requested a review from topperc June 3, 2025 18:53

AZero13 changed the title ~~Enable unaligned loads on x86 using cmpxchg~~ [X86] Enable unaligned loads on x86 using cmpxchg Jun 3, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[X86] Enable unaligned loads on x86 using cmpxchg #142645

[X86] Enable unaligned loads on x86 using cmpxchg #142645

Uh oh!

AZero13 commented Jun 3, 2025

Uh oh!

llvmbot commented Jun 3, 2025

Uh oh!

github-actions bot commented Jun 3, 2025 •

edited

Loading

Uh oh!

Uh oh!

topperc commented Jun 3, 2025

Uh oh!

AZero13 commented Jun 4, 2025

Uh oh!

topperc commented Jun 4, 2025

Uh oh!

Uh oh!

[X86] Enable unaligned loads on x86 using cmpxchg #142645

Are you sure you want to change the base?

[X86] Enable unaligned loads on x86 using cmpxchg #142645

Uh oh!

Conversation

AZero13 commented Jun 3, 2025

Uh oh!

llvmbot commented Jun 3, 2025

Uh oh!

github-actions bot commented Jun 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

topperc commented Jun 3, 2025

Uh oh!

AZero13 commented Jun 4, 2025

Uh oh!

topperc commented Jun 4, 2025

Uh oh!

Uh oh!

github-actions bot commented Jun 3, 2025 •

edited

Loading