Enable unaligned loads on x86 using cmpxchg

AZero13 · AZero13 · commit 7775d9a7e7c2 · 2025-06-03T13:46:40.000-04:00
We can do this by using cmpxchg. It is really the only way, though the big concern is that x86 cpus can choose to do a cpu exception here, or handle it. So I am unsure of how to deal with that.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -232,8 +232,22 @@ template <typename Inst>
 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
   unsigned Size = getAtomicOpSize(I);
   Align Alignment = I->getAlign();
-  return Alignment >= Size &&
-         Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+
+  // X86 we can do unaligned loads
+  return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 &&
+         (Alignment >= Size || TLI->supportsUnalignedAtomics());
+}
+
+template <typename Inst>
+static bool canLowerAtomicAsUnaligned(const TargetLowering *TLI, Inst *I) {
+  if (!TLI->supportsUnalignedAtomics())
+    return false;
+  unsigned Size = getAtomicOpSize(I);
+  Align Alignment = I->getAlign();
+
+  // X86 we can do unaligned loads
+  return Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8 &&
+         (Alignment < Size);
 }
 
 bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
@@ -510,6 +524,10 @@ AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
 }
 
 bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
+
+  if (canLowerAtomicAsUnaligned(TLI, LI))
+    return expandAtomicLoadToCmpXchg(LI);
+
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
@@ -532,6 +550,11 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
 }
 
 bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
+  if (canLowerAtomicAsUnaligned(TLI, SI)) {
+    expandAtomicStore(SI);
+    return true;
+  }
+
   switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -137,6 +137,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Set up the TargetLowering object.
 
+  // X86 supports unaligned atomic memory accesses via cmpxchg8b and cmpxchg16b
+  setSupportsUnalignedAtomics(true);
+
   // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
diff --git a/llvm/test/CodeGen/X86/atomic-unaligned.ll b/llvm/test/CodeGen/X86/atomic-unaligned.ll
@@ -1,15 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
 
 ; Quick test to ensure that atomics which are not naturally-aligned
 ; emit unsized libcalls, and aren't emitted as native instructions or
 ; sized libcalls.
 define void @test_i32(ptr %a) nounwind {
 ; CHECK-LABEL: test_i32:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    xchgl %eax, (%rdi)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    xchgl %eax, (%rdi)
+; CHECK-NEXT:    lock addl $2, (%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-NEXT:    retq
   %t0 = load atomic i32, ptr %a seq_cst, align 2
   store atomic i32 1, ptr %a seq_cst, align 2
   %t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
@@ -20,10 +29,74 @@ define void @test_i32(ptr %a) nounwind {
 
 define void @test_i128(ptr %a) nounwind {
 ; CHECK-LABEL: test_i128:
-; CHECK: callq __atomic_load
-; CHECK: callq __atomic_store
-; CHECK: callq __atomic_exchange
-; CHECK: callq __atomic_compare_exchange
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $32, %rsp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsp, %r14
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r14, %rdx
+; CHECK-NEXT:    movl $5, %ecx
+; CHECK-NEXT:    callq __atomic_load@PLT
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, (%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movl $5, %ecx
+; CHECK-NEXT:    callq __atomic_store@PLT
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, (%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r15, %rcx
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    callq __atomic_exchange@PLT
+; CHECK-NEXT:    movq (%rbx), %rdx
+; CHECK-NEXT:    movq 8(%rbx), %rcx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    addq $2, %rax
+; CHECK-NEXT:    movq %rdx, (%rsp)
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movq %r14, %rdx
+; CHECK-NEXT:    movq %r15, %rcx
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    movl $5, %r9d
+; CHECK-NEXT:    callq __atomic_compare_exchange@PLT
+; CHECK-NEXT:    movq (%rsp), %rdx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsp, %rdx
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movl $16, %edi
+; CHECK-NEXT:    movq %rbx, %rsi
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    movl $5, %r9d
+; CHECK-NEXT:    callq __atomic_compare_exchange@PLT
+; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
   %t0 = load atomic i128, ptr %a seq_cst, align 8
   store atomic i128 1, ptr %a seq_cst, align 8
   %t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -64,30 +64,18 @@ define void @store_i16(ptr %ptr, i16 %v) {
 define i16 @load_i16_unaligned(ptr %ptr) {
 ; CHECK-O0-LABEL: load_i16_unaligned:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $2, %edi
-; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; CHECK-O0-NEXT:    popq %rcx
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT:    xorl %eax, %eax
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    lock cmpxchgw %cx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: load_i16_unaligned:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O3-NEXT:    movl $2, %edi
 ; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; CHECK-O3-NEXT:    popq %rcx
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i16, ptr %ptr unordered, align 1
   ret i16 %v
@@ -97,33 +85,13 @@ define i16 @load_i16_unaligned(ptr %ptr) {
 define void @store_i16_unaligned(ptr %ptr, i16 %v) {
 ; CHECK-O0-LABEL: store_i16_unaligned:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movl %esi, %eax
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-O0-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; CHECK-O0-NEXT:    movl $2, %edi
-; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_store@PLT
-; CHECK-O0-NEXT:    popq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT:    movw %si, %ax
+; CHECK-O0-NEXT:    xchgw %ax, (%rdi)
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: store_i16_unaligned:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rax
-; CHECK-O3-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O3-NEXT:    movl $2, %edi
-; CHECK-O3-NEXT:    movq %rax, %rsi
-; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_store@PLT
-; CHECK-O3-NEXT:    popq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT:    xchgw %si, (%rdi)
 ; CHECK-O3-NEXT:    retq
   store atomic i16 %v, ptr %ptr unordered, align 1
   ret void
@@ -150,65 +118,27 @@ define void @store_i32(ptr %ptr, i32 %v) {
 define i32 @load_i32_unaligned(ptr %ptr) {
 ; CHECK-O0-LABEL: load_i32_unaligned:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $4, %edi
-; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-O0-NEXT:    popq %rcx
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT:    movl %ecx, %eax
+; CHECK-O0-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: load_i32_unaligned:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O3-NEXT:    movl $4, %edi
 ; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-O3-NEXT:    popq %rcx
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i32, ptr %ptr unordered, align 1
   ret i32 %v
 }
 
 define void @store_i32_unaligned(ptr %ptr, i32 %v) {
-; CHECK-O0-LABEL: store_i32_unaligned:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movl %esi, %eax
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-O0-NEXT:    movl $4, %edi
-; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_store@PLT
-; CHECK-O0-NEXT:    popq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-O0-NEXT:    retq
-;
-; CHECK-O3-LABEL: store_i32_unaligned:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rax
-; CHECK-O3-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
-; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; CHECK-O3-NEXT:    movl $4, %edi
-; CHECK-O3-NEXT:    movq %rax, %rsi
-; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_store@PLT
-; CHECK-O3-NEXT:    popq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-O3-NEXT:    retq
+; CHECK-LABEL: store_i32_unaligned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xchgl %esi, (%rdi)
+; CHECK-NEXT:    retq
   store atomic i32 %v, ptr %ptr unordered, align 1
   ret void
 }
@@ -234,65 +164,28 @@ define void @store_i64(ptr %ptr, i64 %v) {
 define i64 @load_i64_unaligned(ptr %ptr) {
 ; CHECK-O0-LABEL: load_i64_unaligned:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movl $8, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movq (%rsp), %rax
-; CHECK-O0-NEXT:    popq %rcx
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O0-NEXT:    xorl %eax, %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    movq %rcx, %rax
+; CHECK-O0-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; CHECK-O0-NEXT:    sete %cl
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: load_i64_unaligned:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rsi
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $8, %edi
 ; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_load@PLT
-; CHECK-O3-NEXT:    movq (%rsp), %rax
-; CHECK-O3-NEXT:    popq %rcx
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-O3-NEXT:    xorl %eax, %eax
+; CHECK-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i64, ptr %ptr unordered, align 1
   ret i64 %v
 }
 
 define void @store_i64_unaligned(ptr %ptr, i64 %v) {
-; CHECK-O0-LABEL: store_i64_unaligned:
-; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    pushq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:    movq %rsi, %rax
-; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movq %rax, (%rsp)
-; CHECK-O0-NEXT:    movl $8, %edi
-; CHECK-O0-NEXT:    movq %rsp, %rdx
-; CHECK-O0-NEXT:    xorl %ecx, %ecx
-; CHECK-O0-NEXT:    callq __atomic_store@PLT
-; CHECK-O0-NEXT:    popq %rax
-; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-O0-NEXT:    retq
-;
-; CHECK-O3-LABEL: store_i64_unaligned:
-; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    pushq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:    movq %rdi, %rax
-; CHECK-O3-NEXT:    movq %rsi, (%rsp)
-; CHECK-O3-NEXT:    movq %rsp, %rdx
-; CHECK-O3-NEXT:    movl $8, %edi
-; CHECK-O3-NEXT:    movq %rax, %rsi
-; CHECK-O3-NEXT:    xorl %ecx, %ecx
-; CHECK-O3-NEXT:    callq __atomic_store@PLT
-; CHECK-O3-NEXT:    popq %rax
-; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-O3-NEXT:    retq
+; CHECK-LABEL: store_i64_unaligned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xchgq %rsi, (%rdi)
+; CHECK-NEXT:    retq
   store atomic i64 %v, ptr %ptr unordered, align 1
   ret void
 }