Skip to content

Commit 3b17d04

Browse files
authored
[AArch64][SVE] Don't require 16-byte aligned SVE loads/stores with +strict-align (#119732)
Instead, allow any alignment >= the element size (in bytes). This is all that is needed for (predicated) vector loads even if unaligned accesses are disabled. See: https://developer.arm.com/documentation/ddi0602/2024-09/Shared-Pseudocode/aarch64-functions-memory?lang=en#impl-aarch64.Mem.read.3 Specifically: ``` // Check alignment on size of element accessed, not overall access size. constant integer alignment = if accdesc.ispair then size DIV 2 else size; ``` The `size` passed to `Mem` by SVE load/store instructions is the element size.
1 parent aff3e68 commit 3b17d04

File tree

3 files changed

+106
-0
lines changed

3 files changed

+106
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2569,6 +2569,19 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
25692569
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
25702570
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
25712571
unsigned *Fast) const {
2572+
2573+
// Allow SVE loads/stores where the alignment >= the size of the element type,
2574+
// even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2575+
// for stores that come from IR, only require element-size alignment (even if
2576+
// unaligned accesses are disabled). Without this, these will be forced to
2577+
// have 16-byte alignment with +strict-align (and fail to lower as we don't
2578+
// yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2579+
if (VT.isScalableVector()) {
2580+
unsigned ElementSizeBits = VT.getScalarSizeInBits();
2581+
if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2582+
return true;
2583+
}
2584+
25722585
if (Subtarget->requiresStrictAlign())
25732586
return false;
25742587

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s | FileCheck %s
4+
5+
define void @nxv16i8(ptr %ldptr, ptr %stptr) {
6+
; CHECK-LABEL: nxv16i8:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: ptrue p0.b
9+
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
10+
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
11+
; CHECK-NEXT: ret
12+
%l3 = load <vscale x 16 x i8>, ptr %ldptr, align 1
13+
store <vscale x 16 x i8> %l3, ptr %stptr, align 1
14+
ret void
15+
}
16+
17+
define void @nxv8i16(ptr %ldptr, ptr %stptr) {
18+
; CHECK-LABEL: nxv8i16:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: ptrue p0.h
21+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
22+
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
23+
; CHECK-NEXT: ret
24+
%l3 = load <vscale x 8 x i16>, ptr %ldptr, align 2
25+
store <vscale x 8 x i16> %l3, ptr %stptr, align 2
26+
ret void
27+
}
28+
29+
define void @nxv4i32(ptr %ldptr, ptr %stptr) {
30+
; CHECK-LABEL: nxv4i32:
31+
; CHECK: // %bb.0:
32+
; CHECK-NEXT: ptrue p0.s
33+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
34+
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
35+
; CHECK-NEXT: ret
36+
%l3 = load <vscale x 4 x i32>, ptr %ldptr, align 4
37+
store <vscale x 4 x i32> %l3, ptr %stptr, align 4
38+
ret void
39+
}
40+
41+
define void @nxv2i64(ptr %ldptr, ptr %stptr) {
42+
; CHECK-LABEL: nxv2i64:
43+
; CHECK: // %bb.0:
44+
; CHECK-NEXT: ptrue p0.d
45+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
46+
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
47+
; CHECK-NEXT: ret
48+
%l3 = load <vscale x 2 x i64>, ptr %ldptr, align 8
49+
store <vscale x 2 x i64> %l3, ptr %stptr, align 8
50+
ret void
51+
}
52+
53+
define void @nxv16i1(ptr %ldptr, ptr %stptr) {
54+
; CHECK-LABEL: nxv16i1:
55+
; CHECK: // %bb.0:
56+
; CHECK-NEXT: ldr p0, [x0]
57+
; CHECK-NEXT: str p0, [x1]
58+
; CHECK-NEXT: ret
59+
%l3 = load <vscale x 16 x i1>, ptr %ldptr, align 2
60+
store <vscale x 16 x i1> %l3, ptr %stptr, align 2
61+
ret void
62+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3+
; RUN: not --crash llc -mtriple=aarch64-linux-gnu -mattr=+sve,+strict-align < %s 2>&1 | FileCheck %s --check-prefix=CHECK-FIXME
4+
5+
; REQUIRES: asserts
6+
7+
; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE.
8+
; CHECK-FIXME: LLVM ERROR: Invalid size request on a scalable vector.
9+
10+
define void @unaligned_nxv16i1(ptr %ldptr, ptr %stptr) {
11+
; CHECK-LABEL: unaligned_nxv16i1:
12+
; CHECK: // %bb.0:
13+
; CHECK-NEXT: ldr p0, [x0]
14+
; CHECK-NEXT: str p0, [x1]
15+
; CHECK-NEXT: ret
16+
%l3 = load <vscale x 16 x i1>, ptr %ldptr, align 1
17+
store <vscale x 16 x i1> %l3, ptr %stptr, align 1
18+
ret void
19+
}
20+
21+
define void @unaligned_nxv2i64(ptr %ldptr, ptr %stptr) {
22+
; CHECK-LABEL: unaligned_nxv2i64:
23+
; CHECK: // %bb.0:
24+
; CHECK-NEXT: ptrue p0.d
25+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
26+
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
27+
; CHECK-NEXT: ret
28+
%l3 = load <vscale x 2 x i64>, ptr %ldptr, align 4
29+
store <vscale x 2 x i64> %l3, ptr %stptr, align 4
30+
ret void
31+
}

0 commit comments

Comments
 (0)