Skip to content

Commit 0748f42

Browse files
authored
[AArch64][GlobalISel] Legalize 128-bit types for FABS (#104753)
This patch adds a common lower action for `G_FABS`, which generates `and x8, x8, #0x7fffffffffffffff` to reset the sign bit. The action does not support vectors since `G_AND` does not support fp128. This approach is different than what SDAG is doing. SDAG stores the value onto stack, clears the sign bit in the most significant byte, and loads the value back into register. This involves multiple memory ops and sounds slower.
1 parent 3d5e1ec commit 0748f42

File tree

5 files changed

+207
-4
lines changed

5 files changed

+207
-4
lines changed

llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ class LegalizerHelper {
437437
LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
438438
LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
439439
LegalizeResult lowerAbsToCNeg(MachineInstr &MI);
440+
LegalizeResult lowerFAbs(MachineInstr &MI);
440441
LegalizeResult lowerVectorReduction(MachineInstr &MI);
441442
LegalizeResult lowerMemcpyInline(MachineInstr &MI);
442443
LegalizeResult lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4255,6 +4255,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
42554255
return lowerShlSat(MI);
42564256
case G_ABS:
42574257
return lowerAbsToAddXor(MI);
4258+
case G_FABS:
4259+
return lowerFAbs(MI);
42584260
case G_SELECT:
42594261
return lowerSelect(MI);
42604262
case G_IS_FPCLASS:
@@ -8761,6 +8763,22 @@ LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
87618763
return Legalized;
87628764
}
87638765

8766+
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
8767+
Register SrcReg = MI.getOperand(1).getReg();
8768+
Register DstReg = MI.getOperand(0).getReg();
8769+
8770+
LLT Ty = MRI.getType(DstReg);
8771+
8772+
// Reset sign bit
8773+
MIRBuilder.buildAnd(
8774+
DstReg, SrcReg,
8775+
MIRBuilder.buildConstant(
8776+
Ty, APInt::getSignedMaxValue(Ty.getScalarSizeInBits())));
8777+
8778+
MI.eraseFromParent();
8779+
return Legalized;
8780+
}
8781+
87648782
LegalizerHelper::LegalizeResult
87658783
LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
87668784
Register SrcReg = MI.getOperand(1).getReg();

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
242242
.widenScalarToNextPow2(0);
243243

244244
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
245-
G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
246-
G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
247-
G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
245+
G_FSQRT, G_FMAXNUM, G_FMINNUM, G_FMAXIMUM,
246+
G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT,
247+
G_FNEARBYINT, G_INTRINSIC_TRUNC,
248248
G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
249249
.legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
250250
.legalIf([=](const LegalityQuery &Query) {
@@ -258,6 +258,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
258258
.clampNumElements(0, v2s64, v2s64)
259259
.moreElementsToNextPow2(0);
260260

261+
getActionDefinitionsBuilder(G_FABS)
262+
.legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
263+
.legalIf([=](const LegalityQuery &Query) {
264+
const auto &Ty = Query.Types[0];
265+
return (Ty == v8s16 || Ty == v4s16) && HasFP16;
266+
})
267+
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
268+
.lowerIf(scalarOrEltWiderThan(0, 64))
269+
.minScalarOrElt(0, MinFPScalar)
270+
.clampNumElements(0, v4s16, v8s16)
271+
.clampNumElements(0, v2s32, v4s32)
272+
.clampNumElements(0, v2s64, v2s64)
273+
.moreElementsToNextPow2(0);
274+
261275
getActionDefinitionsBuilder(G_FREM)
262276
.libcallFor({s32, s64})
263277
.minScalar(0, s32)

llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
12
# RUN: llc -mtriple=aarch64-- -run-pass=legalizer %s \
23
# RUN: -mcpu=cortex-a75 -o - 2>&1 | FileCheck %s --check-prefixes=CHECK
34

@@ -538,7 +539,6 @@
538539
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
539540
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
540541
# DEBUG-NEXT: G_FABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
541-
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
542542
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
543543
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
544544
# DEBUG-NEXT: G_FCOPYSIGN (opcode {{[0-9]+}}): 2 type indices
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel=0 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3+
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel=1 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4+
5+
define fp128 @fabs_f128(fp128 %a) {
6+
; CHECK-SD-LABEL: fabs_f128:
7+
; CHECK-SD: // %bb.0: // %entry
8+
; CHECK-SD-NEXT: str q0, [sp, #-16]!
9+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
10+
; CHECK-SD-NEXT: ldrb w8, [sp, #15]
11+
; CHECK-SD-NEXT: and w8, w8, #0x7f
12+
; CHECK-SD-NEXT: strb w8, [sp, #15]
13+
; CHECK-SD-NEXT: ldr q0, [sp], #16
14+
; CHECK-SD-NEXT: ret
15+
;
16+
; CHECK-GI-LABEL: fabs_f128:
17+
; CHECK-GI: // %bb.0: // %entry
18+
; CHECK-GI-NEXT: mov x8, v0.d[1]
19+
; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
20+
; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
21+
; CHECK-GI-NEXT: mov v0.d[1], x8
22+
; CHECK-GI-NEXT: ret
23+
entry:
24+
%c = call fp128 @llvm.fabs.f128(fp128 %a)
25+
ret fp128 %c
26+
}
27+
28+
define <1 x fp128> @fabs_v1f128(<1 x fp128> %a) {
29+
; CHECK-SD-LABEL: fabs_v1f128:
30+
; CHECK-SD: // %bb.0: // %entry
31+
; CHECK-SD-NEXT: str q0, [sp, #-16]!
32+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
33+
; CHECK-SD-NEXT: ldrb w8, [sp, #15]
34+
; CHECK-SD-NEXT: and w8, w8, #0x7f
35+
; CHECK-SD-NEXT: strb w8, [sp, #15]
36+
; CHECK-SD-NEXT: ldr q0, [sp], #16
37+
; CHECK-SD-NEXT: ret
38+
;
39+
; CHECK-GI-LABEL: fabs_v1f128:
40+
; CHECK-GI: // %bb.0: // %entry
41+
; CHECK-GI-NEXT: mov x8, v0.d[1]
42+
; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
43+
; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
44+
; CHECK-GI-NEXT: mov v0.d[1], x8
45+
; CHECK-GI-NEXT: ret
46+
entry:
47+
%c = call <1 x fp128> @llvm.fabs.v1f128(<1 x fp128> %a)
48+
ret <1 x fp128> %c
49+
}
50+
51+
define <2 x fp128> @fabs_v2f128(<2 x fp128> %a) {
52+
; CHECK-SD-LABEL: fabs_v2f128:
53+
; CHECK-SD: // %bb.0: // %entry
54+
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
55+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
56+
; CHECK-SD-NEXT: ldrb w8, [sp, #15]
57+
; CHECK-SD-NEXT: and w8, w8, #0x7f
58+
; CHECK-SD-NEXT: strb w8, [sp, #15]
59+
; CHECK-SD-NEXT: ldrb w8, [sp, #31]
60+
; CHECK-SD-NEXT: and w8, w8, #0x7f
61+
; CHECK-SD-NEXT: strb w8, [sp, #31]
62+
; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
63+
; CHECK-SD-NEXT: ret
64+
;
65+
; CHECK-GI-LABEL: fabs_v2f128:
66+
; CHECK-GI: // %bb.0: // %entry
67+
; CHECK-GI-NEXT: mov x8, v0.d[1]
68+
; CHECK-GI-NEXT: mov x9, v1.d[1]
69+
; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
70+
; CHECK-GI-NEXT: mov v1.d[0], v1.d[0]
71+
; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
72+
; CHECK-GI-NEXT: and x9, x9, #0x7fffffffffffffff
73+
; CHECK-GI-NEXT: mov v0.d[1], x8
74+
; CHECK-GI-NEXT: mov v1.d[1], x9
75+
; CHECK-GI-NEXT: ret
76+
entry:
77+
%c = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> %a)
78+
ret <2 x fp128> %c
79+
}
80+
81+
define <3 x fp128> @fabs_v3f128(<3 x fp128> %a) {
82+
; CHECK-SD-LABEL: fabs_v3f128:
83+
; CHECK-SD: // %bb.0: // %entry
84+
; CHECK-SD-NEXT: stp q0, q1, [sp, #-48]!
85+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
86+
; CHECK-SD-NEXT: ldrb w8, [sp, #15]
87+
; CHECK-SD-NEXT: str q2, [sp, #32]
88+
; CHECK-SD-NEXT: and w8, w8, #0x7f
89+
; CHECK-SD-NEXT: strb w8, [sp, #15]
90+
; CHECK-SD-NEXT: ldrb w8, [sp, #31]
91+
; CHECK-SD-NEXT: and w8, w8, #0x7f
92+
; CHECK-SD-NEXT: strb w8, [sp, #31]
93+
; CHECK-SD-NEXT: ldrb w8, [sp, #47]
94+
; CHECK-SD-NEXT: ldp q0, q1, [sp]
95+
; CHECK-SD-NEXT: and w8, w8, #0x7f
96+
; CHECK-SD-NEXT: strb w8, [sp, #47]
97+
; CHECK-SD-NEXT: ldr q2, [sp, #32]
98+
; CHECK-SD-NEXT: add sp, sp, #48
99+
; CHECK-SD-NEXT: ret
100+
;
101+
; CHECK-GI-LABEL: fabs_v3f128:
102+
; CHECK-GI: // %bb.0: // %entry
103+
; CHECK-GI-NEXT: mov x8, v0.d[1]
104+
; CHECK-GI-NEXT: mov x9, v1.d[1]
105+
; CHECK-GI-NEXT: mov x10, v2.d[1]
106+
; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
107+
; CHECK-GI-NEXT: mov v1.d[0], v1.d[0]
108+
; CHECK-GI-NEXT: mov v2.d[0], v2.d[0]
109+
; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
110+
; CHECK-GI-NEXT: and x9, x9, #0x7fffffffffffffff
111+
; CHECK-GI-NEXT: and x10, x10, #0x7fffffffffffffff
112+
; CHECK-GI-NEXT: mov v0.d[1], x8
113+
; CHECK-GI-NEXT: mov v1.d[1], x9
114+
; CHECK-GI-NEXT: mov v2.d[1], x10
115+
; CHECK-GI-NEXT: ret
116+
entry:
117+
%c = call <3 x fp128> @llvm.fabs.v3f128(<3 x fp128> %a)
118+
ret <3 x fp128> %c
119+
}
120+
121+
define <4 x fp128> @fabs_v4f128(<4 x fp128> %a) {
122+
; CHECK-SD-LABEL: fabs_v4f128:
123+
; CHECK-SD: // %bb.0: // %entry
124+
; CHECK-SD-NEXT: stp q0, q1, [sp, #-64]!
125+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
126+
; CHECK-SD-NEXT: ldrb w8, [sp, #15]
127+
; CHECK-SD-NEXT: stp q2, q3, [sp, #32]
128+
; CHECK-SD-NEXT: and w8, w8, #0x7f
129+
; CHECK-SD-NEXT: strb w8, [sp, #15]
130+
; CHECK-SD-NEXT: ldrb w8, [sp, #31]
131+
; CHECK-SD-NEXT: and w8, w8, #0x7f
132+
; CHECK-SD-NEXT: strb w8, [sp, #31]
133+
; CHECK-SD-NEXT: ldrb w8, [sp, #47]
134+
; CHECK-SD-NEXT: ldp q0, q1, [sp]
135+
; CHECK-SD-NEXT: and w8, w8, #0x7f
136+
; CHECK-SD-NEXT: strb w8, [sp, #47]
137+
; CHECK-SD-NEXT: ldrb w8, [sp, #63]
138+
; CHECK-SD-NEXT: and w8, w8, #0x7f
139+
; CHECK-SD-NEXT: strb w8, [sp, #63]
140+
; CHECK-SD-NEXT: ldp q2, q3, [sp, #32]
141+
; CHECK-SD-NEXT: add sp, sp, #64
142+
; CHECK-SD-NEXT: ret
143+
;
144+
; CHECK-GI-LABEL: fabs_v4f128:
145+
; CHECK-GI: // %bb.0: // %entry
146+
; CHECK-GI-NEXT: mov x8, v0.d[1]
147+
; CHECK-GI-NEXT: mov v7.d[0], v0.d[0]
148+
; CHECK-GI-NEXT: mov x9, v1.d[1]
149+
; CHECK-GI-NEXT: mov x10, v2.d[1]
150+
; CHECK-GI-NEXT: mov x11, v3.d[1]
151+
; CHECK-GI-NEXT: mov v1.d[0], v1.d[0]
152+
; CHECK-GI-NEXT: mov v2.d[0], v2.d[0]
153+
; CHECK-GI-NEXT: mov v3.d[0], v3.d[0]
154+
; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
155+
; CHECK-GI-NEXT: mov v7.d[1], x8
156+
; CHECK-GI-NEXT: and x8, x9, #0x7fffffffffffffff
157+
; CHECK-GI-NEXT: and x9, x10, #0x7fffffffffffffff
158+
; CHECK-GI-NEXT: and x10, x11, #0x7fffffffffffffff
159+
; CHECK-GI-NEXT: mov v1.d[1], x8
160+
; CHECK-GI-NEXT: mov v2.d[1], x9
161+
; CHECK-GI-NEXT: mov v3.d[1], x10
162+
; CHECK-GI-NEXT: mov v0.16b, v7.16b
163+
; CHECK-GI-NEXT: ret
164+
entry:
165+
%c = call <4 x fp128> @llvm.fabs.v4f128(<4 x fp128> %a)
166+
ret <4 x fp128> %c
167+
}
168+
169+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
170+
; CHECK: {{.*}}

0 commit comments

Comments
 (0)