Skip to content

Commit 6c1bac9

Browse files
sdesmalen-armllvmbot
authored andcommitted
[AArch64] Avoid NEON dot product in streaming[-compatible] functions (#101677)
The NEON dot product is not valid in streaming mode. A follow-up patch will improve codegen for these operations. (cherry picked from commit 12937b1)
1 parent 69555e0 commit 6c1bac9

File tree

2 files changed

+146
-0
lines changed

2 files changed

+146
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -17719,6 +17719,9 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
1771917719
// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
1772017720
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
1772117721
const AArch64Subtarget *ST) {
17722+
if (!ST->isNeonAvailable())
17723+
return SDValue();
17724+
1772217725
if (!ST->hasDotProd())
1772317726
return performVecReduceAddCombineWithUADDLP(N, DAG);
1772417727

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mattr=+sve < %s | FileCheck %s
3+
; RUN: llc -mattr=+dotprod,+sve < %s | FileCheck %s -check-prefix=DOT
4+
; RUN: llc -mattr=+dotprod,+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE
5+
; RUN: llc -mattr=+dotprod,+sme -force-streaming < %s | FileCheck %s --check-prefix=STREAMING-SVE
6+
7+
target triple = "aarch64-unknown-linux-gnu"
8+
9+
define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
10+
; CHECK-LABEL: reduce_uaddv_v16i8:
11+
; CHECK: // %bb.0:
12+
; CHECK-NEXT: ushll2 v2.8h, v1.16b, #0
13+
; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0
14+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
15+
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
16+
; CHECK-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
17+
; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h
18+
; CHECK-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
19+
; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
20+
; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
21+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
22+
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
23+
; CHECK-NEXT: addv s0, v0.4s
24+
; CHECK-NEXT: fmov w0, s0
25+
; CHECK-NEXT: ret
26+
;
27+
; DOT-LABEL: reduce_uaddv_v16i8:
28+
; DOT: // %bb.0:
29+
; DOT-NEXT: movi v2.16b, #1
30+
; DOT-NEXT: movi v3.2d, #0000000000000000
31+
; DOT-NEXT: udot v3.4s, v1.16b, v2.16b
32+
; DOT-NEXT: udot v3.4s, v0.16b, v2.16b
33+
; DOT-NEXT: addv s0, v3.4s
34+
; DOT-NEXT: fmov w0, s0
35+
; DOT-NEXT: ret
36+
;
37+
; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
38+
; STREAMING-SVE: // %bb.0:
39+
; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
40+
; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b
41+
; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
42+
; STREAMING-SVE-NEXT: uunpklo z3.h, z0.b
43+
; STREAMING-SVE-NEXT: ptrue p0.s, vl4
44+
; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8
45+
; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
46+
; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b
47+
; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b
48+
; STREAMING-SVE-NEXT: uunpklo z4.s, z2.h
49+
; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8
50+
; STREAMING-SVE-NEXT: uunpklo z6.s, z3.h
51+
; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8
52+
; STREAMING-SVE-NEXT: mov z5.d, z1.d
53+
; STREAMING-SVE-NEXT: uunpklo z7.s, z0.h
54+
; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
55+
; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h
56+
; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h
57+
; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s
58+
; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8
59+
; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h
60+
; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h
61+
; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s
62+
; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h
63+
; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s
64+
; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s
65+
; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s
66+
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
67+
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
68+
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
69+
; STREAMING-SVE-NEXT: fmov x0, d0
70+
; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
71+
; STREAMING-SVE-NEXT: ret
72+
%1 = zext <32 x i8> %a to <32 x i32>
73+
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
74+
ret i32 %2
75+
}
76+
77+
define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
78+
; CHECK-LABEL: reduce_saddv_v16i8:
79+
; CHECK: // %bb.0:
80+
; CHECK-NEXT: sshll2 v2.8h, v1.16b, #0
81+
; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0
82+
; CHECK-NEXT: sshll v1.8h, v1.8b, #0
83+
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
84+
; CHECK-NEXT: saddl2 v4.4s, v3.8h, v2.8h
85+
; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h
86+
; CHECK-NEXT: saddl2 v5.4s, v0.8h, v1.8h
87+
; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
88+
; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
89+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
90+
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
91+
; CHECK-NEXT: addv s0, v0.4s
92+
; CHECK-NEXT: fmov w0, s0
93+
; CHECK-NEXT: ret
94+
;
95+
; DOT-LABEL: reduce_saddv_v16i8:
96+
; DOT: // %bb.0:
97+
; DOT-NEXT: movi v2.16b, #1
98+
; DOT-NEXT: movi v3.2d, #0000000000000000
99+
; DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
100+
; DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
101+
; DOT-NEXT: addv s0, v3.4s
102+
; DOT-NEXT: fmov w0, s0
103+
; DOT-NEXT: ret
104+
;
105+
; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
106+
; STREAMING-SVE: // %bb.0:
107+
; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
108+
; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b
109+
; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
110+
; STREAMING-SVE-NEXT: sunpklo z3.h, z0.b
111+
; STREAMING-SVE-NEXT: ptrue p0.s, vl4
112+
; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8
113+
; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
114+
; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b
115+
; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b
116+
; STREAMING-SVE-NEXT: sunpklo z4.s, z2.h
117+
; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8
118+
; STREAMING-SVE-NEXT: sunpklo z6.s, z3.h
119+
; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8
120+
; STREAMING-SVE-NEXT: mov z5.d, z1.d
121+
; STREAMING-SVE-NEXT: sunpklo z7.s, z0.h
122+
; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
123+
; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h
124+
; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h
125+
; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s
126+
; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8
127+
; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h
128+
; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h
129+
; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s
130+
; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h
131+
; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s
132+
; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s
133+
; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s
134+
; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
135+
; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
136+
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
137+
; STREAMING-SVE-NEXT: fmov x0, d0
138+
; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
139+
; STREAMING-SVE-NEXT: ret
140+
%1 = sext <32 x i8> %a to <32 x i32>
141+
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
142+
ret i32 %2
143+
}

0 commit comments

Comments
 (0)