Skip to content

Commit 8a67870

Browse files
committed
[AArch64][SVE] Fold ADD+CNTB to INCB/DECB
Currently, given: ```cpp uint64_t incb(uint64_t x) { return x+svcntb(); } ``` LLVM generates: ```gas incb: addvl x0, x0, #1 ret ``` Which is functionally equivalent to: ```gas incb: incb x0 ret ``` However, on microarchitectures like the Neoverse V2 and Neoverse V3, the second form (with INCB) can have significantly better latency and throughput. On the Neoverse V2, for example, ADDVL has a latency and throughput of 2, whereas INCB has a latency of 1 and a throughput of 4 (and similarly for the Neoverse V3, though in this case the throughput is further increased to 8). The same applies to DECB. This patch adds patterns to prefer the INCB/DECB forms over ADDVL where applicable.
1 parent 14335be commit 8a67870

File tree

9 files changed

+81
-64
lines changed

9 files changed

+81
-64
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,13 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
142142

143143
// SVE CNT/INC/RDVL
144144
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
145+
def sve_cntb_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 16>">;
145146
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
146147
def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
147148
def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
148149

149150
// SVE DEC
151+
def sve_cntb_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -16>">;
150152
def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
151153
def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
152154
def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
@@ -2678,28 +2680,31 @@ let Predicates = [HasSVE_or_SME] in {
26782680
}
26792681

26802682
let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in {
2681-
def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
2682-
(ADDVL_XXI GPR64:$op, $imm)>;
2683-
2684-
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
2685-
(EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
2686-
GPR32:$op, sub_32), $imm),
2687-
sub_32)>;
2688-
2683+
def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm i32:$imm))),
2684+
(INCB_XPiI GPR64:$op, 31, $imm)>;
26892685
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
26902686
(INCH_XPiI GPR64:$op, 31, $imm)>;
26912687
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
26922688
(INCW_XPiI GPR64:$op, 31, $imm)>;
26932689
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
26942690
(INCD_XPiI GPR64:$op, 31, $imm)>;
26952691

2692+
def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm_neg i32:$imm))),
2693+
(DECB_XPiI GPR64:$op, 31, $imm)>;
26962694
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
26972695
(DECH_XPiI GPR64:$op, 31, $imm)>;
26982696
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
26992697
(DECW_XPiI GPR64:$op, 31, $imm)>;
27002698
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
27012699
(DECD_XPiI GPR64:$op, 31, $imm)>;
27022700

2701+
def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
2702+
(ADDVL_XXI GPR64:$op, $imm)>;
2703+
2704+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm i32:$imm))))),
2705+
(EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2706+
GPR32:$op, sub_32), 31, $imm),
2707+
sub_32)>;
27032708
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
27042709
(EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27052710
GPR32:$op, sub_32), 31, $imm),
@@ -2713,6 +2718,10 @@ let Predicates = [HasSVE_or_SME] in {
27132718
GPR32:$op, sub_32), 31, $imm),
27142719
sub_32)>;
27152720

2721+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm_neg i32:$imm))))),
2722+
(EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
2723+
GPR32:$op, sub_32), 31, $imm),
2724+
sub_32)>;
27162725
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
27172726
(EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27182727
GPR32:$op, sub_32), 31, $imm),
@@ -2725,6 +2734,11 @@ let Predicates = [HasSVE_or_SME] in {
27252734
(EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
27262735
GPR32:$op, sub_32), 31, $imm),
27272736
sub_32)>;
2737+
2738+
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
2739+
(EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
2740+
GPR32:$op, sub_32), $imm),
2741+
sub_32)>;
27282742
}
27292743

27302744
// For big endian, only BITCASTs involving same sized vector types with same

llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ define void @quux() #1 {
6565
; CHECK-NEXT: mov sp, x9
6666
; CHECK-NEXT: sub x10, x29, #104
6767
; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill
68-
; CHECK-NEXT: addvl x9, x8, #1
68+
; CHECK-NEXT: mov x9, x8
69+
; CHECK-NEXT: incb x9
6970
; CHECK-NEXT: mov w0, w9
7071
; CHECK-NEXT: // implicit-def: $x9
7172
; CHECK-NEXT: mov w9, w0
@@ -160,7 +161,8 @@ define void @quux() #1 {
160161
; CHECK-NEXT: mov x9, sp
161162
; CHECK-NEXT: subs x9, x9, #16
162163
; CHECK-NEXT: mov sp, x9
163-
; CHECK-NEXT: addvl x9, x8, #2
164+
; CHECK-NEXT: mov x9, x8
165+
; CHECK-NEXT: incb x9, all, mul #2
164166
; CHECK-NEXT: mov w0, w9
165167
; CHECK-NEXT: // implicit-def: $x9
166168
; CHECK-NEXT: mov w9, w0

llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,9 @@ define void @ldr_with_off_15(ptr %ptr) {
271271
define void @ldr_with_off_15mulvl(ptr %ptr) {
272272
; CHECK-LABEL: ldr_with_off_15mulvl:
273273
; CHECK: // %bb.0:
274+
; CHECK-NEXT: incb x0, all, mul #15
274275
; CHECK-NEXT: mov w12, #15 // =0xf
275-
; CHECK-NEXT: addvl x8, x0, #15
276-
; CHECK-NEXT: ldr za[w12, 0], [x8]
276+
; CHECK-NEXT: ldr za[w12, 0], [x0]
277277
; CHECK-NEXT: ret
278278
%vscale = call i64 @llvm.vscale.i64()
279279
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
285285
define void @ldr_with_off_16mulvl(ptr %ptr) {
286286
; CHECK-LABEL: ldr_with_off_16mulvl:
287287
; CHECK: // %bb.0:
288+
; CHECK-NEXT: incb x0, all, mul #16
288289
; CHECK-NEXT: mov w12, #16 // =0x10
289-
; CHECK-NEXT: addvl x8, x0, #16
290-
; CHECK-NEXT: ldr za[w12, 0], [x8]
290+
; CHECK-NEXT: ldr za[w12, 0], [x0]
291291
; CHECK-NEXT: ret
292292
%vscale = call i64 @llvm.vscale.i64()
293293
%mulvl = mul i64 %vscale, 256

llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,9 @@ define void @str_with_off_15(ptr %ptr) {
271271
define void @str_with_off_15mulvl(ptr %ptr) {
272272
; CHECK-LABEL: str_with_off_15mulvl:
273273
; CHECK: // %bb.0:
274+
; CHECK-NEXT: incb x0, all, mul #15
274275
; CHECK-NEXT: mov w12, #15 // =0xf
275-
; CHECK-NEXT: addvl x8, x0, #15
276-
; CHECK-NEXT: str za[w12, 0], [x8]
276+
; CHECK-NEXT: str za[w12, 0], [x0]
277277
; CHECK-NEXT: ret
278278
%vscale = call i64 @llvm.vscale.i64()
279279
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @str_with_off_15mulvl(ptr %ptr) {
285285
define void @str_with_off_16mulvl(ptr %ptr) {
286286
; CHECK-LABEL: str_with_off_16mulvl:
287287
; CHECK: // %bb.0:
288+
; CHECK-NEXT: incb x0, all, mul #16
288289
; CHECK-NEXT: mov w12, #16 // =0x10
289-
; CHECK-NEXT: addvl x8, x0, #16
290-
; CHECK-NEXT: str za[w12, 0], [x8]
290+
; CHECK-NEXT: str za[w12, 0], [x0]
291291
; CHECK-NEXT: ret
292292
%vscale = call i64 @llvm.vscale.i64()
293293
%mulvl = mul i64 %vscale, 256

llvm/test/CodeGen/AArch64/sve-lsrchain.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
8585
; CHECK-NEXT: ldr z5, [x4, #3, mul vl]
8686
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
8787
; CHECK-NEXT: str z4, [x16, #3, mul vl]
88-
; CHECK-NEXT: addvl x16, x16, #4
88+
; CHECK-NEXT: incb x16, all, mul #4
8989
; CHECK-NEXT: cmp x16, x11
9090
; CHECK-NEXT: b.lo .LBB0_4
9191
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us

llvm/test/CodeGen/AArch64/sve-vl-arith.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ define i64 @incb_scalar_i64(i64 %a) {
123123
;
124124
; CHECK-LABEL: incb_scalar_i64:
125125
; CHECK: // %bb.0:
126-
; CHECK-NEXT: addvl x0, x0, #1
126+
; CHECK-NEXT: incb x0
127127
; CHECK-NEXT: ret
128128
%vscale = call i64 @llvm.vscale.i64()
129129
%mul = mul i64 %vscale, 16
@@ -193,7 +193,7 @@ define i64 @decb_scalar_i64(i64 %a) {
193193
;
194194
; CHECK-LABEL: decb_scalar_i64:
195195
; CHECK: // %bb.0:
196-
; CHECK-NEXT: addvl x0, x0, #-2
196+
; CHECK-NEXT: decb x0, all, mul #2
197197
; CHECK-NEXT: ret
198198
%vscale = call i64 @llvm.vscale.i64()
199199
%mul = mul i64 %vscale, 32
@@ -264,7 +264,7 @@ define i32 @incb_scalar_i32(i32 %a) {
264264
; CHECK-LABEL: incb_scalar_i32:
265265
; CHECK: // %bb.0:
266266
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
267-
; CHECK-NEXT: addvl x0, x0, #3
267+
; CHECK-NEXT: incb x0, all, mul #3
268268
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
269269
; CHECK-NEXT: ret
270270

@@ -350,7 +350,7 @@ define i32 @decb_scalar_i32(i32 %a) {
350350
; CHECK-LABEL: decb_scalar_i32:
351351
; CHECK: // %bb.0:
352352
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
353-
; CHECK-NEXT: addvl x0, x0, #-4
353+
; CHECK-NEXT: decb x0, all, mul #4
354354
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
355355
; CHECK-NEXT: ret
356356

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba
3333
define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
3434
; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
3535
; CHECK: // %bb.0:
36-
; CHECK-NEXT: addvl x8, x0, #2
37-
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
36+
; CHECK-NEXT: incb x0, all, mul #2
37+
; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0]
3838
; CHECK-NEXT: ret
3939
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
4040
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
@@ -101,8 +101,8 @@ define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, ptr %ba
101101
define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
102102
; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
103103
; CHECK: // %bb.0:
104-
; CHECK-NEXT: addvl x8, x0, #-5
105-
; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
104+
; CHECK-NEXT: decb x0, all, mul #5
105+
; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0]
106106
; CHECK-NEXT: ret
107107
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
108108
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ define void @test_svst1wq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred
3030
define void @test_svst1wq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base) {
3131
; CHECK-LABEL: test_svst1wq_i32_out_of_bound:
3232
; CHECK: // %bb.0:
33-
; CHECK-NEXT: addvl x8, x0, #2
34-
; CHECK-NEXT: st1w { z0.q }, p0, [x8]
33+
; CHECK-NEXT: incb x0, all, mul #2
34+
; CHECK-NEXT: st1w { z0.q }, p0, [x0]
3535
; CHECK-NEXT: ret
3636
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
3737
call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
@@ -91,8 +91,8 @@ define void @test_svst1dq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred
9191
define void @test_svst1dq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base) {
9292
; CHECK-LABEL: test_svst1dq_i64_out_of_bound:
9393
; CHECK: // %bb.0:
94-
; CHECK-NEXT: addvl x8, x0, #-5
95-
; CHECK-NEXT: st1d { z0.q }, p0, [x8]
94+
; CHECK-NEXT: decb x0, all, mul #5
95+
; CHECK-NEXT: st1d { z0.q }, p0, [x0]
9696
; CHECK-NEXT: ret
9797
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
9898
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)

0 commit comments

Comments
 (0)