Skip to content

Commit 9f54332

Browse files
AArch64: Add support for boolean reductions for Adv. SIMD
The vectorizer has learned how to do boolean reductions of masks to a C bool for the operations OR, XOR and AND. This implements the new optabs for Adv.SIMD. Adv.SIMD today can already vectorize such loops but does so through SHIFT-AND-INSERT to perform the reductions step-wise and inorder. As an example, an OR reduction today does: movi v3.4s, 0 ext v5.16b, v30.16b, v3.16b, #8 orr v5.16b, v5.16b, v30.16b ext v29.16b, v5.16b, v3.16b, #4 orr v29.16b, v29.16b, v5.16b ext v4.16b, v29.16b, v3.16b, #2 orr v4.16b, v4.16b, v29.16b ext v3.16b, v4.16b, v3.16b, #1 orr v3.16b, v3.16b, v4.16b fmov w1, s3 and w1, w1, 1 For reducing to a boolean however we don't need the stepwise reduction and can just look at the bit patterns. For e.g. OR we now generate: umaxp v3.4s, v3.4s, v3.4s fmov x1, d3 cmp x1, 0 cset w0, ne For the remaining codegen see test vect-reduc-bool-9.c. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (reduc_sbool_and_scal_<mode>, reduc_sbool_ior_scal_<mode>, reduc_sbool_xor_scal_<mode>): New. * config/aarch64/iterators.md (VALLI): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-reduc-bool-1.c: New test. * gcc.target/aarch64/vect-reduc-bool-2.c: New test. * gcc.target/aarch64/vect-reduc-bool-3.c: New test. * gcc.target/aarch64/vect-reduc-bool-4.c: New test. * gcc.target/aarch64/vect-reduc-bool-5.c: New test. * gcc.target/aarch64/vect-reduc-bool-6.c: New test. * gcc.target/aarch64/vect-reduc-bool-7.c: New test. * gcc.target/aarch64/vect-reduc-bool-8.c: New test. * gcc.target/aarch64/vect-reduc-bool-9.c: New test.
1 parent 9b3606d commit 9f54332

File tree

11 files changed

+563
-0
lines changed

11 files changed

+563
-0
lines changed

gcc/config/aarch64/aarch64-simd.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3469,6 +3469,103 @@
34693469
DONE;
34703470
})
34713471

3472+
;; AND tree reductions.
3473+
;; Check if after a min pairwise reduction that all the lanes are 1.
3474+
;;
3475+
;; uminp v1.4s, v1.4s, v1.4s
3476+
;; fmov x1, d1
3477+
;; cmn x1, #1
3478+
;; cset w0, eq
3479+
;;
3480+
(define_expand "reduc_sbool_and_scal_<mode>"
3481+
[(set (match_operand:QI 0 "register_operand")
3482+
(unspec:QI [(match_operand:VALLI 1 "register_operand")]
3483+
UNSPEC_ANDV))]
3484+
"TARGET_SIMD"
3485+
{
3486+
rtx tmp = operands[1];
3487+
/* 128-bit vectors need to be compressed to 64-bits first. */
3488+
if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
3489+
{
3490+
/* Always reduce using a V4SI. */
3491+
rtx reduc = gen_lowpart (V4SImode, tmp);
3492+
rtx res = gen_reg_rtx (V4SImode);
3493+
emit_insn (gen_aarch64_uminpv4si (res, reduc, reduc));
3494+
emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
3495+
}
3496+
rtx val = gen_reg_rtx (DImode);
3497+
emit_move_insn (val, gen_lowpart (DImode, tmp));
3498+
rtx cc_reg = aarch64_gen_compare_reg (EQ, val, constm1_rtx);
3499+
rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, constm1_rtx);
3500+
rtx tmp2 = gen_reg_rtx (SImode);
3501+
emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg));
3502+
emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
3503+
DONE;
3504+
})
3505+
3506+
;; IOR tree reductions.
3507+
;; Check that after a MAX pairwise reduction any lane is not 0
3508+
;;
3509+
;; umaxp v1.4s, v1.4s, v1.4s
3510+
;; fmov x1, d1
3511+
;; cmp x1, 0
3512+
;; cset w0, ne
3513+
;;
3514+
(define_expand "reduc_sbool_ior_scal_<mode>"
3515+
[(set (match_operand:QI 0 "register_operand")
3516+
(unspec:QI [(match_operand:VALLI 1 "register_operand")]
3517+
UNSPEC_IORV))]
3518+
"TARGET_SIMD"
3519+
{
3520+
rtx tmp = operands[1];
3521+
/* 128-bit vectors need to be compressed to 64-bits first. */
3522+
if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
3523+
{
3524+
/* Always reduce using a V4SI. */
3525+
rtx reduc = gen_lowpart (V4SImode, tmp);
3526+
rtx res = gen_reg_rtx (V4SImode);
3527+
emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
3528+
emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
3529+
}
3530+
rtx val = gen_reg_rtx (DImode);
3531+
emit_move_insn (val, gen_lowpart (DImode, tmp));
3532+
rtx cc_reg = aarch64_gen_compare_reg (NE, val, const0_rtx);
3533+
rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx);
3534+
rtx tmp2 = gen_reg_rtx (SImode);
3535+
emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg));
3536+
emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
3537+
DONE;
3538+
})
3539+
3540+
;; Unpredicated predicate XOR tree reductions.
3541+
;; Check to see if the number of active lanes in the predicates is a multiple
3542+
;; of 2. We use a normal reduction after masking with 0x1.
3543+
;;
3544+
;; movi v1.16b, 0x1
3545+
;; and v2.16b, v2.16b, v2.16b
3546+
;; addv b3, v2.16b
3547+
;; fmov w1, s3
3548+
;; and w0, w1, 1
3549+
;;
3550+
(define_expand "reduc_sbool_xor_scal_<mode>"
3551+
[(set (match_operand:QI 0 "register_operand")
3552+
(unspec:QI [(match_operand:VALLI 1 "register_operand")]
3553+
UNSPEC_XORV))]
3554+
"TARGET_SIMD"
3555+
{
3556+
rtx tmp = gen_reg_rtx (<MODE>mode);
3557+
rtx one_reg = force_reg (<MODE>mode, CONST1_RTX (<MODE>mode));
3558+
emit_move_insn (tmp, gen_rtx_AND (<MODE>mode, operands[1], one_reg));
3559+
rtx tmp2 = gen_reg_rtx (<VEL>mode);
3560+
emit_insn (gen_reduc_plus_scal_<mode> (tmp2, tmp));
3561+
rtx tmp3 = gen_reg_rtx (DImode);
3562+
emit_move_insn (tmp3, gen_rtx_AND (DImode,
3563+
lowpart_subreg (DImode, tmp2, <VEL>mode),
3564+
const1_rtx));
3565+
emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
3566+
DONE;
3567+
})
3568+
34723569
;; SADDLV and UADDLV can be expressed as an ADDV instruction that first
34733570
;; sign or zero-extends its elements.
34743571
(define_insn "aarch64_<su>addlv<mode>"

gcc/config/aarch64/iterators.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@
217217
;; All Advanced SIMD modes on which we support any arithmetic operations.
218218
(define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
219219

220+
;; All Advanced SIMD integer modes
221+
(define_mode_iterator VALLI [VDQ_BHSI V2DI])
222+
220223
;; All Advanced SIMD modes suitable for moving, loading, and storing.
221224
(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
222225
V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
3+
4+
char p[128];
5+
6+
bool __attribute__((noipa))
7+
fand (int n)
8+
{
9+
bool r = true;
10+
for (int i = 0; i < n; ++i)
11+
r &= (p[i] != 0);
12+
return r;
13+
}
14+
15+
bool __attribute__((noipa))
16+
fior (int n)
17+
{
18+
bool r = false;
19+
for (int i = 0; i < n; ++i)
20+
r |= (p[i] != 0);
21+
return r;
22+
}
23+
24+
int main()
25+
{
26+
__builtin_memset (p, 1, sizeof(p));
27+
28+
for (int n = 0; n < 77; ++n)
29+
if (!fand (n))
30+
__builtin_abort ();
31+
32+
p[0] = 0;
33+
for (int n = 1; n < 77; ++n)
34+
if (fand (n))
35+
__builtin_abort ();
36+
37+
__builtin_memset (p, 0, sizeof(p));
38+
39+
for (int n = 0; n < 77; ++n)
40+
if (fior (n))
41+
__builtin_abort ();
42+
43+
p[0] = 1;
44+
for (int n = 1; n < 77; ++n)
45+
if (!fior (n))
46+
__builtin_abort ();
47+
48+
return 0;
49+
}
50+
51+
/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
3+
4+
short p[128];
5+
6+
bool __attribute__((noipa))
7+
fand (int n)
8+
{
9+
bool r = true;
10+
for (int i = 0; i < n; ++i)
11+
r &= (p[i] != 0);
12+
return r;
13+
}
14+
15+
bool __attribute__((noipa))
16+
fior (int n)
17+
{
18+
bool r = false;
19+
for (int i = 0; i < n; ++i)
20+
r |= (p[i] != 0);
21+
return r;
22+
}
23+
24+
int main()
25+
{
26+
__builtin_memset (p, 1, sizeof(p));
27+
28+
for (int n = 0; n < 77; ++n)
29+
if (!fand (n))
30+
__builtin_abort ();
31+
32+
p[0] = 0;
33+
for (int n = 1; n < 77; ++n)
34+
if (fand (n))
35+
__builtin_abort ();
36+
37+
__builtin_memset (p, 0, sizeof(p));
38+
39+
for (int n = 0; n < 77; ++n)
40+
if (fior (n))
41+
__builtin_abort ();
42+
43+
p[0] = 1;
44+
for (int n = 1; n < 77; ++n)
45+
if (!fior (n))
46+
__builtin_abort ();
47+
48+
return 0;
49+
}
50+
51+
/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
3+
4+
int p[128];
5+
6+
bool __attribute__((noipa))
7+
fand (int n)
8+
{
9+
bool r = true;
10+
for (int i = 0; i < n; ++i)
11+
r &= (p[i] != 0);
12+
return r;
13+
}
14+
15+
bool __attribute__((noipa))
16+
fior (int n)
17+
{
18+
bool r = false;
19+
for (int i = 0; i < n; ++i)
20+
r |= (p[i] != 0);
21+
return r;
22+
}
23+
24+
int main()
25+
{
26+
__builtin_memset (p, 1, sizeof(p));
27+
28+
for (int n = 0; n < 77; ++n)
29+
if (!fand (n))
30+
__builtin_abort ();
31+
32+
p[0] = 0;
33+
for (int n = 1; n < 77; ++n)
34+
if (fand (n))
35+
__builtin_abort ();
36+
37+
__builtin_memset (p, 0, sizeof(p));
38+
39+
for (int n = 0; n < 77; ++n)
40+
if (fior (n))
41+
__builtin_abort ();
42+
43+
p[0] = 1;
44+
for (int n = 1; n < 77; ++n)
45+
if (!fior (n))
46+
__builtin_abort ();
47+
48+
return 0;
49+
}
50+
51+
/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
3+
4+
long long p[128];
5+
6+
bool __attribute__((noipa))
7+
fand (int n)
8+
{
9+
bool r = true;
10+
for (int i = 0; i < n; ++i)
11+
r &= (p[i] != 0);
12+
return r;
13+
}
14+
15+
bool __attribute__((noipa))
16+
fior (int n)
17+
{
18+
bool r = false;
19+
for (int i = 0; i < n; ++i)
20+
r |= (p[i] != 0);
21+
return r;
22+
}
23+
24+
int main()
25+
{
26+
__builtin_memset (p, 1, sizeof(p));
27+
28+
for (int n = 0; n < 77; ++n)
29+
if (!fand (n))
30+
__builtin_abort ();
31+
32+
p[0] = 0;
33+
for (int n = 1; n < 77; ++n)
34+
if (fand (n))
35+
__builtin_abort ();
36+
37+
__builtin_memset (p, 0, sizeof(p));
38+
39+
for (int n = 0; n < 77; ++n)
40+
if (fior (n))
41+
__builtin_abort ();
42+
43+
p[0] = 1;
44+
for (int n = 1; n < 77; ++n)
45+
if (!fior (n))
46+
__builtin_abort ();
47+
48+
return 0;
49+
}
50+
51+
/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/
3+
4+
char p[128];
5+
6+
bool __attribute__((noipa))
7+
fxort (int n)
8+
{
9+
bool r = true;
10+
for (int i = 0; i < n; ++i)
11+
r ^= (p[i] != 0);
12+
return r;
13+
}
14+
15+
bool __attribute__((noipa))
16+
fxorf (int n)
17+
{
18+
bool r = false;
19+
for (int i = 0; i < n; ++i)
20+
r ^= (p[i] != 0);
21+
return r;
22+
}
23+
24+
int main()
25+
{
26+
__builtin_memset (p, 1, sizeof(p));
27+
28+
for (int n = 0; n < 77; ++n)
29+
if (fxort (n) != !(n & 1))
30+
__builtin_abort ();
31+
32+
for (int n = 0; n < 77; ++n)
33+
if (fxorf (n) != (n & 1))
34+
__builtin_abort ();
35+
36+
__builtin_memset (p, 0, sizeof(p));
37+
38+
for (int n = 0; n < 77; ++n)
39+
if (!fxort (n))
40+
__builtin_abort ();
41+
42+
for (int n = 0; n < 77; ++n)
43+
if (fxorf (n))
44+
__builtin_abort ();
45+
46+
return 0;
47+
}
48+
49+
/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */

0 commit comments

Comments
 (0)