Skip to content

Commit 81fd62d

Browse files
middle-end: Implement preferred_div_as_shifts_over_mult [PR108583]
This now implements a hook preferred_div_as_shifts_over_mult that indicates whether a target prefers that the vectorizer decomposes division as shifts rather than multiplication when possible. In order to be able to use this we need to check whether the current precision has enough bits to do the operation without any of the additions overflowing. We use range information to determine this and only do the operation if we're sure am overflow won't occur. This now uses ranger to do this range check. This seems to work better than vect_get_range_info which uses range_query, but I have not switched the interface of vect_get_range_info over in this PR fix. As Andy said before initializing a ranger instance is cheap but not free, and if the intention is to call it often during a pass it should be instantiated at pass startup and passed along to the places that need it. This is a big refactoring and doesn't seem right to do in this PR. But we should in GCC 14. Currently we only instantiate it after a long series of much cheaper checks. gcc/ChangeLog: PR target/108583 * target.def (preferred_div_as_shifts_over_mult): New. * doc/tm.texi.in: Document it. * doc/tm.texi: Regenerate. * targhooks.cc (default_preferred_div_as_shifts_over_mult): New. * targhooks.h (default_preferred_div_as_shifts_over_mult): New. * tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it. gcc/testsuite/ChangeLog: PR target/108583 * gcc.dg/vect/vect-div-bitmask-4.c: New test. * gcc.dg/vect/vect-div-bitmask-5.c: New test.
1 parent 0b3c630 commit 81fd62d

File tree

8 files changed

+190
-0
lines changed

8 files changed

+190
-0
lines changed

gcc/doc/tm.texi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6146,6 +6146,12 @@ instruction pattern. There is no need for the hook to handle these two
61466146
implementation approaches itself.
61476147
@end deftypefn
61486148

6149+
@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
6150+
Sometimes it is possible to implement a vector division using a sequence
6151+
of two addition-shift pairs, giving four instructions in total.
6152+
Return true if taking this approach for @var{vectype} is likely
6153+
to be better than using a sequence involving highpart multiplication.
6154+
Default is false if @code{can_mult_highpart_p}, otherwise true.
61496155
@end deftypefn
61506156

61516157
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})

gcc/doc/tm.texi.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4175,6 +4175,7 @@ address; but often a machine-dependent strategy can generate better code.
41754175

41764176
@hook TARGET_VECTORIZE_VEC_PERM_CONST
41774177

4178+
@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
41784179

41794180
@hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
41804181

gcc/target.def

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1868,6 +1868,18 @@ correct for most targets.",
18681868
poly_uint64, (const_tree type),
18691869
default_preferred_vector_alignment)
18701870

1871+
/* Returns whether the target has a preference for decomposing divisions using
1872+
shifts rather than multiplies. */
1873+
DEFHOOK
1874+
(preferred_div_as_shifts_over_mult,
1875+
"Sometimes it is possible to implement a vector division using a sequence\n\
1876+
of two addition-shift pairs, giving four instructions in total.\n\
1877+
Return true if taking this approach for @var{vectype} is likely\n\
1878+
to be better than using a sequence involving highpart multiplication.\n\
1879+
Default is false if @code{can_mult_highpart_p}, otherwise true.",
1880+
bool, (const_tree type),
1881+
default_preferred_div_as_shifts_over_mult)
1882+
18711883
/* Return true if vector alignment is reachable (by peeling N
18721884
iterations) for the given scalar type. */
18731885
DEFHOOK

gcc/targhooks.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,15 @@ default_preferred_vector_alignment (const_tree type)
14881488
return TYPE_ALIGN (type);
14891489
}
14901490

1491+
/* The default implementation of
1492+
TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
1493+
1494+
bool
1495+
default_preferred_div_as_shifts_over_mult (const_tree type)
1496+
{
1497+
return !can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));
1498+
}
1499+
14911500
/* By default assume vectors of element TYPE require a multiple of the natural
14921501
alignment of TYPE. TYPE is naturally aligned if IS_PACKED is false. */
14931502
bool

gcc/targhooks.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
5353
extern unsigned HOST_WIDE_INT default_shift_truncation_mask
5454
(machine_mode);
5555
extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
56+
extern bool default_preferred_div_as_shifts_over_mult
57+
(const_tree);
5658
extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
5759

5860
extern tree default_stack_protect_guard (void);
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/* { dg-require-effective-target vect_int } */
2+
3+
#include <stdint.h>
4+
#include "tree-vect.h"
5+
6+
typedef unsigned __attribute__((__vector_size__ (16))) V;
7+
8+
static __attribute__((__noinline__)) __attribute__((__noclone__)) V
9+
foo (V v, unsigned short i)
10+
{
11+
v /= i;
12+
return v;
13+
}
14+
15+
int
16+
main (void)
17+
{
18+
V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
19+
for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
20+
if (v[i] != 0x00010001)
21+
__builtin_abort ();
22+
return 0;
23+
}
24+
25+
/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/* { dg-require-effective-target vect_int } */
2+
3+
#include <stdint.h>
4+
#include <stdio.h>
5+
#include "tree-vect.h"
6+
7+
#define N 50
8+
#define TYPE uint8_t
9+
10+
#ifndef DEBUG
11+
#define DEBUG 0
12+
#endif
13+
14+
#define BASE ((TYPE) -1 < 0 ? -126 : 4)
15+
16+
17+
__attribute__((noipa, noinline, optimize("O1")))
18+
void fun1(TYPE* restrict pixel, TYPE level, int n)
19+
{
20+
for (int i = 0; i < n; i+=1)
21+
pixel[i] = (pixel[i] + level) / 0xff;
22+
}
23+
24+
__attribute__((noipa, noinline, optimize("O3")))
25+
void fun2(TYPE* restrict pixel, TYPE level, int n)
26+
{
27+
for (int i = 0; i < n; i+=1)
28+
pixel[i] = (pixel[i] + level) / 0xff;
29+
}
30+
31+
int main ()
32+
{
33+
TYPE a[N];
34+
TYPE b[N];
35+
36+
for (int i = 0; i < N; ++i)
37+
{
38+
a[i] = BASE + i * 13;
39+
b[i] = BASE + i * 13;
40+
if (DEBUG)
41+
printf ("%d: 0x%x\n", i, a[i]);
42+
}
43+
44+
fun1 (a, N / 2, N);
45+
fun2 (b, N / 2, N);
46+
47+
for (int i = 0; i < N; ++i)
48+
{
49+
if (DEBUG)
50+
printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
51+
52+
if (a[i] != b[i])
53+
__builtin_abort ();
54+
}
55+
return 0;
56+
}
57+
58+
/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */

gcc/tree-vect-patterns.cc

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3934,6 +3934,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
39343934
return pattern_stmt;
39353935
}
39363936

3937+
if ((cst = uniform_integer_cst_p (oprnd1))
3938+
&& TYPE_UNSIGNED (itype)
3939+
&& rhs_code == TRUNC_DIV_EXPR
3940+
&& vectype
3941+
&& targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
3942+
{
3943+
/* We can use the relationship:
3944+
3945+
x // N == ((x+N+2) // (N+1) + x) // (N+1) for 0 <= x < N(N+3)
3946+
3947+
to optimize cases where N+1 is a power of 2, and where // (N+1)
3948+
is therefore a shift right. When operating in modes that are
3949+
multiples of a byte in size, there are two cases:
3950+
3951+
(1) N(N+3) is not representable, in which case the question
3952+
becomes whether the replacement expression overflows.
3953+
It is enough to test that x+N+2 does not overflow,
3954+
i.e. that x < MAX-(N+1).
3955+
3956+
(2) N(N+3) is representable, in which case it is the (only)
3957+
bound that we need to check.
3958+
3959+
??? For now we just handle the case where // (N+1) is a shift
3960+
right by half the precision, since some architectures can
3961+
optimize the associated addition and shift combinations
3962+
into single instructions. */
3963+
3964+
auto wcst = wi::to_wide (cst);
3965+
int pow = wi::exact_log2 (wcst + 1);
3966+
if (pow == prec / 2)
3967+
{
3968+
gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
3969+
3970+
gimple_ranger ranger;
3971+
int_range_max r;
3972+
3973+
/* Check that no overflow will occur. If we don't have range
3974+
information we can't perform the optimization. */
3975+
3976+
if (ranger.range_of_expr (r, oprnd0, stmt))
3977+
{
3978+
wide_int max = r.upper_bound ();
3979+
wide_int one = wi::shwi (1, prec);
3980+
wide_int adder = wi::add (one, wi::lshift (one, pow));
3981+
wi::overflow_type ovf;
3982+
wi::add (max, adder, UNSIGNED, &ovf);
3983+
if (ovf == wi::OVF_NONE)
3984+
{
3985+
*type_out = vectype;
3986+
tree tadder = wide_int_to_tree (itype, adder);
3987+
tree rshift = wide_int_to_tree (itype, pow);
3988+
3989+
tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
3990+
gassign *patt1
3991+
= gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
3992+
append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
3993+
3994+
tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
3995+
patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
3996+
rshift);
3997+
append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
3998+
3999+
tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
4000+
patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
4001+
oprnd0);
4002+
append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
4003+
4004+
tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
4005+
pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
4006+
new_lhs3, rshift);
4007+
4008+
return pattern_stmt;
4009+
}
4010+
}
4011+
}
4012+
}
4013+
39374014
if (prec > HOST_BITS_PER_WIDE_INT
39384015
|| integer_zerop (oprnd1))
39394016
return NULL;

0 commit comments

Comments
 (0)