Skip to content

Commit f23dc72

Browse files
AArch64: Update div-bitmask to implement new optab instead of target hook [PR108583]
This replaces the custom division hook with just an implementation through add_highpart. For NEON we implement the add highpart (Addition + extraction of the upper highpart of the register in the same precision) as ADD + LSR. This representation allows us to easily optimize the sequence using existing sequences. This gets us a pretty decent sequence using SRA: umull v1.8h, v0.8b, v3.8b umull2 v0.8h, v0.16b, v3.16b add v5.8h, v1.8h, v2.8h add v4.8h, v0.8h, v2.8h usra v1.8h, v5.8h, 8 usra v0.8h, v4.8h, 8 uzp2 v1.16b, v1.16b, v0.16b To get the most optimal sequence however we match (a + ((b + c) >> n)) where n is half the precision of the mode of the operation into addhn + uaddw which is a general good optimization on its own and gets us back to: .L4: ldr q0, [x3] umull v1.8h, v0.8b, v5.8b umull2 v0.8h, v0.16b, v5.16b addhn v3.8b, v1.8h, v4.8h addhn v2.8b, v0.8h, v4.8h uaddw v1.8h, v1.8h, v3.8b uaddw v0.8h, v0.8h, v2.8b uzp2 v1.16b, v1.16b, v0.16b str q1, [x3], 16 cmp x3, x4 bne .L4 For SVE2 we optimize the initial sequence to the same ADD + LSR which gets us: .L3: ld1b z0.h, p0/z, [x0, x3] mul z0.h, p1/m, z0.h, z2.h add z1.h, z0.h, z3.h usra z0.h, z1.h, gcc-mirror#8 lsr z0.h, z0.h, gcc-mirror#8 st1b z0.h, p0, [x0, x3] inch x3 whilelo p0.h, w3, w2 b.any .L3 .L1: ret and to get the most optimal sequence I match (a + b) >> n (same constraint on n) to addhnb which gets us to: .L3: ld1b z0.h, p0/z, [x0, x3] mul z0.h, p1/m, z0.h, z2.h addhnb z1.b, z0.h, z3.h addhnb z0.b, z0.h, z1.h st1b z0.h, p0, [x0, x3] inch x3 whilelo p0.h, w3, w2 b.any .L3 There are multiple RTL representations possible for these optimizations, I did not represent them using a zero_extend because we seem very inconsistent in this in the backend. Since they are unspecs we won't match them from vector ops anyway. I figured maintainers would prefer this, but my maintainer ouija board is still out for repairs :) There are no new test as new correctness tests were added to the mid-end and the existing codegen tests for this already exist. gcc/ChangeLog: PR target/108583 * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): Remove. (*bitmask_shift_plus<mode>): New. * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New. (@aarch64_bitmask_udiv<mode>3): Remove. * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant, TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed. (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT, aarch64_vectorize_preferred_div_as_shifts_over_mult): New.
1 parent 81fd62d commit f23dc72

File tree

3 files changed

+52
-137
lines changed

3 files changed

+52
-137
lines changed

gcc/config/aarch64/aarch64-simd.md

Lines changed: 19 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4867,60 +4867,27 @@
48674867
}
48684868
)
48694869

4870-
;; div optimizations using narrowings
4871-
;; we can do the division e.g. shorts by 255 faster by calculating it as
4872-
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
4873-
;; double the precision of x.
4874-
;;
4875-
;; If we imagine a short as being composed of two blocks of bytes then
4876-
;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
4877-
;; adding 1 to each sub component:
4878-
;;
4879-
;; short value of 16-bits
4880-
;; ┌──────────────┬────────────────┐
4881-
;; │ │ │
4882-
;; └──────────────┴────────────────┘
4883-
;; 8-bit part1 ▲ 8-bit part2 ▲
4884-
;; │ │
4885-
;; │ │
4886-
;; +1 +1
4887-
;;
4888-
;; after the first addition, we have to shift right by 8, and narrow the
4889-
;; results back to a byte. Remember that the addition must be done in
4890-
;; double the precision of the input. Since 8 is half the size of a short
4891-
;; we can use a narrowing halfing instruction in AArch64, addhn which also
4892-
;; does the addition in a wider precision and narrows back to a byte. The
4893-
;; shift itself is implicit in the operation as it writes back only the top
4894-
;; half of the result. i.e. bits 2*esize-1:esize.
4895-
;;
4896-
;; Since we have narrowed the result of the first part back to a byte, for
4897-
;; the second addition we can use a widening addition, uaddw.
4898-
;;
4899-
;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
4900-
;;
4901-
;; The shift is later optimized by combine to a uzp2 with movi #0.
4902-
(define_expand "@aarch64_bitmask_udiv<mode>3"
4903-
[(match_operand:VQN 0 "register_operand")
4904-
(match_operand:VQN 1 "register_operand")
4905-
(match_operand:VQN 2 "immediate_operand")]
4870+
;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector
4871+
(define_insn_and_split "*bitmask_shift_plus<mode>"
4872+
[(set (match_operand:VQN 0 "register_operand" "=&w")
4873+
(plus:VQN
4874+
(lshiftrt:VQN
4875+
(plus:VQN (match_operand:VQN 1 "register_operand" "w")
4876+
(match_operand:VQN 2 "register_operand" "w"))
4877+
(match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" ""))
4878+
(match_operand:VQN 4 "register_operand" "w")))]
49064879
"TARGET_SIMD"
4880+
"#"
4881+
"&& true"
4882+
[(const_int 0)]
49074883
{
4908-
unsigned HOST_WIDE_INT size
4909-
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
4910-
rtx elt = unwrap_const_vec_duplicate (operands[2]);
4911-
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
4912-
FAIL;
4913-
4914-
rtx addend = gen_reg_rtx (<MODE>mode);
4915-
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
4916-
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
4917-
rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
4918-
rtx tmp2 = gen_reg_rtx (<MODE>mode);
4919-
emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
4920-
unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
4921-
rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
4922-
emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
4923-
emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
4884+
rtx tmp;
4885+
if (can_create_pseudo_p ())
4886+
tmp = gen_reg_rtx (<VNARROWQ>mode);
4887+
else
4888+
tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0]));
4889+
emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2]));
4890+
emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp));
49244891
DONE;
49254892
})
49264893

gcc/config/aarch64/aarch64-sve2.md

Lines changed: 16 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
;; ---- [INT] Reciprocal approximation
7272
;; ---- [INT<-FP] Base-2 logarithm
7373
;; ---- [INT] Polynomial multiplication
74-
;; ---- [INT] Misc optab implementations
7574
;;
7675
;; == Permutation
7776
;; ---- [INT,FP] General permutes
@@ -1600,6 +1599,22 @@
16001599
"<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
16011600
)
16021601

1602+
;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
1603+
(define_insn "*bitmask_shift_plus<mode>"
1604+
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
1605+
(unspec:SVE_FULL_HSDI
1606+
[(match_operand:<VPRED> 1)
1607+
(lshiftrt:SVE_FULL_HSDI
1608+
(plus:SVE_FULL_HSDI
1609+
(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
1610+
(match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
1611+
(match_operand:SVE_FULL_HSDI 4
1612+
"aarch64_simd_shift_imm_vec_exact_top" ""))]
1613+
UNSPEC_PRED_X))]
1614+
"TARGET_SVE2"
1615+
"addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
1616+
)
1617+
16031618
;; -------------------------------------------------------------------------
16041619
;; ---- [INT] Narrowing right shifts
16051620
;; -------------------------------------------------------------------------
@@ -2313,46 +2328,6 @@
23132328
"<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
23142329
)
23152330

2316-
;; -------------------------------------------------------------------------
2317-
;; ---- [INT] Misc optab implementations
2318-
;; -------------------------------------------------------------------------
2319-
;; Includes:
2320-
;; - aarch64_bitmask_udiv
2321-
;; -------------------------------------------------------------------------
2322-
2323-
;; div optimizations using narrowings
2324-
;; we can do the division e.g. shorts by 255 faster by calculating it as
2325-
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
2326-
;; double the precision of x.
2327-
;;
2328-
;; See aarch64-simd.md for bigger explanation.
2329-
(define_expand "@aarch64_bitmask_udiv<mode>3"
2330-
[(match_operand:SVE_FULL_HSDI 0 "register_operand")
2331-
(match_operand:SVE_FULL_HSDI 1 "register_operand")
2332-
(match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
2333-
"TARGET_SVE2"
2334-
{
2335-
unsigned HOST_WIDE_INT size
2336-
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
2337-
rtx elt = unwrap_const_vec_duplicate (operands[2]);
2338-
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
2339-
FAIL;
2340-
2341-
rtx addend = gen_reg_rtx (<MODE>mode);
2342-
rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
2343-
rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
2344-
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
2345-
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
2346-
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
2347-
addend));
2348-
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
2349-
lowpart_subreg (<MODE>mode, tmp1,
2350-
<VNARROW>mode)));
2351-
emit_move_insn (operands[0],
2352-
lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
2353-
DONE;
2354-
})
2355-
23562331
;; =========================================================================
23572332
;; == Permutation
23582333
;; =========================================================================

gcc/config/aarch64/aarch64.cc

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3847,6 +3847,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
38473847
return default_vectorize_related_mode (vector_mode, element_mode, nunits);
38483848
}
38493849

3850+
/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
3851+
3852+
static bool
3853+
aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
3854+
{
3855+
machine_mode mode = TYPE_MODE (type);
3856+
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857+
bool sve_p = (vec_flags & VEC_ANY_SVE);
3858+
bool simd_p = (vec_flags & VEC_ADVSIMD);
3859+
3860+
return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
3861+
}
3862+
38503863
/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
38513864
prefer to use the first arithmetic operand as the else value if
38523865
the else value doesn't matter, since that exactly matches the SVE
@@ -24361,46 +24374,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
2436124374

2436224375
return ret;
2436324376
}
24364-
24365-
/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24366-
24367-
bool
24368-
aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24369-
tree vectype, wide_int cst,
24370-
rtx *output, rtx in0, rtx in1)
24371-
{
24372-
if (code != TRUNC_DIV_EXPR
24373-
|| !TYPE_UNSIGNED (vectype))
24374-
return false;
24375-
24376-
machine_mode mode = TYPE_MODE (vectype);
24377-
unsigned int flags = aarch64_classify_vector_mode (mode);
24378-
if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24379-
return false;
24380-
24381-
int pow = wi::exact_log2 (cst + 1);
24382-
auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24383-
/* SVE actually has a div operator, we may have gotten here through
24384-
that route. */
24385-
if (pow != (int) (element_precision (vectype) / 2)
24386-
|| insn_code == CODE_FOR_nothing)
24387-
return false;
24388-
24389-
/* We can use the optimized pattern. */
24390-
if (in0 == NULL_RTX && in1 == NULL_RTX)
24391-
return true;
24392-
24393-
gcc_assert (output);
24394-
24395-
expand_operand ops[3];
24396-
create_output_operand (&ops[0], *output, mode);
24397-
create_input_operand (&ops[1], in0, mode);
24398-
create_fixed_operand (&ops[2], in1);
24399-
expand_insn (insn_code, 3, ops);
24400-
*output = ops[0].value;
24401-
return true;
24402-
}
24403-
2440424377
/* Generate a byte permute mask for a register of mode MODE,
2440524378
which has NUNITS units. */
2440624379

@@ -27902,13 +27875,13 @@ aarch64_libgcc_floating_mode_supported_p
2790227875
#undef TARGET_MAX_ANCHOR_OFFSET
2790327876
#define TARGET_MAX_ANCHOR_OFFSET 4095
2790427877

27878+
#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
27879+
#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
27880+
aarch64_vectorize_preferred_div_as_shifts_over_mult
27881+
2790527882
#undef TARGET_VECTOR_ALIGNMENT
2790627883
#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
2790727884

27908-
#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
27909-
#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
27910-
aarch64_vectorize_can_special_div_by_constant
27911-
2791227885
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
2791327886
#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
2791427887
aarch64_vectorize_preferred_vector_alignment

0 commit comments

Comments
 (0)