Skip to content

Commit

Permalink
aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
Browse files Browse the repository at this point in the history
Since the last operation is destructive, the first argument to the FMA
also has to be the first argument to the special-case in order to
avoid unnecessary MOVs. Reorder arguments and adjust special-case
bounds to facilitate this.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
  • Loading branch information
joeramsay authored and Wilco1 committed Sep 9, 2024
1 parent 43669fc commit 8b09af5
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 45 deletions.
38 changes: 24 additions & 14 deletions sysdeps/aarch64/fpu/log10f_advsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@

static const struct data
{
uint32x4_t min_norm;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
float32x4_t poly[8];
float32x4_t inv_ln10, ln2;
uint32x4_t off, mantissa_mask;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
Expand All @@ -35,18 +35,22 @@ static const struct data
V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
.min_norm = V4 (0x00800000),
.special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};

static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
uint16x4_t cmp)
special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
}

/* Fast implementation of AdvSIMD log10f,
Expand All @@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t u = vreinterpretq_u32_f32 (x);
uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
vget_low_u16 (d->special_bound));

/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint32x4_t u_off = vreinterpretq_u32_f32 (x);

/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = vsubq_u32 (u, d->off);
u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */

uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));

uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));

/* y = log10(1+r) + n * log10(2). */
Expand All @@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
y = vmulq_f32 (y, d->inv_ln10);

if (__glibc_unlikely (v_any_u16h (special)))
return special_case (x, y, poly, r2, special);
return special_case (y, u_off, poly, r2, special, d);
return vfmaq_f32 (y, poly, r2);
}
libmvec_hidden_def (V_NAME_F1 (log10))
Expand Down
38 changes: 24 additions & 14 deletions sysdeps/aarch64/fpu/log2f_advsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@

static const struct data
{
uint32x4_t min_norm;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t off, mantissa_mask;
uint32x4_t mantissa_mask;
float32x4_t poly[9];
} data = {
/* Coefficients generated using Remez algorithm approximate
Expand All @@ -34,18 +34,22 @@ static const struct data
V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
.min_norm = V4 (0x00800000),
.special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};

static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
uint16x4_t cmp)
special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
}

/* Fast implementation for single precision AdvSIMD log2,
Expand All @@ -56,23 +60,29 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t u = vreinterpretq_u32_f32 (x);
uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
vget_low_u16 (d->special_bound));

/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint32x4_t u_off = vreinterpretq_u32_f32 (x);

/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = vsubq_u32 (u, d->off);
u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */

uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));

uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));

/* y = log2(1+r) + n. */
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);

if (__glibc_unlikely (v_any_u16h (special)))
return special_case (x, n, p, r, special);
return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
libmvec_hidden_def (V_NAME_F1 (log2))
Expand Down
41 changes: 24 additions & 17 deletions sysdeps/aarch64/fpu/logf_advsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,53 +21,60 @@

static const struct data
{
uint32x4_t min_norm;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
float32x4_t poly[7];
float32x4_t ln2, tiny_bound;
uint32x4_t off, mantissa_mask;
float32x4_t ln2;
} data = {
/* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
V4 (-0x1.ffffc8p-2f) },
.ln2 = V4 (0x1.62e43p-1f),
.tiny_bound = V4 (0x1p-126),
.min_norm = V4 (0x00800000),
.special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
.offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
.special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};

#define P(i) d->poly[7 - i]

static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
uint16x4_t cmp)
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}

float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
uint32x4_t u;
uint32x4_t u, u_off;
uint16x4_t cmp;

u = vreinterpretq_u32_f32 (x);
cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
vget_low_u16 (d->special_bound));
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
u_off = vreinterpretq_u32_f32 (x);

/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = vsubq_u32 (u, d->off);
u_off = vsubq_u32 (u_off, d->off);
n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
u = vandq_u32 (u, d->mantissa_mask);
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
u = vandq_u32 (u_off, d->mantissa_mask);
u = vaddq_u32 (u, d->off);
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));

cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
vget_low_u16 (d->special_bound));

/* y = log(1+r) + n*ln2. */
r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
Expand All @@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
p = vfmaq_f32 (r, d->ln2, n);

if (__glibc_unlikely (v_any_u16h (cmp)))
return special_case (x, y, r2, p, cmp);
return special_case (p, u_off, y, r2, cmp, d);
return vfmaq_f32 (p, y, r2);
}
libmvec_hidden_def (V_NAME_F1 (log))
Expand Down

0 comments on commit 8b09af5

Please sign in to comment.