Skip to content

[vectorization] gcc generate better code for a loop body with control flow #64292

Open
@zhongsir1

Description

@zhongsir1

test: https://godbolt.org/z/7Y6a4EvW9

void s1161(void)
{
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (c[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            b[i] = a[i] + d[i] * d[i];
L10:
            ;
        }
}
  • gcc: generate a sve loop
.L2:
  lsl x1, x0, 3
  ld1d z31.d, p7/z, [x9, x0, lsl 3]
  ld1d z29.d, p7/z, [x8, x0, lsl 3]
  add x2, x7, x1
  fcmlt p6.d, p7/z, z31.d, #0.0
  ld1d z30.d, p7/z, [x2]
  not p6.b, p7/z, p6.b
  add x4, x5, x1
  add x1, x6, x1
  ld1d z28.d, p7/z, [x4]
  fcmlt p7.d, p7/z, z31.d, #0.0
  fmla z31.d, p6/m, z29.d, z28.d
  fmla z30.d, p7/m, z29.d, z29.d
  st1d z31.d, p6, [x2]
  st1d z30.d, p7, [x1]
  add x0, x0, x10
  whilelo p7.d, w0, w3
  b.any .L2
  • llvm: failed with vectorization
.LBB0_1: // in Loop: Header=BB0_2 Depth=1
  ldr d0, [x11, x8]
  mov x15, x13
  ldr d1, [x12, x8]
  fmadd d0, d1, d1, d0
  str d0, [x13, x8]
  add x8, x8, #8
  cmp x8, x9
  b.eq .LBB0_4
.LBB0_2: // =>This Inner Loop Header: Depth=1
  ldr d0, [x10, x8]
  fcmp d0, #0.0
  b.mi .LBB0_1
// %bb.3: // in Loop: Header=BB0_2 Depth=1
  ldr d1, [x12, x8]
  mov x15, x11
  ldr d2, [x14, x8]
  fmadd d0, d1, d2, d0
  str d0, [x11, x8]
  add x8, x8, #8
  cmp x8, x9
  b.ne .LBB0_2

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions