Skip to content

[X86] Unnecessary sequences of 8 GPR movs back and forth #136574

Open
@dzaima

Description

@dzaima

This C code (heavily reduced from real-world code implementing SIMD transpose), compiled via -O3 -march=haswell:

#include <immintrin.h>
#include <stdint.h>
#define load(x) _mm256_loadu_si256((void *)(x))
#define store(x, v) _mm256_storeu_si256((void *)(x), v)
void f(char *p1, char *p2, char *p3, uint64_t x, uint64_t y, uint64_t z) {
  while (1) {
    uint64_t i = 0;
    while (1) {
      if (i >= x)
        break;
      uint64_t j = 4 * i ? 4 * i : x;
      __m256i a = load(p2 + y * 5);

      __m256i l0 = load(p2);
      __m256i l1 = load(p2 + j + 3 * y);
      __m256i b = l0 + l1;

      __m256i l2 = load(p2 + y);
      __m256i l3 = load(p3 + y + j);
      __m256i c = l2 + l3;

      __m256i l4 = load(p3 + 6 * y);
      __m256i l5 = load(p2 + j + 7 * y);
      __m256i d = l4 + l5;

      store(p1 + j * z + 16 * z, _mm256_permute2x128_si256(a, b, 49));
      store(p1 + j * z, _mm256_permute2x128_si256(c, d, 49));

      i++;
    }
  }
}

results in this segment of assembly:

        ...
        mov     rcx, rdi
        mov     rdi, r8
        mov     r8, r15
        mov     r15, r14
        mov     r14, r11
        mov     r11, r9
        mov     r9, rdx
        mov     rdx, r10
        mov     r10, qword ptr [rsp - 8]
        vmovdqu ymm1, ymmword ptr [r10 + rbp]
        mov     r10, rdx
        mov     rdx, r9
        mov     r9, r11
        mov     r11, r14
        mov     r14, r15
        mov     r15, r8
        mov     r8, rdi
        mov     rdi, rcx
        ...

which could be just:

        mov     rcx, qword ptr [rsp - 8]
        vmovdqu ymm1, ymmword ptr [rcx + rbp]

https://godbolt.org/z/P66GdGzaM

Similar to #81391, but for GPRs, not SIMD registers (though SIMD is still involved).

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions