Skip to content

[AArch64][X86] Shift by a multiple of 8 on a i128 could turn into a vector shift #107289

Open
@Validark

Description

@Validark

https://zig.godbolt.org/z/cjGMW4W64

export fn foo(v: @Vector(16, u8)) @TypeOf(v) {
    return @bitCast(@as(u128, @bitCast(v)) << 8);
}

Znver4:

foo:
        vpextrq rax, xmm0, 1
        vmovq   rcx, xmm0
        mov     rdx, rcx
        shr     rdx, 56
        shl     rax, 8
        shl     rcx, 8
        or      rax, rdx
        vmovq   xmm1, rcx
        vmovq   xmm0, rax
        vpunpcklqdq     xmm0, xmm1, xmm0
        ret

Apple-m3:

foo:
        mov     x8, v0.d[1]
        fmov    x9, d0
        extr    x8, x8, x9, #56
        lsl     x9, x9, #8
        fmov    d0, x9
        mov     v0.d[1], x8
        ret

Optimized LLVM:

define dso_local <16 x i8> @foo(<16 x i8> %0) local_unnamed_addr {
Entry:
  %1 = bitcast <16 x i8> %0 to i128
  %2 = shl i128 %1, 8
  %3 = bitcast i128 %2 to <16 x i8>
  ret <16 x i8> %3
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1
Unoptimized LLVM dump

via zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1

; ModuleID = 'llvm_code'
source_filename = "llvm_code"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-musl"

%Target.Cpu.Feature.Set = type { [5 x i64] }
%Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set }
%Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] }

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8
@Target.aarch64.cpu.apple_latest = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_227, i64 0), i64 12 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_230, i64 0), i64 12 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 158329674598400, i64 2251799830972420, i64 1125900041060352, i64 12885032960, i64 0] } }, align 8
@__anon_227 = internal unnamed_addr constant [13 x i8] c"apple_latest\00", align 1
@__anon_230 = internal unnamed_addr constant [13 x i8] c"apple-latest\00", align 1
@builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.aarch64.cpu.apple_latest, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 -6882457295353816576, i64 3831332528523300484, i64 4612917471702155264, i64 47783866528, i64 0] }, i6 6, [7 x i8] undef }, align 8
@start.simplified_logic = internal unnamed_addr constant i1 false, align 1
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local <16 x i8> @foo(<16 x i8> %0) #0 {
1:
  %2 = bitcast <16 x i8> %0 to i128
  %3 = zext i7 8 to i128
  %4 = shl i128 %2, %3
  %5 = bitcast i128 %4 to <16 x i8>
  ret <16 x i8> %5
}

attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }

!llvm.module.flags = !{}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions