Skip to content

sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1> #38188

Closed
@gnzlbg

Description

@gnzlbg
mannequin
Bugzilla Link 38840
Resolution FIXED
Resolved on Apr 28, 2019 04:01
Version trunk
OS All
CC @topperc,@RKSimon,@rotateright
Fixed by commit(s) r359385,r359396

Extended Description

The llvm.experimental.vector.reduce.{and,or,xor} instructions of the x86 backend produce very sub-optimal machine code. See it live: https://gcc.godbolt.org/z/qIHi6D

LLVM-IR:

declare i1 @​llvm.experimental.vector.reduce.and.v32i1(<32 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1>);
declare i1 @​llvm.experimental.vector.reduce.and.v2i1(<2 x i1>);

define i1 @​and128_x2(<2 x i64>) {
%a = trunc <2 x i64> %0 to <2 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
ret i1 %b
}
define i1 @​and_x4(<4 x i32>) {
%a = trunc <4 x i32> %0 to <4 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
ret i1 %b
}
define i1 @​and128_x8(<8 x i8>) {
%a = trunc <8 x i8> %0 to <8 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
ret i1 %b
}
define i1 @​and256_x4(<4 x i64>) {
%a = trunc <4 x i64> %0 to <4 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
ret i1 %b
}
define i1 @​and_x8(<8 x i32>) {
%a = trunc <8 x i32> %0 to <8 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
ret i1 %b
}
define i1 @​and256_x32(<32 x i8>) {
%a = trunc <32 x i8> %0 to <32 x i1>
%b = call i1 @​llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
ret i1 %b
}

produces

and128_x2: # @​and128_x2
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and_x4: # @​and_x4
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movd %xmm0, %eax
retq
and128_x8: # @​and128_x8
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and256_x4: # @​and256_x4
shufps $136, %xmm1, %xmm0 # xmm0 = xmm0[0,2],xmm1[0,2]
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movd %xmm0, %eax
retq
and256_x8: # @​and_x8
pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7]
pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7]
pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7]
pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0]
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movd %xmm1, %eax
retq
and256_x32: # @​and256_x32
pand %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
pand %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
pand %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrld $16, %xmm1
pand %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrlw $8, %xmm0
pand %xmm1, %xmm0
movd %xmm0, %eax
retq

but these should all lower to a single mvmsk instruction:

and128_x2:
movmskpd %xmm0, %eax
retq
and128_x4:
movmskps %xmm0, %eax
retq
and128_x8:
pmovmskb %xmm0, %eax
retq
and256_x4:
vmovmskpd %ymm0, %eax
vzeroupper
retq
and256_x8:
vmovmskps %ymm0, %eax
vzeroupper
retq
and256_x32:
vpmovmskb %ymm0, %eax
vzeroupper
retq1

The llvm.experimental.vector.reduce.and for <8 x i16>, <16 x i16>, <1 x i128>, <2 x i128>, etc. probably produce very sub-optimal machine code for i1 vectors as well.

The llvm.experimental.vector.reduce.or and llvm.experimental.vector.reduce.xor probably produce very sub-optimal machine code for all these i1 vectors too.

These llvm intrinsics are critical for efficiently performing coherent control flow.

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions