sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1>

|  |  |
| --- | --- |
| Bugzilla Link | [38840](https://llvm.org/bz38840) |
| Resolution | FIXED |
| Resolved on | Apr 28, 2019 04:01 |
| Version | trunk |
| OS | All |
| CC | @topperc,@RKSimon,@rotateright |
| Fixed by commit(s) | r359385,r359396 |

## Extended Description 
The llvm.experimental.vector.reduce.{and,or,xor} instructions of the x86 backend produce very sub-optimal machine code. See it live: https://gcc.godbolt.org/z/qIHi6D

LLVM-IR:

declare i1 @&#8203;llvm.experimental.vector.reduce.and.v32i1(<32 x i1>);
declare i1 @&#8203;llvm.experimental.vector.reduce.and.v8i1(<8 x i1>);
declare i1 @&#8203;llvm.experimental.vector.reduce.and.v4i1(<4 x i1>);
declare i1 @&#8203;llvm.experimental.vector.reduce.and.v2i1(<2 x i1>);

define i1 @&#8203;and128_x2(<2 x i64>) {
    %a = trunc <2 x i64> %0 to <2 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)
    ret i1 %b
}
define i1 @&#8203;and_x4(<4 x i32>) {
    %a = trunc <4 x i32> %0 to <4 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
    ret i1 %b
}
define i1 @&#8203;and128_x8(<8 x i8>) {
    %a = trunc <8 x i8> %0 to <8 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
    ret i1 %b
}
define i1 @&#8203;and256_x4(<4 x i64>) {
    %a = trunc <4 x i64> %0 to <4 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)
    ret i1 %b
}
define i1 @&#8203;and_x8(<8 x i32>) {
    %a = trunc <8 x i32> %0 to <8 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)
    ret i1 %b
}
define i1 @&#8203;and256_x32(<32 x i8>) {
    %a = trunc <32 x i8> %0 to <32 x i1>
    %b = call i1 @&#8203;llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)
    ret i1 %b
}

produces

and128_x2: # @&#8203;and128_x2
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  movd %xmm1, %eax
  retq
and_x4: # @&#8203;and_x4
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
  pand %xmm1, %xmm0
  movd %xmm0, %eax
  retq
and128_x8: # @&#8203;and128_x8
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
  pand %xmm1, %xmm0
  movdqa %xmm0, %xmm1
  psrld $16, %xmm1
  pand %xmm0, %xmm1
  movd %xmm1, %eax
  retq
and256_x4: # @&#8203;and256_x4
  shufps $136, %xmm1, %xmm0 # xmm0 = xmm0[0,2],xmm1[0,2]
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
  pand %xmm1, %xmm0
  movd %xmm0, %eax
  retq
and256_x8: # @&#8203;and_x8
  pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7]
  pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7]
  pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
  pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7]
  pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7]
  pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
  punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0]
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
  pand %xmm1, %xmm0
  movdqa %xmm0, %xmm1
  psrld $16, %xmm1
  pand %xmm0, %xmm1
  movd %xmm1, %eax
  retq
and256_x32: # @&#8203;and256_x32
  pand %xmm1, %xmm0
  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
  pand %xmm0, %xmm1
  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
  pand %xmm1, %xmm0
  movdqa %xmm0, %xmm1
  psrld $16, %xmm1
  pand %xmm0, %xmm1
  movdqa %xmm1, %xmm0
  psrlw $8, %xmm0
  pand %xmm1, %xmm0
  movd %xmm0, %eax
  retq

but these should all lower to a single mvmsk instruction:

and128_x2:
  movmskpd %xmm0, %eax
  retq
and128_x4:
  movmskps %xmm0, %eax
  retq
and128_x8:
  pmovmskb %xmm0, %eax
  retq
and256_x4:
  vmovmskpd %ymm0, %eax
  vzeroupper
  retq
and256_x8:
  vmovmskps %ymm0, %eax
  vzeroupper
  retq
and256_x32:
  vpmovmskb %ymm0, %eax
  vzeroupper
  retq1

The llvm.experimental.vector.reduce.and for <8 x i16>, <16 x i16>, <1 x i128>, <2 x i128>, etc. probably produce very sub-optimal machine code for i1 vectors as well. 

The llvm.experimental.vector.reduce.or and llvm.experimental.vector.reduce.xor probably produce very sub-optimal machine code for all these i1 vectors too.

These llvm intrinsics are critical for efficiently performing coherent control flow.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1> #38188

Extended Description

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development


Bugzilla Link	38840
Resolution	FIXED
Resolved on	Apr 28, 2019 04:01
Version	trunk
OS	All
CC	@topperc,@RKSimon,@rotateright
Fixed by commit(s)	r359385,r359396

sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1> #38188

Description

Extended Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions