Skip to content

Commit 5705dc7

Browse files
committed
Recommit "[X86] Add TuningPreferShiftShuffle for when Shifts are preferable to shuffles." (2nd Try)
Move the opcode checks to after we have already verified we found a valid shift instruction (`0 < ShiftAmt`) in `matchUnaryPermuteShuffle` and `lowerShuffleAsShift`. Reviewed By: pengfei, RKSimon Differential Revision: https://reviews.llvm.org/D143786
1 parent 6b5afda commit 5705dc7

File tree

7 files changed

+826
-266
lines changed

7 files changed

+826
-266
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,12 @@ def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle",
550550
"NoDomainDelayShuffle","true",
551551
"Has no bypass delay when using the 'wrong' shuffle type">;
552552

553+
// Prefer lowering shuffles on AVX512 targets (e.g. Skylake Server) to
554+
// imm shifts/rotate if they can use more ports than regular shuffles.
555+
def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle",
556+
"PreferLowerShuffleAsShift", "true",
557+
"Shifts are faster (or as fast) as shuffle">;
558+
553559
// On some X86 processors, a vzeroupper instruction should be inserted after
554560
// using ymm/zmm registers before executing code that may use SSE instructions.
555561
def TuningInsertVZEROUPPER
@@ -922,6 +928,7 @@ def ProcessorFeatures {
922928
TuningPOPCNTFalseDeps,
923929
TuningInsertVZEROUPPER,
924930
TuningAllowLight256Bit,
931+
TuningPreferShiftShuffle,
925932
TuningNoDomainDelayMov,
926933
TuningNoDomainDelayShuffle,
927934
TuningNoDomainDelayBlend];

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 195 additions & 99 deletions
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86TargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
9292
X86::TuningNoDomainDelayMov,
9393
X86::TuningNoDomainDelayShuffle,
9494
X86::TuningNoDomainDelayBlend,
95+
X86::TuningPreferShiftShuffle,
9596

9697
// Perf-tuning flags.
9798
X86::TuningFastGather,

llvm/test/CodeGen/X86/avx512-hadd-hsub.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
1616
; SKX: # %bb.0:
1717
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1818
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
19-
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
19+
; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
2020
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2121
; SKX-NEXT: vmovd %xmm0, %eax
2222
; SKX-NEXT: vzeroupper
@@ -43,7 +43,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
4343
; SKX: # %bb.0:
4444
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4545
; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
46-
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
46+
; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
4747
; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
4848
; SKX-NEXT: vmovd %xmm0, %eax
4949
; SKX-NEXT: vzeroupper

0 commit comments

Comments
 (0)