Skip to content

Commit 3a622a1

Browse files
committed
[AVX512] Don't switch unmasked subvector insert/extract instructions when AVX512DQI is enabled.
There's no reason to switch instructions with and without DQI. It just creates extra isel patterns and test divergences. There is however value in enabling the masked version of the instructions with DQI. This required introducing some new multiclasses to enabling this splitting. Differential Revision: https://reviews.llvm.org/D36661 llvm-svn: 311091
1 parent 5960848 commit 3a622a1

15 files changed

+846
-1691
lines changed

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 91 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,28 @@ multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
282282
MaskingConstraint, itin, IsCommutable,
283283
IsKCommutable>;
284284

285+
// This multiclass generates the unconditional/non-masking, the masking and
286+
// the zero-masking variant of the vector instruction. In the masking case, the
287+
// perserved vector elements come from a new dummy input operand tied to $dst.
288+
// This version uses a separate dag for non-masking and masking.
289+
multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
290+
dag Outs, dag Ins, string OpcodeStr,
291+
string AttSrcAsm, string IntelSrcAsm,
292+
dag RHS, dag MaskRHS,
293+
InstrItinClass itin = NoItinerary,
294+
bit IsCommutable = 0, bit IsKCommutable = 0,
295+
SDNode Select = vselect> :
296+
AVX512_maskable_custom<O, F, Outs, Ins,
297+
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
298+
!con((ins _.KRCWM:$mask), Ins),
299+
OpcodeStr, AttSrcAsm, IntelSrcAsm,
300+
[(set _.RC:$dst, RHS)],
301+
[(set _.RC:$dst,
302+
(Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
303+
[(set _.RC:$dst,
304+
(Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
305+
"$src0 = $dst", itin, IsCommutable, IsKCommutable>;
306+
285307
// This multiclass generates the unconditional/non-masking, the masking and
286308
// the zero-masking variant of the vector instruction. In the masking case, the
287309
// perserved vector elements come from a new dummy input operand tied to $dst.
@@ -512,28 +534,45 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
512534
//===----------------------------------------------------------------------===//
513535
// AVX-512 - VECTOR INSERT
514536
//
515-
multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
516-
PatFrag vinsert_insert> {
537+
538+
// Supports two different pattern operators for mask and unmasked ops. Allows
539+
// null_frag to be passed for one.
540+
multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
541+
X86VectorVTInfo To,
542+
SDPatternOperator vinsert_insert,
543+
SDPatternOperator vinsert_for_mask> {
517544
let ExeDomain = To.ExeDomain in {
518-
defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
545+
defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
519546
(ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
520547
"vinsert" # From.EltTypeName # "x" # From.NumElts,
521548
"$src3, $src2, $src1", "$src1, $src2, $src3",
522549
(vinsert_insert:$src3 (To.VT To.RC:$src1),
523550
(From.VT From.RC:$src2),
524-
(iPTR imm))>, AVX512AIi8Base, EVEX_4V;
551+
(iPTR imm)),
552+
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
553+
(From.VT From.RC:$src2),
554+
(iPTR imm))>, AVX512AIi8Base, EVEX_4V;
525555

526-
defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
556+
defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
527557
(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
528558
"vinsert" # From.EltTypeName # "x" # From.NumElts,
529559
"$src3, $src2, $src1", "$src1, $src2, $src3",
530560
(vinsert_insert:$src3 (To.VT To.RC:$src1),
561+
(From.VT (bitconvert (From.LdFrag addr:$src2))),
562+
(iPTR imm)),
563+
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
531564
(From.VT (bitconvert (From.LdFrag addr:$src2))),
532565
(iPTR imm))>, AVX512AIi8Base, EVEX_4V,
533566
EVEX_CD8<From.EltSize, From.CD8TupleForm>;
534567
}
535568
}
536569

570+
// Passes the same pattern operator for masked and unmasked ops.
571+
multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
572+
X86VectorVTInfo To,
573+
SDPatternOperator vinsert_insert> :
574+
vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert>;
575+
537576
multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
538577
X86VectorVTInfo To, PatFrag vinsert_insert,
539578
SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
@@ -573,44 +612,46 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
573612
X86VectorVTInfo< 8, EltVT64, VR512>,
574613
vinsert256_insert>, VEX_W, EVEX_V512;
575614

615+
// Even with DQI we'd like to only use these instructions for masking.
576616
let Predicates = [HasVLX, HasDQI] in
577-
defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
617+
defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
578618
X86VectorVTInfo< 2, EltVT64, VR128X>,
579619
X86VectorVTInfo< 4, EltVT64, VR256X>,
580-
vinsert128_insert>, VEX_W, EVEX_V256;
620+
null_frag, vinsert128_insert>, VEX_W, EVEX_V256;
581621

622+
// Even with DQI we'd like to only use these instructions for masking.
582623
let Predicates = [HasDQI] in {
583-
defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
624+
defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
584625
X86VectorVTInfo< 2, EltVT64, VR128X>,
585626
X86VectorVTInfo< 8, EltVT64, VR512>,
586-
vinsert128_insert>, VEX_W, EVEX_V512;
627+
null_frag, vinsert128_insert>, VEX_W, EVEX_V512;
587628

588-
defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
629+
defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
589630
X86VectorVTInfo< 8, EltVT32, VR256X>,
590631
X86VectorVTInfo<16, EltVT32, VR512>,
591-
vinsert256_insert>, EVEX_V512;
632+
null_frag, vinsert256_insert>, EVEX_V512;
592633
}
593634
}
594635

595636
defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
596637
defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
597638

598639
// Codegen pattern with the alternative types,
599-
// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
640+
// Even with AVX512DQ we'll still use these for unmasked operations.
600641
defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
601-
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
642+
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
602643
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
603-
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
644+
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
604645

605646
defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
606-
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
647+
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
607648
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
608-
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
649+
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
609650

610651
defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
611-
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
652+
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
612653
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
613-
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
654+
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
614655

615656
// Codegen pattern with the alternative types insert VEC128 into VEC256
616657
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
@@ -647,16 +688,20 @@ def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
647688
// AVX-512 VECTOR EXTRACT
648689
//---
649690

650-
multiclass vextract_for_size<int Opcode,
651-
X86VectorVTInfo From, X86VectorVTInfo To,
652-
PatFrag vextract_extract> {
691+
// Supports two different pattern operators for mask and unmasked ops. Allows
692+
// null_frag to be passed for one.
693+
multiclass vextract_for_size_split<int Opcode,
694+
X86VectorVTInfo From, X86VectorVTInfo To,
695+
SDPatternOperator vextract_extract,
696+
SDPatternOperator vextract_for_mask> {
653697

654698
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
655-
defm rr : AVX512_maskable<Opcode, MRMDestReg, To, (outs To.RC:$dst),
699+
defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
656700
(ins From.RC:$src1, u8imm:$idx),
657701
"vextract" # To.EltTypeName # "x" # To.NumElts,
658702
"$idx, $src1", "$src1, $idx",
659-
(vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm))>,
703+
(vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
704+
(vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
660705
AVX512AIi8Base, EVEX;
661706
def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
662707
(ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
@@ -677,6 +722,12 @@ multiclass vextract_for_size<int Opcode,
677722
}
678723
}
679724

725+
// Passes the same pattern operator for masked and unmasked ops.
726+
multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
727+
X86VectorVTInfo To,
728+
SDPatternOperator vextract_extract> :
729+
vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract>;
730+
680731
// Codegen pattern for the alternative types
681732
multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
682733
X86VectorVTInfo To, PatFrag vextract_extract,
@@ -713,22 +764,26 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
713764
X86VectorVTInfo< 4, EltVT32, VR128X>,
714765
vextract128_extract>,
715766
EVEX_V256, EVEX_CD8<32, CD8VT4>;
767+
768+
// Even with DQI we'd like to only use these instructions for masking.
716769
let Predicates = [HasVLX, HasDQI] in
717-
defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
770+
defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
718771
X86VectorVTInfo< 4, EltVT64, VR256X>,
719772
X86VectorVTInfo< 2, EltVT64, VR128X>,
720-
vextract128_extract>,
773+
null_frag, vextract128_extract>,
721774
VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
775+
776+
// Even with DQI we'd like to only use these instructions for masking.
722777
let Predicates = [HasDQI] in {
723-
defm NAME # "64x2Z" : vextract_for_size<Opcode128,
778+
defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
724779
X86VectorVTInfo< 8, EltVT64, VR512>,
725780
X86VectorVTInfo< 2, EltVT64, VR128X>,
726-
vextract128_extract>,
781+
null_frag, vextract128_extract>,
727782
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
728-
defm NAME # "32x8Z" : vextract_for_size<Opcode256,
783+
defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
729784
X86VectorVTInfo<16, EltVT32, VR512>,
730785
X86VectorVTInfo< 8, EltVT32, VR256X>,
731-
vextract256_extract>,
786+
null_frag, vextract256_extract>,
732787
EVEX_V512, EVEX_CD8<32, CD8VT8>;
733788
}
734789
}
@@ -737,21 +792,21 @@ defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
737792
defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
738793

739794
// extract_subvector codegen patterns with the alternative types.
740-
// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
795+
// Even with AVX512DQ we'll still use these for unmasked operations.
741796
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
742-
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
797+
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
743798
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
744-
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
799+
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
745800

746801
defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
747-
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
802+
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
748803
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
749-
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
804+
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
750805

751806
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
752-
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
807+
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
753808
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
754-
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
809+
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
755810

756811
// Codegen pattern with the alternative types extract VEC128 from VEC256
757812
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,

llvm/test/CodeGen/X86/avx512-cvt.ll

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -642,19 +642,12 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
642642
}
643643

644644
define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
645-
; NODQ-LABEL: fptrunc00:
646-
; NODQ: # BB#0:
647-
; NODQ-NEXT: vcvtpd2ps %zmm0, %ymm0
648-
; NODQ-NEXT: vcvtpd2ps %zmm1, %ymm1
649-
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
650-
; NODQ-NEXT: retq
651-
;
652-
; DQ-LABEL: fptrunc00:
653-
; DQ: # BB#0:
654-
; DQ-NEXT: vcvtpd2ps %zmm0, %ymm0
655-
; DQ-NEXT: vcvtpd2ps %zmm1, %ymm1
656-
; DQ-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
657-
; DQ-NEXT: retq
645+
; ALL-LABEL: fptrunc00:
646+
; ALL: # BB#0:
647+
; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
648+
; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1
649+
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
650+
; ALL-NEXT: retq
658651
%a = fptrunc <16 x double> %b to <16 x float>
659652
ret <16 x float> %a
660653
}
@@ -876,21 +869,13 @@ define i32 @float_to_int(float %x) {
876869
}
877870

878871
define <16 x double> @uitof64(<16 x i32> %a) nounwind {
879-
; NODQ-LABEL: uitof64:
880-
; NODQ: # BB#0:
881-
; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm2
882-
; NODQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
883-
; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm1
884-
; NODQ-NEXT: vmovaps %zmm2, %zmm0
885-
; NODQ-NEXT: retq
886-
;
887-
; DQ-LABEL: uitof64:
888-
; DQ: # BB#0:
889-
; DQ-NEXT: vcvtudq2pd %ymm0, %zmm2
890-
; DQ-NEXT: vextractf32x8 $1, %zmm0, %ymm0
891-
; DQ-NEXT: vcvtudq2pd %ymm0, %zmm1
892-
; DQ-NEXT: vmovaps %zmm2, %zmm0
893-
; DQ-NEXT: retq
872+
; ALL-LABEL: uitof64:
873+
; ALL: # BB#0:
874+
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2
875+
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
876+
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1
877+
; ALL-NEXT: vmovaps %zmm2, %zmm0
878+
; ALL-NEXT: retq
894879
%b = uitofp <16 x i32> %a to <16 x double>
895880
ret <16 x double> %b
896881
}

0 commit comments

Comments
 (0)