Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit a5ab9ba

Browse files
committed
[X86][SchedModel] SSE reciprocal square root instruction latencies.
The SSE rsqrt instruction (a fast reciprocal square root estimate) was grouped in the same scheduling IIC_SSE_SQRT* class as the accurate (but very slow) SSE sqrt instruction. For code which uses rsqrt (possibly with newton-raphson iterations) this poor scheduling was affecting performances. This patch splits off the rsqrt instruction from the sqrt instruction scheduling classes and creates new IIC_SSE_RSQER* classes with latency values based on Agner's table. Differential Revision: http://reviews.llvm.org/D5370 Patch by Simon Pilgrim. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218517 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a0d5d7a commit a5ab9ba

File tree

7 files changed

+39
-15
lines changed

7 files changed

+39
-15
lines changed

lib/Target/X86/X86InstrSSE.td

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
33443344
>;
33453345
}
33463346

3347+
let Sched = WriteFRsqrt in {
3348+
def SSE_RSQRTPS : OpndItins<
3349+
IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
3350+
>;
3351+
3352+
def SSE_RSQRTSS : OpndItins<
3353+
IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
3354+
>;
3355+
}
3356+
33473357
let Sched = WriteFRcp in {
33483358
def SSE_RCPP : OpndItins<
33493359
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
@@ -3622,10 +3632,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
36223632

36233633
// Reciprocal approximations. Note that these typically require refinement
36243634
// in order to obtain suitable precision.
3625-
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
3626-
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
3635+
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
3636+
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
36273637
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3628-
int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
3638+
int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
36293639
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
36303640
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
36313641
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,

lib/Target/X86/X86SchedHaswell.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
129129
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
130130
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
131131
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
132+
defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
132133
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
133134
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
134135
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;

lib/Target/X86/X86SchedSandyBridge.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
117117
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
118118
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
119119
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
120+
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
120121
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
121122
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
122123
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;

lib/Target/X86/X86Schedule.td

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
6363
defm WriteJump : X86SchedWritePair;
6464

6565
// Floating point. This covers both scalar and vector operations.
66-
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
67-
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
68-
defm WriteFDiv : X86SchedWritePair; // Floating point division.
69-
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
70-
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
71-
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
66+
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
67+
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
68+
defm WriteFDiv : X86SchedWritePair; // Floating point division.
69+
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
70+
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
71+
defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
72+
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
7273
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
7374
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
7475
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
@@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
314315
def IIC_SSE_SQRTSD_RR : InstrItinClass;
315316
def IIC_SSE_SQRTSD_RM : InstrItinClass;
316317

318+
def IIC_SSE_RSQRTPS_RR : InstrItinClass;
319+
def IIC_SSE_RSQRTPS_RM : InstrItinClass;
320+
def IIC_SSE_RSQRTSS_RR : InstrItinClass;
321+
def IIC_SSE_RSQRTSS_RM : InstrItinClass;
322+
317323
def IIC_SSE_RCPP_RR : InstrItinClass;
318324
def IIC_SSE_RCPP_RM : InstrItinClass;
319325
def IIC_SSE_RCPS_RR : InstrItinClass;

lib/Target/X86/X86ScheduleAtom.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
224224
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
225225
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
226226

227+
InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
228+
InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
229+
InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
230+
InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
231+
227232
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
228233
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
229234
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,

lib/Target/X86/X86ScheduleBtVer2.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
163163
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
164164
// FIXME: Double precision latencies
165165
// FIXME: SS vs PS latencies
166-
// FIXME: RSQRT latencies
167166
// FIXME: ymm latencies
168167
////////////////////////////////////////////////////////////////////////////////
169168

170-
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
171-
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
172-
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
173-
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
174-
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
169+
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
170+
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
171+
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
172+
defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
173+
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
174+
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
175175
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
176176

177177
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {

lib/Target/X86/X86ScheduleSLM.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
101101
// Scalar and vector floating point.
102102
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
103103
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
104+
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
104105
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
105106
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
106107
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;

0 commit comments

Comments
 (0)