Skip to content

Commit e2125a6

Browse files
author
Jan Hubicka
committed
Zen5 tuning part 3: scheduler tweaks
this patch adds support for new fussion in znver5 documented in the optimization manual: The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions with certain ALU instructions. The following conditions need to be met for fusion to happen: - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B - The MOV is followed by an ALU instruction where the MOV and ALU destination register match. - The ALU instruction may source only registers or immediate data. There cannot be any memory source. - The ALU instruction sources either the source or dest of MOV instruction. - If ALU instruction has 2 reg sources, they should be different. - The following ALU instructions can fuse with an older qualified MOV instruction: ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR (I assume OP is OR) I also increased issue rate from 4 to 6. Theoretically znver5 can do more, but with our model we can't realy use it. Increasing issue rate to 8 leads to infinite loop in scheduler. Finally, I also enabled fuse_alu_and_branch since it is supported by znver5 (I think by earlier zens too). New fussion pattern moves quite few instructions around in common code: @@ -2210,13 +2210,13 @@ .cfi_offset 3, -32 leaq 63(%rsi), %rbx movq %rbx, %rbp + shrq $6, %rbp + salq $3, %rbp subq $16, %rsp .cfi_def_cfa_offset 48 movq %rdi, %r12 - shrq $6, %rbp - movq %rsi, 8(%rsp) - salq $3, %rbp movq %rbp, %rdi + movq %rsi, 8(%rsp) call _Znwm movq 8(%rsp), %rsi movl $0, 8(%r12) @@ -2224,8 +2224,8 @@ movq %rax, (%r12) movq %rbp, 32(%r12) testq %rsi, %rsi - movq %rsi, %rdx cmovns %rsi, %rbx + movq %rsi, %rdx sarq $63, %rdx shrq $58, %rdx sarq $6, %rbx which should help decoder bandwidth and perhaps also cache, though I was not able to measure off-noise effect on SPEC. gcc/ChangeLog: * config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5. (ix86_adjust_cost): Add TODO about znver5 memory latency. (ix86_fuse_mov_alu_p): New. (ix86_macro_fusion_pair_p): Use it. * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5. (X86_TUNE_FUSE_MOV_AND_ALU): New tune;
1 parent dee3c5c commit e2125a6

File tree

3 files changed

+77
-3
lines changed

3 files changed

+77
-3
lines changed

gcc/config/i386/i386.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
430430
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
431431
#define TARGET_FUSE_ALU_AND_BRANCH \
432432
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
433+
#define TARGET_FUSE_MOV_AND_ALU \
434+
ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
433435
#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
434436
#define TARGET_AVOID_LEA_FOR_ADDR \
435437
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]

gcc/config/i386/x86-tune-sched.cc

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ ix86_issue_rate (void)
6767
case PROCESSOR_ZNVER2:
6868
case PROCESSOR_ZNVER3:
6969
case PROCESSOR_ZNVER4:
70-
case PROCESSOR_ZNVER5:
7170
case PROCESSOR_CORE2:
7271
case PROCESSOR_NEHALEM:
7372
case PROCESSOR_SANDYBRIDGE:
@@ -91,6 +90,13 @@ ix86_issue_rate (void)
9190
return 5;
9291

9392
case PROCESSOR_SAPPHIRERAPIDS:
93+
/* For znver5 decoder can handle 4 or 8 instructions per cycle,
94+
op cache 12 instruction/cycle, dispatch 8 instructions
95+
integer rename 8 instructions and Fp 6 instructions.
96+
97+
The scheduler, without understanding out of order nature of the CPU
98+
is unlikely going to be able to fill all of these. */
99+
case PROCESSOR_ZNVER5:
94100
return 6;
95101

96102
default:
@@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
434440
enum attr_unit unit = get_attr_unit (insn);
435441
int loadcost;
436442

443+
/* TODO: On znver5 complex addressing modes have
444+
greater latency. */
437445
if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
438446
loadcost = 4;
439447
else
@@ -563,13 +571,70 @@ ix86_macro_fusion_p ()
563571
return TARGET_FUSE_CMP_AND_BRANCH;
564572
}
565573

574+
static bool
575+
ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
576+
{
577+
/* Validate mov:
578+
- It should be reg-reg move with opcode 0x89 or 0x8B. */
579+
rtx set1 = PATTERN (mov);
580+
if (GET_CODE (set1) != SET
581+
|| !GENERAL_REG_P (SET_SRC (set1))
582+
|| !GENERAL_REG_P (SET_DEST (set1)))
583+
return false;
584+
rtx reg = SET_DEST (set1);
585+
/* - it should have 0x89 or 0x8B opcode. */
586+
if (!INTEGRAL_MODE_P (GET_MODE (reg))
587+
|| GET_MODE_SIZE (GET_MODE (reg)) < 2
588+
|| GET_MODE_SIZE (GET_MODE (reg)) > 8)
589+
return false;
590+
/* Validate ALU. */
591+
if (GET_CODE (PATTERN (alu)) != PARALLEL)
592+
return false;
593+
rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
594+
if (GET_CODE (set2) != SET)
595+
return false;
596+
/* Match one of:
597+
ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
598+
We also may add insn attribute to handle some of sporadic
599+
case we output those with different RTX expressions. */
600+
601+
if (GET_CODE (SET_SRC (set2)) != PLUS
602+
&& GET_CODE (SET_SRC (set2)) != MINUS
603+
&& GET_CODE (SET_SRC (set2)) != XOR
604+
&& GET_CODE (SET_SRC (set2)) != AND
605+
&& GET_CODE (SET_SRC (set2)) != IOR
606+
&& GET_CODE (SET_SRC (set2)) != NOT
607+
&& GET_CODE (SET_SRC (set2)) != ASHIFT
608+
&& GET_CODE (SET_SRC (set2)) != ASHIFTRT
609+
&& GET_CODE (SET_SRC (set2)) != LSHIFTRT)
610+
return false;
611+
rtx op0 = XEXP (SET_SRC (set2), 0);
612+
rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
613+
/* One of operands should be register. */
614+
if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
615+
std::swap (op0, op1);
616+
if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
617+
return false;
618+
if (op1
619+
&& !REG_P (op1)
620+
&& !x86_64_immediate_operand (op1, VOIDmode))
621+
return false;
622+
/* Only one of two paramters must be move destination. */
623+
if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
624+
return false;
625+
return true;
626+
}
627+
566628
/* Check whether current microarchitecture support macro fusion
567629
for insn pair "CONDGEN + CONDJMP". Refer to
568630
"Intel Architectures Optimization Reference Manual". */
569631

570632
bool
571633
ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
572634
{
635+
if (TARGET_FUSE_MOV_AND_ALU
636+
&& ix86_fuse_mov_alu_p (condgen, condjmp))
637+
return true;
573638
rtx src, dest;
574639
enum rtx_code ccode;
575640
rtx compare_set = NULL_RTX, test_if, cond;

gcc/config/i386/x86-tune.def

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
143143

144144
/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
145145
jump instruction when the alu instruction produces the CCFLAG consumed by
146-
the conditional jump instruction. */
146+
the conditional jump instruction.
147+
148+
TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
149+
There is also limitation for immediate and displacement supported. */
147150
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
148-
m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
151+
m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
149152

153+
/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
154+
and the destination is used by alu. alu must be one of
155+
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
156+
DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
150157

151158
/*****************************************************************************/
152159
/* Function prologue, epilogue and function calling sequences. */

0 commit comments

Comments
 (0)