Skip to content
This repository has been archived by the owner on Apr 23, 2020. It is now read-only.

Commit

Permalink
AMDGPU/SI: Avoid moving PHIs to VALU when phi values are defined in s…
Browse files Browse the repository at this point in the history
…calar branches

Reviewers: arsenm

Subscribers: arsenm, llvm-commits, kzhuravl

Differential Revision: https://reviews.llvm.org/D23417

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288095 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
tstellarAMD committed Nov 29, 2016
1 parent bde207a commit d39b310
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 61 deletions.
46 changes: 38 additions & 8 deletions lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
Expand All @@ -82,6 +83,9 @@ using namespace llvm;
namespace {

class SIFixSGPRCopies : public MachineFunctionPass {

MachineDominatorTree *MDT;

public:
static char ID;

Expand All @@ -92,15 +96,21 @@ class SIFixSGPRCopies : public MachineFunctionPass {
StringRef getPassName() const override { return "SI Fix SGPR copies"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // End anonymous namespace

INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)


char SIFixSGPRCopies::ID = 0;

Expand Down Expand Up @@ -274,11 +284,22 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
return false;
}

static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
const TargetRegisterInfo &TRI) {
for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
E = MBB.end(); I != E; ++I) {
if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
return true;
}
return false;
}

bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();

SmallVector<MachineInstr *, 16> Worklist;

Expand Down Expand Up @@ -309,11 +330,23 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::PHI: {
DEBUG(dbgs() << "Fixing PHI: " << MI);
unsigned Reg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;

// We don't need to fix the PHI if the common dominator of the
// two incoming blocks terminates with a uniform branch.
if (MI.getNumExplicitOperands() == 5) {
MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();

MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
break;
}
}

// If a PHI node defines an SGPR and any of its operands are VGPRs,
// then we need to move it to the VALU.
//
Expand All @@ -340,10 +373,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// ...
// use sgpr2
//
// FIXME: This is OK if the branching decision is made based on an
// SGPR value.
bool SGPRBranch = false;

// The one exception to this rule is when one of the operands
// is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
// instruction. In this case, there we know the program will
Expand All @@ -353,7 +382,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {

SmallSet<unsigned, 8> Visited;
if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
(!SGPRBranch && !phiHasBreakDef(MI, MRI, Visited))) {
!phiHasBreakDef(MI, MRI, Visited)) {
DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
}
break;
Expand Down
10 changes: 4 additions & 6 deletions test/CodeGen/AMDGPU/branch-relaxation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -163,23 +163,21 @@ bb3:
ret void
}

; FIXME: Should be able to use s_cbranch_scc0
; GCN-LABEL: {{^}}long_backward_sbranch:
; GCN: v_mov_b32_e32 [[LOOPIDX:v[0-9]+]], 0{{$}}
; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}

; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10

; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
; GCN-NEXT: ;;#ASMEND

; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]

; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
Expand Down
3 changes: 3 additions & 0 deletions test/CodeGen/AMDGPU/cf-loop-on-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ for.body:
; GCN: v_cmp_eq_u32_e32 vcc, 1,

; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80
; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 4

; GCN: s_cbranch_vccnz [[LOOPBB]]
; GCN-NEXT: ; BB#2
; GCN-NEXT: s_endpgm
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/AMDGPU/coalescer_remat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
; CHECK: ; NumVgprs: 9
; CHECK: ; NumVgprs: 8
define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
entry:
%cmpflag = icmp eq i32 %flag, 1
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/AMDGPU/hoist-cond.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
; At the same time condition shall not be serialized into a VGPR and deserialized later
; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.

; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
; CHECK: v_cmp_{{..}}_u32_e{{32|64}} [[COND:s\[[0-9]+:[0-9]+\]|vcc]]
; CHECK: BB0_1:
; CHECK-NOT: v_cmp
; CHECK_NOT: v_cndmask
Expand Down
5 changes: 4 additions & 1 deletion test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; FIXME: Enabling critical edge splitting will fix this.
; XFAIL: *

; Make sure that m0 is not reinitialized in the loop.

; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
Expand All @@ -12,7 +15,7 @@
; GCN: ds_read_b32
; GCN: buffer_store_dword

; GCN: s_cbranch_vccz BB0_2
; GCN: s_cbranch_scc0 BB0_2

; GCN: BB0_3:
; GCN-NEXT: s_endpgm
Expand Down
5 changes: 2 additions & 3 deletions test/CodeGen/AMDGPU/loop_break.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@

; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
; GCN: v_cmp_lt_i32_e32 vcc,
; GCN: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1
; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]]

; GCN: ; BB#2: ; %bb4
; GCN: buffer_load_dword
Expand Down
71 changes: 36 additions & 35 deletions test/CodeGen/AMDGPU/uniform-cfg.ll
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}uniform_if_scc:
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = icmp eq i32 %cond, 0
Expand All @@ -29,17 +30,16 @@ done:
}

; GCN-LABEL: {{^}}uniform_if_vcc:
; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
; also scheduled the write first.
; GCN-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = fcmp oeq float %cond, 0.0
Expand All @@ -59,14 +59,15 @@ done:

; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc:
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = icmp eq i32 %cond, 0
Expand All @@ -85,17 +86,16 @@ done:
}

; GCN-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
; also scheduled the write first.
; GCN-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = fcmp oeq float %cond, 0.0
Expand Down Expand Up @@ -276,15 +276,12 @@ bb9: ; preds = %bb8, %bb4
ret void
}

; GCN-LABEL: {{^}}uniform_loop:
; GCN: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
; FIXME: We need to teach GCNFixSGPRCopies about uniform branches so we
; get s_add_i32 here.
; GCN: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
; GCN: v_cmp_ne_u32_e32 vcc, 0, [[I]]
; GCN: s_and_b64 vcc, exec, vcc
; GCN: s_cbranch_vccnz [[LOOP_LABEL]]
; GCN: s_endpgm
; SI-LABEL: {{^}}uniform_loop:
; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
; SI: s_add_i32 [[I:s[0-9]+]], s{{[0-9]+}}, -1
; SI: s_cmp_lg_u32 [[I]], 0
; SI: s_cbranch_scc1 [[LOOP_LABEL]]
; SI: s_endpgm
define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
entry:
br label %loop
Expand Down Expand Up @@ -433,18 +430,19 @@ bb9: ; preds = %bb8, %bb4

; GCN-LABEL: {{^}}uniform_if_scc_i64_eq:
; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0

; SI: v_cmp_eq_u64_e64
; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]

; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = icmp eq i64 %cond, 0
Expand All @@ -464,18 +462,19 @@ done:

; GCN-LABEL: {{^}}uniform_if_scc_i64_ne:
; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0

; SI: v_cmp_ne_u64_e64
; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]

; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = icmp ne i64 %cond, 0
Expand All @@ -494,14 +493,16 @@ done:
}

; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt:
; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0
; GCN: v_cmp_gt_i64_e64
; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]

; Fall-through to the else
; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
; GCN: s_mov_b32 [[S_VAL]], 1

; GCN: [[IF_LABEL]]:
; GCN: buffer_store_dword [[STORE_VAL]]
; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
; GCN: buffer_store_dword [[V_VAL]]
define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
entry:
%cmp0 = icmp sgt i64 %cond, 0
Expand Down
4 changes: 1 addition & 3 deletions test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
; CHECK: s_and_saveexec_b64
; CHECK-NEXT: s_xor_b64
; CHECK-NEXT: ; mask branch
; CHECK-NEXT: s_cbranch_execz

; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader

; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
; CHECK: s_and_b64 vcc, exec, vcc
; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]

; CHECK: s_endpgm
define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
Expand Down
5 changes: 2 additions & 3 deletions test/CodeGen/AMDGPU/valu-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,8 @@ exit:
; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
; SI: buffer_load_dword
; SI-DAG: buffer_store_dword
; SI-DAG: v_cmp_eq_u32_e32 vcc,
; SI-DAG: s_and_b64 vcc, exec, vcc
; SI: s_cbranch_vccz [[LABEL_LOOP]]
; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
; SI: s_cbranch_scc0 [[LABEL_LOOP]]
; SI: [[LABEL_EXIT]]:
; SI: s_endpgm

Expand Down

0 comments on commit d39b310

Please sign in to comment.