From b6c8ff93debe76f89b1252e3475f2086680caea1 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <Piotr.Sobczak@amd.com>
Date: Mon, 10 Feb 2020 15:25:09 +0100
Subject: [PATCH] Revert "AMDGPU: Fix handling of infinite loops in fragment
 shaders"

This reverts commit 87d98c149504f9b0751189744472d7cc94883960.
---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp  | 79 ++-----------------
 .../test/CodeGen/AMDGPU/kill-infinite-loop.ll | 68 ----------------
 2 files changed, 6 insertions(+), 141 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 01bb60f07f2ecf..191f603a66d6af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -34,7 +34,6 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -118,58 +117,24 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
   return true;
 }
 
-static void removeDoneExport(Function &F) {
-  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
-        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-          Intrin->setArgOperand(6, BoolFalse); // done
-        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
-          Intrin->setArgOperand(4, BoolFalse); // done
-        }
-      }
-    }
-  }
-}
-
 static BasicBlock *unifyReturnBlockSet(Function &F,
                                        ArrayRef<BasicBlock *> ReturningBlocks,
-                                       bool InsertExport,
                                        const TargetTransformInfo &TTI,
                                        StringRef Name) {
   // Otherwise, we need to insert a new basic block into the function, add a PHI
   // nodes (if the function returns values), and convert all of the return
   // instructions into unconditional branches.
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
-  IRBuilder<> B(NewRetBlock);
-
-  if (InsertExport) {
-    // Ensure that there's only one "done" export in the shader by removing the
-    // "done" bit set on the original final export. More than one "done" export
-    // can lead to undefined behavior.
-    removeDoneExport(F);
-
-    Value *Undef = UndefValue::get(B.getFloatTy());
-    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
-                      {
-                        B.getInt32(9), // target, SQ_EXP_NULL
-                        B.getInt32(0), // enabled channels
-                        Undef, Undef, Undef, Undef, // values
-                        B.getTrue(), // done
-                        B.getTrue(), // valid mask
-                      });
-  }
 
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
-    B.CreateRetVoid();
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
   } else {
     // If the function doesn't return void... add a PHI node to the block...
-    PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
-                     "UnifiedRetVal");
-    assert(!InsertExport);
-    B.CreateRet(PN);
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
   }
 
   // Loop over all of the blocks, replacing the return instruction with an
@@ -208,8 +173,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
-  bool InsertExport = false;
-
   for (BasicBlock *BB : PDT.getRoots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
@@ -225,36 +188,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
                                            "DummyReturnBlock", &F);
         Type *RetTy = F.getReturnType();
         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
-        // For pixel shaders, the producer guarantees that an export is
-        // executed before each return instruction. However, if there is an
-        // infinite loop and we insert a return ourselves, we need to uphold
-        // that guarantee by inserting a null export. This can happen e.g. in
-        // an infinite loop with kill instructions, which is supposed to
-        // terminate. However, we don't need to do this if there is a non-void
-        // return value, since then there is an epilog afterwards which will
-        // still export.
-        //
-        // Note: In the case where only some threads enter the infinite loop,
-        // this can result in the null export happening redundantly after the
-        // original exports. However, The last "real" export happens after all
-        // the threads that didn't enter an infinite loop converged, which
-        // means that the only extra threads to execute the null export are
-        // threads that entered the infinite loop, and they only could've
-        // exited through being killed which sets their exec bit to 0.
-        // Therefore, unless there's an actual infinite loop, which can have
-        // invalid results, or there's a kill after the last export, which we
-        // assume the frontend won't do, this export will have the same exec
-        // mask as the last "real" export, and therefore the valid mask will be
-        // overwritten with the same value and will still be correct. Also,
-        // even though this forces an extra unnecessary export wait, we assume
-        // that this happens rare enough in practice to that we don't have to
-        // worry about performance.
-        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
-            RetTy->isVoidTy()) {
-          InsertExport = true;
-        }
-
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
         ReturningBlocks.push_back(DummyReturnBB);
       }
@@ -327,6 +260,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI
     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
+  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
deleted file mode 100644
index 30280b967ad8d7..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
-; Although it's modeled without any control flow in order to get better code
-; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
-; it with "true". In case it's called in a provably infinite loop, we still
-; need to successfully exit and export something, even if we can't know where
-; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
-; this case right before the s_endpgm to avoid GPU hangs, which is what this
-; tests.
-
-; CHECK-LABEL: return_void
-; Make sure that we remove the done bit from the original export
-; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
-; CHECK: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
-define amdgpu_ps void @return_void(float %0) #0 {
-main_body:
-  %cmp = fcmp olt float %0, 1.000000e+01
-  br i1 %cmp, label %end, label %loop
-
-loop:
-  call void @llvm.amdgcn.kill(i1 false) #3
-  br label %loop
-
-end:
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3
-  ret void
-}
-
-; Check that we also remove the done bit from compressed exports correctly.
-; CHECK-LABEL: return_void_compr
-; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
-; CHECK: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
-define amdgpu_ps void @return_void_compr(float %0) #0 {
-main_body:
-  %cmp = fcmp olt float %0, 1.000000e+01
-  br i1 %cmp, label %end, label %loop
-
-loop:
-  call void @llvm.amdgcn.kill(i1 false) #3
-  br label %loop
-
-end:
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3
-  ret void
-}
-
-; In case there's an epilog, we shouldn't have to do this.
-; CHECK-LABEL: return_nonvoid
-; CHECK-NOT: exp null off, off, off, off done vm
-define amdgpu_ps float @return_nonvoid(float %0) #0 {
-main_body:
-  %cmp = fcmp olt float %0, 1.000000e+01
-  br i1 %cmp, label %end, label %loop
-
-loop:
-  call void @llvm.amdgcn.kill(i1 false) #3
-  br label %loop
-
-end:
-  ret float 0.
-}
-
-declare void @llvm.amdgcn.kill(i1) #0
-declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
-declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0
-
-attributes #0 = { nounwind }