-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[NVPTX] Improve copy avoidance during lowering. #106423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-llvm-analysis Author: Artem Belevich (Artem-B) ChangesOn newer GPUs, where
Switched pointer traversal from a DIY implementation to PtrUseVisitor. Note: includes PtrUseVisitor patch #106308 which will land separately. Patch is 77.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106423.diff 6 Files Affected:
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..539d302bb70a1b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
///
/// This will visit the users with the same offset of the current visit
/// (including an unknown offset if that is the current state).
- void enqueueUsers(Instruction &I);
+ void enqueueUsers(Value &I);
/// Walk the operands of a GEP and adjust the offset as appropriate.
///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
/// Recursively visit the uses of the given pointer.
/// \returns An info struct about the pointer. See \c PtrInfo for details.
- PtrInfo visitPtr(Instruction &I) {
+ /// We may also need to process Argument pointers, so the input uses is
+ /// a common Value type.
+ PtrInfo visitPtr(Value &I) {
// This must be a pointer type. Get an integer type suitable to hold
// offsets on this pointer.
// FIXME: Support a vector of pointers.
assert(I.getType()->isPointerTy());
+ assert(isa<Instruction>(I) || isa<Argument>(I));
IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
IsOffsetKnown = true;
Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
using namespace llvm;
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
for (Use &U : I.uses()) {
if (VisitedUses.insert(&U).second) {
UseToVisit NewU = {
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4c6b008f..243f39d8a16719 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -79,15 +79,15 @@
//
// define void @foo({i32*, i32*}* byval %input) {
// %b_param = addrspacecat ptr %input to ptr addrspace(101)
-// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
-// %b = load ptr, ptr addrspace(101) %b_ptr
-// %b_global = addrspacecast ptr %b to ptr addrspace(1)
-// ; use %b_generic
+// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0,
+// i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast
+// ptr %b to ptr addrspace(1) ; use %b_generic
// }
//
-// Create a local copy of kernel byval parameters used in a way that *might* mutate
-// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
-// are undefined behaviour, and don't require local copies.
+// Create a local copy of kernel byval parameters used in a way that *might*
+// mutate the parameter, by storing it in an alloca. Mutations to
+// "grid_constant" parameters are undefined behaviour, and don't require
+// local copies.
//
// define void @foo(ptr byval(%struct.s) align 4 %input) {
// store i32 42, ptr %input
@@ -124,11 +124,11 @@
//
// define void @foo(ptr byval(%struct.s) %input) {
// %input1 = addrspacecast ptr %input to ptr addrspace(101)
-// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
-// ; to prevent generic -> param -> generic from getting cancelled out
-// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
-// %call = call i32 @escape(ptr %input1.gen)
-// ret void
+// ; the following intrinsic converts pointer to generic. We don't use an
+// addrspacecast ; to prevent generic -> param -> generic from getting
+// cancelled out %input1.gen = call ptr
+// @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call =
+// call i32 @escape(ptr %input1.gen) ret void
// }
//
// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
@@ -139,16 +139,21 @@
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <numeric>
#include <queue>
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
// pointer in parameter AS.
// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
// generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+ bool IsGridConstant) {
Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
assert(I && "OldUse must be in an instruction");
struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
SmallVector<Instruction *> InstructionsToDelete;
- auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+ auto CloneInstInParamAS = [HasCvtaParam,
+ IsGridConstant](const IP &I) -> Value * {
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
LI->setOperand(0, I.NewParam);
return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
// Just pass through the argument, the old ASC is no longer needed.
return I.NewParam;
}
+ if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+ if (MI->getRawSource() == I.OldUse->get()) {
+ // convert to memcpy/memmove from param space.
+ IRBuilder<> Builder(I.OldInstruction);
+ Intrinsic::ID ID = MI->getIntrinsicID();
+
+ CallInst *B = Builder.CreateMemTransferInst(
+ ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+ MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+ for (unsigned I : {0, 1})
+ if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+ B->addDereferenceableParamAttr(I, Bytes);
+ return B;
+ }
+ // We may be able to handle other cases if the argument is
+ // __grid_constant__
+ }
- if (GridConstant) {
+ if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
OriginalUser->getIterator());
return CvtToGenCall;
};
-
- if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
- I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
- return CI;
+ auto *ParamInGenericAS =
+ GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+ // phi/select could use generic arg pointers w/o __grid_constant__
+ if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+ for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+ if (V.get() == I.OldUse->get())
+ PHI->setIncomingValue(Idx, ParamInGenericAS);
+ }
}
- if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
- // byval address is being stored, cast it to generic
- if (SI->getValueOperand() == I.OldUse->get())
- SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
- return SI;
+ if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+ if (SI->getTrueValue() == I.OldUse->get())
+ SI->setTrueValue(ParamInGenericAS);
+ if (SI->getFalseValue() == I.OldUse->get())
+ SI->setFalseValue(ParamInGenericAS);
}
- if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
- if (PI->getPointerOperand() == I.OldUse->get())
- PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
- return PI;
+
+ // Escapes or writes can only use generic param pointers if
+ // __grid_constant__ is in effect.
+ if (IsGridConstant) {
+ if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+ I.OldUse->set(ParamInGenericAS);
+ return CI;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+ // byval address is being stored, cast it to generic
+ if (SI->getValueOperand() == I.OldUse->get())
+ SI->setOperand(0, ParamInGenericAS);
+ return SI;
+ }
+ if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+ if (PI->getPointerOperand() == I.OldUse->get())
+ PI->setOperand(0, ParamInGenericAS);
+ return PI;
+ }
+ // TODO: iIf we allow stores, we should allow memcpy/memset to
+ // parameter, too.
}
- llvm_unreachable(
- "Instruction unsupported even for grid_constant argument");
}
llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+ using Base = PtrUseVisitor<ArgUseChecker>;
+
+ bool IsGridConstant;
+ SmallPtrSet<Value *, 16> AllArgUsers;
+ // Set of phi/select instructions using the Arg
+ SmallPtrSet<Instruction *, 4> Conditionals;
+
+ ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+ : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+ PtrInfo visitArgPtr(Argument &A) {
+ assert(A.getType()->isPointerTy());
+ IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+ IsOffsetKnown = false;
+ Offset = APInt(IntIdxTy->getBitWidth(), 0);
+ PI.reset();
+ AllArgUsers.clear();
+ Conditionals.clear();
+
+ LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+ // Enqueue the uses of this pointer.
+ enqueueUsers(A);
+ AllArgUsers.insert(&A);
+
+ // Visit all the uses off the worklist until it is empty.
+ // Note that unlike PtrUseVisitor we're intentionally do not track offset.
+ // We're only interested in how we use the pointer.
+ while (!(Worklist.empty() || PI.isAborted())) {
+ UseToVisit ToVisit = Worklist.pop_back_val();
+ U = ToVisit.UseAndIsOffsetKnown.getPointer();
+ Instruction *I = cast<Instruction>(U->getUser());
+ AllArgUsers.insert(I);
+ if (isa<PHINode>(I) || isa<SelectInst>(I))
+ Conditionals.insert(I);
+ LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+ Base::visit(I);
+ }
+ if (PI.isEscaped())
+ LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+ << "\n");
+ else if (PI.isAborted())
+ LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with "
+ << Conditionals.size() << " conditionals\n");
+ return PI;
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ // Storing the pointer escapes it.
+ if (U->get() == SI.getValueOperand())
+ return PI.setEscapedAndAborted(&SI);
+ // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+ // copy.
+ if (!IsGridConstant)
+ return PI.setAborted(&SI);
+ }
+
+ void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ // ASC to param space are no-ops and do not need a copy
+ if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+ return PI.setEscapedAndAborted(&ASC);
+ Base::visitAddrSpaceCastInst(ASC);
+ }
+
+ void visitPtrToIntInst(PtrToIntInst &I) {
+ if (IsGridConstant)
+ return;
+ Base::visitPtrToIntInst(I);
+ }
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ }
+ // PHI and select just pass through the pointers.
+ void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+ void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+
+ // TODO: memcpy from arg is OK as it can get unrolled into ld.param.
+ // However, memcpys are currently expected to be unrolled before we
+ // get here, so we never see them in practice, and we do not currently
+ // handle them when we convert IR to access param space directly. So,
+ // we'll mark it as an escape for now. It would still force a copy on
+ // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy.
+ //
+ // PI.setEscaped(&II);
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+ }
+ // debug only helper.
+ auto &getVisitedUses() { return VisitedUses; }
+};
+} // namespace
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Argument *Arg) {
- bool IsGridConstant = isParamGridConstant(*Arg);
Function *Func = Arg->getParent();
+ bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
assert(StructType && "Missing byval type");
- auto AreSupportedUsers = [&](Value *Start) {
- SmallVector<Value *, 16> ValuesToCheck = {Start};
- auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
- if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
- return true;
- // ASC to param space are OK, too -- we'll just strip them.
- if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
- if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
- return true;
- }
- // Simple calls and stores are supported for grid_constants
- // writes to these pointers are undefined behaviour
- if (IsGridConstant &&
- (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
- return true;
- return false;
- };
-
- while (!ValuesToCheck.empty()) {
- Value *V = ValuesToCheck.pop_back_val();
- if (!IsSupportedUse(V)) {
- LLVM_DEBUG(dbgs() << "Need a "
- << (isParamGridConstant(*Arg) ? "cast " : "copy ")
- << "of " << *Arg << " because of " << *V << "\n");
- (void)Arg;
- return false;
- }
- if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
- !isa<PtrToIntInst>(V))
- llvm::append_range(ValuesToCheck, V->users());
- }
- return true;
- };
-
- if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+ ArgUseChecker AUC(DL, IsGridConstant);
+ ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+ // Easy case, accessing parameter directly is fine.
+ if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) {
// Convert all loads and intermediate operations to use parameter AS and
// skip creation of a local copy of the argument.
SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
for (Use *U : UsesToUpdate)
- convertToParamAS(U, ArgInParamAS, IsGridConstant);
+ convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
const auto *TLI =
@@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
return;
}
- const DataLayout &DL = Func->getDataLayout();
+ // We can't access byval arg directly and need a pointer. on sm_70+ we have
+ // ability to take a pointer to the argument without making a local copy.
+ // However, we're still not allowed to write to it. If the user specified
+ // `__grid_constant__` for the argument, we'll consider escaped pointer as
+ // read-only.
unsigned AS = DL.getAllocaAddrSpace();
- if (isParamGridConstant(*Arg)) {
- // Writes to a grid constant are undefined behaviour. We do not need a
- // temporary copy. When a pointer might have escaped, conservatively replace
- // all of its uses (which might include a device function call) with a cast
- // to the generic address space.
+ if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) {
+ LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+ // Replace all argument pointer uses (which might include a device function
+ // call) with a cast to the generic address space using cvta.param
+ // instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
// Cast argument to param address space
@@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// Do not replace Arg in the cast to param space
CastToParam->setOperand(0, Arg);
} else {
+ LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
// Otherwise we have to create a temporary copy.
AllocaInst *AllocA =
new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..38b5ee63f4b2b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ bool hasCvtaParam() const {
+ return SmVersion >= 70 && PTXVersion >= 77;
+ }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c429dba57..176dfee11cfb09 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
; PTX-LABEL: grid_const_int(
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT: add.s32 %r3, %r2, %r1;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPU...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff f00c946c2da0caf6da4a49e87ac905a8b1d2e8b6 909970a7222f23e15d416271e4304309d7f1b3a8 --extensions h,cpp -- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp llvm/lib/Target/NVPTX/NVPTXSubtarget.h View the diff from clang-format here.diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 130075f22f..7b47190e93 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -554,7 +554,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
ArgUseChecker AUC(DL, IsGridConstant);
ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
- bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
+ bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
// Easy case, accessing parameter directly is fine.
if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
// Convert all loads and intermediate operations to use parameter AS and
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will be adding more test cases with select/phi. Stay tuned.
Adding @fiigii, am not sure about the handling of Phi/Select. |
552e5c3
to
d207ec4
Compare
@fiigii :
I was under impression that it's the conversion that requires address boundary checks and, possibly, some math to shift the address to generic address space, but then the SASS-level |
94e43e0
to
9277c3f
Compare
On newer GPUs, where `cvta.param` instruction is available we can avoid making byval arguments when their pointers are used in a few more cases, even when __grid_constant__ is not specified. - phi - select - memcpy from the parameter.
9277c3f
to
909970a
Compare
The patch appears to work fine on our tests. So, unless there are no further comments/objections, I'll land it tomorrow. |
lgtm |
On newer GPUs, where
cvta.param
instruction is available we can avoid makingbyval arguments when their pointers are used in a few more cases, even
when
__grid_constant__
is not specified.Switched pointer traversal from a DIY implementation to PtrUseVisitor.