Skip to content

Allow fasttail calls for engregesterable structs #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions src/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8259,13 +8259,20 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
bool compPublishStubParam : 1; // EAX captured in prolog will be available through an instrinsic
bool compRetBuffDefStack : 1; // The ret buff argument definitely points into the stack.

var_types compRetType; // Return type of the method as declared in IL
var_types compRetNativeType; // Normalized return type as per target arch ABI
unsigned compILargsCount; // Number of arguments (incl. implicit but not hidden)
unsigned compArgsCount; // Number of arguments (incl. implicit and hidden)
unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present);
int compTypeCtxtArg; // position of hidden param for type context for generic code (CORINFO_CALLCONV_PARAMTYPE)
unsigned compThisArg; // position of implicit this pointer param (not to be confused with lvaArg0Var)
var_types compRetType; // Return type of the method as declared in IL
var_types compRetNativeType; // Normalized return type as per target arch ABI
unsigned compILargsCount; // Number of arguments (incl. implicit but not hidden)
unsigned compArgsCount; // Number of arguments (incl. implicit and hidden)

#if defined(FEATURE_MULTIREG_ARGS) && defined(FEATURE_FASTTAILCALL)
unsigned compArgRegCount; // Number of incoming integer args
unsigned compFloatArgRegCount; // Number of incoming floating point args
size_t compStackSize; // Incoming stack size
#endif // defined(FEATURE_MULTIREG_ARGS) && defined(FEATURE_FASTTAILCALL)

unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present);
int compTypeCtxtArg; // position of hidden param for type context for generic code (CORINFO_CALLCONV_PARAMTYPE)
unsigned compThisArg; // position of implicit this pointer param (not to be confused with lvaArg0Var)
unsigned compILlocalsCount; // Number of vars : args + locals (incl. implicit but not hidden)
unsigned compLocalsCount; // Number of vars : args + locals (incl. implicit and hidden)
unsigned compMaxStack;
Expand Down
73 changes: 69 additions & 4 deletions src/jit/lclvars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,53 @@ void Compiler::lvaInitTypeRef()

lvaInitArgs(&varDscInfo);

#if FEATURE_FASTTAILCALL

//-------------------------------------------------------------------------
// Calculate the argument register usage.
//
// This will later be used for fastTailCall determination
//-------------------------------------------------------------------------

unsigned argRegCount = 0;
unsigned floatingRegCount = 0;
unsigned stackArgCount = 0;
size_t stackSize = 0;
unsigned compArgCount = info.compArgsCount;

auto incrementRegCount = [&floatingRegCount, &argRegCount](LclVarDsc* varDsc)
{
varDsc->IsFloatRegType() ? ++floatingRegCount : ++argRegCount;
};

unsigned argNum;
LclVarDsc* curDsc;

for (curDsc = lvaTable, argNum = 0; argNum < varDscInfo.varNum; argNum++, curDsc++)
{
if (curDsc->lvIsRegArg)
{
incrementRegCount(curDsc);

#if defined(FEATURE_MULTIREG_ARGS) && defined(UNIX_AMD64_ABI)
if (curDsc->lvOtherArgReg != REG_NA)
{
incrementRegCount(curDsc);
}
#endif // defined(FEATURE_MULTIREG_ARGS) && defined(UNIX_AMD64_ABI)
}
else if (varTypeIsStruct(curDsc))
{
stackSize += curDsc->lvSize();
}
else
{
stackSize += TARGET_POINTER_SIZE;
}
}

#endif // FEATURE_FASTTAILCALL

//-------------------------------------------------------------------------
// Finally the local variables
//-------------------------------------------------------------------------
Expand All @@ -247,21 +294,35 @@ void Compiler::lvaInitTypeRef()
i++, varNum++, varDsc++, localsSig = info.compCompHnd->getArgNext(localsSig))
{
CORINFO_CLASS_HANDLE typeHnd;
CorInfoTypeWithMod corInfoType =
CorInfoTypeWithMod corInfoTypeWithMod =
info.compCompHnd->getArgType(&info.compMethodInfo->locals, localsSig, &typeHnd);
CorInfoType corInfoType = strip(corInfoTypeWithMod);

lvaInitVarDsc(varDsc, varNum, strip(corInfoType), typeHnd, localsSig, &info.compMethodInfo->locals);
lvaInitVarDsc(varDsc, varNum, corInfoType, typeHnd, localsSig, &info.compMethodInfo->locals);

varDsc->lvPinned = ((corInfoType & CORINFO_TYPE_MOD_PINNED) != 0);
varDsc->lvPinned = ((corInfoTypeWithMod & CORINFO_TYPE_MOD_PINNED) != 0);
varDsc->lvOnFrame = true; // The final home for this local variable might be our local stack frame

if (strip(corInfoType) == CORINFO_TYPE_CLASS)
if (corInfoType == CORINFO_TYPE_CLASS)
{
CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig);
lvaSetClass(varNum, clsHnd);
}
}

#if FEATURE_FASTTAILCALL
//-------------------------------------------------------------------------
// Save the register usage information and stack size.
//-------------------------------------------------------------------------

stackSize += stackArgCount * REGSIZE_BYTES;

info.compArgRegCount = argRegCount;
info.compFloatArgRegCount = floatingRegCount;
info.compStackSize = stackSize;

#endif //FEATURE_FASTTAILCALL

if ( // If there already exist unsafe buffers, don't mark more structs as unsafe
// as that will cause them to be placed along with the real unsafe buffers,
// unnecessarily exposing them to overruns. This can affect GS tests which
Expand Down Expand Up @@ -1253,6 +1314,10 @@ void Compiler::lvaInitVarDsc(LclVarDsc* varDsc,
#ifdef DEBUG
varDsc->lvStkOffs = BAD_STK_OFFS;
#endif

#if defined(FEATURE_MULTIREG_ARGS) && !defined(WINDOWS_AMD64_ABI)
varDsc->lvOtherArgReg = REG_NA;
#endif // defined(FEATURE_MULTIREG_ARGS) && !defined(WINDOWS_AMD64_ABI)
}

/*****************************************************************************
Expand Down
196 changes: 174 additions & 22 deletions src/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6934,6 +6934,8 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
}
#endif

unsigned nCallerArgs = info.compArgsCount;

// Note on vararg methods:
// If the caller is vararg method, we don't know the number of arguments passed by caller's caller.
// But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its
Expand All @@ -6943,20 +6945,34 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
// Note that callee being a vararg method is not a problem since we can account the params being passed.

// Count of caller args including implicit and hidden (i.e. thisPtr, RetBuf, GenericContext, VarargCookie)
unsigned nCallerArgs = info.compArgsCount;
size_t callerArgRegCount = info.compArgRegCount;
size_t callerFloatArgRegCount = info.compFloatArgRegCount;

// TODO-Linux-x64
// TODO-ARM64
//
// Currently we can track the caller's inbound stack size; however, we cannot
// easily determine the caller's outbound stack size (the callee's inbound stack
// size). This information is computed in fgMorphArgs which currently is
// dependent on the canFastTailCall decision.
//
// Note that we can get around this by excluding all struct which cannot
// be engregistered.

// Count the callee args including implicit and hidden.
// Note that GenericContext and VarargCookie are added by importer while
// importing the call to gtCallArgs list along with explicit user args.
unsigned nCalleeArgs = 0;
size_t calleeArgRegCount = 0;
size_t calleeFloatArgRegCount = 0;

if (callee->gtCallObjp) // thisPtr
{
nCalleeArgs++;
++calleeArgRegCount;
}

if (callee->HasRetBufArg()) // RetBuf
{
nCalleeArgs++;
++calleeArgRegCount;

// If callee has RetBuf param, caller too must have it.
// Otherwise go the slow route.
Expand All @@ -6971,10 +6987,11 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
// non-standard and secret params passed in registers (e.g. R10, R11) since
// these won't contribute to out-going arg size.
bool hasMultiByteArgs = false;
bool hasTwoSlotSizedStruct = false;
size_t nCalleeArgs = calleeArgRegCount; // Keep track of how many args we have.
for (GenTreePtr args = callee->gtCallArgs; (args != nullptr) && !hasMultiByteArgs; args = args->gtOp.gtOp2)
{
nCalleeArgs++;

++nCalleeArgs;
assert(args->OperIsList());
GenTreePtr argx = args->gtOp.gtOp1;

Expand All @@ -7001,24 +7018,85 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
{
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)

// hasMultiByteArgs will determine if the struct can be passed
// in registers. If it cannot we will break the loop and not
// fastTailCall.
unsigned typeSize = 0;
hasMultiByteArgs = !VarTypeIsMultiByteAndCanEnreg(argx->TypeGet(), objClass, &typeSize, false);

#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) || defined(_TARGET_ARM64_)
// On System V/arm64 the args could be a 2 eightbyte struct that is passed in two registers.
// Account for the second eightbyte in the nCalleeArgs.
// https://github.com/dotnet/coreclr/issues/2666
// TODO-CQ-Amd64-Unix/arm64: Structs of size between 9 to 16 bytes are conservatively estimated
// as two args, since they need two registers whereas nCallerArgs is
// counting such an arg as one. This would mean we will not be optimizing
// certain calls though technically possible.
#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;

assert(objClass != nullptr);
eeGetSystemVAmd64PassStructInRegisterDescriptor(objClass, &structDesc);

if (typeSize > TARGET_POINTER_SIZE)
// TODO. Here we have made the assumption that multibyte struct
// arguments will cause a no fastTailCall decision.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// TODO-AMD64-Linux

if (!structDesc.passedInRegisters)
{
unsigned extraArgRegsToAdd = (typeSize / TARGET_POINTER_SIZE);
nCalleeArgs += extraArgRegsToAdd;
// TODO do not approx callee stack size.
noway_assert(hasMultiByteArgs);
}
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING || _TARGET_ARM64_
else
{
if (structDesc.eightByteCount > 1)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reads better as:

if (structDesc.eightByteCount == 2)

{
hasTwoSlotSizedStruct = true;
}

for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
{
if (structDesc.IsIntegralSlot(i))
{
++calleeArgRegCount;
}
else if (structDesc.IsSseSlot(i))
{
++calleeFloatArgRegCount;
}
else
{
assert(false && "Invalid eightbyte classification type.");
break;
}
}
}

#elif defined(_TARGET_ARM64_) // ARM64
var_types hfaType = GetHfaType(argx);
bool isHfaArg = varTypeIsFloating(hfaType);
size_t size = 1;

if (isHfaArg)
{
size = GetHfaCount(argx);
}
else
{
// Structs are either passed in 1 or 2 (64-bit) slots
size_t roundupSize = roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd),
TARGET_POINTER_SIZE);
size = roundupSize / TARGET_POINTER_SIZE;

if (size > 2)
{
// TODO do not approx callee stack size.
noway_assert(hasMultiByteArgs);
}

else if (size == 2)
{
hasTwoSlotSizedStruct = true;
}
}

calleeArgRegCount += size;

#elif defined(WINDOWS_AMD64_ABI)

++calleeArgRegCount;

#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING

#else
assert(!"Target platform ABI rules regarding passing struct type args in registers");
Expand All @@ -7030,28 +7108,102 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
hasMultiByteArgs = true;
}
}
else
{
varTypeIsFloating(argx) ? ++calleeFloatArgRegCount : ++calleeArgRegCount;
}
}

// Go the slow route, if it has multi-byte params
if (hasMultiByteArgs)
{
JITDUMP("Will not fastTailCall hasMultiByteArgs");
return false;
}

const unsigned maxRegArgs = MAX_REG_ARG;

// If we reached here means that callee has only those argument types which can be passed in
// a register and if passed on stack will occupy exactly one stack slot in out-going arg area.
// If we are passing args on stack for callee and it has more args passed on stack than
// caller, then fast tail call cannot be performed.
// If we are passing args on stack for the callee and it has more args passed on stack than
// the caller, then fast tail call cannot be performed.
//
// Note that the GC'ness of on stack args need not match since the arg setup area is marked
// as non-interruptible for fast tail calls.
if ((nCalleeArgs > MAX_REG_ARG) && (nCallerArgs < nCalleeArgs))

#ifdef WINDOWS_AMD64_ABI
size_t calleeStackSlots = ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) ? (calleeArgRegCount + calleeFloatArgRegCount) - maxRegArgs : 0;
size_t calleeStackSize = calleeStackSlots * TARGET_POINTER_SIZE;
size_t callerStackSize = info.compStackSize;

// x64 Windows: If we have more callee registers used than MAX_REG_ARG, then
// make sure the callee's incoming arguments is less than the caller's
if ((calleeStackSlots > 0) && (calleeStackSize > callerStackSize))
{
JITDUMP("Will not fastTailCall (calleeStackSlots > 0) && (calleeStackSize > callerStackSize)");
return false;
}

#elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI) || defined(_TARGET_ARM64_))

// For *nix Amd64 and Arm64 check to see if all arguments for the callee
// and caller are passing in registers. If not, ensure that the outgoing argument stack size
// requirement for the callee is less than or equal to the caller's entire stack frame usage.
//
// Also, in the case that we have to pass arguments on the stack make sure
// that we are not dealing with structs that are >8 bytes.

bool hasStackArgs = false;
size_t maxFloatRegArgs = MAX_FLOAT_REG_ARG;

size_t calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0;
size_t calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0;

size_t calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount;
size_t callerStackSize = info.compStackSize;
size_t calleeStackSize = calleeStackArgCount * TARGET_POINTER_SIZE;

if (callerStackSize > 0 || calleeStackSize > 0)
{
hasStackArgs = true;
}

// We have a >8 byte struct in the callee and arguments that have to go
// on the stack. Do not fastTailCall.
if (hasStackArgs && hasTwoSlotSizedStruct)
{
JITDUMP("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct");
return false;
}

// TODO-Linux-x64
// TODO-ARM64
//
// LowerFastTailCall current assumes nCalleeArgs == nCallerArgs. This is
// not true in many cases on x64 linux, remove this pessimization when
// LowerFastTailCall is fixed. See https://github.com/dotnet/coreclr/issues/12468
// for more information.
if (hasStackArgs && (nCalleeArgs != nCallerArgs))
{
JITDUMP("Will not fastTailCall hasStackArgs && (nCalleeArgs != nCallerArgs)");
return false;
}

if (calleeStackSize > callerStackSize)
{
JITDUMP("Will not fastTailCall calleeStackArgCount > callerStackArgCount");
return false;
}

return true;
#else

NYI("fastTailCall not supported on this Architecture.");

#endif // WINDOWS_AMD64_ABI

JITDUMP("Will fastTailCall");
return true;
#else // FEATURE_FASTTAILCALL
return false;
#endif
}
Expand Down
Loading