Skip to content

Commit

Permalink
[RISC-V] Initial patch to fix RISCV64 interpreter (#94548)
Browse files Browse the repository at this point in the history
* [RISC-V] Initial patch to fix RISCV64 interpreter

* Code review feedback
  • Loading branch information
JongHeonChoi authored Nov 10, 2023
1 parent 5127e07 commit e7ab2f6
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 11 deletions.
62 changes: 53 additions & 9 deletions src/coreclr/vm/interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,9 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp,
// x8 through x15 are scratch registers on ARM64.
IntReg x8 = IntReg(8);
IntReg x9 = IntReg(9);

#elif defined(HOST_RISCV64)
#else
#error unsupported platform
#endif
}
Expand Down Expand Up @@ -1073,15 +1076,15 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp,
argState.AddArg(vaSigCookieIndex);
}

#if defined(HOST_ARM) || defined(HOST_AMD64) || defined(HOST_ARM64)
#if defined(HOST_ARM) || defined(HOST_AMD64) || defined(HOST_ARM64) || defined(HOST_RISCV64)
// Generics context comes before args on ARM. Would be better if I factored this out as a call,
// to avoid large swatches of duplicate code.
if (hasGenericsContextArg)
{
argPerm[genericsContextArgIndex] = physArgIndex; physArgIndex++;
argState.AddArg(genericsContextArgIndex);
}
#endif // HOST_ARM || HOST_AMD64 || HOST_ARM64
#endif // HOST_ARM || HOST_AMD64 || HOST_ARM64 || HOST_RISCV64

CORINFO_ARG_LIST_HANDLE argPtr = info->args.args;
// Some arguments are have been passed in registers, some in memory. We must generate code that
Expand Down Expand Up @@ -1432,7 +1435,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp,
sl.X86EmitPopReg(kEBP);
sl.X86EmitReturn(static_cast<WORD>(argState.callerArgStackSlots * sizeof(void*)));
#elif defined(UNIX_AMD64_ABI)
bool hasTowRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS &&
bool hasTwoRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS &&
getClassSize(info->args.retTypeClass) == 16;

int fixedTwoSlotSize = 16;
Expand Down Expand Up @@ -1484,7 +1487,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp,
sl.X86EmitRegLoad(ARGUMENT_kREG1, reinterpret_cast<UINT_PTR>(interpMethInfo));

sl.X86EmitCall(sl.NewExternalCodeLabel(interpretMethodFunc), 0);
if (hasTowRetSlots) {
if (hasTwoRetSlots) {
sl.X86EmitEspOffset(0x8b, kRAX, 0);
sl.X86EmitEspOffset(0x8b, kRDX, 8);
}
Expand Down Expand Up @@ -1635,7 +1638,40 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp,
#elif defined(HOST_LOONGARCH64)
assert(!"unimplemented on LOONGARCH yet");
#elif defined(HOST_RISCV64)
assert(!"unimplemented on RISCV64 yet");
bool hasTwoRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS &&
getClassSize(info->args.retTypeClass) == 16;

UINT stackFrameSize = argState.numFPRegArgSlots;

sl.EmitProlog(argState.numRegArgs, argState.numFPRegArgSlots, hasTwoRetSlots ? 2 * sizeof(void*) : 0);

#if INTERP_ILSTUBS
if (pMD->IsILStub())
{
// Third argument is stubcontext, in t2 (METHODDESC_REGISTER).
sl.EmitMovReg(IntReg(12), IntReg(7));
}
else
#endif
{
// For a non-ILStub method, push NULL as the third StubContext argument.
sl.EmitMovConstant(IntReg(12), 0);
}
// Second arg is pointer to the base of the ILargs arr -- i.e., the current stack value.
sl.EmitAddImm(IntReg(11), RegSp, sl.GetSavedRegArgsOffset());

// First arg is the pointer to the interpMethodInfo structure
sl.EmitMovConstant(IntReg(10), reinterpret_cast<UINT64>(interpMethInfo));

sl.EmitCallLabel(sl.NewExternalCodeLabel((LPVOID)interpretMethodFunc), FALSE, FALSE);
if (hasTwoRetSlots)
{
// TODO: handle return registers to use int or float registers
sl.EmitLoad(IntReg(10), RegSp, 0);
sl.EmitLoad(IntReg(11), RegSp, sizeof(void*));
}

sl.EmitEpilog();
#else
#error unsupported platform
#endif
Expand Down Expand Up @@ -2430,6 +2466,14 @@ void Interpreter::ExecuteMethod(ARG_SLOT* retVal, _Out_ bool* pDoJmpCall, _Out_
//The Fixed Two slot return buffer address
memcpy(m_ilArgs-16, OpStackGet<void*>(0), sz);
}
#elif defined(TARGET_RISCV64)
// Is it an struct contained in two slots
else if (m_methInfo->m_returnType == CORINFO_TYPE_VALUECLASS
&& sz == 16)
{
//The Fixed Two slot return buffer address
memcpy(m_ilArgs-32, OpStackGet<void*>(0), sz);
}
#endif
else if (CorInfoTypeIsFloatingPoint(m_methInfo->m_returnType) &&
CorInfoTypeIsFloatingPoint(retValIt.ToCorInfoType()))
Expand Down Expand Up @@ -9448,7 +9492,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T
HFAReturnArgSlots = (HFAReturnArgSlots + sizeof(ARG_SLOT) - 1) / sizeof(ARG_SLOT);
}
}
#elif defined(UNIX_AMD64_ABI)
#elif defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64)
unsigned HasTwoSlotBuf = sigInfo.retType == CORINFO_TYPE_VALUECLASS &&
getClassSize(sigInfo.retTypeClass) == 16;
#endif
Expand Down Expand Up @@ -9689,7 +9733,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T
// This is the argument slot that will be used to hold the return value.
// In UNIX_AMD64_ABI, return type may have need tow ARG_SLOTs.
ARG_SLOT retVals[2] = {0, 0};
#if !defined(HOST_ARM) && !defined(UNIX_AMD64_ABI)
#if !defined(HOST_ARM) && !defined(UNIX_AMD64_ABI) && !defined(TARGET_RISCV64)
_ASSERTE (NUMBER_RETURNVALUE_SLOTS == 1);
#endif

Expand Down Expand Up @@ -9968,7 +10012,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T
bool b = CycleTimer::GetThreadCyclesS(&startCycles); _ASSERTE(b);
#endif // INTERP_ILCYCLE_PROFILE

#if defined(UNIX_AMD64_ABI)
#if defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64)
mdcs.CallTargetWorker(args, retVals, HasTwoSlotBuf ? 16: 8);
#else
mdcs.CallTargetWorker(args, retVals, 8);
Expand Down Expand Up @@ -10114,7 +10158,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T
{
OpStackSet<INT64>(m_curStackHt, GetSmallStructValue(&smallStructRetVal, retTypeSz));
}
#if defined(UNIX_AMD64_ABI)
#if defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64)
else if (HasTwoSlotBuf)
{
void* dst = LargeStructOperandStackPush(16);
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/vm/riscv64/cgencpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ class StubLinkerCPU : public StubLinker
void EmitComputedInstantiatingMethodStub(MethodDesc* pSharedMD, struct ShuffleEntry *pShuffleEntryArray, void* extraArg);
#endif // FEATURE_SHARE_GENERIC_CODE

private:
void EmitMovConstant(IntReg target, UINT64 constant);
void EmitJumpRegister(IntReg regTarget);
void EmitMovReg(IntReg dest, IntReg source);
Expand All @@ -380,6 +379,9 @@ class StubLinkerCPU : public StubLinker
void EmitLoad(FloatReg dest, IntReg srcAddr, int offset = 0);
void EmitStore(IntReg src, IntReg destAddr, int offset = 0);
void EmitStore(FloatReg src, IntReg destAddr, int offset = 0);

void EmitProlog(unsigned short cIntRegArgs, unsigned short cFpRegArgs, unsigned short cbStackSpace = 0);
void EmitEpilog();
};

extern "C" void SinglecastDelegateInvokeStub();
Expand Down
110 changes: 110 additions & 0 deletions src/coreclr/vm/riscv64/stubs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,116 @@ void StubLinkerCPU::EmitJumpRegister(IntReg regTarget)
Emit32(0x00000067 | (regTarget << 15));
}

void StubLinkerCPU::EmitProlog(unsigned short cIntRegArgs, unsigned short cFpRegArgs, unsigned short cbStackSpace)
{
_ASSERTE(!m_fProlog);

unsigned short numberOfEntriesOnStack = 2 + cIntRegArgs + cFpRegArgs; // 2 for fp, ra

// Stack needs to be 16 byte aligned. Compute the required padding before saving it
unsigned short totalPaddedFrameSize = static_cast<unsigned short>(ALIGN_UP(cbStackSpace + numberOfEntriesOnStack * sizeof(void*), 2 * sizeof(void*)));
// The padding is going to be applied to the local stack
cbStackSpace = totalPaddedFrameSize - numberOfEntriesOnStack * sizeof(void*);

// Record the parameters of this prolog so that we can generate a matching epilog and unwind info.
DescribeProlog(cIntRegArgs, cFpRegArgs, cbStackSpace);


// N.B Despite the range of a jump with a sub sp is 4KB, we're limiting to 504 to save from emitting right prolog that's
// expressable in unwind codes efficiently. The largest offset in typical unwindinfo encodings that we use is 504.
// so allocations larger than 504 bytes would require setting the SP in multiple strides, which would complicate both
// prolog and epilog generation as well as unwindinfo generation.
_ASSERTE((totalPaddedFrameSize <= 504) && "NYI:RISCV64 Implement StubLinker prologs with larger than 504 bytes of frame size");
if (totalPaddedFrameSize > 504)
COMPlusThrow(kNotSupportedException);

// Here is how the stack would look like (Stack grows up)
// [Low Address]
// +------------+
// SP -> | | <-+
// : : | Stack Frame, (i.e outgoing arguments) including padding
// | | <-+
// +------------+
// | FP |
// +------------+
// | RA |
// +------------+
// | F10 | <-+
// +------------+ |
// : : | Fp Args
// +------------+ |
// | F17 | <-+
// +------------+
// | X10 | <-+
// +------------+ |
// : : | Int Args
// +------------+ |
// | X17 | <-+
// +------------+
// Old SP -> |[Stack Args]|
// [High Address]

// Regarding the order of operations in the prolog and epilog;
// If the prolog and the epilog matches each other we can simplify emitting the unwind codes and save a few
// bytes of unwind codes by making prolog and epilog share the same unwind codes.
// In order to do that we need to make the epilog be the reverse of the prolog.
// But we wouldn't want to add restoring of the argument registers as that's completely unnecessary.
// Besides, saving argument registers cannot be expressed by the unwind code encodings.
// So, we'll push saving the argument registers to the very last in the prolog, skip restoring it in epilog,
// and also skip reporting it to the OS.
//
// Another bit that we can save is resetting the frame pointer.
// This is not necessary when the SP doesn't get modified beyond prolog and epilog. (i.e no alloca/localloc)
// And in that case we don't need to report setting up the FP either.

// 1. Relocate SP
EmitSubImm(RegSp, RegSp, totalPaddedFrameSize);

unsigned cbOffset = 2 * sizeof(void*) + cbStackSpace; // 2 is for fp, ra

// 2. Store FP/RA
EmitStore(RegFp, RegSp, cbStackSpace);
EmitStore(RegRa, RegSp, cbStackSpace + sizeof(void*));

// 3. Set the frame pointer
EmitMovReg(RegFp, RegSp);

// 4. Store floating point argument registers
_ASSERTE(cFpRegArgs <= 8);
for (unsigned short i = 0; i < cFpRegArgs; i++)
EmitStore(FloatReg(i + 10), RegSp, cbOffset + i * sizeof(void*));

// 5. Store int argument registers
cbOffset += cFpRegArgs * sizeof(void*);
_ASSERTE(cIntRegArgs <= 8);
for (unsigned short i = 0 ; i < cIntRegArgs; i++)
EmitStore(IntReg(i + 10), RegSp, cbOffset + i * sizeof(void*));
}

void StubLinkerCPU::EmitEpilog()
{
_ASSERTE(m_fProlog);

// 5. Restore int argument registers
// nop: We don't need to. They are scratch registers

// 4. Restore floating point argument registers
// nop: We don't need to. They are scratch registers

// 3. Restore the SP from FP
// N.B. We're assuming that the stublinker stubs doesn't do alloca, hence nop

// 2. Restore FP/RA
EmitLoad(RegFp, RegSp, m_cbStackSpace);
EmitLoad(RegRa, RegSp, m_cbStackSpace + sizeof(void*));

// 1. Restore SP
EmitAddImm(RegSp, RegSp, GetStackFrameSize());

// jalr x0, 0(ra)
EmitJumpRegister(RegRa);
}

// Instruction types as per RISC-V Spec, Chapter 24 RV32/64G Instruction Set Listings
static unsigned ITypeInstr(unsigned opcode, unsigned funct3, unsigned rd, unsigned rs1, int imm12)
{
Expand Down
31 changes: 30 additions & 1 deletion src/coreclr/vm/stublink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,12 @@ StubLinker::StubLinker()
m_cbStackFrame = 0;
m_fPushArgRegs = FALSE;
#endif
#ifdef TARGET_RISCV64
m_fProlog = FALSE;
m_cIntRegArgs = 0;
m_cFpRegArgs = 0;
m_cbStackSpace = 0;
#endif
#ifdef STUBLINKER_GENERATES_UNWIND_INFO
#ifdef _DEBUG
m_pUnwindInfoCheckLabel = NULL;
Expand Down Expand Up @@ -1891,7 +1897,30 @@ UINT StubLinker::GetStackFrameSize()
return m_cbStackSpace + (2 + m_cCalleeSavedRegs + m_cIntRegArgs + m_cVecRegArgs)*sizeof(void*);
}

#endif // ifdef TARGET_ARM, elif defined(TARGET_ARM64)
#elif defined(TARGET_RISCV64)
void StubLinker::DescribeProlog(UINT cIntRegArgs, UINT cFpRegArgs, UINT cbStackSpace)
{
m_fProlog = TRUE;
m_cIntRegArgs = cIntRegArgs;
m_cFpRegArgs = cFpRegArgs;
m_cbStackSpace = cbStackSpace;
}

UINT StubLinker::GetSavedRegArgsOffset()
{
_ASSERTE(m_fProlog);
// This is the offset from SP
// We're assuming that the stublinker will push the arg registers to the bottom of the stack frame
return m_cbStackSpace + 2 * sizeof(void*); // 2 is for FP and LR
}

UINT StubLinker::GetStackFrameSize()
{
_ASSERTE(m_fProlog);
return m_cbStackSpace + (2 + m_cIntRegArgs + m_cFpRegArgs) * sizeof(void*);
}

#endif // ifdef TARGET_ARM, elif defined(TARGET_ARM64), elif defined(TARGET_RISCV64)

#endif // #ifndef DACCESS_COMPILE

Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/vm/stublink.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,10 @@ class StubLinker
void DescribeProlog(UINT cIntRegArgs, UINT cVecRegArgs, UINT cCalleeSavedRegs, UINT cbStackFrame);
UINT GetSavedRegArgsOffset();
UINT GetStackFrameSize();
#elif defined(TARGET_RISCV64)
void DescribeProlog(UINT cIntRegArgs, UINT cVecRegArgs, UINT cbStackFrame);
UINT GetSavedRegArgsOffset();
UINT GetStackFrameSize();
#endif

//===========================================================================
Expand Down Expand Up @@ -304,6 +308,14 @@ class StubLinker
UINT m_cbStackSpace; // Additional stack space for return buffer and stack alignment
#endif // TARGET_ARM64

#ifdef TARGET_RISCV64
protected:
BOOL m_fProlog; // True if DescribeProlog has been called
UINT m_cIntRegArgs; // Count of int register arguments (x10 - x17)
UINT m_cFpRegArgs; // Count of FP register arguments (f10 - f17)
UINT m_cbStackSpace; // Additional stack space for return buffer and stack alignment
#endif // TARGET_RISCV64

#ifdef STUBLINKER_GENERATES_UNWIND_INFO

#ifdef _DEBUG
Expand Down

0 comments on commit e7ab2f6

Please sign in to comment.