Skip to content

[RISC-V] Use zero register as argument for atomics #112693

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 32 additions & 36 deletions src/coreclr/jit/codegenriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1091,31 +1091,23 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
double constValue = tree->AsDblCon()->DconValue();

assert(emitter::isFloatReg(targetReg));

// Make sure we use "fmv.w.x reg, zero" only for positive zero (0.0) and not for negative zero (-0.0)
if (FloatingPointUtils::isPositiveZero(constValue))
{
// A faster/smaller way to generate 0.0
// We will just zero out the entire register for both float and double
emit->emitIns_R_R(size == EA_4BYTE ? INS_fmv_w_x : INS_fmv_d_x, size, targetReg, REG_R0);
break;
}

int64_t bits =
(size == EA_4BYTE)
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(constValue))
: (int64_t)BitOperations::DoubleToUInt64Bits(constValue);
bool fitsInLui = ((bits & 0xfff) == 0) && emitter::isValidSimm20(bits >> 12);
if (fitsInLui || emitter::isValidSimm12(bits)) // can we synthesize bits with a single instruction?
int64_t bits;
if (emitter::isSingleInstructionFpImm(constValue, size, &bits))
{
regNumber temp = internalRegisters.GetSingle(tree);
if (fitsInLui)
regNumber temp = REG_ZERO;
if (bits != 0)
{
emit->emitIns_R_I(INS_lui, size, temp, bits >> 12);
}
else
{
emit->emitIns_R_R_I(INS_addi, size, temp, REG_ZERO, bits);
temp = internalRegisters.GetSingle(tree);
if (emitter::isValidSimm12(bits))
{
emit->emitIns_R_R_I(INS_addi, size, temp, REG_ZERO, bits);
}
else
{
int64_t upperBits = bits >> 12;
assert((upperBits << 12) == bits);
emit->emitIns_R_I(INS_lui, size, temp, upperBits);
}
}

emit->emitIns_R_R(size == EA_4BYTE ? INS_fmv_w_x : INS_fmv_d_x, size, targetReg, temp);
Expand Down Expand Up @@ -2371,12 +2363,12 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)

GenTree* data = treeNode->AsOp()->gtOp2;
GenTree* addr = treeNode->AsOp()->gtOp1;
regNumber dataReg = data->GetRegNum();
regNumber dataReg = !data->isContained() ? data->GetRegNum() : REG_ZERO;
regNumber addrReg = addr->GetRegNum();
regNumber targetReg = treeNode->GetRegNum();
if (targetReg == REG_NA)
{
targetReg = REG_R0;
targetReg = REG_ZERO;
}

genConsumeAddress(addr);
Expand All @@ -2385,8 +2377,6 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
emitAttr dataSize = emitActualTypeSize(data);
bool is4 = (dataSize == EA_4BYTE);

assert(!data->isContainedIntOrIImmed());

instruction ins = INS_none;
switch (treeNode->gtOper)
{
Expand All @@ -2407,7 +2397,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
}
GetEmitter()->emitIns_R_R_R(ins, dataSize, targetReg, addrReg, dataReg);

if (targetReg != REG_R0)
if (targetReg != REG_ZERO)
{
genProduceReg(treeNode);
}
Expand All @@ -2430,9 +2420,19 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)

regNumber target = treeNode->GetRegNum();
regNumber loc = locOp->GetRegNum();
regNumber val = valOp->GetRegNum();
regNumber comparand = comparandOp->GetRegNum();
regNumber storeErr = internalRegisters.Extract(treeNode, RBM_ALLINT);
regNumber val = !valOp->isContained() ? valOp->GetRegNum() : REG_ZERO;
regNumber comparand = REG_ZERO;
if (!comparandOp->isContained())
{
comparand = comparandOp->GetRegNum();
if (comparandOp->TypeIs(TYP_INT, TYP_UINT))
{
regNumber signExtendedComparand = internalRegisters.Extract(treeNode);
GetEmitter()->emitIns_R_R(INS_sext_w, EA_4BYTE, signExtendedComparand, comparand);
comparand = signExtendedComparand;
}
}
regNumber storeErr = internalRegisters.GetSingle(treeNode);

// Register allocator should have extended the lifetimes of all input and internal registers
// They should all be different
Expand All @@ -2443,16 +2443,12 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
noway_assert(loc != val);
noway_assert(loc != comparand);
noway_assert(loc != storeErr);
noway_assert(val != comparand);
noway_assert((val != comparand) || (val == REG_ZERO));
noway_assert(val != storeErr);
noway_assert(comparand != storeErr);
noway_assert(target != REG_NA);
noway_assert(storeErr != REG_NA);

assert(locOp->isUsedFromReg());
assert(valOp->isUsedFromReg());
assert(!comparandOp->isUsedFromMemory());

genConsumeAddress(locOp);
genConsumeRegs(valOp);
genConsumeRegs(comparandOp);
Expand Down
6 changes: 5 additions & 1 deletion src/coreclr/jit/emitriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,11 @@ void emitter::emitIns_R_R_R(
{
code |= 0x7 << 12;
}
else if (INS_lr_w <= ins && ins <= INS_amomaxu_d)
else if (ins == INS_sc_w || ins == INS_sc_d)
{
code |= 0b10 << 25; // release ordering, it ends the lr-sc loop
}
else if ((ins == INS_lr_w || ins == INS_lr_d) || (INS_amoswap_w <= ins && ins <= INS_amomaxu_d))
{
// For now all atomics are seq. consistent as Interlocked.* APIs don't expose acquire/release ordering
code |= 0b11 << 25;
Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/jit/emitriscv64.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,26 @@ static bool isValidSimm32(ssize_t value)
return (-(((ssize_t)1) << 31) - 0x800) <= value && value < (((ssize_t)1) << 31) - 0x800;
}

//------------------------------------------------------------------------
// isSingleInstructionFpImm: checks if the floating-point constant can be synthesized with one instruction
//
// Arguments:
// value - the constant to be imm'ed
// size - size of the target immediate
// outBits - [out] the bits of the immediate
//
// Return Value:
// Whether the floating-point immediate can be synthesized with one instruction
//
static bool isSingleInstructionFpImm(double value, emitAttr size, int64_t* outBits)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be more useful to return out some enum that indicates if we're doing, none, Simm12, or Simm20 instead of the bool?

There is significant coupling of the callers to this and them subsequently doing the same isValidSim12 and/or isValidSimm20 to emit the actual code. Might be really nice to return which and not test the bits there too...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it but it complicated the signature of this simple helper, besides LSRA doesn't need to know which instruction will be used.

{
assert(size == EA_4BYTE || size == EA_8BYTE);
*outBits = (size == EA_4BYTE)
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(value))
: (int64_t)BitOperations::DoubleToUInt64Bits(value);
return isValidSimm12(*outBits) || (((*outBits & 0xfff) == 0) && isValidSimm20(*outBits >> 12));
}

// Returns the number of bits used by the given 'size'.
inline static unsigned getBitWidth(emitAttr size)
{
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -644,12 +644,14 @@ GenTree* Lowering::LowerNode(GenTree* node)

#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
case GT_CMPXCHG:
RISCV64_ONLY(CheckImmedAndMakeContained(node, node->AsCmpXchg()->Data()));
CheckImmedAndMakeContained(node, node->AsCmpXchg()->Comparand());
break;

case GT_XORR:
case GT_XAND:
case GT_XADD:
case GT_XCHG:
CheckImmedAndMakeContained(node, node->AsOp()->gtOp2);
break;
#elif defined(TARGET_XARCH)
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/jit/lowerriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode) const
case GT_JCMP:
return true;

case GT_CMPXCHG:
case GT_XORR:
case GT_XAND:
case GT_XADD:
case GT_XCHG:
case GT_STORE_LCL_FLD:
case GT_STORE_LCL_VAR:
if (immVal == 0)
Expand Down
70 changes: 49 additions & 21 deletions src/coreclr/jit/lsrariscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,20 +144,11 @@ int LinearScan::BuildNode(GenTree* tree)
case GT_CNS_DBL:
{
emitAttr size = emitActualTypeSize(tree);

double constValue = tree->AsDblCon()->DconValue();
if (!FloatingPointUtils::isPositiveZero(constValue))
int64_t bits;
if (emitter::isSingleInstructionFpImm(tree->AsDblCon()->DconValue(), size, &bits) && bits != 0)
{
int64_t bits =
(size == EA_4BYTE)
? (int32_t)BitOperations::SingleToUInt32Bits(FloatingPointUtils::convertToSingle(constValue))
: (int64_t)BitOperations::DoubleToUInt64Bits(constValue);
bool fitsInLui = ((bits & 0xfff) == 0) && emitter::isValidSimm20(bits >> 12);
if (fitsInLui || emitter::isValidSimm12(bits)) // can we synthesize bits with a single instruction?
{
buildInternalIntRegisterDefForNode(tree);
buildInternalRegisterUses();
}
buildInternalIntRegisterDefForNode(tree);
buildInternalRegisterUses();
}
}
FALLTHROUGH;
Expand Down Expand Up @@ -455,16 +446,44 @@ int LinearScan::BuildNode(GenTree* tree)
case GT_CMPXCHG:
{
GenTreeCmpXchg* cas = tree->AsCmpXchg();
assert(!cas->Comparand()->isContained());
srcCount = 3;
assert(dstCount == 1);

buildInternalIntRegisterDefForNode(tree); // temp reg for store conditional error
srcCount = 1;
// Extend lifetimes of argument regs because they may be reused during retries
assert(!cas->Addr()->isContained());
setDelayFree(BuildUse(cas->Addr()));
setDelayFree(BuildUse(cas->Data()));
setDelayFree(BuildUse(cas->Comparand()));

GenTree* data = cas->Data();
if (!data->isContained())
{
srcCount++;
setDelayFree(BuildUse(data));
}
else
{
assert(data->IsIntegralConst(0));
}

GenTree* comparand = cas->Comparand();
if (!comparand->isContained())
{
srcCount++;
RefPosition* use = BuildUse(comparand);
if (comparand->TypeIs(TYP_INT, TYP_UINT))
{
buildInternalIntRegisterDefForNode(tree); // temp reg for sign-extended comparand
}
else
{
setDelayFree(use);
}
}
else
{
assert(comparand->IsIntegralConst(0));
}

buildInternalIntRegisterDefForNode(tree); // temp reg for store conditional error
// Internals may not collide with target
setInternalRegsDelayFree = true;
buildInternalRegisterUses();
Expand All @@ -484,11 +503,20 @@ int LinearScan::BuildNode(GenTree* tree)
assert(dstCount == (tree->TypeIs(TYP_VOID) ? 0 : 1));
GenTree* addr = tree->gtGetOp1();
GenTree* data = tree->gtGetOp2();
assert(!addr->isContained() && !data->isContained());
srcCount = 2;
assert(!addr->isContained());

srcCount = 1;
BuildUse(addr);
BuildUse(data);
if (!data->isContained())
{
srcCount++;
BuildUse(data);
}
else
{
assert(data->IsIntegralConst(0));
}

if (dstCount == 1)
{
BuildDef(tree);
Expand Down
28 changes: 28 additions & 0 deletions src/tests/JIT/Intrinsics/Interlocked.cs
Original file line number Diff line number Diff line change
Expand Up @@ -300,5 +300,33 @@ private static void ThrowsNRE(Action action, [CallerLineNumber] int line = 0, [C
Console.WriteLine($"Line {line}: test failed (expected: NullReferenceException)");
_errors++;
}

public struct FloatUint
{
public float f;
public uint u;
}

[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public static int DoTestCompareExchangeUnextended(FloatUint comparand)
{
uint val = 1;
uint old = Interlocked.CompareExchange(ref val, 0, comparand.u);
if (val != 0)
return 101;
if (old != 1)
return 102;
return 100;
}

[Fact]
public static int TestCompareExchangeUnextended()
{
// RISC-V comparisons are always full-register so the comparand reg must be extended.
// The integer field of a struct passed according to the floating-point calling convention is not ABI-extended.
// The reflection call poisons its remaining bits in runtime debug mode, making it a better repro.
return (int)typeof(Program).GetMethod("DoTestCompareExchangeUnextended").Invoke(
null, new object[] { new FloatUint{f = 0f, u = 1} });
}
}
}
Loading