Skip to content

Improving the SIMD codegen for SIMD12 load/store #80083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 6, 2023
12 changes: 6 additions & 6 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1086,13 +1086,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// values through an indirection. Note that Vector3 locals allocated on stack would have
// their size rounded to TARGET_POINTER_SIZE (which is 8 bytes on 64-bit targets) and hence
// Vector3 locals could be treated as TYP_SIMD16 while reading/writing.
void genStoreIndTypeSIMD12(GenTree* treeNode);
void genLoadIndTypeSIMD12(GenTree* treeNode);
void genStoreLclTypeSIMD12(GenTree* treeNode);
void genLoadLclTypeSIMD12(GenTree* treeNode);
void genStoreIndTypeSimd12(GenTreeStoreInd* treeNode);
void genLoadIndTypeSimd12(GenTreeIndir* treeNode);
void genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode);
void genLoadLclTypeSimd12(GenTreeLclVarCommon* treeNode);
#ifdef TARGET_X86
void genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg);
void genPutArgStkSIMD12(GenTree* treeNode);
void genStoreSimd12ToStack(regNumber dataReg, regNumber tmpReg);
void genPutArgStkSimd12(GenTreePutArgStk* treeNode);
#endif // TARGET_X86
#endif // FEATURE_SIMD

Expand Down
102 changes: 50 additions & 52 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2798,19 +2798,19 @@ void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
{
var_types targetType = tree->TypeGet();
regNumber targetReg = tree->GetRegNum();
emitter* emit = GetEmitter();
noway_assert(targetType != TYP_STRUCT);

#ifdef FEATURE_SIMD
// storing of TYP_SIMD12 (i.e. Vector3) field
if (tree->TypeGet() == TYP_SIMD12)
if (targetType == TYP_SIMD12)
{
genStoreLclTypeSIMD12(tree);
genStoreLclTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD

regNumber targetReg = tree->GetRegNum();
emitter* emit = GetEmitter();
noway_assert(targetType != TYP_STRUCT);

// record the offset
unsigned offset = tree->GetLclOffs();

Expand Down Expand Up @@ -2909,7 +2909,7 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode)
// storing of TYP_SIMD12 (i.e. Vector3) field
if (targetType == TYP_SIMD12)
{
genStoreLclTypeSIMD12(lclNode);
genStoreLclTypeSimd12(lclNode);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -4172,7 +4172,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
// Storing Vector3 of size 12 bytes through indirection
if (tree->TypeGet() == TYP_SIMD12)
{
genStoreIndTypeSIMD12(tree);
genStoreIndTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -5160,7 +5160,7 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node)
}

//-----------------------------------------------------------------------------
// genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
// genStoreIndTypeSimd12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
// Since Vector3 is not a hardware supported write size, it is performed
// as two writes: 8 byte followed by 4-byte.
//
Expand All @@ -5171,41 +5171,39 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node)
// Return Value:
// None.
//
void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
void CodeGen::genStoreIndTypeSimd12(GenTreeStoreInd* treeNode)
{
assert(treeNode->OperGet() == GT_STOREIND);
assert(treeNode->OperIs(GT_STOREIND));

GenTree* addr = treeNode->AsOp()->gtOp1;
GenTree* data = treeNode->AsOp()->gtOp2;
// Should not require a write barrier
assert(gcInfo.gcIsWriteBarrierCandidate(treeNode) == GCInfo::WBF_NoBarrier);

// addr and data should not be contained.
assert(!data->isContained());

GenTree* addr = treeNode->Addr();
assert(!addr->isContained());

#ifdef DEBUG
// Should not require a write barrier
GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode->AsStoreInd());
assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
#endif
GenTree* data = treeNode->Data();
assert(!data->isContained());

genConsumeOperands(treeNode->AsOp());
regNumber addrReg = genConsumeReg(addr);
regNumber dataReg = genConsumeReg(data);

// Need an additional integer register to extract upper 4 bytes from data.
regNumber tmpReg = treeNode->GetSingleTempReg();
assert(tmpReg != addr->GetRegNum());

// 8-byte write
GetEmitter()->emitIns_R_R(INS_str, EA_8BYTE, data->GetRegNum(), addr->GetRegNum());
GetEmitter()->emitIns_R_R(INS_str, EA_8BYTE, dataReg, addrReg);

// Extract upper 4-bytes from data
GetEmitter()->emitIns_R_R_I(INS_mov, EA_4BYTE, tmpReg, data->GetRegNum(), 2);
GetEmitter()->emitIns_R_R_I(INS_mov, EA_4BYTE, tmpReg, dataReg, 2);

// 4-byte write
GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, tmpReg, addr->GetRegNum(), 8);
GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, tmpReg, addrReg, 8);
}

//-----------------------------------------------------------------------------
// genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
// genLoadIndTypeSimd12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
// Since Vector3 is not a hardware supported write size, it is performed
// as two loads: 8 byte followed by 4-byte.
//
Expand All @@ -5216,34 +5214,33 @@ void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
// Return Value:
// None.
//
void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode)
{
assert(treeNode->OperGet() == GT_IND);

GenTree* addr = treeNode->AsOp()->gtOp1;
regNumber targetReg = treeNode->GetRegNum();
assert(treeNode->OperIs(GT_IND));

GenTree* addr = treeNode->Addr();
assert(!addr->isContained());

regNumber operandReg = genConsumeReg(addr);
regNumber tgtReg = treeNode->GetRegNum();
regNumber addrReg = genConsumeReg(addr);

// Need an additional int register to read upper 4 bytes, which is different from targetReg
regNumber tmpReg = treeNode->GetSingleTempReg();

// 8-byte read
GetEmitter()->emitIns_R_R(INS_ldr, EA_8BYTE, targetReg, addr->GetRegNum());
GetEmitter()->emitIns_R_R(INS_ldr, EA_8BYTE, tgtReg, addrReg);

// 4-byte read
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, tmpReg, addr->GetRegNum(), 8);
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, tmpReg, addrReg, 8);

// Insert upper 4-bytes into data
GetEmitter()->emitIns_R_R_I(INS_mov, EA_4BYTE, targetReg, tmpReg, 2);
GetEmitter()->emitIns_R_R_I(INS_mov, EA_4BYTE, tgtReg, tmpReg, 2);

genProduceReg(treeNode);
}

//-----------------------------------------------------------------------------
// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
// genStoreLclTypeSimd12: store a TYP_SIMD12 (i.e. Vector3) type field.
// Since Vector3 is not a hardware supported write size, it is performed
// as two stores: 8 byte followed by 4-byte.
//
Expand All @@ -5253,23 +5250,20 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
// Return Value:
// None.
//
void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
void CodeGen::genStoreLclTypeSimd12(GenTreeLclVarCommon* treeNode)
{
assert(treeNode->OperIs(GT_STORE_LCL_FLD, GT_STORE_LCL_VAR));

GenTreeLclVarCommon* lclVar = treeNode->AsLclVarCommon();

unsigned offs = lclVar->GetLclOffs();
unsigned varNum = lclVar->GetLclNum();
LclVarDsc* varDsc = compiler->lvaGetDesc(varNum);
unsigned offs = treeNode->GetLclOffs();
unsigned varNum = treeNode->GetLclNum();
assert(varNum < compiler->lvaCount);

GenTree* op1 = lclVar->gtGetOp1();
GenTree* data = treeNode->gtGetOp1();

if (op1->isContained())
if (data->isContained())
{
// This is only possible for a zero-init.
assert(op1->IsIntegralConst(0) || op1->IsVectorZero());
assert(data->IsIntegralConst(0) || data->IsVectorZero());

// store lower 8 bytes
GetEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, REG_ZR, varNum, offs);
Expand All @@ -5279,30 +5273,34 @@ void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)

// Update life after instruction emitted
genUpdateLife(treeNode);

LclVarDsc* varDsc = compiler->lvaGetDesc(varNum);
varDsc->SetRegNum(REG_STK);

return;
}

regNumber targetReg = treeNode->GetRegNum();
regNumber operandReg = genConsumeReg(op1);
regNumber tgtReg = treeNode->GetRegNum();
regNumber dataReg = genConsumeReg(data);

if (targetReg != REG_NA)
if (tgtReg != REG_NA)
{
assert(GetEmitter()->isVectorRegister(targetReg));

// Simply use mov if we move a SIMD12 reg to another SIMD12 reg
inst_Mov(treeNode->TypeGet(), targetReg, operandReg, /* canSkip */ true);
assert(GetEmitter()->isVectorRegister(tgtReg));

inst_Mov(treeNode->TypeGet(), tgtReg, dataReg, /* canSkip */ true);
genProduceReg(treeNode);
}
else
{
// Need an additional integer register to extract upper 4 bytes from data.
regNumber tmpReg = lclVar->GetSingleTempReg();
GetEmitter()->emitStoreSIMD12ToLclOffset(varNum, offs, operandReg, tmpReg);
regNumber tmpReg = treeNode->GetSingleTempReg();
GetEmitter()->emitStoreSimd12ToLclOffset(varNum, offs, dataReg, tmpReg);

// Update life after instruction emitted
genUpdateLife(treeNode);

LclVarDsc* varDsc = compiler->lvaGetDesc(varNum);
varDsc->SetRegNum(REG_STK);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
if (compMacOsArm64Abi() && (treeNode->GetStackByteSize() == 12))
{
regNumber tmpReg = treeNode->GetSingleTempReg();
GetEmitter()->emitStoreSIMD12ToLclOffset(varNumOut, argOffsetOut, srcReg, tmpReg);
GetEmitter()->emitStoreSimd12ToLclOffset(varNumOut, argOffsetOut, srcReg, tmpReg);
argOffsetOut += 12;
}
else
Expand Down Expand Up @@ -1827,7 +1827,7 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
// Handling of Vector3 type values loaded through indirection.
if (tree->TypeGet() == TYP_SIMD12)
{
genLoadIndTypeSIMD12(tree);
genLoadIndTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1880,7 +1880,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk, unsigned outArg
{
// Need an additional integer register to extract upper 4 bytes from data.
regNumber tmpReg = nextArgNode->GetSingleTempReg();
GetEmitter()->emitStoreSIMD12ToLclOffset(outArgVarNum, thisFieldOffset, reg, tmpReg);
GetEmitter()->emitStoreSimd12ToLclOffset(outArgVarNum, thisFieldOffset, reg, tmpReg);
}
else
#endif // FEATURE_SIMD
Expand Down
28 changes: 17 additions & 11 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4768,19 +4768,19 @@ void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
assert(tree->OperIs(GT_LCL_FLD));

var_types targetType = tree->TypeGet();
regNumber targetReg = tree->GetRegNum();

noway_assert(targetReg != REG_NA);

#ifdef FEATURE_SIMD
// Loading of TYP_SIMD12 (i.e. Vector3) field
if (targetType == TYP_SIMD12)
{
genLoadLclTypeSIMD12(tree);
genLoadLclTypeSimd12(tree);
return;
}
#endif

regNumber targetReg = tree->GetRegNum();
noway_assert(targetReg != REG_NA);

noway_assert(targetType != TYP_STRUCT);

emitAttr size = emitTypeSize(targetType);
Expand Down Expand Up @@ -4819,7 +4819,7 @@ void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
// Loading of TYP_SIMD12 (i.e. Vector3) variable
if (tree->TypeGet() == TYP_SIMD12)
{
genLoadLclTypeSIMD12(tree);
genLoadLclTypeSimd12(tree);
return;
}
#endif // defined(FEATURE_SIMD) && defined(TARGET_X86)
Expand Down Expand Up @@ -4848,7 +4848,7 @@ void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
// storing of TYP_SIMD12 (i.e. Vector3) field
if (targetType == TYP_SIMD12)
{
genStoreLclTypeSIMD12(tree);
genStoreLclTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -4949,7 +4949,7 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode)
// storing of TYP_SIMD12 (i.e. Vector3) field
if (targetType == TYP_SIMD12)
{
genStoreLclTypeSIMD12(lclNode);
genStoreLclTypeSimd12(lclNode);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -5131,7 +5131,7 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
// Handling of Vector3 type values loaded through indirection.
if (tree->TypeGet() == TYP_SIMD12)
{
genLoadIndTypeSIMD12(tree);
genLoadIndTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -5170,7 +5170,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
// Storing Vector3 of size 12 bytes through indirection
if (tree->TypeGet() == TYP_SIMD12)
{
genStoreIndTypeSIMD12(tree);
genStoreIndTypeSimd12(tree);
return;
}
#endif // FEATURE_SIMD
Expand Down Expand Up @@ -5336,6 +5336,12 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
break;
}

case NI_Vector128_GetElement:
{
assert(baseType == TYP_FLOAT);
FALLTHROUGH;
}

case NI_SSE2_Extract:
case NI_SSE41_Extract:
case NI_SSE41_X64_Extract:
Expand Down Expand Up @@ -8123,7 +8129,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
if (fieldType == TYP_SIMD12)
{
assert(genIsValidFloatReg(simdTmpReg));
genStoreSIMD12ToStack(argReg, simdTmpReg);
genStoreSimd12ToStack(argReg, simdTmpReg);
}
else
#endif // defined(FEATURE_SIMD)
Expand Down Expand Up @@ -8418,7 +8424,7 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
#if defined(TARGET_X86) && defined(FEATURE_SIMD)
if (putArgStk->isSIMD12())
{
genPutArgStkSIMD12(putArgStk);
genPutArgStkSimd12(putArgStk);
return;
}
#endif // defined(TARGET_X86) && defined(FEATURE_SIMD)
Expand Down
Loading