Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit f72025c

Browse files
Add support for BSWAP intrinsic (#18398)
With this change, the JIT will recognize a call to BinaryPrimitives.ReverseEndianness and will emit a bswap instruction. This logic is currently only hooked up for x86 and x64; ARM still uses fallback logic. If the JIT can't emit a bswap instruction (for example, trying to emit a 64-bit bswap in a 32-bit process), it will fall back to a software implementation, so the APIs will work across all architectures.
1 parent 2841758 commit f72025c

15 files changed

+373
-10
lines changed

src/System.Private.CoreLib/shared/System/Buffers/Binary/Reader.cs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,21 @@ public static sbyte ReverseEndianness(sbyte value)
3030
/// <summary>
3131
/// Reverses a primitive value - performs an endianness swap
3232
/// </summary>
33+
[Intrinsic]
3334
[MethodImpl(MethodImplOptions.AggressiveInlining)]
34-
public static short ReverseEndianness(short value)
35-
{
36-
return (short)((value & 0x00FF) << 8 | (value & 0xFF00) >> 8);
37-
}
35+
public static short ReverseEndianness(short value) => (short)ReverseEndianness((ushort)value);
3836

3937
/// <summary>
4038
/// Reverses a primitive value - performs an endianness swap
4139
/// </summary>
40+
[Intrinsic]
4241
[MethodImpl(MethodImplOptions.AggressiveInlining)]
4342
public static int ReverseEndianness(int value) => (int)ReverseEndianness((uint)value);
4443

4544
/// <summary>
4645
/// Reverses a primitive value - performs an endianness swap
4746
/// </summary>
47+
[Intrinsic]
4848
[MethodImpl(MethodImplOptions.AggressiveInlining)]
4949
public static long ReverseEndianness(long value) => (long)ReverseEndianness((ulong)value);
5050

@@ -54,7 +54,7 @@ public static short ReverseEndianness(short value)
5454
/// rather than having to skip byte fields.
5555
/// </summary>
5656
[MethodImpl(MethodImplOptions.AggressiveInlining)]
57-
public static byte ReverseEndianness(byte value)
57+
public static byte ReverseEndianness(byte value)
5858
{
5959
return value;
6060
}
@@ -63,6 +63,7 @@ public static byte ReverseEndianness(byte value)
6363
/// Reverses a primitive value - performs an endianness swap
6464
/// </summary>
6565
[CLSCompliant(false)]
66+
[Intrinsic]
6667
[MethodImpl(MethodImplOptions.AggressiveInlining)]
6768
public static ushort ReverseEndianness(ushort value)
6869
{
@@ -80,6 +81,7 @@ public static ushort ReverseEndianness(ushort value)
8081
/// Reverses a primitive value - performs an endianness swap
8182
/// </summary>
8283
[CLSCompliant(false)]
84+
[Intrinsic]
8385
[MethodImpl(MethodImplOptions.AggressiveInlining)]
8486
public static uint ReverseEndianness(uint value)
8587
{
@@ -113,6 +115,7 @@ public static uint ReverseEndianness(uint value)
113115
/// Reverses a primitive value - performs an endianness swap
114116
/// </summary>
115117
[CLSCompliant(false)]
118+
[Intrinsic]
116119
[MethodImpl(MethodImplOptions.AggressiveInlining)]
117120
public static ulong ReverseEndianness(ulong value)
118121
{

src/jit/codegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
10351035
void genCodeForIndexAddr(GenTreeIndexAddr* tree);
10361036
void genCodeForIndir(GenTreeIndir* tree);
10371037
void genCodeForNegNot(GenTree* tree);
1038+
void genCodeForBswap(GenTree* tree);
10381039
void genCodeForLclVar(GenTreeLclVar* tree);
10391040
void genCodeForLclFld(GenTreeLclFld* tree);
10401041
void genCodeForStoreLclFld(GenTreeLclFld* tree);

src/jit/codegenxarch.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,46 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
520520
genProduceReg(tree);
521521
}
522522

523+
//------------------------------------------------------------------------
524+
// genCodeForBswap: Produce code for a GT_BSWAP / GT_BSWAP16 node.
525+
//
526+
// Arguments:
527+
// tree - the node
528+
//
529+
void CodeGen::genCodeForBswap(GenTree* tree)
530+
{
531+
// TODO: If we're swapping immediately after a read from memory or immediately before
532+
// a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
533+
// the platform supports it.
534+
535+
assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));
536+
537+
regNumber targetReg = tree->gtRegNum;
538+
var_types targetType = tree->TypeGet();
539+
540+
GenTree* operand = tree->gtGetOp1();
541+
assert(operand->isUsedFromReg());
542+
regNumber operandReg = genConsumeReg(operand);
543+
544+
if (operandReg != targetReg)
545+
{
546+
inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
547+
}
548+
549+
if (tree->OperIs(GT_BSWAP))
550+
{
551+
// 32-bit and 64-bit byte swaps use "bswap reg"
552+
inst_RV(INS_bswap, targetReg, targetType);
553+
}
554+
else
555+
{
556+
// 16-bit byte swaps use "ror reg.16, 8"
557+
inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
558+
}
559+
560+
genProduceReg(tree);
561+
}
562+
523563
// Generate code to get the high N bits of a N*N=2N bit multiplication result
524564
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
525565
{
@@ -1562,6 +1602,11 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
15621602
genCodeForNegNot(treeNode);
15631603
break;
15641604

1605+
case GT_BSWAP:
1606+
case GT_BSWAP16:
1607+
genCodeForBswap(treeNode);
1608+
break;
1609+
15651610
case GT_DIV:
15661611
if (varTypeIsFloating(treeNode->TypeGet()))
15671612
{

src/jit/compiler.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9741,6 +9741,8 @@ class GenTreeVisitor
97419741
// Standard unary operators
97429742
case GT_NOT:
97439743
case GT_NEG:
9744+
case GT_BSWAP:
9745+
case GT_BSWAP16:
97449746
case GT_COPY:
97459747
case GT_RELOAD:
97469748
case GT_ARR_LENGTH:

src/jit/compiler.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4312,6 +4312,8 @@ void GenTree::VisitOperands(TVisitor visitor)
43124312
case GT_STORE_LCL_FLD:
43134313
case GT_NOT:
43144314
case GT_NEG:
4315+
case GT_BSWAP:
4316+
case GT_BSWAP16:
43154317
case GT_COPY:
43164318
case GT_RELOAD:
43174319
case GT_ARR_LENGTH:

src/jit/emitxarch.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11018,6 +11018,32 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
1101811018
dst += emitOutputByte(dst, code);
1101911019
break;
1102011020

11021+
case INS_bswap:
11022+
{
11023+
assert(size >= EA_4BYTE && size <= EA_PTRSIZE); // 16-bit BSWAP is undefined
11024+
11025+
// The Intel instruction set reference for BSWAP states that extended registers
11026+
// should be enabled via REX.R, but per Vol. 2A, Sec. 2.2.1.2 (see also Figure 2-7),
11027+
// REX.B should instead be used if the register is encoded in the opcode byte itself.
11028+
// Therefore the default logic of insEncodeReg012 is correct for this case.
11029+
11030+
code = insCodeRR(ins);
11031+
11032+
if (TakesRexWPrefix(ins, size))
11033+
{
11034+
code = AddRexWPrefix(ins, code);
11035+
}
11036+
11037+
// Register...
11038+
unsigned regcode = insEncodeReg012(ins, reg, size, &code);
11039+
11040+
// Output the REX prefix
11041+
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
11042+
11043+
dst += emitOutputWord(dst, code | (regcode << 8));
11044+
break;
11045+
}
11046+
1102111047
case INS_seto:
1102211048
case INS_setno:
1102311049
case INS_setb:

src/jit/gentree.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4748,6 +4748,8 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
47484748
case GT_NOP:
47494749
case GT_RETURN:
47504750
case GT_RETFILT:
4751+
case GT_BSWAP:
4752+
case GT_BSWAP16:
47514753
if (def == this->AsUnOp()->gtOp1)
47524754
{
47534755
*use = &this->AsUnOp()->gtOp1;
@@ -8348,6 +8350,8 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
83488350
case GT_NULLCHECK:
83498351
case GT_PUTARG_REG:
83508352
case GT_PUTARG_STK:
8353+
case GT_BSWAP:
8354+
case GT_BSWAP16:
83518355
#if FEATURE_ARG_SPLIT
83528356
case GT_PUTARG_SPLIT:
83538357
#endif // FEATURE_ARG_SPLIT
@@ -12906,6 +12910,15 @@ GenTree* Compiler::gtFoldExprConst(GenTree* tree)
1290612910
i1 = -i1;
1290712911
break;
1290812912

12913+
case GT_BSWAP:
12914+
i1 = ((i1 >> 24) & 0xFF) | ((i1 >> 8) & 0xFF00) | ((i1 << 8) & 0xFF0000) |
12915+
((i1 << 24) & 0xFF000000);
12916+
break;
12917+
12918+
case GT_BSWAP16:
12919+
i1 = ((i1 >> 8) & 0xFF) | ((i1 << 8) & 0xFF00);
12920+
break;
12921+
1290912922
case GT_CAST:
1291012923
// assert (genActualType(tree->CastToType()) == tree->gtType);
1291112924
switch (tree->CastToType())
@@ -13041,6 +13054,13 @@ GenTree* Compiler::gtFoldExprConst(GenTree* tree)
1304113054
lval1 = -lval1;
1304213055
break;
1304313056

13057+
case GT_BSWAP:
13058+
lval1 = ((lval1 >> 56) & 0xFF) | ((lval1 >> 40) & 0xFF00) | ((lval1 >> 24) & 0xFF0000) |
13059+
((lval1 >> 8) & 0xFF000000) | ((lval1 << 8) & 0xFF00000000) |
13060+
((lval1 << 24) & 0xFF0000000000) | ((lval1 << 40) & 0xFF000000000000) |
13061+
((lval1 << 56) & 0xFF00000000000000);
13062+
break;
13063+
1304413064
case GT_CAST:
1304513065
assert(genActualType(tree->CastToType()) == tree->gtType);
1304613066
switch (tree->CastToType())

src/jit/gtlist.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ GTNODE(INIT_VAL , GenTreeOp ,0,GTK_UNOP) // Initi
101101

102102
GTNODE(RUNTIMELOOKUP , GenTreeRuntimeLookup, 0,GTK_UNOP|GTK_EXOP) // Runtime handle lookup
103103

104+
GTNODE(BSWAP , GenTreeOp ,0,GTK_UNOP) // Byte swap (32-bit or 64-bit)
105+
GTNODE(BSWAP16 , GenTreeOp ,0,GTK_UNOP) // Byte swap (16-bit)
106+
104107
//-----------------------------------------------------------------------------
105108
// Binary operators (2 operands):
106109
//-----------------------------------------------------------------------------

src/jit/importer.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3934,6 +3934,45 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
39343934
break;
39353935
}
39363936

3937+
case NI_System_Buffers_Binary_BinaryPrimitives_ReverseEndianness:
3938+
{
3939+
assert(sig->numArgs == 1);
3940+
3941+
// We expect the return type of the ReverseEndianness routine to match the type of the
3942+
// one and only argument to the method. We use a special instruction for 16-bit
3943+
// BSWAPs since on x86 processors this is implemented as ROR <16-bit reg>, 8. Additionally,
3944+
// we only emit 64-bit BSWAP instructions on 64-bit archs; if we're asked to perform a
3945+
// 64-bit byte swap on a 32-bit arch, we'll fall to the default case in the switch block below.
3946+
3947+
switch (sig->retType)
3948+
{
3949+
case CorInfoType::CORINFO_TYPE_SHORT:
3950+
case CorInfoType::CORINFO_TYPE_USHORT:
3951+
retNode = gtNewOperNode(GT_BSWAP16, callType, impPopStack().val);
3952+
break;
3953+
3954+
case CorInfoType::CORINFO_TYPE_INT:
3955+
case CorInfoType::CORINFO_TYPE_UINT:
3956+
#ifdef _TARGET_64BIT_
3957+
case CorInfoType::CORINFO_TYPE_LONG:
3958+
case CorInfoType::CORINFO_TYPE_ULONG:
3959+
#endif // _TARGET_64BIT_
3960+
retNode = gtNewOperNode(GT_BSWAP, callType, impPopStack().val);
3961+
break;
3962+
3963+
default:
3964+
// This default case gets hit on 32-bit archs when a call to a 64-bit overload
3965+
// of ReverseEndianness is encountered. In that case we'll let JIT treat this as a standard
3966+
// method call, where the implementation decomposes the operation into two 32-bit
3967+
// bswap routines. If the input to the 64-bit function is a constant, then we rely
3968+
// on inlining + constant folding of 32-bit bswaps to effectively constant fold
3969+
// the 64-bit call site.
3970+
break;
3971+
}
3972+
3973+
break;
3974+
}
3975+
39373976
default:
39383977
break;
39393978
}
@@ -4072,6 +4111,15 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
40724111
result = NI_Math_Round;
40734112
}
40744113
}
4114+
#if defined(_TARGET_XARCH_) // We currently only support BSWAP on x86
4115+
else if (strcmp(namespaceName, "System.Buffers.Binary") == 0)
4116+
{
4117+
if ((strcmp(className, "BinaryPrimitives") == 0) && (strcmp(methodName, "ReverseEndianness") == 0))
4118+
{
4119+
result = NI_System_Buffers_Binary_BinaryPrimitives_ReverseEndianness;
4120+
}
4121+
}
4122+
#endif // !defined(_TARGET_XARCH_)
40754123
else if (strcmp(namespaceName, "System.Collections.Generic") == 0)
40764124
{
40774125
if ((strcmp(className, "EqualityComparer`1") == 0) && (strcmp(methodName, "get_Default") == 0))

src/jit/instrsxarch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE,
6363
INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_FLAGS_WritesFlags)
6464
INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_FLAGS_WritesFlags)
6565

66+
// Multi-byte opcodes without modrm are represented in mixed endian fashion.
67+
// See comment around quarter way through this file for more information.
68+
INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_FLAGS_None)
69+
6670
// id nm um mr mi rm a4 flags
6771
INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_FLAGS_WritesFlags)
6872
INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_FLAGS_WritesFlags)

0 commit comments

Comments
 (0)