Skip to content

ARM64-SVE: Add Not, InsertIntoShiftedVector #103725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2114,6 +2114,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt);
break;
}

case NI_Sve_Compute8BitAddresses:
case NI_Sve_Compute16BitAddresses:
case NI_Sve_Compute32BitAddresses:
Expand All @@ -2127,6 +2128,22 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
INS_SCALABLE_OPTS_LSL_N);
break;
}

case NI_Sve_InsertIntoShiftedVector:
{
assert(isRMW);
assert(emitter::isFloatReg(op2Reg) == varTypeIsFloating(intrin.baseType));
if (targetReg != op1Reg)
{
assert(targetReg != op2Reg);
GetEmitter()->emitIns_Mov(INS_mov, emitTypeSize(node), targetReg, op1Reg,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work with both op1Reg being gpr or SIMD/FP?

Copy link
Member Author

@amanasifkhalid amanasifkhalid Jun 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, though I'm struggling to get the JIT to use a gpr for op1Reg under the stress modes -- I'm only ever getting the "easy" case, where when op1Reg and targetReg differ, op1Reg is already a vector register, so we're moving from a vector reg to a vector reg. Since the first argument in Sve.InsertIntoShiftedVector<T> is of type Vector<T>, we'd expect op1Reg to always be a vector register, right? I could add an assert here clarify this. (Though looking at emitIns_Mov, it does have a path for emitting mov instructions from a gpr to a vector register.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I meant that for op2Reg. Sorry. So ideally, where you have double or float as 2nd argument, we should have op2Reg as SIMD/floating point and otherwise should be gpr. Can you verify that please? For op1Reg it should always be scalable register and we already assert that in emitter.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, I've added an assert to check this. Stress tests for unoptimized and optimized tests are still passing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind sharing small section of disassembly for both the categories?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not at all. Here's a snippet from the double tests:

            ldr     q16, [x0]
            ldr     d17, [fp, #0x30]    // [V01 loc0]
            insr    z16.d, d17

And from the uint tests:

            ldr     q16, [x0]
            ldr     w0, [fp, #0x34]     // [V01 loc0]
            insr    z16.s, w0

/* canSkip */ true);
}

GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op2Reg, opt);
break;
}

default:
unreached();
}
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ HARDWARE_INTRINSIC(Sve, GatherVectorUInt32WithByteOffsetsZeroExtend,
HARDWARE_INTRINSIC(Sve, GatherVectorUInt32ZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorWithByteOffsets, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GetActiveElementCount, -1, 2, true, {INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_ExplicitMaskedOperation)
HARDWARE_INTRINSIC(Sve, InsertIntoShiftedVector, -1, 2, true, {INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, LeadingSignCount, -1, -1, false, {INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LeadingZeroCount, -1, -1, false, {INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVector, -1, 2, true, {INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down Expand Up @@ -159,6 +160,7 @@ HARDWARE_INTRINSIC(Sve, MultiplyBySelectedScalar,
HARDWARE_INTRINSIC(Sve, MultiplyExtended, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmulx, INS_sve_fmulx}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MultiplySubtract, -1, -1, false, {INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, Negate, -1, -1, false, {INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_fneg, INS_sve_fneg}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Not, -1, -1, false, {INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Or, -1, -1, false, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, false, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, false, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2365,6 +2365,79 @@ internal Arm64() { }
public static unsafe ulong GetActiveElementCount(Vector<ulong> mask, Vector<ulong> from) { throw new PlatformNotSupportedException(); }


/// Insert scalar into shifted vector

/// <summary>
/// svuint8_t svinsr[_n_u8](svuint8_t op1, uint8_t op2)
/// INSR Ztied1.B, Wop2
/// INSR Ztied1.B, Bop2
/// </summary>
public static unsafe Vector<byte> InsertIntoShiftedVector(Vector<byte> left, byte right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat64_t svinsr[_n_f64](svfloat64_t op1, float64_t op2)
/// INSR Ztied1.D, Xop2
/// INSR Ztied1.D, Dop2
/// </summary>
public static unsafe Vector<double> InsertIntoShiftedVector(Vector<double> left, double right) { throw new PlatformNotSupportedException(); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This API, as per the docs is implementing both INSR - SIMD&FP and INSR - scalar, but it is not clear to me, how we decide which one to pick. @a74nh - any idea?


/// <summary>
/// svint16_t svinsr[_n_s16](svint16_t op1, int16_t op2)
/// INSR Ztied1.H, Wop2
/// INSR Ztied1.H, Hop2
/// </summary>
public static unsafe Vector<short> InsertIntoShiftedVector(Vector<short> left, short right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint32_t svinsr[_n_s32](svint32_t op1, int32_t op2)
/// INSR Ztied1.S, Wop2
/// INSR Ztied1.S, Sop2
/// </summary>
public static unsafe Vector<int> InsertIntoShiftedVector(Vector<int> left, int right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint64_t svinsr[_n_s64](svint64_t op1, int64_t op2)
/// INSR Ztied1.D, Xop2
/// INSR Ztied1.D, Dop2
/// </summary>
public static unsafe Vector<long> InsertIntoShiftedVector(Vector<long> left, long right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint8_t svinsr[_n_s8](svint8_t op1, int8_t op2)
/// INSR Ztied1.B, Wop2
/// INSR Ztied1.B, Bop2
/// </summary>
public static unsafe Vector<sbyte> InsertIntoShiftedVector(Vector<sbyte> left, sbyte right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat32_t svinsr[_n_f32](svfloat32_t op1, float32_t op2)
/// INSR Ztied1.S, Wop2
/// INSR Ztied1.S, Sop2
/// </summary>
public static unsafe Vector<float> InsertIntoShiftedVector(Vector<float> left, float right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint16_t svinsr[_n_u16](svuint16_t op1, uint16_t op2)
/// INSR Ztied1.H, Wop2
/// INSR Ztied1.H, Hop2
/// </summary>
public static unsafe Vector<ushort> InsertIntoShiftedVector(Vector<ushort> left, ushort right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint32_t svinsr[_n_u32](svuint32_t op1, uint32_t op2)
/// INSR Ztied1.S, Wop2
/// INSR Ztied1.S, Sop2
/// </summary>
public static unsafe Vector<uint> InsertIntoShiftedVector(Vector<uint> left, uint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint64_t svinsr[_n_u64](svuint64_t op1, uint64_t op2)
/// INSR Ztied1.D, Xop2
/// INSR Ztied1.D, Dop2
/// </summary>
public static unsafe Vector<ulong> InsertIntoShiftedVector(Vector<ulong> left, ulong right) { throw new PlatformNotSupportedException(); }


/// Count leading sign bits

/// <summary>
Expand Down Expand Up @@ -4034,6 +4107,96 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<float> Negate(Vector<float> value) { throw new PlatformNotSupportedException(); }

/// Bitwise invert

/// <summary>
/// svuint8_t svnot[_u8]_m(svuint8_t inactive, svbool_t pg, svuint8_t op)
/// NOT Ztied.B, Pg/M, Zop.B
/// svuint8_t svnot[_u8]_x(svbool_t pg, svuint8_t op)
/// NOT Ztied.B, Pg/M, Ztied.B
/// svuint8_t svnot[_u8]_z(svbool_t pg, svuint8_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<byte> Not(Vector<byte> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint16_t svnot[_s16]_m(svint16_t inactive, svbool_t pg, svint16_t op)
/// NOT Ztied.H, Pg/M, Zop.H
/// svint16_t svnot[_s16]_x(svbool_t pg, svint16_t op)
/// NOT Ztied.H, Pg/M, Ztied.H
/// svint16_t svnot[_s16]_z(svbool_t pg, svint16_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<short> Not(Vector<short> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint32_t svnot[_s32]_m(svint32_t inactive, svbool_t pg, svint32_t op)
/// NOT Ztied.S, Pg/M, Zop.S
/// svint32_t svnot[_s32]_x(svbool_t pg, svint32_t op)
/// NOT Ztied.S, Pg/M, Ztied.S
/// svint32_t svnot[_s32]_z(svbool_t pg, svint32_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<int> Not(Vector<int> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint64_t svnot[_s64]_m(svint64_t inactive, svbool_t pg, svint64_t op)
/// NOT Ztied.D, Pg/M, Zop.D
/// svint64_t svnot[_s64]_x(svbool_t pg, svint64_t op)
/// NOT Ztied.D, Pg/M, Ztied.D
/// svint64_t svnot[_s64]_z(svbool_t pg, svint64_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<long> Not(Vector<long> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint8_t svnot[_s8]_m(svint8_t inactive, svbool_t pg, svint8_t op)
/// NOT Ztied.B, Pg/M, Zop.B
/// svint8_t svnot[_s8]_x(svbool_t pg, svint8_t op)
/// NOT Ztied.B, Pg/M, Ztied.B
/// svint8_t svnot[_s8]_z(svbool_t pg, svint8_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<sbyte> Not(Vector<sbyte> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint16_t svnot[_u16]_m(svuint16_t inactive, svbool_t pg, svuint16_t op)
/// NOT Ztied.H, Pg/M, Zop.H
/// svuint16_t svnot[_u16]_x(svbool_t pg, svuint16_t op)
/// NOT Ztied.H, Pg/M, Ztied.H
/// svuint16_t svnot[_u16]_z(svbool_t pg, svuint16_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<ushort> Not(Vector<ushort> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint32_t svnot[_u32]_m(svuint32_t inactive, svbool_t pg, svuint32_t op)
/// NOT Ztied.S, Pg/M, Zop.S
/// svuint32_t svnot[_u32]_x(svbool_t pg, svuint32_t op)
/// NOT Ztied.S, Pg/M, Ztied.S
/// svuint32_t svnot[_u32]_z(svbool_t pg, svuint32_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<uint> Not(Vector<uint> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint64_t svnot[_u64]_m(svuint64_t inactive, svbool_t pg, svuint64_t op)
/// NOT Ztied.D, Pg/M, Zop.D
/// svuint64_t svnot[_u64]_x(svbool_t pg, svuint64_t op)
/// NOT Ztied.D, Pg/M, Ztied.D
/// svuint64_t svnot[_u64]_z(svbool_t pg, svuint64_t op)
/// svbool_t svnot[_b]_z(svbool_t pg, svbool_t op)
/// EOR Presult.B, Pg/Z, Pop.B, Pg.B
/// </summary>
public static unsafe Vector<ulong> Not(Vector<ulong> value) { throw new PlatformNotSupportedException(); }

/// Or : Bitwise inclusive OR

/// <summary>
Expand Down
Loading
Loading