Description
Background and motivation
It seems like that #87097 lacks intrinsics for compress
instructions with merge-masking.
Merge-masking for vpcompress*
and vcompressp*
instructions works differently than any other instructions' merge-masking. They pass through the values from the source after running out of mask bits set to 1 in the process of compression, instead of passing through the values from the source where the corresponding bit of mask operand set to 1.
For example:
vmovdqu16 zmm0, zmmword ptr [rip - 64] ; [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
vpternlogd zmm1, zmm1, zmm1, 0xff
mov eax, 0xA08A28AC ; [0b1010_0000_1000_1010_0010_1000_1010_1100]
kmovd k1, eax
vpcompressw zmm1{k1}, zmm0
This code produces 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535
in zmm1
.
This behavior cannot be correctly described by something like Avx512BW.BlendVariable(zmm1, Avx512Vbmi2.Compress(zmm0, k1), k1)
which I will suppose it to be:
vpcompressw zmm0{k1}{z}, zmm0
vpblendmw zmm1{k1}, zmm1, zmm0
If we used that instead of vpcompressw zmm1{k1}, zmm0
, it will produce 65535, 65535, 5, 7, 65535, 13, 65535, 19, 65535, 65535, 65535, 0, 65535, 0, 65535, 65535, 65535, 0, 65535, 0, 65535, 65535, 65535, 0, 65535, 65535, 65535, 65535, 65535, 0, 65535, 0
.
API Proposal
namespace System.Runtime.Intrinsics.X86
{
public static partial class Avx512F
{
public static Vector512<double> CompressMerge(Vector512<double> mergeSource, Vector512<double> value, Vector512<double> mask);
public static Vector512<ulong> CompressMerge(Vector512<ulong> mergeSource, Vector512<ulong> value, Vector512<ulong> mask);
public static Vector512<long> CompressMerge(Vector512<long> mergeSource, Vector512<long> value, Vector512<long> mask);
public static Vector512<float> CompressMerge(Vector512<float> mergeSource, Vector512<float> value, Vector512<float> mask);
public static Vector512<uint> CompressMerge(Vector512<uint> mergeSource, Vector512<uint> value, Vector512<uint> mask);
public static Vector512<int> CompressMerge(Vector512<int> mergeSource, Vector512<int> value, Vector512<int> mask);
public static partial class VL
{
public static Vector256<double> CompressMerge(Vector256<double> mergeSource, Vector256<double> value, Vector256<double> mask);
public static Vector256<ulong> CompressMerge(Vector256<ulong> mergeSource, Vector256<ulong> value, Vector256<ulong> mask);
public static Vector256<long> CompressMerge(Vector256<long> mergeSource, Vector256<long> value, Vector256<long> mask);
public static Vector256<float> CompressMerge(Vector256<float> mergeSource, Vector256<float> value, Vector256<float> mask);
public static Vector256<uint> CompressMerge(Vector256<uint> mergeSource, Vector256<uint> value, Vector256<uint> mask);
public static Vector256<int> CompressMerge(Vector256<int> mergeSource, Vector256<int> value, Vector256<int> mask);
public static Vector128<double> CompressMerge(Vector128<double> mergeSource, Vector128<double> value, Vector128<double> mask);
public static Vector128<ulong> CompressMerge(Vector128<ulong> mergeSource, Vector128<ulong> value, Vector128<ulong> mask);
public static Vector128<long> CompressMerge(Vector128<long> mergeSource, Vector128<long> value, Vector128<long> mask);
public static Vector128<float> CompressMerge(Vector128<float> mergeSource, Vector128<float> value, Vector128<float> mask);
public static Vector128<uint> CompressMerge(Vector128<uint> mergeSource, Vector128<uint> value, Vector128<uint> mask);
public static Vector128<int> CompressMerge(Vector128<int> mergeSource, Vector128<int> value, Vector128<int> mask);
}
}
public abstract class Avx512Vbmi2 : Avx512BW
{
public static Vector512<ushort> CompressMerge(Vector512<ushort> mergeSource, Vector512<ushort> value, Vector512<ushort> mask);
public static Vector512<short> CompressMerge(Vector512<short> mergeSource, Vector512<short> value, Vector512<short> mask);
public static Vector512<byte> CompressMerge(Vector512<byte> mergeSource, Vector512<byte> value, Vector512<byte> mask);
public static Vector512<sbyte> CompressMerge(Vector512<sbyte> mergeSource, Vector512<sbyte> value, Vector512<sbyte> mask);
public abstract class VL : Avx512BW.VL
{
public static new bool IsSupported { get; }
public static Vector256<ushort> CompressMerge(Vector256<ushort> mergeSource, Vector256<ushort> value, Vector256<ushort> mask);
public static Vector256<short> CompressMerge(Vector256<short> mergeSource, Vector256<short> value, Vector256<short> mask);
public static Vector256<byte> CompressMerge(Vector256<byte> mergeSource, Vector256<byte> value, Vector256<byte> mask);
public static Vector256<sbyte> CompressMerge(Vector256<sbyte> mergeSource, Vector256<sbyte> value, Vector256<sbyte> mask);
public static Vector128<ushort> CompressMerge(Vector128<ushort> mergeSource, Vector128<ushort> value, Vector128<ushort> mask);
public static Vector128<short> CompressMerge(Vector128<short> mergeSource, Vector128<short> value, Vector128<short> mask);
public static Vector128<byte> CompressMerge(Vector128<byte> mergeSource, Vector128<byte> value, Vector128<byte> mask);
public static Vector128<sbyte> CompressMerge(Vector128<sbyte> mergeSource, Vector128<sbyte> value, Vector128<sbyte> mask);
}
}
}
API Usage
public static Vector512<float> A(Vector512<float> v)
{
var zmm0 = v;
var k1 = Avx512F.Compare(zmm0, zmm0, FloatComparisonMode.OrderedNonSignaling);
var zmm1 = Vector512.Create(MathF.PI);
return Avx512F.CompressMerge(zmm1, zmm0, k1); // vcompressps zmm1{k1}, zmm0
}
Alternative Designs
I wonder if it's better to overload Compress
with an additional mergeSource
argument.
Risks
None