Skip to content

Port all relevant AVX hardware intrinsics C# APIs from SIMD native algorithms #691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Aug 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0dab6d1
Implemented AVX intrinsics
briancylui Aug 10, 2018
3d76fb1
Implemented performance tests for AVX intrinsics, with some fixes to …
briancylui Aug 15, 2018
6a51bd8
Changes to perf tests in response to feedback
briancylui Aug 16, 2018
1b2cea9
Fixes across multiple files to make unit tests and perf tests work fo…
briancylui Aug 17, 2018
f471726
Implemented new AVX intrinsics that do not involve matrix operations,…
briancylui Aug 17, 2018
41d65f5
Implemented perf tests for AVX via CpuMathUtils class
briancylui Aug 17, 2018
8c34e87
Implemented switching logic for Vector128/256Alignment between SSE an…
briancylui Aug 17, 2018
ddeb655
Changed perf tests to reveal SSE and AVX intrinsics perf separately
briancylui Aug 17, 2018
c776fb0
Fixed access modifiers of private fields
briancylui Aug 17, 2018
df09fe3
Implemented all unit tests for AVX intrinsics that do not involve mat…
briancylui Aug 17, 2018
a9c481f
Implemented unit tests for AVX intrinsics
briancylui Aug 18, 2018
c692a6f
Fixed errors on the RffTransform.CfltAlign const-expression requirement
briancylui Aug 18, 2018
40528e4
Fixed Debug errors by making RffTransform.CfltAlign read-only
briancylui Aug 18, 2018
4d7d8ef
Fixed errors by making CfltAlign static (and read-only)
briancylui Aug 18, 2018
75e4cde
Developed two unit tests for netcoreapp and netstandard to deal with …
briancylui Aug 19, 2018
a763059
Kept only the most recent unit tests which are sufficient for both ne…
briancylui Aug 19, 2018
8bc8cc8
Respond to PR feedback: Style changes
briancylui Aug 20, 2018
f1664fa
Respond to PR feedback: More style changes
briancylui Aug 20, 2018
26ed884
Implemented class inheritance in perf tests to reduce overlapping code
briancylui Aug 20, 2018
31de895
Respond to PR feedback: Changed Sse/AvxIntrinsics from public to inte…
briancylui Aug 21, 2018
f07afb2
Respond to PR feedback: Used env vars to determine whether to use AVX…
briancylui Aug 21, 2018
9a9d272
Respond to PR feedback: Included 0 into consideration for parsing env…
briancylui Aug 21, 2018
c249d88
Respond to PR feedback: env vars, InternalsVisibleTo, and abstract
briancylui Aug 21, 2018
f606432
Respond to PR feedback: Added new comparer class specifically for MatMul
briancylui Aug 21, 2018
27ad829
Respond to PR feedback: Changes to intrinsics
briancylui Aug 23, 2018
0fd78a6
Respond to PR comment: Makes alignment checking consistent in externa…
briancylui Aug 23, 2018
3380ded
Respond to PR feedback: Refactored Sse/AvxIntrinsics helper functions
briancylui Aug 23, 2018
b8d63cc
Made two Sse/AvxIntrinsics helper functions about AlignedArray inline…
briancylui Aug 23, 2018
32a3704
Respond to PR feedback: styles for Vector256Alignment and Avx.GetLowe…
briancylui Aug 29, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,495 changes: 1,495 additions & 0 deletions src/Microsoft.ML.CpuMath/AvxIntrinsics.cs

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values)
#if DEBUG
var mat = values as TMatrix;
Contracts.AssertValue(mat);
Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.Vector128Alignment);
Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
#endif
}

Expand All @@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values)
#if DEBUG
CpuAlignedVector vec = values as CpuAlignedVector;
Contracts.AssertValue(vec);
Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.Vector128Alignment);
Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
#endif
}

Expand Down
176 changes: 151 additions & 25 deletions src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System;

Expand All @@ -10,14 +11,38 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
public static partial class CpuMathUtils
{
// The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
public const int Vector128Alignment = 16;
private const int Vector128Alignment = 16;

// The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
private const int Vector256Alignment = 32;

// The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray
private const int FloatAlignment = 4;

// If neither AVX nor SSE is supported, return basic alignment for a 4-byte float.
[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
public static int GetVectorAlignment()
=> Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment);

public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
{
Contracts.Assert(mat.Size == dst.Size * src.Size);
Contracts.Assert(crun >= 0);

if (Sse.IsSupported)
if (Avx.IsSupported)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should ensure that using 256-bit operations is a net-win for the entire application before "enabling" it by default. For certain workloads, it can downclock the processor and while this generally results in a higher throughput for the specific 256-bit workload, it can potentially cause a perf hit for any non 256-bit workloads that get executed before the processor goes back to its "highest" frequency level.

As per 15.26 Skylake Server Power Management (from the "Intel® 64 and IA-32 Architectures Optimization Reference Manual")
image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not always easy to predict whether a program's performance will improve from building it to target
Intel AVX-512 instructions. Programs that enjoy high performance gains from the use of xmm or ymm
registers may expect performance improvement by moving to the use of zmm registers. However, some
programs that use zmm registers may not gain as much, or may even lose performance. It is recommended
to try multiple build options and measure the performance of the program.

The same logic applies, but to a lesser extent, to AVX-256 instructions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@briancylui - there is one end-to-end benchmark in the repo today - https://github.com/dotnet/machinelearning/tree/master/test/Microsoft.ML.Benchmarks. Can you try running it with/without your changes and post the results?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to @tannergooding 's concern, what will be the end-to-end scenarios or applications to drive the optimizations? https://github.com/dotnet/machinelearning/tree/master/test/Microsoft.ML.Benchmarks lists one end-to-end benchmark, but it seems iris.txt is a small data set. Do we have more end-to-end scenarios to test?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we are currently formulating more benchmarks to add to the repo. See #711.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eerhardt: Feel free to check out the perf results for end-to-end perf scenarios on briancylui#7

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding suggested earlier about checking if the input array length is greater than a certain AvxLengthLimit along with checking Avx.IsSupported. I could do some perf tests that accept a Param that is the length of the input array to see what value AvxLengthLimit should take. Same for SseLengthLimit.

I think one of the problems about AVX that impacts perf right now is that we haven't implemented the "double-compute" to take care of alignment that @tannergooding mentioned in briancylui#2 (bottom item). I look forward to implementing it after this PR, since it would change the layout of every function in a significant way.

{
if (!tran)
{
Contracts.Assert(crun <= dst.Size);
AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size);
}
else
{
Contracts.Assert(crun <= src.Size);
AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
}
}
else if (Sse.IsSupported)
{
if (!tran)
{
Expand Down Expand Up @@ -96,7 +121,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
Contracts.AssertNonEmpty(rgposSrc);
Contracts.Assert(crun >= 0);

if (Sse.IsSupported)
if (Avx.IsSupported)
{
if (!tran)
{
Contracts.Assert(crun <= dst.Size);
AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
}
else
{
Contracts.Assert(crun <= srcValues.Size);
AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
}
}
else if (Sse.IsSupported)
{
if (!tran)
{
Expand Down Expand Up @@ -170,7 +208,11 @@ public static void Add(float a, float[] dst, int count)

private static void Add(float a, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddScalarU(a, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddScalarU(a, dst);
}
Expand Down Expand Up @@ -204,7 +246,11 @@ public static void Scale(float a, float[] dst, int offset, int count)

private static void Scale(float a, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.ScaleU(a, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.ScaleU(a, dst);
}
Expand All @@ -231,7 +277,11 @@ public static void Scale(float a, float[] src, float[] dst, int count)

private static void Scale(float a, Span<float> src, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.ScaleSrcU(a, src, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.ScaleSrcU(a, src, dst);
}
Expand All @@ -256,7 +306,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)

private static void ScaleAdd(float a, float b, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.ScaleAddU(a, b, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.ScaleAddU(a, b, dst);
}
Expand Down Expand Up @@ -295,7 +349,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in

private static void AddScale(float a, Span<float> src, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddScaleU(a, src, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddScaleU(a, src, dst);
}
Expand Down Expand Up @@ -339,7 +397,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in

private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddScaleSU(a, src, indices, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddScaleSU(a, src, indices, dst);
}
Expand Down Expand Up @@ -368,7 +430,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res,

private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddScaleCopyU(a, src, dst, res);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddScaleCopyU(a, src, dst, res);
}
Expand All @@ -394,7 +460,11 @@ public static void Add(float[] src, float[] dst, int count)

private static void Add(Span<float> src, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddU(src, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddU(src, dst);
}
Expand Down Expand Up @@ -438,7 +508,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i

private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.AddSU(src, indices, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.AddSU(src, indices, dst);
}
Expand Down Expand Up @@ -467,7 +541,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c

private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.MulElementWiseU(src1, src2, dst);
}
else if (Sse.IsSupported)
{
SseIntrinsics.MulElementWiseU(src1, src2, dst);
}
Expand Down Expand Up @@ -501,7 +579,11 @@ public static float Sum(float[] src, int offset, int count)

private static float Sum(Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.SumU(src);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.SumU(src);
}
Expand Down Expand Up @@ -537,7 +619,11 @@ public static float SumSq(float[] src, int offset, int count)

private static float SumSq(Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.SumSqU(src);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.SumSqU(src);
}
Expand All @@ -564,7 +650,11 @@ public static float SumSq(float mean, float[] src, int offset, int count)

private static float SumSq(float mean, Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src);
}
else if (Sse.IsSupported)
{
return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
}
Expand Down Expand Up @@ -600,7 +690,11 @@ public static float SumAbs(float[] src, int offset, int count)

private static float SumAbs(Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.SumAbsU(src);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.SumAbsU(src);
}
Expand All @@ -627,7 +721,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count)

private static float SumAbs(float mean, Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src);
}
else if (Sse.IsSupported)
{
return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
}
Expand Down Expand Up @@ -663,7 +761,11 @@ public static float MaxAbs(float[] src, int offset, int count)

private static float MaxAbs(Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.MaxAbsU(src);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.MaxAbsU(src);
}
Expand Down Expand Up @@ -693,7 +795,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count)

private static float MaxAbsDiff(float mean, Span<float> src)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.MaxAbsDiffU(mean, src);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.MaxAbsDiffU(mean, src);
}
Expand Down Expand Up @@ -737,7 +843,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count)

private static float DotProductDense(Span<float> a, Span<float> b)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.DotU(a, b);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.DotU(a, b);
}
Expand Down Expand Up @@ -784,7 +894,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind

private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.DotSU(a, b, indices);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.DotSU(a, b, indices);
}
Expand Down Expand Up @@ -813,7 +927,11 @@ public static float L2DistSquared(float[] a, float[] b, int count)

private static float L2DistSquared(Span<float> a, Span<float> b)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
return AvxIntrinsics.Dist2(a, b);
}
else if (Sse.IsSupported)
{
return SseIntrinsics.Dist2(a, b);
}
Expand Down Expand Up @@ -909,7 +1027,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src

private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
}
else if (Sse.IsSupported)
{
SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
}
Expand Down Expand Up @@ -943,7 +1065,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr

private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
{
if (Sse.IsSupported)
if (Avx.IsSupported)
{
AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
}
else if (Sse.IsSupported)
{
SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
}
Expand Down
8 changes: 7 additions & 1 deletion src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.CompilerServices;

namespace Microsoft.ML.Runtime.Internal.CpuMath
{
public static partial class CpuMathUtils
{
// The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
public const int Vector128Alignment = 16;
private const int Vector128Alignment = 16;

[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
public static int GetVectorAlignment()
=> Vector128Alignment;

public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);

Expand Down
Loading