dotnet · safern · Aug 29, 2018 · Aug 10, 2018 · Aug 15, 2018 · Aug 16, 2018
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
@@ -16,7 +16,7 @@ public static void AssertCompatible(ICpuFullMatrix values)
 #if DEBUG
             var mat = values as TMatrix;
             Contracts.AssertValue(mat);
-            Contracts.Assert(mat.Items.CbAlign == CpuMathUtils.Vector128Alignment);
+            Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
 #endif
         }
 
@@ -29,7 +29,7 @@ public static void AssertCompatible(ICpuVector values)
 #if DEBUG
             CpuAlignedVector vec = values as CpuAlignedVector;
             Contracts.AssertValue(vec);
-            Contracts.Assert(vec.Items.CbAlign == CpuMathUtils.Vector128Alignment);
+            Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
 #endif
         }
 

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics.X86;
 using System;
 
@@ -10,14 +11,38 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
     public static partial class CpuMathUtils
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        public const int Vector128Alignment = 16;
+        private const int Vector128Alignment = 16;
+
+        // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
+        private const int Vector256Alignment = 32;
+
+        // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray
+        private const int FloatAlignment = 4;
+
+        // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float.
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        public static int GetVectorAlignment()
+            => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment);
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
             Contracts.Assert(mat.Size == dst.Size * src.Size);
             Contracts.Assert(crun >= 0);
 
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    AvxIntrinsics.MatMulX(add, mat, src, dst, crun, src.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= src.Size);
+                    AvxIntrinsics.MatMulTranX(add, mat, src, dst, dst.Size, crun);
+                }
+            }
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
@@ -96,7 +121,20 @@ public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgpo
             Contracts.AssertNonEmpty(rgposSrc);
             Contracts.Assert(crun >= 0);
 
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    AvxIntrinsics.MatMulPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= srcValues.Size);
+                    AvxIntrinsics.MatMulTranPX(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
+                }
+            }
+            else if (Sse.IsSupported)
             {
                 if (!tran)
                 {
@@ -170,7 +208,11 @@ public static void Add(float a, float[] dst, int count)
 
         private static void Add(float a, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScalarU(a, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScalarU(a, dst);
             }
@@ -204,7 +246,11 @@ public static void Scale(float a, float[] dst, int offset, int count)
 
         private static void Scale(float a, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleU(a, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleU(a, dst);
             }
@@ -231,7 +277,11 @@ public static void Scale(float a, float[] src, float[] dst, int count)
 
         private static void Scale(float a, Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleSrcU(a, src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleSrcU(a, src, dst);
             }
@@ -256,7 +306,11 @@ public static void ScaleAdd(float a, float b, float[] dst, int count)
 
         private static void ScaleAdd(float a, float b, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.ScaleAddU(a, b, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.ScaleAddU(a, b, dst);
             }
@@ -295,7 +349,11 @@ public static void AddScale(float a, float[] src, float[] dst, int dstOffset, in
 
         private static void AddScale(float a, Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleU(a, src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleU(a, src, dst);
             }
@@ -339,7 +397,11 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
 
         private static void AddScale(float a, Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleSU(a, src, indices, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleSU(a, src, indices, dst);
             }
@@ -368,7 +430,11 @@ public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res,
 
         private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddScaleCopyU(a, src, dst, res);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddScaleCopyU(a, src, dst, res);
             }
@@ -394,7 +460,11 @@ public static void Add(float[] src, float[] dst, int count)
 
         private static void Add(Span<float> src, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddU(src, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddU(src, dst);
             }
@@ -438,7 +508,11 @@ public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, i
 
         private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.AddSU(src, indices, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.AddSU(src, indices, dst);
             }
@@ -467,7 +541,11 @@ public static void MulElementWise(float[] src1, float[] src2, float[] dst, int c
 
         private static void MulElementWise(Span<float> src1, Span<float> src2, Span<float> dst)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.MulElementWiseU(src1, src2, dst);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.MulElementWiseU(src1, src2, dst);
             }
@@ -501,7 +579,11 @@ public static float Sum(float[] src, int offset, int count)
 
         private static float Sum(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumU(src);
             }
@@ -537,7 +619,11 @@ public static float SumSq(float[] src, int offset, int count)
 
         private static float SumSq(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumSqU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumSqU(src);
             }
@@ -564,7 +650,11 @@ public static float SumSq(float mean, float[] src, int offset, int count)
 
         private static float SumSq(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return (mean == 0) ? AvxIntrinsics.SumSqU(src) : AvxIntrinsics.SumSqDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
             }
@@ -600,7 +690,11 @@ public static float SumAbs(float[] src, int offset, int count)
 
         private static float SumAbs(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.SumAbsU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.SumAbsU(src);
             }
@@ -627,7 +721,11 @@ public static float SumAbs(float mean, float[] src, int offset, int count)
 
         private static float SumAbs(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return (mean == 0) ? AvxIntrinsics.SumAbsU(src) : AvxIntrinsics.SumAbsDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
             }
@@ -663,7 +761,11 @@ public static float MaxAbs(float[] src, int offset, int count)
 
         private static float MaxAbs(Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.MaxAbsU(src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsU(src);
             }
@@ -693,7 +795,11 @@ public static float MaxAbsDiff(float mean, float[] src, int count)
 
         private static float MaxAbsDiff(float mean, Span<float> src)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.MaxAbsDiffU(mean, src);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.MaxAbsDiffU(mean, src);
             }
@@ -737,7 +843,11 @@ public static float DotProductDense(float[] a, int offset, float[] b, int count)
 
         private static float DotProductDense(Span<float> a, Span<float> b)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.DotU(a, b);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotU(a, b);
             }
@@ -784,7 +894,11 @@ public static float DotProductSparse(float[] a, int offset, float[] b, int[] ind
 
         private static float DotProductSparse(Span<float> a, Span<float> b, Span<int> indices)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.DotSU(a, b, indices);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.DotSU(a, b, indices);
             }
@@ -813,7 +927,11 @@ public static float L2DistSquared(float[] a, float[] b, int count)
 
         private static float L2DistSquared(Span<float> a, Span<float> b)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                return AvxIntrinsics.Dist2(a, b);
+            }
+            else if (Sse.IsSupported)
             {
                 return SseIntrinsics.Dist2(a, b);
             }
@@ -909,7 +1027,11 @@ public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src
 
         private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
             }
@@ -943,7 +1065,11 @@ public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] sr
 
         private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
         {
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
+            {
+                AvxIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
+            }
+            else if (Sse.IsSupported)
             {
                 SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
             }

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -2,12 +2,18 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System.Runtime.CompilerServices;
+
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
-        public const int Vector128Alignment = 16;
+        private const int Vector128Alignment = 16;
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        public static int GetVectorAlignment()
+            => Vector128Alignment;
 
         public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);