dotnet · pkumar07 · May 28, 2019 · May 30, 2019 · wschin · May 30, 2019
diff --git a/build/Dependencies.props b/build/Dependencies.props
@@ -7,6 +7,7 @@
     <SystemCollectionsImmutableVersion>1.5.0</SystemCollectionsImmutableVersion>
     <SystemMemoryVersion>4.5.1</SystemMemoryVersion>
     <SystemReflectionEmitLightweightPackageVersion>4.3.0</SystemReflectionEmitLightweightPackageVersion>
+    <SystemRuntimeCompilerServices>4.5.2</SystemRuntimeCompilerServices>
     <SystemThreadingTasksDataflowPackageVersion>4.8.0</SystemThreadingTasksDataflowPackageVersion>
   </PropertyGroup>
 

diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,6 +9,7 @@
 
 namespace Microsoft.ML.Internal.CpuMath
 {
+    [BestFriend]
     internal static partial class CpuMathUtils
     {
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray

diff --git a/...soft.ML.StandardTrainers/FactorizationMachine/FactorizationMachineInterface.netcoreapp.cs b/...soft.ML.StandardTrainers/FactorizationMachine/FactorizationMachineInterface.netcoreapp.cs
@@ -0,0 +1,125 @@
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Security;
+using Microsoft.ML.Internal.CpuMath;
+using Microsoft.ML.Runtime;
+
+namespace Microsoft.ML.Trainers
+{
+    internal static unsafe class FieldAwareFactorizationMachineInterface
+    {
+        internal const string NativePath = "FactorizationMachineNative";
+        public const int CbAlign = 16;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        private static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateIntermediateVariablesNativeSSE(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
+            float* /*const*/ featureValues, float* /*const*/ linearWeights, float* /*const*/ latentWeights, float* latentSum, float* response);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateIntermediateVariablesNativeAVX(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
+            float* /*const*/ featureValues, float* /*const*/ linearWeights, float* /*const*/ latentWeights, float* latentSum, float* response);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateIntermediateVariablesNativeFMA(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
+            float* /*const*/ featureValues, float* /*const*/ linearWeights, float* /*const*/ latentWeights, float* latentSum, float* response);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateGradientAndUpdateNativeSSE(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
+            int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices, float* /*const*/ featureValues, float* /*const*/ latentSum, float slope,
+            float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateGradientAndUpdateNativeAVX(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
+            int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices, float* /*const*/ featureValues, float* /*const*/ latentSum, float slope,
+            float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void CalculateGradientAndUpdateNativeFMA(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
+            int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices, float* /*const*/ featureValues, float* /*const*/ latentSum, float slope,
+            float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
+
+        public static void CalculateIntermediateVariables(int fieldCount, int latentDim, int count, int[] fieldIndices, int[] featureIndices, float[] featureValues,
+            float[] linearWeights, AlignedArray latentWeights, AlignedArray latentSum, ref float response)
+        {
+            Contracts.AssertNonEmpty(fieldIndices);
+            Contracts.AssertNonEmpty(featureValues);
+            Contracts.AssertNonEmpty(featureIndices);
+            Contracts.AssertNonEmpty(linearWeights);
+            Contracts.Assert(Compat(latentWeights));
+            Contracts.Assert(Compat(latentSum));
+
+            unsafe
+            {
+                fixed (int* pf = &fieldIndices[0])
+                fixed (int* pi = &featureIndices[0])
+                fixed (float* px = &featureValues[0])
+                fixed (float* pw = &linearWeights[0])
+                fixed (float* pv = &latentWeights.Items[0])
+                fixed (float* pq = &latentSum.Items[0])
+                fixed (float* pr = &response)
+                {
+                    if (Fma.IsSupported)
+                        CalculateIntermediateVariablesNativeFMA(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
+                    else if (Avx.IsSupported)
+                        CalculateIntermediateVariablesNativeAVX(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
+                    else
+                        CalculateIntermediateVariablesNativeSSE(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
+                }
+            }
+        }
+
+        public static void CalculateGradientAndUpdate(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim,
+            float weight, int count, int[] fieldIndices, int[] featureIndices, float[] featureValues, AlignedArray latentSum, float slope,
+            float[] linearWeights, AlignedArray latentWeights, float[] linearAccumulatedSquaredGrads, AlignedArray latentAccumulatedSquaredGrads)
+        {
+            Contracts.AssertNonEmpty(fieldIndices);
+            Contracts.AssertNonEmpty(featureIndices);
+            Contracts.AssertNonEmpty(featureValues);
+            Contracts.Assert(Compat(latentSum));
+            Contracts.AssertNonEmpty(linearWeights);
+            Contracts.Assert(Compat(latentWeights));
+            Contracts.AssertNonEmpty(linearAccumulatedSquaredGrads);
+            Contracts.Assert(Compat(latentAccumulatedSquaredGrads));
+
+            unsafe
+            {
+                fixed (int* pf = &fieldIndices[0])
+                fixed (int* pi = &featureIndices[0])
+                fixed (float* px = &featureValues[0])
+                fixed (float* pq = &latentSum.Items[0])
+                fixed (float* pw = &linearWeights[0])
+                fixed (float* pv = &latentWeights.Items[0])
+                fixed (float* phw = &linearAccumulatedSquaredGrads[0])
+                fixed (float* phv = &latentAccumulatedSquaredGrads.Items[0])
+                {
+                    if (Fma.IsSupported)
+                        CalculateGradientAndUpdateNativeFMA(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
+                            Ptr(latentSum, pq), slope, pw, Ptr(latentWeights, pv), phw, Ptr(latentAccumulatedSquaredGrads, phv));
+                    else if (Avx.IsSupported)
+                        CalculateGradientAndUpdateNativeAVX(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
+                            Ptr(latentSum, pq), slope, pw, Ptr(latentWeights, pv), phw, Ptr(latentAccumulatedSquaredGrads, phv));
+                    else
+                        CalculateGradientAndUpdateNativeSSE(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
+                            Ptr(latentSum, pq), slope, pw, Ptr(latentWeights, pv), phw, Ptr(latentAccumulatedSquaredGrads, phv));
+                }
+            }
+        }
+    }
+}
diff --git a/...nMachine/FactorizationMachineInterface.cs → ...torizationMachineInterface.netstandard.cs b/...nMachine/FactorizationMachineInterface.cs → ...torizationMachineInterface.netstandard.cs
@@ -30,11 +30,11 @@ private static bool Compat(AlignedArray a)
         }
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void CalculateIntermediateVariablesNative(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
+        public static extern void CalculateIntermediateVariablesNativeSSE(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
             float* /*const*/ featureValues, float* /*const*/ linearWeights, float* /*const*/ latentWeights, float* latentSum, float* response);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void CalculateGradientAndUpdateNative(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
+        public static extern void CalculateGradientAndUpdateNativeSSE(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
             int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices, float* /*const*/ featureValues, float* /*const*/ latentSum, float slope,
             float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
 
@@ -57,7 +57,7 @@ public static void CalculateIntermediateVariables(int fieldCount, int latentDim,
                 fixed (float* pv = &latentWeights.Items[0])
                 fixed (float* pq = &latentSum.Items[0])
                 fixed (float* pr = &response)
-                    CalculateIntermediateVariablesNative(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
+                    CalculateIntermediateVariablesNativeSSE(fieldCount, latentDim, count, pf, pi, px, pw, Ptr(latentWeights, pv), Ptr(latentSum, pq), pr);
             }
         }
 
@@ -84,10 +84,10 @@ public static void CalculateGradientAndUpdate(float lambdaLinear, float lambdaLa
                 fixed (float* pv = &latentWeights.Items[0])
                 fixed (float* phw = &linearAccumulatedSquaredGrads[0])
                 fixed (float* phv = &latentAccumulatedSquaredGrads.Items[0])
-                    CalculateGradientAndUpdateNative(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
+                    CalculateGradientAndUpdateNativeSSE(lambdaLinear, lambdaLatent, learningRate, fieldCount, latentDim, weight, count, pf, pi, px,
                         Ptr(latentSum, pq), slope, pw, Ptr(latentWeights, pv), phw, Ptr(latentAccumulatedSquaredGrads, phv));
             }
 
         }
     }
-}
+}
diff --git a/src/Microsoft.ML.StandardTrainers/Microsoft.ML.StandardTrainers.csproj b/src/Microsoft.ML.StandardTrainers/Microsoft.ML.StandardTrainers.csproj
@@ -1,7 +1,8 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFramework>netstandard2.0</TargetFramework>
+    <TargetFramework Condition="'$(UseIntrinsics)' != 'true'">netstandard2.0</TargetFramework>
+    <TargetFrameworks Condition="'$(UseIntrinsics)' == 'true'">netstandard2.0;netcoreapp3.0</TargetFrameworks>
     <IncludeInPackage>Microsoft.ML</IncludeInPackage>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
@@ -10,6 +11,17 @@
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
     <ProjectReference Include="..\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />
     <ProjectReference Include="..\Microsoft.ML.Data\Microsoft.ML.Data.csproj" />
+    <!-- Workaround https://github.com/dotnet/project-system/issues/935 -->
+    <None Include="**/*.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFramework)' == 'netcoreapp3.0'">
+    <Compile Remove="FactorizationMachine/FactorizationMachineInterface.netstandard.cs" />
+    <PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="$(SystemRuntimeCompilerServices)" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
+    <Compile Remove="FactorizationMachine/FactorizationMachineInterface.netcoreapp.cs" />
   </ItemGroup>
 
 </Project>
diff --git a/src/Native/FactorizationMachineNative/CMakeLists.txt b/src/Native/FactorizationMachineNative/CMakeLists.txt
@@ -1,12 +1,16 @@
 project (FactorizationMachineNative)
 
 set(SOURCES
-    FactorizationMachineCore.cpp
+    FactorizationMachineCoreSSE.cpp
+    FactorizationMachineCoreAVX.cpp
+    FactorizationMachineCoreFMA.cpp
 )
 
 if(WIN32)
 else()
     list(APPEND SOURCES ${VERSION_FILE_PATH})
+    set_property(SOURCE FactorizationMachineCoreAVX.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx")
+    set_property(SOURCE FactorizationMachineCoreFMA.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -mfma")
 endif()
 
 add_library(FactorizationMachineNative SHARED ${SOURCES} ${RESOURCES})