Skip to content

Add AVX and FMA intrinsics in Factorization Machine #3940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
<OutputType>Exe</OutputType>
Expand All @@ -8,7 +8,7 @@
<PublicSign>false</PublicSign>
<RootNamespace>Samples</RootNamespace>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Microsoft.ML.LightGbm\Microsoft.ML.LightGbm.csproj" />
<ProjectReference Include="..\..\..\src\Microsoft.ML.Mkl.Components\Microsoft.ML.Mkl.Components.csproj" />
Expand All @@ -26,7 +26,6 @@
<NativeAssemblyReference Include="CpuMathNative" />
<NativeAssemblyReference Include="FastTreeNative" />
<NativeAssemblyReference Include="MatrixFactorizationNative" />
<NativeAssemblyReference Include="FactorizationMachineNative" />
<NativeAssemblyReference Include="LdaNative" />
<NativeAssemblyReference Include="SymSgdNative" />
<NativeAssemblyReference Include="MklProxyNative" />
Expand Down Expand Up @@ -71,7 +70,7 @@
<DependentUpon>LbfgsLogisticRegressionWithOptions.tt</DependentUpon>
</None>
<PackageReference Include="Microsoft.ML.TensorFlow.Redist" Version="0.10.0" />

</ItemGroup>

<ItemGroup>
Expand Down Expand Up @@ -949,19 +948,19 @@
<ItemGroup>
<PackageReference Include="Microsoft.ML.Onnx.TestModels" Version="$(MicrosoftMLOnnxTestModelsVersion)" />
</ItemGroup>

<ItemGroup>
<Content Include="$(ObjDir)DnnImageModels\ResNet18Onnx\ResNet18.onnx">
<Link>DnnImageModels\ResNet18Onnx\ResNet18.onnx</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>

<ItemGroup>
<Content Include="$(ObjDir)DnnImageModels\ResNetPrepOnnx\ResNetPreprocess.onnx">
<Link>DnnImageModels\ResNetPrepOnnx\ResNetPreprocess.onnx</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
5 changes: 2 additions & 3 deletions src/Microsoft.ML.Console/Microsoft.ML.Console.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
<OutputType>Exe</OutputType>
<AssemblyName>MML</AssemblyName>
<AssemblyName>MML</AssemblyName>
<StartupObject>Microsoft.ML.Tools.Console.Console</StartupObject>
</PropertyGroup>

Expand All @@ -30,12 +30,11 @@

<NativeAssemblyReference Include="FastTreeNative" />
<NativeAssemblyReference Include="CpuMathNative" />
<NativeAssemblyReference Include="FactorizationMachineNative" />
<NativeAssemblyReference Include="MatrixFactorizationNative" />
<NativeAssemblyReference Include="LdaNative" />
<NativeAssemblyReference Include="SymSgdNative"/>
<NativeAssemblyReference Include="MklImports"/>
<NativeAssemblyReference Condition="'$(OS)' == 'Windows_NT'" Include="libiomp5md"/>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ private static bool ShouldSkipPath(string path)
case "cpumathnative.dll":
case "cqo.dll":
case "fasttreenative.dll":
case "factorizationmachinenative.dll":
case "libiomp5md.dll":
case "ldanative.dll":
case "libvw.dll":
Expand Down
1 change: 1 addition & 0 deletions src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

namespace Microsoft.ML.Internal.CpuMath
{
[BestFriend]
internal static partial class CpuMathUtils
{
// The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
Expand Down
224 changes: 224 additions & 0 deletions src/Microsoft.ML.CpuMath/FactorizationMachine/AvxIntrinsics.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using Microsoft.ML.Internal.CpuMath.Core;

namespace Microsoft.ML.Internal.CpuMath.FactorizationMachine
{
internal static class AvxIntrinsics
{
private static readonly Vector256<float> _point5 = Vector256.Create(0.5f);

[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
private static Vector256<float> MultiplyAdd(Vector256<float> src1, Vector256<float> src2, Vector256<float> src3)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(src1, src2, src3);
}
else
{
Vector256<float> product = Avx.Multiply(src1, src2);
return Avx.Add(product, src3);
}
}

[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
private static Vector256<float> MultiplyAddNegated(Vector256<float> src1, Vector256<float> src2, Vector256<float> src3)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAddNegated(src1, src2, src3);
}
else
{
Vector256<float> product = Avx.Multiply(src1, src2);
return Avx.Subtract(src3, product);
}
}

// This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf.
// Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part.
// The linear part is the inner product of linearWeights and featureValues.
// The latent part is the sum of all intra-field interactions in one field f, for all fields possible
public static unsafe void CalculateIntermediateVariables(int* fieldIndices, int* featureIndices, float* featureValues,
float* linearWeights, float* latentWeights, float* latentSum, float* response, int fieldCount, int latentDim, int count)
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be nice to assert that AVX is supported, given the function is dependent on it.

Contracts.Assert(Avx.IsSupported);

// The number of all possible fields.
int m = fieldCount;
int d = latentDim;
int c = count;
int* pf = fieldIndices;
int* pi = featureIndices;
float* px = featureValues;
float* pw = linearWeights;
float* pv = latentWeights;
float* pq = latentSum;
float linearResponse = 0;
float latentResponse = 0;

Unsafe.InitBlock(pq, 0, (uint)(m*m*d*sizeof(float)));

Vector256<float> y = Vector256<float>.Zero;
Vector256<float> tmp = Vector256<float>.Zero;

for (int i = 0; i < c; i++)
{
int f = pf[i];
int j = pi[i];
linearResponse += pw[j] * px[i];

Vector256<float> x = Avx.BroadcastScalarToVector256(px + i);
Vector256<float> xx = Avx.Multiply(x, x);

// tmp -= <v_j,f, v_j,f> * x * x
int vBias = j * m * d + f * d;

// j-th feature's latent vector in the f-th field hidden space.
float* vjf = pv + vBias;

for (int k = 0; k + 8 <= d; k += 8)
{
Vector256<float> vjfBuffer = Avx.LoadVector256(vjf + k);
tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp);
}

for (int fprime = 0; fprime < m; fprime++)
{
vBias = j * m * d + fprime * d;
int qBias = f * m * d + fprime * d;
float* vjfprime = pv + vBias;
float* qffprime = pq + qBias;

// q_f,f' += v_j,f' * x
for (int k = 0; k + 8 <= d; k += 8)
{
Vector256<float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k);
Vector256<float> q = Avx.LoadVector256(qffprime + k);
q = MultiplyAdd(vjfprimeBuffer, x, q);
Avx.Store(qffprime + k, q);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the codegen for this sequence... I would hope it folds both loads, but I've seen some codegen here sometimes.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Codegen:

movsxd rbx, eax
vmovups ymm3, ymmword ptr [r8+rbx*4]
movsxd rbx, eax
vmovups ymm4, ymmword ptr [r10+rbx*4]
vmulps ymm3, ymm3, ymm2
vaddps ymm4, ymm4, ymm3
movsxd rbx, eax
vmovups ymmword ptr [r10+rbx*4], ymm4

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the codegen for this sequence... I would hope it folds both loads, but I've seen some codegen here sometimes.

@tannergooding
Should this be created as an issue in coreclr?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already logged one for it a while back: https://github.com/dotnet/coreclr/issues/25008

}
}
}

for (int f = 0; f < m; f++)
{
// tmp += <q_f,f, q_f,f>
float* qff = pq + f * m * d + f * d;
for (int k = 0; k + 8 <= d; k += 8)
{
Vector256<float> qffBuffer = Avx.LoadVector256(qff + k);

// Intra-field interactions.
tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp);
}

// y += <q_f,f', q_f',f>, f != f'
// Whis loop handles inter - field interactions because f != f'.
for (int fprime = f + 1; fprime < m; fprime++)
{
float* qffprime = pq + f * m * d + fprime * d;
float* qfprimef = pq + fprime * m * d + f * d;
for (int k = 0; k + 8 <= d; k += 8)
{
// Inter-field interaction.
Vector256<float> qffprimeBuffer = Avx.LoadVector256(qffprime + k);
Vector256<float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k);
y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y);
}
}
}

y = MultiplyAdd(_point5, tmp, y);
tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1));
tmp = Avx.HorizontalAdd(tmp, tmp);
y = Avx.HorizontalAdd(tmp, tmp);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the difference here from the C++ code?

In C++ we have:

_tmp = _mm_add_ps(_y, _mm_movehl_ps(_y, _y));
_y = _mm_add_ps(_tmp, _mm_shuffle_ps(_tmp, _tmp, 1)); // The lowest slot is the response value.

But here, we have:

tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1));
tmp = Avx.HorizontalAdd(tmp, tmp);
y = Avx.HorizontalAdd(tmp, tmp);

Every other line in this method reads line-for-line identical to the C++ code (besides the SSE vs AVX differences).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no AVX equivalent instruction for _mm_movehl_ps(_y, _y). Permuting and doing horizontal add on the two floating point elements (tmp) does the same computation.

Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value.
*response = linearResponse + latentResponse;
}

// This function implements Algorithm 2 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
// Calculate the stochastic gradient and update the model.
public static unsafe void CalculateGradientAndUpdate(int* fieldIndices, int* featureIndices, float* featureValues, float* latentSum, float* linearWeights,
float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads, float lambdaLinear, float lambdaLatent, float learningRate,
int fieldCount, int latentDim, float weight, int count, float slope)
{
Contracts.Assert(Avx.IsSupported);

int m = fieldCount;
int d = latentDim;
int c = count;
int* pf = fieldIndices;
int* pi = featureIndices;
float* px = featureValues;
float* pq = latentSum;
float* pw = linearWeights;
float* pv = latentWeights;
float* phw = linearAccumulatedSquaredGrads;
float* phv = latentAccumulatedSquaredGrads;

Vector256<float> wei = Vector256.Create(weight);
Vector256<float> s= Vector256.Create(slope);
Vector256<float> lr = Vector256.Create(learningRate);
Vector256<float> lambdav = Vector256.Create(lambdaLatent);

for (int i = 0; i < count; i++)
{
int f = pf[i];
int j = pi[i];

// Calculate gradient of linear term w_j.
float g = weight * (lambdaLinear * pw[j] + slope * px[i]);

// Accumulate the gradient of the linear term.
phw[j] += g * g;

// Perform ADAGRAD update rule to adjust linear term.
pw[j] -= learningRate / MathF.Sqrt(phw[j]) * g;

// Update latent term, v_j,f', f'=1,...,m.
Vector256<float> x = Avx.BroadcastScalarToVector256(px + i);

for (int fprime = 0; fprime < m; fprime++)
{
float* vjfprime = pv + j * m * d + fprime * d;
float* hvjfprime = phv + j * m * d + fprime * d;
float* qfprimef = pq + fprime * m * d + f * d;
Vector256<float> sx = Avx.Multiply(s, x);

for (int k = 0; k + 8 <= d; k += 8)
{
Vector256<float> v = Avx.LoadVector256(vjfprime + k);
Vector256<float> q = Avx.LoadVector256(qfprimef + k);

// Calculate L2-norm regularization's gradient.
Vector256<float> gLatent = Avx.Multiply(lambdav, v);

Vector256<float> tmp = q;

// Calculate loss function's gradient.
if (fprime == f)
tmp = MultiplyAddNegated(v, x, q);
gLatent = MultiplyAdd(sx, tmp, gLatent);
gLatent = Avx.Multiply(wei, gLatent);

// Accumulate the gradient of latent vectors.
Vector256<float> h = MultiplyAdd(gLatent, gLatent, Avx.LoadVector256(hvjfprime + k));

// Perform ADAGRAD update rule to adjust latent vector.
v = MultiplyAddNegated(lr, Avx.Multiply(Avx.ReciprocalSqrt(h), gLatent), v);
Avx.Store(vjfprime + k, v);
Avx.Store(hvjfprime + k, h);
}
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.InteropServices;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All new code files need a copyright header. You can copy one from an existing .cs file in the repo.

using System.Security;
using Microsoft.ML.Internal.CpuMath.Core;

namespace Microsoft.ML.Internal.CpuMath.FactorizationMachine
{
internal static unsafe partial class FieldAwareFactorizationMachineInterface
{
private const string NativePath = "CpuMathNative";
private const int CbAlign = 16;

private static bool Compat(AlignedArray a)
{
Contracts.AssertValue(a);
Contracts.Assert(a.Size > 0);
return a.CbAlign == CbAlign;
}

private static unsafe float* Ptr(AlignedArray a, float* p)
{
Contracts.AssertValue(a);
float* q = p + a.GetBase((long)p);
Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
return q;
}

[DllImport(NativePath), SuppressUnmanagedCodeSecurity]
private static extern void CalculateIntermediateVariablesNative(int fieldCount, int latentDim, int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices,
float* /*const*/ featureValues, float* /*const*/ linearWeights, float* /*const*/ latentWeights, float* latentSum, float* response);

[DllImport(NativePath), SuppressUnmanagedCodeSecurity]
private static extern void CalculateGradientAndUpdateNative(float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight,
int count, int* /*const*/ fieldIndices, int* /*const*/ featureIndices, float* /*const*/ featureValues, float* /*const*/ latentSum, float slope,
float* linearWeights, float* latentWeights, float* linearAccumulatedSquaredGrads, float* latentAccumulatedSquaredGrads);
}
}
Loading