Skip to content

Commit b5975a3

Browse files
Merge pull request #1399 from SixLabors/js/avx2-premultiplication
Add Avx2 Vector4 Span Premultiplication and Reverse
2 parents b577d8e + f1959f3 commit b5975a3

File tree

6 files changed

+228
-10
lines changed

6 files changed

+228
-10
lines changed

src/ImageSharp/Common/Helpers/ImageMaths.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,12 @@ public static int LeastCommonMultiple(int a, int b)
132132
return (a / GreatestCommonDivisor(a, b)) * b;
133133
}
134134

135+
/// <summary>
136+
/// Calculates <paramref name="x"/> % 2
137+
/// </summary>
138+
[MethodImpl(InliningOptions.ShortMethod)]
139+
public static int Modulo2(int x) => x & 1;
140+
135141
/// <summary>
136142
/// Calculates <paramref name="x"/> % 4
137143
/// </summary>

src/ImageSharp/Common/Helpers/Vector4Utilities.cs

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
using System.Numerics;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.InteropServices;
8+
#if SUPPORTS_RUNTIME_INTRINSICS
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
11+
#endif
812

913
namespace SixLabors.ImageSharp
1014
{
@@ -13,6 +17,9 @@ namespace SixLabors.ImageSharp
1317
/// </summary>
1418
internal static class Vector4Utilities
1519
{
20+
private const int BlendAlphaControl = 0b_10_00_10_00;
21+
private const int ShuffleAlphaControl = 0b_11_11_11_11;
22+
1623
/// <summary>
1724
/// Restricts a vector between a minimum and a maximum value.
1825
/// 5x Faster then <see cref="Vector4.Clamp(Vector4, Vector4, Vector4)"/>.
@@ -56,13 +63,39 @@ public static void UnPremultiply(ref Vector4 source)
5663
[MethodImpl(InliningOptions.ShortMethod)]
5764
public static void Premultiply(Span<Vector4> vectors)
5865
{
59-
// TODO: This method can be AVX2 optimized using Vector<float>
60-
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
66+
#if SUPPORTS_RUNTIME_INTRINSICS
67+
if (Avx2.IsSupported && vectors.Length >= 2)
68+
{
69+
ref Vector256<float> vectorsBase =
70+
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
6171

62-
for (int i = 0; i < vectors.Length; i++)
72+
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
73+
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
74+
75+
while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
76+
{
77+
Vector256<float> source = vectorsBase;
78+
Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
79+
vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
80+
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
81+
}
82+
83+
if (ImageMaths.Modulo2(vectors.Length) != 0)
84+
{
85+
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
86+
Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
87+
}
88+
}
89+
else
90+
#endif
6391
{
64-
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
65-
Premultiply(ref v);
92+
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
93+
94+
for (int i = 0; i < vectors.Length; i++)
95+
{
96+
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
97+
Premultiply(ref v);
98+
}
6699
}
67100
}
68101

@@ -73,13 +106,39 @@ public static void Premultiply(Span<Vector4> vectors)
73106
[MethodImpl(InliningOptions.ShortMethod)]
74107
public static void UnPremultiply(Span<Vector4> vectors)
75108
{
76-
// TODO: This method can be AVX2 optimized using Vector<float>
77-
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
109+
#if SUPPORTS_RUNTIME_INTRINSICS
110+
if (Avx2.IsSupported && vectors.Length >= 2)
111+
{
112+
ref Vector256<float> vectorsBase =
113+
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
78114

79-
for (int i = 0; i < vectors.Length; i++)
115+
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
116+
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
117+
118+
while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
119+
{
120+
Vector256<float> source = vectorsBase;
121+
Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
122+
vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
123+
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
124+
}
125+
126+
if (ImageMaths.Modulo2(vectors.Length) != 0)
127+
{
128+
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
129+
UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
130+
}
131+
}
132+
else
133+
#endif
80134
{
81-
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
82-
UnPremultiply(ref v);
135+
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
136+
137+
for (int i = 0; i < vectors.Length; i++)
138+
{
139+
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
140+
UnPremultiply(ref v);
141+
}
83142
}
84143
}
85144

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using System;
5+
using System.Numerics;
6+
using System.Runtime.CompilerServices;
7+
using System.Runtime.InteropServices;
8+
using BenchmarkDotNet.Attributes;
9+
10+
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
11+
{
12+
[Config(typeof(Config.ShortCore31))]
13+
public class PremultiplyVector4
14+
{
15+
private static readonly Vector4[] Vectors = CreateVectors();
16+
17+
[Benchmark(Baseline = true)]
18+
public void PremultiplyBaseline()
19+
{
20+
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
21+
22+
for (int i = 0; i < Vectors.Length; i++)
23+
{
24+
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
25+
Premultiply(ref v);
26+
}
27+
}
28+
29+
[Benchmark]
30+
public void Premultiply()
31+
{
32+
Vector4Utilities.Premultiply(Vectors);
33+
}
34+
35+
[MethodImpl(InliningOptions.ShortMethod)]
36+
private static void Premultiply(ref Vector4 source)
37+
{
38+
float w = source.W;
39+
source *= w;
40+
source.W = w;
41+
}
42+
43+
private static Vector4[] CreateVectors()
44+
{
45+
var rnd = new Random(42);
46+
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
47+
}
48+
49+
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
50+
{
51+
var values = new Vector4[length];
52+
53+
for (int i = 0; i < length; i++)
54+
{
55+
ref Vector4 v = ref values[i];
56+
v.X = GetRandomFloat(rnd, minVal, maxVal);
57+
v.Y = GetRandomFloat(rnd, minVal, maxVal);
58+
v.Z = GetRandomFloat(rnd, minVal, maxVal);
59+
v.W = GetRandomFloat(rnd, minVal, maxVal);
60+
}
61+
62+
return values;
63+
}
64+
65+
private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
66+
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
67+
}
68+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using System;
5+
using System.Numerics;
6+
using System.Runtime.CompilerServices;
7+
using System.Runtime.InteropServices;
8+
using BenchmarkDotNet.Attributes;
9+
10+
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
11+
{
12+
[Config(typeof(Config.ShortCore31))]
13+
public class UnPremultiplyVector4
14+
{
15+
private static readonly Vector4[] Vectors = CreateVectors();
16+
17+
[Benchmark(Baseline = true)]
18+
public void UnPremultiplyBaseline()
19+
{
20+
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
21+
22+
for (int i = 0; i < Vectors.Length; i++)
23+
{
24+
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
25+
UnPremultiply(ref v);
26+
}
27+
}
28+
29+
[Benchmark]
30+
public void UnPremultiply()
31+
{
32+
Vector4Utilities.UnPremultiply(Vectors);
33+
}
34+
35+
[MethodImpl(InliningOptions.ShortMethod)]
36+
private static void UnPremultiply(ref Vector4 source)
37+
{
38+
float w = source.W;
39+
source /= w;
40+
source.W = w;
41+
}
42+
43+
private static Vector4[] CreateVectors()
44+
{
45+
var rnd = new Random(42);
46+
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
47+
}
48+
49+
private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
50+
{
51+
var values = new Vector4[length];
52+
53+
for (int i = 0; i < length; i++)
54+
{
55+
ref Vector4 v = ref values[i];
56+
v.X = GetRandomFloat(rnd, minVal, maxVal);
57+
v.Y = GetRandomFloat(rnd, minVal, maxVal);
58+
v.Z = GetRandomFloat(rnd, minVal, maxVal);
59+
v.W = GetRandomFloat(rnd, minVal, maxVal);
60+
}
61+
62+
return values;
63+
}
64+
65+
private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
66+
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
67+
}
68+
}

tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers
1010
{
1111
public class ImageMathsTests
1212
{
13+
[Theory]
14+
[InlineData(0)]
15+
[InlineData(1)]
16+
[InlineData(2)]
17+
[InlineData(3)]
18+
[InlineData(4)]
19+
[InlineData(100)]
20+
[InlineData(123)]
21+
[InlineData(53436353)]
22+
public void Modulo2(int x)
23+
{
24+
int actual = ImageMaths.Modulo2(x);
25+
Assert.Equal(x % 2, actual);
26+
}
27+
1328
[Theory]
1429
[InlineData(0)]
1530
[InlineData(1)]

tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ public class Vector4UtilsTests
1717
[InlineData(0)]
1818
[InlineData(1)]
1919
[InlineData(30)]
20+
[InlineData(63)]
2021
public void Premultiply_VectorSpan(int length)
2122
{
2223
var rnd = new Random(42);
@@ -36,6 +37,7 @@ public void Premultiply_VectorSpan(int length)
3637
[InlineData(0)]
3738
[InlineData(1)]
3839
[InlineData(30)]
40+
[InlineData(63)]
3941
public void UnPremultiply_VectorSpan(int length)
4042
{
4143
var rnd = new Random(42);

0 commit comments

Comments
 (0)