Skip to content

Commit 3042f83

Browse files
authored
Merge pull request #1143 from SixLabors/af/block-scale-optimization
Undo jpeg perf regression, add various optimizations
2 parents aed6b77 + 810d3bb commit 3042f83

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1238
-303
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright (c) Six Labors and contributors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
#if SUPPORTS_RUNTIME_INTRINSICS
5+
6+
using System;
7+
using System.Numerics;
8+
using System.Runtime.CompilerServices;
9+
using System.Runtime.InteropServices;
10+
using System.Runtime.Intrinsics;
11+
using System.Runtime.Intrinsics.X86;
12+
13+
namespace SixLabors.ImageSharp
14+
{
15+
internal static partial class SimdUtils
16+
{
17+
public static class Avx2Intrinsics
18+
{
19+
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
20+
21+
/// <summary>
22+
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
23+
/// </summary>
24+
[MethodImpl(InliningOptions.ShortMethod)]
25+
internal static void NormalizedFloatToByteSaturateReduce(
26+
ref ReadOnlySpan<float> source,
27+
ref Span<byte> dest)
28+
{
29+
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
30+
31+
if (Avx2.IsSupported)
32+
{
33+
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
34+
int adjustedCount = source.Length - remainder;
35+
36+
if (adjustedCount > 0)
37+
{
38+
NormalizedFloatToByteSaturate(
39+
source.Slice(0, adjustedCount),
40+
dest.Slice(0, adjustedCount));
41+
42+
source = source.Slice(adjustedCount);
43+
dest = dest.Slice(adjustedCount);
44+
}
45+
}
46+
}
47+
48+
/// <summary>
49+
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
50+
/// </summary>
51+
/// <remarks>
52+
/// Implementation is based on MagicScaler code:
53+
/// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
54+
/// </remarks>
55+
internal static void NormalizedFloatToByteSaturate(
56+
ReadOnlySpan<float> source,
57+
Span<byte> dest)
58+
{
59+
VerifySpanInput(source, dest, Vector256<byte>.Count);
60+
61+
int n = dest.Length / Vector256<byte>.Count;
62+
63+
ref Vector256<float> sourceBase =
64+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
65+
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
66+
67+
var maxBytes = Vector256.Create(255f);
68+
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
69+
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
70+
71+
for (int i = 0; i < n; i++)
72+
{
73+
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
74+
75+
Vector256<float> f0 = s;
76+
Vector256<float> f1 = Unsafe.Add(ref s, 1);
77+
Vector256<float> f2 = Unsafe.Add(ref s, 2);
78+
Vector256<float> f3 = Unsafe.Add(ref s, 3);
79+
80+
Vector256<int> w0 = ConvertToInt32(f0, maxBytes);
81+
Vector256<int> w1 = ConvertToInt32(f1, maxBytes);
82+
Vector256<int> w2 = ConvertToInt32(f2, maxBytes);
83+
Vector256<int> w3 = ConvertToInt32(f3, maxBytes);
84+
85+
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
86+
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
87+
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
88+
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
89+
90+
Unsafe.Add(ref destBase, i) = b;
91+
}
92+
}
93+
94+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
95+
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
96+
{
97+
vf = Avx.Multiply(vf, scale);
98+
return Avx.ConvertToVector256Int32(vf);
99+
}
100+
}
101+
}
102+
}
103+
#endif

src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ internal static partial class SimdUtils
1717
/// </summary>
1818
public static class BasicIntrinsics256
1919
{
20-
public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
20+
public static bool IsAvailable { get; } = HasVector8;
2121

2222
#if !SUPPORTS_EXTENDED_INTRINSICS
2323
/// <summary>
24-
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
24+
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
2525
/// </summary>
2626
[MethodImpl(InliningOptions.ShortMethod)]
27-
internal static void BulkConvertByteToNormalizedFloatReduce(
27+
internal static void ByteToNormalizedFloatReduce(
2828
ref ReadOnlySpan<byte> source,
2929
ref Span<float> dest)
3030
{
@@ -40,7 +40,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
4040

4141
if (adjustedCount > 0)
4242
{
43-
BulkConvertByteToNormalizedFloat(
43+
ByteToNormalizedFloat(
4444
source.Slice(0, adjustedCount),
4545
dest.Slice(0, adjustedCount));
4646

@@ -50,10 +50,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
5050
}
5151

5252
/// <summary>
53-
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
53+
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
5454
/// </summary>
5555
[MethodImpl(InliningOptions.ShortMethod)]
56-
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
56+
internal static void NormalizedFloatToByteSaturateReduce(
5757
ref ReadOnlySpan<float> source,
5858
ref Span<byte> dest)
5959
{
@@ -69,7 +69,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
6969

7070
if (adjustedCount > 0)
7171
{
72-
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
72+
NormalizedFloatToByteSaturate(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
7373

7474
source = source.Slice(adjustedCount);
7575
dest = dest.Slice(adjustedCount);
@@ -78,15 +78,15 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
7878
#endif
7979

8080
/// <summary>
81-
/// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
81+
/// SIMD optimized implementation for <see cref="SimdUtils.ByteToNormalizedFloat"/>.
8282
/// Works only with span Length divisible by 8.
8383
/// Implementation adapted from:
8484
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
8585
/// http://stackoverflow.com/a/536278
8686
/// </summary>
87-
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
87+
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
8888
{
89-
VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat));
89+
VerifyHasVector8(nameof(ByteToNormalizedFloat));
9090
VerifySpanInput(source, dest, 8);
9191

9292
var bVec = new Vector<float>(256.0f / 255.0f);
@@ -124,11 +124,11 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
124124
}
125125

126126
/// <summary>
127-
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
127+
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> which is faster on older runtimes.
128128
/// </summary>
129-
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
129+
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> dest)
130130
{
131-
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
131+
VerifyHasVector8(nameof(NormalizedFloatToByteSaturate));
132132
VerifySpanInput(source, dest, 8);
133133

134134
if (source.Length == 0)
@@ -177,7 +177,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan
177177
/// </summary>
178178
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
179179
{
180-
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte));
180+
VerifyHasVector8(nameof(BulkConvertNormalizedFloatToByte));
181181
VerifySpanInput(source, dest, 8);
182182

183183
if (source.Length == 0)

src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ internal static void ConvertToSingle(
4343
}
4444

4545
/// <summary>
46-
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
46+
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
4747
/// </summary>
4848
[MethodImpl(InliningOptions.ShortMethod)]
49-
internal static void BulkConvertByteToNormalizedFloatReduce(
49+
internal static void ByteToNormalizedFloatReduce(
5050
ref ReadOnlySpan<byte> source,
5151
ref Span<float> dest)
5252
{
@@ -62,18 +62,18 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
6262

6363
if (adjustedCount > 0)
6464
{
65-
BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
65+
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
6666

6767
source = source.Slice(adjustedCount);
6868
dest = dest.Slice(adjustedCount);
6969
}
7070
}
7171

7272
/// <summary>
73-
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
73+
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
7474
/// </summary>
7575
[MethodImpl(InliningOptions.ShortMethod)]
76-
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
76+
internal static void NormalizedFloatToByteSaturateReduce(
7777
ref ReadOnlySpan<float> source,
7878
ref Span<byte> dest)
7979
{
@@ -89,7 +89,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
8989

9090
if (adjustedCount > 0)
9191
{
92-
BulkConvertNormalizedFloatToByteClampOverflows(
92+
NormalizedFloatToByteSaturate(
9393
source.Slice(0, adjustedCount),
9494
dest.Slice(0, adjustedCount));
9595

@@ -99,9 +99,9 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
9999
}
100100

101101
/// <summary>
102-
/// Implementation <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
102+
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
103103
/// </summary>
104-
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
104+
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
105105
{
106106
VerifySpanInput(source, dest, Vector<byte>.Count);
107107

@@ -132,9 +132,9 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
132132
}
133133

134134
/// <summary>
135-
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
135+
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
136136
/// </summary>
137-
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
137+
internal static void NormalizedFloatToByteSaturate(
138138
ReadOnlySpan<float> source,
139139
Span<byte> dest)
140140
{

src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ internal static partial class SimdUtils
1919
public static class FallbackIntrinsics128
2020
{
2121
/// <summary>
22-
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
22+
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
2323
/// </summary>
2424
[MethodImpl(InliningOptions.ShortMethod)]
25-
internal static void BulkConvertByteToNormalizedFloatReduce(
25+
internal static void ByteToNormalizedFloatReduce(
2626
ref ReadOnlySpan<byte> source,
2727
ref Span<float> dest)
2828
{
@@ -33,7 +33,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
3333

3434
if (adjustedCount > 0)
3535
{
36-
BulkConvertByteToNormalizedFloat(
36+
ByteToNormalizedFloat(
3737
source.Slice(0, adjustedCount),
3838
dest.Slice(0, adjustedCount));
3939

@@ -43,10 +43,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
4343
}
4444

4545
/// <summary>
46-
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
46+
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
4747
/// </summary>
4848
[MethodImpl(InliningOptions.ShortMethod)]
49-
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
49+
internal static void NormalizedFloatToByteSaturateReduce(
5050
ref ReadOnlySpan<float> source,
5151
ref Span<byte> dest)
5252
{
@@ -57,7 +57,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
5757

5858
if (adjustedCount > 0)
5959
{
60-
BulkConvertNormalizedFloatToByteClampOverflows(
60+
NormalizedFloatToByteSaturate(
6161
source.Slice(0, adjustedCount),
6262
dest.Slice(0, adjustedCount));
6363

@@ -67,10 +67,10 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
6767
}
6868

6969
/// <summary>
70-
/// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
70+
/// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
7171
/// </summary>
7272
[MethodImpl(InliningOptions.ColdPath)]
73-
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
73+
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
7474
{
7575
VerifySpanInput(source, dest, 4);
7676

@@ -99,10 +99,10 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
9999
}
100100

101101
/// <summary>
102-
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
102+
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> using <see cref="Vector4"/>.
103103
/// </summary>
104104
[MethodImpl(InliningOptions.ColdPath)]
105-
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
105+
internal static void NormalizedFloatToByteSaturate(
106106
ReadOnlySpan<float> source,
107107
Span<byte> dest)
108108
{
@@ -148,4 +148,4 @@ private struct ByteVector4
148148
}
149149
}
150150
}
151-
}
151+
}

0 commit comments

Comments
 (0)