-
-
Notifications
You must be signed in to change notification settings - Fork 888
Undo jpeg perf regression, add various optimizations #1143
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ba22e9e
219baad
d951fd9
7c364dc
48d3963
1f2894a
7b7e502
cc056a6
7a93ae4
10616c8
4afb31f
687b48a
c61c3d7
0f278e2
ddb73ea
c567762
16afcab
939c164
26ddbc0
810d3bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| // Copyright (c) Six Labors and contributors. | ||
| // Licensed under the Apache License, Version 2.0. | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
|
|
||
| using System; | ||
| using System.Numerics; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Runtime.InteropServices; | ||
| using System.Runtime.Intrinsics; | ||
| using System.Runtime.Intrinsics.X86; | ||
|
|
||
| namespace SixLabors.ImageSharp | ||
| { | ||
| internal static partial class SimdUtils | ||
| { | ||
| public static class Avx2Intrinsics | ||
| { | ||
| private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; | ||
|
|
||
| /// <summary> | ||
| /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder). | ||
| /// </summary> | ||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| internal static void NormalizedFloatToByteSaturateReduce( | ||
| ref ReadOnlySpan<float> source, | ||
| ref Span<byte> dest) | ||
| { | ||
| DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); | ||
|
|
||
| if (Avx2.IsSupported) | ||
| { | ||
| int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); | ||
| int adjustedCount = source.Length - remainder; | ||
|
|
||
| if (adjustedCount > 0) | ||
| { | ||
| NormalizedFloatToByteSaturate( | ||
| source.Slice(0, adjustedCount), | ||
| dest.Slice(0, adjustedCount)); | ||
|
|
||
| source = source.Slice(adjustedCount); | ||
| dest = dest.Slice(adjustedCount); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// Implementation is based on MagicScaler code: | ||
| /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477 | ||
| /// </remarks> | ||
| internal static void NormalizedFloatToByteSaturate( | ||
| ReadOnlySpan<float> source, | ||
| Span<byte> dest) | ||
| { | ||
| VerifySpanInput(source, dest, Vector256<byte>.Count); | ||
|
|
||
| int n = dest.Length / Vector256<byte>.Count; | ||
|
|
||
| ref Vector256<float> sourceBase = | ||
| ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); | ||
| ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); | ||
|
|
||
| var maxBytes = Vector256.Create(255f); | ||
| ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); | ||
| Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); | ||
|
|
||
| for (int i = 0; i < n; i++) | ||
| { | ||
| ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); | ||
|
|
||
| Vector256<float> f0 = s; | ||
| Vector256<float> f1 = Unsafe.Add(ref s, 1); | ||
| Vector256<float> f2 = Unsafe.Add(ref s, 2); | ||
| Vector256<float> f3 = Unsafe.Add(ref s, 3); | ||
|
|
||
| Vector256<int> w0 = ConvertToInt32(f0, maxBytes); | ||
| Vector256<int> w1 = ConvertToInt32(f1, maxBytes); | ||
| Vector256<int> w2 = ConvertToInt32(f2, maxBytes); | ||
| Vector256<int> w3 = ConvertToInt32(f3, maxBytes); | ||
|
|
||
| Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); | ||
| Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); | ||
| Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); | ||
| b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); | ||
|
|
||
| Unsafe.Add(ref destBase, i) = b; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale) | ||
| { | ||
| vf = Avx.Multiply(vf, scale); | ||
| return Avx.ConvertToVector256Int32(vf); | ||
| } | ||
|
Comment on lines
+95
to
+99
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A couple of things I noticed. You're using We're not clamping like we do in the other implementations. I had a quick go. (I don't know how private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale, Vector256<float> offset)
{
vf = Avx2.Multiply(vf, scale);
vf = Avx2.Add(vf, offset);
vf = Avx2.Min(Avx2.Max(vf, Vector256<float>.Zero), scale);
return Avx2.ConvertToVector256Int32(vf);
}
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JimBobSquarePants new x86 instructions sets are extensions to previous families of instructions. Despite the fact that these methods are static, designers of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wish the docs were better. You have to look up each method on the intel docs which is just misdirection.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it makes sense to duplicate the Intel content because the specification is owned and maintained by Intel. Some instructions do crazy complex stuff. (See: specification of Adding a link to related Intel docs would be nice though.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In general AVX instructions operate on floats and AVX2 operate on integer types. If seeing them mixed in code bothers you, you can simply
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just realised VS actually suggests |
||
| } | ||
| } | ||
| } | ||
| #endif | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note for the future. We should add comments to this kind of stuff so I can understand what is actually does! 😆
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, I would be happy if I could place any meaningful comment here, but the truth is that I have no idea what does it do exactly.
All I know is that it's a permuatation mask to unshuffle the bytes returned by
PackSignedSaturatewhich are in a meaningless order to my naive eyes for some reason I not understand, and haven't taken the time to research it any further. Maybe if @saucecontrol has some more time to clarify the high level concept..Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The basic idea is that nearly all AVX instructions operate independently on 2 128-bit lanes rather than on the 256-bit register as a whole. So if you have 4
Vector256<int>that contain pixels 0,1 | 2,3 | 4,5 | 6,7, when you narrow and pack them, they end up in 2 registers as 0,2,1,3 | 4,6,5,7. Then you do that again, and you get 1 register with 0,2,4,6,1,3,5,7.Permute instructions essentially do a shuffle across lanes, so you give it the order 0,4,1,5,2,6,3,7 to undo the interleaving that happened in the previous steps. That ROS just has those 8 32-bit integers written in little endian order.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Knowing this, the whole thing makes much more sense now.