SixLabors · antonfirsov · Mar 13, 2020 · Feb 29, 2020 · Feb 29, 2020 · Feb 29, 2020
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
@@ -0,0 +1,103 @@
+// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        public static class Avx2Intrinsics
+        {
+            private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
+
+            /// <summary>
+            /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void NormalizedFloatToByteSaturateReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                if (Avx2.IsSupported)
+                {
+                    int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
+                    int adjustedCount = source.Length - remainder;
+
+                    if (adjustedCount > 0)
+                    {
+                        NormalizedFloatToByteSaturate(
+                            source.Slice(0, adjustedCount),
+                            dest.Slice(0, adjustedCount));
+
+                        source = source.Slice(adjustedCount);
+                        dest = dest.Slice(adjustedCount);
+                    }
+                }
+            }
+
+            /// <summary>
+            /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
+            /// </summary>
+            /// <remarks>
+            /// Implementation is based on MagicScaler code:
+            /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
+            /// </remarks>
+            internal static void NormalizedFloatToByteSaturate(
+                ReadOnlySpan<float> source,
+                Span<byte> dest)
+            {
+                VerifySpanInput(source, dest, Vector256<byte>.Count);
+
+                int n = dest.Length / Vector256<byte>.Count;
+
+                ref Vector256<float> sourceBase =
+                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
+
+                var maxBytes = Vector256.Create(255f);
+                ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
+                Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+                    Vector256<float> f0 = s;
+                    Vector256<float> f1 = Unsafe.Add(ref s, 1);
+                    Vector256<float> f2 = Unsafe.Add(ref s, 2);
+                    Vector256<float> f3 = Unsafe.Add(ref s, 3);
+
+                    Vector256<int> w0 = ConvertToInt32(f0, maxBytes);
+                    Vector256<int> w1 = ConvertToInt32(f1, maxBytes);
+                    Vector256<int> w2 = ConvertToInt32(f2, maxBytes);
+                    Vector256<int> w3 = ConvertToInt32(f3, maxBytes);
+
+                    Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
+                    Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
+                    Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
+                    b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
+
+                    Unsafe.Add(ref destBase, i) = b;
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
+            {
+                vf = Avx.Multiply(vf, scale);
+                return Avx.ConvertToVector256Int32(vf);
+            }
+        }
+    }
+}
+#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@@ -17,14 +17,14 @@ internal static partial class SimdUtils
         /// </summary>
         public static class BasicIntrinsics256
         {
-            public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
+            public static bool IsAvailable { get; } = HasVector8;
 
 #if !SUPPORTS_EXTENDED_INTRINSICS
             /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertByteToNormalizedFloatReduce(
+            internal static void ByteToNormalizedFloatReduce(
                 ref ReadOnlySpan<byte> source,
                 ref Span<float> dest)
             {
@@ -40,7 +40,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertByteToNormalizedFloat(
+                    ByteToNormalizedFloat(
                         source.Slice(0, adjustedCount),
                         dest.Slice(0, adjustedCount));
 
@@ -50,10 +50,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
             }
 
             /// <summary>
-            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+            internal static void NormalizedFloatToByteSaturateReduce(
                 ref ReadOnlySpan<float> source,
                 ref Span<byte> dest)
             {
@@ -69,7 +69,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+                    NormalizedFloatToByteSaturate(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
 
                     source = source.Slice(adjustedCount);
                     dest = dest.Slice(adjustedCount);
@@ -78,15 +78,15 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
 #endif
 
             /// <summary>
-            /// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
+            /// SIMD optimized implementation for <see cref="SimdUtils.ByteToNormalizedFloat"/>.
             /// Works only with span Length divisible by 8.
             /// Implementation adapted from:
             /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
             /// http://stackoverflow.com/a/536278
             /// </summary>
-            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
             {
-                VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat));
+                VerifyHasVector8(nameof(ByteToNormalizedFloat));
                 VerifySpanInput(source, dest, 8);
 
                 var bVec = new Vector<float>(256.0f / 255.0f);
@@ -124,11 +124,11 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
             }
 
             /// <summary>
-            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
+            /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> which is faster on older runtimes.
             /// </summary>
-            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+            internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> dest)
             {
-                VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
+                VerifyHasVector8(nameof(NormalizedFloatToByteSaturate));
                 VerifySpanInput(source, dest, 8);
 
                 if (source.Length == 0)
@@ -177,7 +177,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan
             /// </summary>
             internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
             {
-                VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte));
+                VerifyHasVector8(nameof(BulkConvertNormalizedFloatToByte));
                 VerifySpanInput(source, dest, 8);
 
                 if (source.Length == 0)

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@@ -43,10 +43,10 @@ internal static void ConvertToSingle(
             }
 
             /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertByteToNormalizedFloatReduce(
+            internal static void ByteToNormalizedFloatReduce(
                 ref ReadOnlySpan<byte> source,
                 ref Span<float> dest)
             {
@@ -62,18 +62,18 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+                    ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
 
                     source = source.Slice(adjustedCount);
                     dest = dest.Slice(adjustedCount);
                 }
             }
 
             /// <summary>
-            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+            internal static void NormalizedFloatToByteSaturateReduce(
                 ref ReadOnlySpan<float> source,
                 ref Span<byte> dest)
             {
@@ -89,7 +89,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertNormalizedFloatToByteClampOverflows(
+                    NormalizedFloatToByteSaturate(
                         source.Slice(0, adjustedCount),
                         dest.Slice(0, adjustedCount));
 
@@ -99,9 +99,9 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
             }
 
             /// <summary>
-            /// Implementation <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
+            /// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
             /// </summary>
-            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
             {
                 VerifySpanInput(source, dest, Vector<byte>.Count);
 
@@ -132,9 +132,9 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
             }
 
             /// <summary>
-            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
+            /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
             /// </summary>
-            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+            internal static void NormalizedFloatToByteSaturate(
                 ReadOnlySpan<float> source,
                 Span<byte> dest)
             {

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@@ -19,10 +19,10 @@ internal static partial class SimdUtils
         public static class FallbackIntrinsics128
         {
             /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertByteToNormalizedFloatReduce(
+            internal static void ByteToNormalizedFloatReduce(
                 ref ReadOnlySpan<byte> source,
                 ref Span<float> dest)
             {
@@ -33,7 +33,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertByteToNormalizedFloat(
+                    ByteToNormalizedFloat(
                         source.Slice(0, adjustedCount),
                         dest.Slice(0, adjustedCount));
 
@@ -43,10 +43,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
             }
 
             /// <summary>
-            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
             [MethodImpl(InliningOptions.ShortMethod)]
-            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+            internal static void NormalizedFloatToByteSaturateReduce(
                 ref ReadOnlySpan<float> source,
                 ref Span<byte> dest)
             {
@@ -57,7 +57,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
 
                 if (adjustedCount > 0)
                 {
-                    BulkConvertNormalizedFloatToByteClampOverflows(
+                    NormalizedFloatToByteSaturate(
                         source.Slice(0, adjustedCount),
                         dest.Slice(0, adjustedCount));
 
@@ -67,10 +67,10 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
             }
 
             /// <summary>
-            /// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
+            /// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
             /// </summary>
             [MethodImpl(InliningOptions.ColdPath)]
-            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
             {
                 VerifySpanInput(source, dest, 4);
 
@@ -99,10 +99,10 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
             }
 
             /// <summary>
-            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
+            /// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> using <see cref="Vector4"/>.
             /// </summary>
             [MethodImpl(InliningOptions.ColdPath)]
-            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+            internal static void NormalizedFloatToByteSaturate(
                 ReadOnlySpan<float> source,
                 Span<byte> dest)
             {
@@ -148,4 +148,4 @@ private struct ByteVector4
             }
         }
     }
-}
+}