Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright (c) Six Labors and contributors.
// Licensed under the Apache License, Version 2.0.

#if SUPPORTS_RUNTIME_INTRINSICS

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
public static class Avx2Intrinsics
{
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note for the future. We should add comments to this kind of stuff so I can understand what is actually does! 😆

Copy link
Member Author

@antonfirsov antonfirsov Mar 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I would be happy if I could place any meaningful comment here, but the truth is that I have no idea what does it do exactly.

All I know is that it's a permuatation mask to unshuffle the bytes returned by PackSignedSaturate which are in a meaningless order to my naive eyes for some reason I not understand, and haven't taken the time to research it any further. Maybe if @saucecontrol has some more time to clarify the high level concept..

Copy link
Contributor

@saucecontrol saucecontrol Mar 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic idea is that nearly all AVX instructions operate independently on 2 128-bit lanes rather than on the 256-bit register as a whole. So if you have 4 Vector256<int> that contain pixels 0,1 | 2,3 | 4,5 | 6,7, when you narrow and pack them, they end up in 2 registers as 0,2,1,3 | 4,6,5,7. Then you do that again, and you get 1 register with 0,2,4,6,1,3,5,7.

Permute instructions essentially do a shuffle across lanes, so you give it the order 0,4,1,5,2,6,3,7 to undo the interleaving that happened in the previous steps. That ROS just has those 8 32-bit integers written in little endian order.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nearly all AVX instructions operate independently on 2 128-bit lanes rather than on the 256-bit register as a whole

Knowing this, the whole thing makes much more sense now.


/// <summary>
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
int adjustedCount = source.Length - remainder;

if (adjustedCount > 0)
{
NormalizedFloatToByteSaturate(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}

/// <summary>
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
/// <remarks>
/// Implementation is based on MagicScaler code:
/// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
/// </remarks>
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);

int n = dest.Length / Vector256<byte>.Count;

ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

var maxBytes = Vector256.Create(255f);
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);

for (int i = 0; i < n; i++)
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);

Vector256<float> f0 = s;
Vector256<float> f1 = Unsafe.Add(ref s, 1);
Vector256<float> f2 = Unsafe.Add(ref s, 2);
Vector256<float> f3 = Unsafe.Add(ref s, 3);

Vector256<int> w0 = ConvertToInt32(f0, maxBytes);
Vector256<int> w1 = ConvertToInt32(f1, maxBytes);
Vector256<int> w2 = ConvertToInt32(f2, maxBytes);
Vector256<int> w3 = ConvertToInt32(f3, maxBytes);

Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();

Unsafe.Add(ref destBase, i) = b;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
vf = Avx.Multiply(vf, scale);
return Avx.ConvertToVector256Int32(vf);
}
Comment on lines +95 to +99
Copy link
Member

@JimBobSquarePants JimBobSquarePants Mar 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of things I noticed. You're using Avx over Avx2. I've noticed that in @saucecontrol -s original source also. Wouldn't Avx2 be faster?

We're not clamping like we do in the other implementations. I had a quick go. (I don't know how Vector256<float>.Zero is treated here, should it be passed as a param?). More tests pass but there's still minor differences.

private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale, Vector256<float> offset)
{
    vf = Avx2.Multiply(vf, scale);
    vf = Avx2.Add(vf, offset);
    vf = Avx2.Min(Avx2.Max(vf, Vector256<float>.Zero), scale);
    return Avx2.ConvertToVector256Int32(vf);
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avx2 inherits from Avx, and that Multiply overload is not part of the AVX2 instruction set, you're just accessing it through the Avx2 class here, but you're calling the same method. In fact if you use Re# it does suggest to simplify the name and just use Avx 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JimBobSquarePants new x86 instructions sets are extensions to previous families of instructions. Despite the fact that these methods are static, designers of System.Runtime.Intrinsics have chosen to use nonstatic classes with a nice inheritance chain to model this. Multiply(Vector256<float>, Vector256<float>) is in fact defined in the base class Avx, because the backing VMULPS ymm, ymm, ymm/m256 instruction is already available in AVX. New x86 CPU families improve the performance of existing instructions instead of replacing them with new ones.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish the docs were better. You have to look up each method on the intel docs which is just misdirection.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it makes sense to duplicate the Intel content because the specification is owned and maintained by Intel. Some instructions do crazy complex stuff. (See: specification of Avx2.PackUnsignedSaturate / _mm256_packus_epi16 under "Operation"). Other instructions are trivially simple (Add, Multiply), while still best specified by the original docs.

Adding a link to related Intel docs would be nice though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't Avx2 be faster

In general AVX instructions operate on floats and AVX2 operate on integer types. If seeing them mixed in code bothers you, you can simply using static System.Runtime.Intrinsics.X86.Avx2 and have access to everything without the class name. I personally find it useful to specify the exact ISA for each instruction because it makes it easier to spot when an IsSupported check is missing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just realised VS actually suggests Avx anyway, offering to simplify if you use Avx2. Pretty neat!

}
}
}
#endif
28 changes: 14 additions & 14 deletions src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ internal static partial class SimdUtils
/// </summary>
public static class BasicIntrinsics256
{
public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
public static bool IsAvailable { get; } = HasVector8;

#if !SUPPORTS_EXTENDED_INTRINSICS
/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertByteToNormalizedFloatReduce(
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
Expand All @@ -40,7 +40,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(

if (adjustedCount > 0)
{
BulkConvertByteToNormalizedFloat(
ByteToNormalizedFloat(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

Expand All @@ -50,10 +50,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
}

/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
Expand All @@ -69,7 +69,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(

if (adjustedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
NormalizedFloatToByteSaturate(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
Expand All @@ -78,15 +78,15 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
#endif

/// <summary>
/// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
/// SIMD optimized implementation for <see cref="SimdUtils.ByteToNormalizedFloat"/>.
/// Works only with span Length divisible by 8.
/// Implementation adapted from:
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
/// http://stackoverflow.com/a/536278
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat));
VerifyHasVector8(nameof(ByteToNormalizedFloat));
VerifySpanInput(source, dest, 8);

var bVec = new Vector<float>(256.0f / 255.0f);
Expand Down Expand Up @@ -124,11 +124,11 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
}

/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> which is faster on older runtimes.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> dest)
{
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
VerifyHasVector8(nameof(NormalizedFloatToByteSaturate));
VerifySpanInput(source, dest, 8);

if (source.Length == 0)
Expand Down Expand Up @@ -177,7 +177,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan
/// </summary>
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
{
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte));
VerifyHasVector8(nameof(BulkConvertNormalizedFloatToByte));
VerifySpanInput(source, dest, 8);

if (source.Length == 0)
Expand Down
20 changes: 10 additions & 10 deletions src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ internal static void ConvertToSingle(
}

/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertByteToNormalizedFloatReduce(
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
Expand All @@ -62,18 +62,18 @@ internal static void BulkConvertByteToNormalizedFloatReduce(

if (adjustedCount > 0)
{
BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}

/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
Expand All @@ -89,7 +89,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(

if (adjustedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(
NormalizedFloatToByteSaturate(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

Expand All @@ -99,9 +99,9 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
}

/// <summary>
/// Implementation <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
VerifySpanInput(source, dest, Vector<byte>.Count);

Expand Down Expand Up @@ -132,9 +132,9 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
}

/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
Expand Down
22 changes: 11 additions & 11 deletions src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ internal static partial class SimdUtils
public static class FallbackIntrinsics128
{
/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertByteToNormalizedFloatReduce(
internal static void ByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
Expand All @@ -33,7 +33,7 @@ internal static void BulkConvertByteToNormalizedFloatReduce(

if (adjustedCount > 0)
{
BulkConvertByteToNormalizedFloat(
ByteToNormalizedFloat(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

Expand All @@ -43,10 +43,10 @@ internal static void BulkConvertByteToNormalizedFloatReduce(
}

/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
internal static void NormalizedFloatToByteSaturateReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
Expand All @@ -57,7 +57,7 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(

if (adjustedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(
NormalizedFloatToByteSaturate(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

Expand All @@ -67,10 +67,10 @@ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
}

/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
/// </summary>
[MethodImpl(InliningOptions.ColdPath)]
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
VerifySpanInput(source, dest, 4);

Expand Down Expand Up @@ -99,10 +99,10 @@ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source,
}

/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/> using <see cref="Vector4"/>.
/// </summary>
[MethodImpl(InliningOptions.ColdPath)]
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
internal static void NormalizedFloatToByteSaturate(
ReadOnlySpan<float> source,
Span<byte> dest)
{
Expand Down Expand Up @@ -148,4 +148,4 @@ private struct ByteVector4
}
}
}
}
}
Loading