Skip to content

Commit 5c89d0a

Browse files
committed
Initial vectorized cube root implementation
1 parent cb18c58 commit 5c89d0a

File tree

2 files changed

+129
-58
lines changed

2 files changed

+129
-58
lines changed

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,5 +547,131 @@ public static void UnPremultiply(Span<Vector4> vectors)
547547
}
548548
}
549549
}
550+
551+
/// <summary>
552+
/// Calculates the cube pow of all the XYZ channels of the input vectors.
553+
/// </summary>
554+
/// <param name="vectors">The span of vectors</param>
555+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
556+
public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
557+
{
558+
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
559+
int length = vectors.Length;
560+
561+
for (int x = 0; x < length; x++)
562+
{
563+
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
564+
Vector4 v = pixel4;
565+
float a = v.W;
566+
567+
// Fast path for the default gamma exposure, which is 3. In this case we can skip
568+
// calling Math.Pow 3 times (one per component), as the method is an internal call and
569+
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
570+
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
571+
// back to the target index in the temporary span. The whole iteration will get completely
572+
// inlined and traslated into vectorized instructions, with much better performance.
573+
v = v * v * v;
574+
v.W = a;
575+
576+
pixel4 = v;
577+
}
578+
}
579+
580+
/// <summary>
581+
/// Calculates the cube root of all the XYZ channels of the input vectors.
582+
/// </summary>
583+
/// <param name="vectors">The span of vectors</param>
584+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
585+
public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
586+
{
587+
ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
588+
int length = vectors.Length;
589+
590+
#if SUPPORTS_RUNTIME_INTRINSICS
591+
if (Sse41.IsSupported)
592+
{
593+
var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
594+
var v128_0x3F8000000 = Vector128.Create(0x3F800000);
595+
var v128_341 = Vector128.Create(341);
596+
var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000));
597+
var v4_23rds = new Vector4(2 / 3f);
598+
var v4_13rds = new Vector4(1 / 3f);
599+
600+
for (int x = 0; x < length; x++)
601+
{
602+
ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x);
603+
604+
Vector4 vx = v4;
605+
float a = vx.W;
606+
Vector128<int> veax = Unsafe.As<Vector4, Vector128<int>>(ref vx);
607+
Vector128<int> vecx = veax;
608+
609+
// If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
610+
// directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
611+
// https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
612+
// Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
613+
// using a different succession (the same used below), which should be less unstable due to not having cube pow.
614+
veax = Sse2.And(veax, v128_0x7FFFFFFF);
615+
veax = Sse2.Subtract(veax, v128_0x3F8000000);
616+
veax = Sse2.ShiftRightArithmetic(veax, 10);
617+
veax = Sse41.MultiplyLow(veax, v128_341);
618+
veax = Sse2.Add(veax, v128_0x3F8000000);
619+
veax = Sse2.And(veax, v128_0x7FFFFFFF);
620+
vecx = Sse2.And(vecx, v128_0x80000000);
621+
veax = Sse2.Or(veax, vecx);
622+
623+
Vector4 y4 = *(Vector4*)&veax;
624+
625+
y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
626+
y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
627+
y4.W = a;
628+
629+
v4 = y4;
630+
}
631+
632+
return;
633+
}
634+
#else
635+
for (int x = 0; x < length; x++)
636+
{
637+
ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x);
638+
639+
double
640+
x64 = v.X,
641+
y64 = v.Y,
642+
z64 = v.Z;
643+
float a = v.W;
644+
645+
ulong
646+
xl = *(ulong*)&x64,
647+
yl = *(ulong*)&y64,
648+
zl = *(ulong*)&z64;
649+
650+
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing
651+
// pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
652+
// this means what we actually want is to find the cube root of our clamped values.
653+
// For more info on the constant below, see:
654+
// https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
655+
// Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
656+
// store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
657+
// register, and use it to accelerate two steps of the Newton approximation using SIMD.
658+
xl = 0x2a9f8a7be393b600 + (xl / 3);
659+
yl = 0x2a9f8a7be393b600 + (yl / 3);
660+
zl = 0x2a9f8a7be393b600 + (zl / 3);
661+
662+
Vector4 y4;
663+
y4.X = (float)*(double*)&xl;
664+
y4.Y = (float)*(double*)&yl;
665+
y4.Z = (float)*(double*)&zl;
666+
y4.W = 0;
667+
668+
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
669+
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
670+
y4.W = a;
671+
672+
v = y4;
673+
}
674+
#endif
675+
}
550676
}
551677
}

src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -331,27 +331,10 @@ public ApplyGamma3ExposureRowOperation(
331331
public void Invoke(int y, Span<Vector4> span)
332332
{
333333
Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
334+
334335
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
335-
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
336-
int length = this.bounds.Width;
337336

338-
for (int x = 0; x < length; x++)
339-
{
340-
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
341-
Vector4 v = pixel4;
342-
float a = v.W;
343-
344-
// Fast path for the default gamma exposure, which is 3. In this case we can skip
345-
// calling Math.Pow 3 times (one per component), as the method is an internal call and
346-
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
347-
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
348-
// back to the target index in the temporary span. The whole iteration will get completely
349-
// inlined and traslated into vectorized instructions, with much better performance.
350-
v = v * v * v;
351-
v.W = a;
352-
353-
pixel4 = v;
354-
}
337+
Numerics.CubePowOnXYZ(span);
355338

356339
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
357340
}
@@ -438,47 +421,9 @@ public unsafe void Invoke(int y)
438421
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
439422

440423
Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, float.PositiveInfinity);
424+
Numerics.CubeRootOnXYZ(sourceRowSpan);
441425

442426
Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
443-
int length = this.bounds.Width;
444-
445-
for (int x = 0; x < length; x++)
446-
{
447-
ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
448-
449-
double
450-
x64 = v.X,
451-
y64 = v.Y,
452-
z64 = v.Z;
453-
float a = v.W;
454-
455-
ulong
456-
xl = *(ulong*)&x64,
457-
yl = *(ulong*)&y64,
458-
zl = *(ulong*)&z64;
459-
460-
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
461-
// of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
462-
// constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
463-
// the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
464-
// these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
465-
// As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
466-
xl = 0x2a9f8a7be393b600 + (xl / 3);
467-
yl = 0x2a9f8a7be393b600 + (yl / 3);
468-
zl = 0x2a9f8a7be393b600 + (zl / 3);
469-
470-
Vector4 y4;
471-
y4.X = (float)*(double*)&xl;
472-
y4.Y = (float)*(double*)&yl;
473-
y4.Z = (float)*(double*)&zl;
474-
y4.W = 0;
475-
476-
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
477-
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
478-
y4.W = a;
479-
480-
v = y4;
481-
}
482427

483428
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
484429
}

0 commit comments

Comments
 (0)