44using System ;
55using System . Buffers . Binary ;
66using System . Runtime . CompilerServices ;
7+ using System . Runtime . InteropServices ;
8+ #if SUPPORTS_RUNTIME_INTRINSICS
9+ using System . Runtime . Intrinsics ;
10+ using System . Runtime . Intrinsics . X86 ;
11+ #endif
712
813// ReSharper disable InconsistentNaming
914namespace SixLabors . ImageSharp . Formats . Webp . Lossy
1015{
11- internal static class LossyUtils
16+ internal static unsafe class LossyUtils
1217 {
1318 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1419 public static int Vp8Sse16X16 ( Span < byte > a , Span < byte > b ) => GetSse ( a , b , 16 , 16 ) ;
@@ -61,11 +66,12 @@ public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
6166 public static int Vp8Disto16X16 ( Span < byte > a , Span < byte > b , Span < ushort > w , Span < int > scratch )
6267 {
6368 int d = 0 ;
69+ int dataSize = ( 4 * WebpConstants . Bps ) - 16 ;
6470 for ( int y = 0 ; y < 16 * WebpConstants . Bps ; y += 4 * WebpConstants . Bps )
6571 {
6672 for ( int x = 0 ; x < 16 ; x += 4 )
6773 {
68- d += Vp8Disto4X4 ( a . Slice ( x + y ) , b . Slice ( x + y ) , w , scratch ) ;
74+ d += Vp8Disto4X4 ( a . Slice ( x + y , dataSize ) , b . Slice ( x + y , dataSize ) , w , scratch ) ;
6975 }
7076 }
7177
@@ -75,9 +81,19 @@ public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span
7581 [ MethodImpl ( InliningOptions . ShortMethod ) ]
7682 public static int Vp8Disto4X4 ( Span < byte > a , Span < byte > b , Span < ushort > w , Span < int > scratch )
7783 {
78- int sum1 = TTransform ( a , w , scratch ) ;
79- int sum2 = TTransform ( b , w , scratch ) ;
80- return Math . Abs ( sum2 - sum1 ) >> 5 ;
84+ #if SUPPORTS_RUNTIME_INTRINSICS
85+ if ( Sse41 . IsSupported )
86+ {
87+ int diffSum = TTransformSse41 ( a , b , w , scratch ) ;
88+ return Math . Abs ( diffSum ) >> 5 ;
89+ }
90+ else
91+ #endif
92+ {
93+ int sum1 = TTransform ( a , w , scratch ) ;
94+ int sum2 = TTransform ( b , w , scratch ) ;
95+ return Math . Abs ( sum2 - sum1 ) >> 5 ;
96+ }
8197 }
8298
8399 public static void DC16 ( Span < byte > dst , Span < byte > yuv , int offset )
@@ -589,6 +605,127 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
589605 return sum ;
590606 }
591607
608+ #if SUPPORTS_RUNTIME_INTRINSICS
609+ /// <summary>
610+ /// Hadamard transform
611+ /// Returns the weighted sum of the absolute value of transformed coefficients.
612+ /// w[] contains a row-major 4 by 4 symmetric matrix.
613+ /// </summary>
614+ public static int TTransformSse41 ( Span < byte > inputA , Span < byte > inputB , Span < ushort > w , Span < int > scratch )
615+ {
616+ Span < int > sum = scratch . Slice ( 0 , 4 ) ;
617+ sum . Clear ( ) ;
618+
619+ // Load and combine inputs.
620+ Vector128 < byte > ina0 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputA ) ) ;
621+ Vector128 < byte > ina1 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputA . Slice ( WebpConstants . Bps , 16 ) ) ) ;
622+ Vector128 < byte > ina2 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputA . Slice ( WebpConstants . Bps * 2 , 16 ) ) ) ;
623+ Vector128 < long > ina3 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputA . Slice ( WebpConstants . Bps * 3 , 16 ) ) ) . AsInt64 ( ) ;
624+ Vector128 < byte > inb0 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputB ) ) ;
625+ Vector128 < byte > inb1 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputB . Slice ( WebpConstants . Bps , 16 ) ) ) ;
626+ Vector128 < byte > inb2 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputB . Slice ( WebpConstants . Bps * 2 , 16 ) ) ) ;
627+ Vector128 < long > inb3 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( inputB . Slice ( WebpConstants . Bps * 3 , 16 ) ) ) . AsInt64 ( ) ;
628+
629+ // Combine inA and inB (we'll do two transforms in parallel).
630+ Vector128 < int > inab0 = Sse2 . UnpackLow ( ina0 . AsInt32 ( ) , inb0 . AsInt32 ( ) ) ;
631+ Vector128 < int > inab1 = Sse2 . UnpackLow ( ina1 . AsInt32 ( ) , inb1 . AsInt32 ( ) ) ;
632+ Vector128 < int > inab2 = Sse2 . UnpackLow ( ina2 . AsInt32 ( ) , inb2 . AsInt32 ( ) ) ;
633+ Vector128 < int > inab3 = Sse2 . UnpackLow ( ina3 . AsInt32 ( ) , inb3 . AsInt32 ( ) ) ;
634+ Vector128 < short > tmp0 = Sse41 . ConvertToVector128Int16 ( inab0 . AsByte ( ) ) ;
635+ Vector128 < short > tmp1 = Sse41 . ConvertToVector128Int16 ( inab1 . AsByte ( ) ) ;
636+ Vector128 < short > tmp2 = Sse41 . ConvertToVector128Int16 ( inab2 . AsByte ( ) ) ;
637+ Vector128 < short > tmp3 = Sse41 . ConvertToVector128Int16 ( inab3 . AsByte ( ) ) ;
638+
639+ // a00 a01 a02 a03 b00 b01 b02 b03
640+ // a10 a11 a12 a13 b10 b11 b12 b13
641+ // a20 a21 a22 a23 b20 b21 b22 b23
642+ // a30 a31 a32 a33 b30 b31 b32 b33
643+ // Vertical pass first to avoid a transpose (vertical and horizontal passes
644+ // are commutative because w/kWeightY is symmetric) and subsequent transpose.
645+ // Calculate a and b (two 4x4 at once).
646+ Vector128 < short > a0 = Sse2 . Add ( tmp0 , tmp2 ) ;
647+ Vector128 < short > a1 = Sse2 . Add ( tmp1 , tmp3 ) ;
648+ Vector128 < short > a2 = Sse2 . Subtract ( tmp1 , tmp3 ) ;
649+ Vector128 < short > a3 = Sse2 . Subtract ( tmp0 , tmp2 ) ;
650+ Vector128 < short > b0 = Sse2 . Add ( a0 , a1 ) ;
651+ Vector128 < short > b1 = Sse2 . Add ( a3 , a2 ) ;
652+ Vector128 < short > b2 = Sse2 . Subtract ( a3 , a2 ) ;
653+ Vector128 < short > b3 = Sse2 . Subtract ( a0 , a1 ) ;
654+
655+ // a00 a01 a02 a03 b00 b01 b02 b03
656+ // a10 a11 a12 a13 b10 b11 b12 b13
657+ // a20 a21 a22 a23 b20 b21 b22 b23
658+ // a30 a31 a32 a33 b30 b31 b32 b33
659+ // Transpose the two 4x4.
660+ Vector128 < short > transpose00 = Sse2 . UnpackLow ( b0 , b1 ) ;
661+ Vector128 < short > transpose01 = Sse2 . UnpackLow ( b2 , b3 ) ;
662+ Vector128 < short > transpose02 = Sse2 . UnpackHigh ( b0 , b1 ) ;
663+ Vector128 < short > transpose03 = Sse2 . UnpackHigh ( b2 , b3 ) ;
664+
665+ // a00 a10 a01 a11 a02 a12 a03 a13
666+ // a20 a30 a21 a31 a22 a32 a23 a33
667+ // b00 b10 b01 b11 b02 b12 b03 b13
668+ // b20 b30 b21 b31 b22 b32 b23 b33
669+ Vector128 < int > transpose10 = Sse2 . UnpackLow ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
670+ Vector128 < int > transpose11 = Sse2 . UnpackLow ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
671+ Vector128 < int > transpose12 = Sse2 . UnpackHigh ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
672+ Vector128 < int > transpose13 = Sse2 . UnpackHigh ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
673+
674+ // a00 a10 a20 a30 a01 a11 a21 a31
675+ // b00 b10 b20 b30 b01 b11 b21 b31
676+ // a02 a12 a22 a32 a03 a13 a23 a33
677+ // b02 b12 a22 b32 b03 b13 b23 b33
678+ Vector128 < long > output0 = Sse2 . UnpackLow ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
679+ Vector128 < long > output1 = Sse2 . UnpackHigh ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
680+ Vector128 < long > output2 = Sse2 . UnpackLow ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
681+ Vector128 < long > output3 = Sse2 . UnpackHigh ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
682+
683+ // a00 a10 a20 a30 b00 b10 b20 b30
684+ // a01 a11 a21 a31 b01 b11 b21 b31
685+ // a02 a12 a22 a32 b02 b12 b22 b32
686+ // a03 a13 a23 a33 b03 b13 b23 b33
687+ // Horizontal pass and difference of weighted sums.
688+ Vector128 < ushort > w0 = Unsafe . As < ushort , Vector128 < ushort > > ( ref MemoryMarshal . GetReference ( w ) ) ;
689+ Vector128 < ushort > w8 = Unsafe . As < ushort , Vector128 < ushort > > ( ref MemoryMarshal . GetReference ( w . Slice ( 8 , 8 ) ) ) ;
690+
691+ // Calculate a and b (two 4x4 at once).
692+ a0 = Sse2 . Add ( output0 . AsInt16 ( ) , output2 . AsInt16 ( ) ) ;
693+ a1 = Sse2 . Add ( output1 . AsInt16 ( ) , output3 . AsInt16 ( ) ) ;
694+ a2 = Sse2 . Subtract ( output1 . AsInt16 ( ) , output3 . AsInt16 ( ) ) ;
695+ a3 = Sse2 . Subtract ( output0 . AsInt16 ( ) , output2 . AsInt16 ( ) ) ;
696+ b0 = Sse2 . Add ( a0 , a1 ) ;
697+ b1 = Sse2 . Add ( a3 , a2 ) ;
698+ b2 = Sse2 . Subtract ( a3 , a2 ) ;
699+ b3 = Sse2 . Subtract ( a0 , a1 ) ;
700+
701+ // Separate the transforms of inA and inB.
702+ Vector128 < long > ab0 = Sse2 . UnpackLow ( b0 . AsInt64 ( ) , b1 . AsInt64 ( ) ) ;
703+ Vector128 < long > ab2 = Sse2 . UnpackLow ( b2 . AsInt64 ( ) , b3 . AsInt64 ( ) ) ;
704+ Vector128 < long > bb0 = Sse2 . UnpackHigh ( b0 . AsInt64 ( ) , b1 . AsInt64 ( ) ) ;
705+ Vector128 < long > bb2 = Sse2 . UnpackHigh ( b2 . AsInt64 ( ) , b3 . AsInt64 ( ) ) ;
706+
707+ Vector128 < ushort > ab0Abs = Ssse3 . Abs ( ab0 . AsInt16 ( ) ) ;
708+ Vector128 < ushort > ab2Abs = Ssse3 . Abs ( ab2 . AsInt16 ( ) ) ;
709+ Vector128 < ushort > b0Abs = Ssse3 . Abs ( bb0 . AsInt16 ( ) ) ;
710+ Vector128 < ushort > bb2Abs = Ssse3 . Abs ( bb2 . AsInt16 ( ) ) ;
711+
712+ // weighted sums.
713+ Vector128 < int > ab0mulw0 = Sse2 . MultiplyAddAdjacent ( ab0Abs . AsInt16 ( ) , w0 . AsInt16 ( ) ) ;
714+ Vector128 < int > ab2mulw8 = Sse2 . MultiplyAddAdjacent ( ab2Abs . AsInt16 ( ) , w8 . AsInt16 ( ) ) ;
715+ Vector128 < int > b0mulw0 = Sse2 . MultiplyAddAdjacent ( b0Abs . AsInt16 ( ) , w0 . AsInt16 ( ) ) ;
716+ Vector128 < int > bb2mulw8 = Sse2 . MultiplyAddAdjacent ( bb2Abs . AsInt16 ( ) , w8 . AsInt16 ( ) ) ;
717+ Vector128 < int > ab0ab2Sum = Sse2 . Add ( ab0mulw0 , ab2mulw8 ) ;
718+ Vector128 < int > b0w0bb2w8Sum = Sse2 . Add ( b0mulw0 , bb2mulw8 ) ;
719+
720+ // difference of weighted sums.
721+ Vector128 < int > result = Sse2 . Subtract ( ab0ab2Sum . AsInt32 ( ) , b0w0bb2w8Sum . AsInt32 ( ) ) ;
722+
723+ ref int outputRef = ref MemoryMarshal . GetReference ( sum ) ;
724+ Unsafe . As < int , Vector128 < int > > ( ref outputRef ) = result. AsInt32 ( ) ;
725+ return sum [ 3 ] + sum [ 2 ] + sum [ 1 ] + sum [ 0 ] ;
726+ }
727+ #endif
728+
592729 public static void TransformTwo ( Span < short > src , Span < byte > dst , Span < int > scratch )
593730 {
594731 TransformOne ( src , dst , scratch ) ;
0 commit comments