33
44using System ;
55using System . Runtime . CompilerServices ;
6+ using System . Runtime . InteropServices ;
7+ #if SUPPORTS_RUNTIME_INTRINSICS
8+ using System . Runtime . Intrinsics ;
9+ using System . Runtime . Intrinsics . X86 ;
10+ #endif
611
712namespace SixLabors . ImageSharp . Formats . Webp . Lossy
813{
914 /// <summary>
1015 /// Quantization methods.
1116 /// </summary>
12- internal static class QuantEnc
17+ internal static unsafe class QuantEnc
1318 {
1419 private static readonly byte [ ] Zigzag = { 0 , 1 , 4 , 8 , 5 , 2 , 3 , 6 , 9 , 12 , 13 , 10 , 7 , 11 , 14 , 15 } ;
1520
1621 private static readonly ushort [ ] WeightY = { 38 , 32 , 20 , 9 , 32 , 28 , 17 , 7 , 20 , 17 , 10 , 4 , 9 , 7 , 4 , 2 } ;
1722
1823 private const int MaxLevel = 2047 ;
1924
25+ #if SUPPORTS_RUNTIME_INTRINSICS
26+ private static readonly Vector128 < short > MaxCoeff2047 = Vector128 . Create ( ( short ) MaxLevel ) ;
27+
28+ private static readonly Vector128 < byte > CstLo = Vector128 . Create ( 0 , 1 , 2 , 3 , 8 , 9 , 254 , 255 , 10 , 11 , 4 , 5 , 6 , 7 , 12 , 13 ) ;
29+
30+ private static readonly Vector128 < byte > Cst7 = Vector128 . Create ( 254 , 255 , 254 , 255 , 254 , 255 , 254 , 255 , 14 , 15 , 254 , 255 , 254 , 255 , 254 , 255 ) ;
31+
32+ private static readonly Vector128 < byte > CstHi = Vector128 . Create ( 2 , 3 , 8 , 9 , 10 , 11 , 4 , 5 , 254 , 255 , 6 , 7 , 12 , 13 , 14 , 15 ) ;
33+
34+ private static readonly Vector128 < byte > Cst8 = Vector128 . Create ( 254 , 255 , 254 , 255 , 254 , 255 , 0 , 1 , 254 , 255 , 254 , 255 , 254 , 255 , 254 , 255 ) ;
35+ #endif
36+
2037 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
2138 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
2239 private const int C1 = 7 ; // fraction of error sent to the 4x4 block below
@@ -298,14 +315,14 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
298315 }
299316
300317 Vp8Encoding . FTransformWht ( tmp , dcTmp , scratch ) ;
301- nz |= QuantizeBlock ( dcTmp , rd . YDcLevels , dqm . Y2 ) << 24 ;
318+ nz |= QuantizeBlock ( dcTmp , rd . YDcLevels , ref dqm . Y2 ) << 24 ;
302319
303320 for ( n = 0 ; n < 16 ; n += 2 )
304321 {
305322 // Zero-out the first coeff, so that: a) nz is correct below, and
306323 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
307324 tmp [ n * 16 ] = tmp [ ( n + 1 ) * 16 ] = 0 ;
308- nz |= Quantize2Blocks ( tmp . Slice ( n * 16 , 32 ) , rd . YAcLevels . AsSpan ( n * 16 , 32 ) , dqm . Y1 ) << n ;
325+ nz |= Quantize2Blocks ( tmp . Slice ( n * 16 , 32 ) , rd . YAcLevels . AsSpan ( n * 16 , 32 ) , ref dqm . Y1 ) << n ;
309326 }
310327
311328 // Transform back.
@@ -326,7 +343,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
326343 tmp . Clear ( ) ;
327344 scratch . Clear ( ) ;
328345 Vp8Encoding . FTransform ( src , reference , tmp , scratch ) ;
329- int nz = QuantizeBlock ( tmp , levels , dqm . Y1 ) ;
346+ int nz = QuantizeBlock ( tmp , levels , ref dqm . Y1 ) ;
330347 Vp8Encoding . ITransform ( reference , tmp , yuvOut , false , scratch ) ;
331348
332349 return nz ;
@@ -353,11 +370,11 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
353370 scratch ) ;
354371 }
355372
356- CorrectDcValues ( it , dqm . Uv , tmp , rd ) ;
373+ CorrectDcValues ( it , ref dqm . Uv , tmp , rd ) ;
357374
358375 for ( n = 0 ; n < 8 ; n += 2 )
359376 {
360- nz |= Quantize2Blocks ( tmp . Slice ( n * 16 , 32 ) , rd . UvLevels . AsSpan ( n * 16 , 32 ) , dqm . Uv ) << n ;
377+ nz |= Quantize2Blocks ( tmp . Slice ( n * 16 , 32 ) , rd . UvLevels . AsSpan ( n * 16 , 32 ) , ref dqm . Uv ) << n ;
361378 }
362379
363380 for ( n = 0 ; n < 8 ; n += 2 )
@@ -508,58 +525,155 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
508525 }
509526
510527 [ MethodImpl ( InliningOptions . ShortMethod ) ]
511- public static int Quantize2Blocks ( Span < short > input , Span < short > output , Vp8Matrix mtx )
528+ public static int Quantize2Blocks ( Span < short > input , Span < short > output , ref Vp8Matrix mtx )
512529 {
513- int nz = QuantizeBlock ( input , output , mtx ) << 0 ;
514- nz |= QuantizeBlock ( input . Slice ( 1 * 16 ) , output . Slice ( 1 * 16 ) , mtx ) << 1 ;
530+ int nz = QuantizeBlock ( input . Slice ( 0 , 16 ) , output . Slice ( 0 , 16 ) , ref mtx ) << 0 ;
531+ nz |= QuantizeBlock ( input . Slice ( 1 * 16 , 16 ) , output . Slice ( 1 * 16 , 16 ) , ref mtx ) << 1 ;
515532 return nz ;
516533 }
517534
518- public static int QuantizeBlock ( Span < short > input , Span < short > output , Vp8Matrix mtx )
535+ public static int QuantizeBlock ( Span < short > input , Span < short > output , ref Vp8Matrix mtx )
519536 {
520- int last = - 1 ;
521- int n ;
522- for ( n = 0 ; n < 16 ; ++ n )
537+ #if SUPPORTS_RUNTIME_INTRINSICS
538+ if ( Sse41 . IsSupported )
523539 {
524- int j = Zigzag [ n ] ;
525- bool sign = input [ j ] < 0 ;
526- uint coeff = ( uint ) ( ( sign ? - input [ j ] : input [ j ] ) + mtx . Sharpen [ j ] ) ;
527- if ( coeff > mtx . ZThresh [ j ] )
540+ // Load all inputs.
541+ Vector128 < short > input0 = Unsafe . As < short , Vector128 < short > > ( ref MemoryMarshal . GetReference ( input ) ) ;
542+ Vector128 < short > input8 = Unsafe . As < short , Vector128 < short > > ( ref MemoryMarshal . GetReference ( input . Slice ( 8 , 8 ) ) ) ;
543+ Vector128 < ushort > iq0 = Unsafe . As < ushort , Vector128 < ushort > > ( ref mtx . IQ [ 0 ] ) ;
544+ Vector128 < ushort > iq8 = Unsafe . As < ushort , Vector128 < ushort > > ( ref mtx . IQ [ 8 ] ) ;
545+ Vector128 < ushort > q0 = Unsafe . As < ushort , Vector128 < ushort > > ( ref mtx . Q [ 0 ] ) ;
546+ Vector128 < ushort > q8 = Unsafe . As < ushort , Vector128 < ushort > > ( ref mtx . Q [ 8 ] ) ;
547+
548+ // coeff = abs(in)
549+ Vector128 < ushort > coeff0 = Ssse3 . Abs ( input0 ) ;
550+ Vector128 < ushort > coeff8 = Ssse3 . Abs ( input8 ) ;
551+
552+ // coeff = abs(in) + sharpen
553+ Vector128 < short > sharpen0 = Unsafe . As < short , Vector128 < short > > ( ref mtx . Sharpen [ 0 ] ) ;
554+ Vector128 < short > sharpen8 = Unsafe . As < short , Vector128 < short > > ( ref mtx . Sharpen [ 8 ] ) ;
555+ Sse2 . Add ( coeff0 . AsInt16 ( ) , sharpen0 ) ;
556+ Sse2 . Add ( coeff8 . AsInt16 ( ) , sharpen8 ) ;
557+
558+ // out = (coeff * iQ + B) >> QFIX
559+ // doing calculations with 32b precision (QFIX=17)
560+ // out = (coeff * iQ)
561+ Vector128 < ushort > coeffiQ0H = Sse2 . MultiplyHigh ( coeff0 , iq0 ) ;
562+ Vector128 < ushort > coeffiQ0L = Sse2 . MultiplyLow ( coeff0 , iq0 ) ;
563+ Vector128 < ushort > coeffiQ8H = Sse2 . MultiplyHigh ( coeff8 , iq8 ) ;
564+ Vector128 < ushort > coeffiQ8L = Sse2 . MultiplyLow ( coeff8 , iq8 ) ;
565+ Vector128 < ushort > out00 = Sse2 . UnpackLow ( coeffiQ0L , coeffiQ0H ) ;
566+ Vector128 < ushort > out04 = Sse2 . UnpackHigh ( coeffiQ0L , coeffiQ0H ) ;
567+ Vector128 < ushort > out08 = Sse2 . UnpackLow ( coeffiQ8L , coeffiQ8H ) ;
568+ Vector128 < ushort > out12 = Sse2 . UnpackHigh ( coeffiQ8L , coeffiQ8H ) ;
569+
570+ // out = (coeff * iQ + B)
571+ Vector128 < uint > bias00 = Unsafe . As < uint , Vector128 < uint > > ( ref mtx . Bias [ 0 ] ) ;
572+ Vector128 < uint > bias04 = Unsafe . As < uint , Vector128 < uint > > ( ref mtx . Bias [ 4 ] ) ;
573+ Vector128 < uint > bias08 = Unsafe . As < uint , Vector128 < uint > > ( ref mtx . Bias [ 8 ] ) ;
574+ Vector128 < uint > bias12 = Unsafe . As < uint , Vector128 < uint > > ( ref mtx . Bias [ 12 ] ) ;
575+ out00 = Sse2 . Add ( out00 . AsInt32 ( ) , bias00 . AsInt32 ( ) ) . AsUInt16 ( ) ;
576+ out04 = Sse2 . Add ( out04 . AsInt32 ( ) , bias04 . AsInt32 ( ) ) . AsUInt16 ( ) ;
577+ out08 = Sse2 . Add ( out08 . AsInt32 ( ) , bias08 . AsInt32 ( ) ) . AsUInt16 ( ) ;
578+ out12 = Sse2 . Add ( out12 . AsInt32 ( ) , bias12 . AsInt32 ( ) ) . AsUInt16 ( ) ;
579+
580+ // out = QUANTDIV(coeff, iQ, B, QFIX)
581+ out00 = Sse2 . ShiftRightArithmetic ( out00 . AsInt32 ( ) , WebpConstants . QFix ) . AsUInt16 ( ) ;
582+ out04 = Sse2 . ShiftRightArithmetic ( out04 . AsInt32 ( ) , WebpConstants . QFix ) . AsUInt16 ( ) ;
583+ out08 = Sse2 . ShiftRightArithmetic ( out08 . AsInt32 ( ) , WebpConstants . QFix ) . AsUInt16 ( ) ;
584+ out12 = Sse2 . ShiftRightArithmetic ( out12 . AsInt32 ( ) , WebpConstants . QFix ) . AsUInt16 ( ) ;
585+
586+ // pack result as 16b
587+ Vector128 < short > out0 = Sse2 . PackSignedSaturate ( out00 . AsInt32 ( ) , out04 . AsInt32 ( ) ) ;
588+ Vector128 < short > out8 = Sse2 . PackSignedSaturate ( out08 . AsInt32 ( ) , out12 . AsInt32 ( ) ) ;
589+
590+ // if (coeff > 2047) coeff = 2047
591+ out0 = Sse2 . Min ( out0 , MaxCoeff2047 ) ;
592+ out8 = Sse2 . Min ( out8 , MaxCoeff2047 ) ;
593+
594+ // put sign back
595+ out0 = Ssse3 . Sign ( out0 , input0 ) ;
596+ out8 = Ssse3 . Sign ( out8 , input8 ) ;
597+
598+ // in = out * Q
599+ input0 = Sse2 . MultiplyLow ( out0 , q0 . AsInt16 ( ) ) ;
600+ input8 = Sse2 . MultiplyLow ( out8 , q8 . AsInt16 ( ) ) ;
601+
602+ // in = out * Q
603+ ref short inputRef = ref MemoryMarshal . GetReference ( input ) ;
604+ Unsafe . As < short , Vector128 < short > > ( ref inputRef ) = input0;
605+ Unsafe . As < short , Vector128 < short > > ( ref Unsafe . Add ( ref inputRef , 8 ) ) = input8;
606+
607+ // zigzag the output before storing it. The re-ordering is:
608+ // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
609+ // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
610+ // There's only two misplaced entries ([8] and [7]) that are crossing the
611+ // reg's boundaries.
612+ // We use pshufb instead of pshuflo/pshufhi.
613+ Vector128 < byte > tmpLo = Ssse3 . Shuffle ( out0 . AsByte ( ) , CstLo ) ;
614+ Vector128 < byte > tmp7 = Ssse3 . Shuffle ( out0 . AsByte ( ) , Cst7 ) ; // extract #7
615+ Vector128 < byte > tmpHi = Ssse3 . Shuffle ( out8 . AsByte ( ) , CstHi ) ;
616+ Vector128 < byte > tmp8 = Ssse3 . Shuffle ( out8 . AsByte ( ) , Cst8 ) ; // extract #8
617+ Vector128 < byte > outZ0 = Sse2 . Or ( tmpLo , tmp8 ) ;
618+ Vector128 < byte > outZ8 = Sse2 . Or ( tmpHi , tmp7 ) ;
619+
620+ ref short outputRef = ref MemoryMarshal . GetReference ( output ) ;
621+ Unsafe . As < short , Vector128 < short > > ( ref outputRef ) = outZ0. AsInt16 ( ) ;
622+ Unsafe . As < short , Vector128 < short > > ( ref Unsafe . Add ( ref outputRef , 8 ) ) = outZ8. AsInt16 ( ) ;
623+
624+ Vector128 < sbyte > packedOutput = Sse2 . PackSignedSaturate ( outZ0 . AsInt16 ( ) , outZ8 . AsInt16 ( ) ) ;
625+
626+ // Detect if all 'out' values are zeros or not.
627+ Vector128 < sbyte > cmpeq = Sse2 . CompareEqual ( packedOutput , Vector128 < sbyte > . Zero ) ;
628+ return Sse2 . MoveMask ( cmpeq ) != 0xffff ? 1 : 0 ;
629+ }
630+ else
631+ #endif
632+ {
633+ int last = - 1 ;
634+ int n ;
635+ for ( n = 0 ; n < 16 ; ++ n )
528636 {
529- uint q = mtx . Q [ j ] ;
530- uint iQ = mtx . IQ [ j ] ;
531- uint b = mtx . Bias [ j ] ;
532- int level = QuantDiv ( coeff , iQ , b ) ;
533- if ( level > MaxLevel )
637+ int j = Zigzag [ n ] ;
638+ bool sign = input [ j ] < 0 ;
639+ uint coeff = ( uint ) ( ( sign ? - input [ j ] : input [ j ] ) + mtx . Sharpen [ j ] ) ;
640+ if ( coeff > mtx . ZThresh [ j ] )
534641 {
535- level = MaxLevel ;
536- }
642+ uint q = mtx . Q [ j ] ;
643+ uint iQ = mtx . IQ [ j ] ;
644+ uint b = mtx . Bias [ j ] ;
645+ int level = QuantDiv ( coeff , iQ , b ) ;
646+ if ( level > MaxLevel )
647+ {
648+ level = MaxLevel ;
649+ }
537650
538- if ( sign )
539- {
540- level = - level ;
541- }
651+ if ( sign )
652+ {
653+ level = - level ;
654+ }
542655
543- input [ j ] = ( short ) ( level * ( int ) q ) ;
544- output [ n ] = ( short ) level ;
545- if ( level != 0 )
656+ input [ j ] = ( short ) ( level * ( int ) q ) ;
657+ output [ n ] = ( short ) level ;
658+ if ( level != 0 )
659+ {
660+ last = n ;
661+ }
662+ }
663+ else
546664 {
547- last = n ;
665+ output [ n ] = 0 ;
666+ input [ j ] = 0 ;
548667 }
549668 }
550- else
551- {
552- output [ n ] = 0 ;
553- input [ j ] = 0 ;
554- }
555- }
556669
557- return last >= 0 ? 1 : 0 ;
670+ return last >= 0 ? 1 : 0 ;
671+ }
558672 }
559673
560674 // Quantize as usual, but also compute and return the quantization error.
561675 // Error is already divided by DSHIFT.
562- public static int QuantizeSingle ( Span < short > v , Vp8Matrix mtx )
676+ public static int QuantizeSingle ( Span < short > v , ref Vp8Matrix mtx )
563677 {
564678 int v0 = v [ 0 ] ;
565679 bool sign = v0 < 0 ;
@@ -580,7 +694,7 @@ public static int QuantizeSingle(Span<short> v, Vp8Matrix mtx)
580694 return ( sign ? - v0 : v0 ) >> DSCALE ;
581695 }
582696
583- public static void CorrectDcValues ( Vp8EncIterator it , Vp8Matrix mtx , Span < short > tmp , Vp8ModeScore rd )
697+ public static void CorrectDcValues ( Vp8EncIterator it , ref Vp8Matrix mtx , Span < short > tmp , Vp8ModeScore rd )
584698 {
585699#pragma warning disable SA1005 // Single line comments should begin with single space
586700 // | top[0] | top[1]
@@ -597,13 +711,13 @@ public static void CorrectDcValues(Vp8EncIterator it, Vp8Matrix mtx, Span<short>
597711 Span < sbyte > left = it . LeftDerr . AsSpan ( ch , 2 ) ;
598712 Span < short > c = tmp . Slice ( ch * 4 * 16 , 4 * 16 ) ;
599713 c [ 0 ] += ( short ) ( ( ( C1 * top [ 0 ] ) + ( C2 * left [ 0 ] ) ) >> ( DSHIFT - DSCALE ) ) ;
600- int err0 = QuantizeSingle ( c , mtx ) ;
714+ int err0 = QuantizeSingle ( c , ref mtx ) ;
601715 c [ 1 * 16 ] += ( short ) ( ( ( C1 * top [ 1 ] ) + ( C2 * err0 ) ) >> ( DSHIFT - DSCALE ) ) ;
602- int err1 = QuantizeSingle ( c . Slice ( 1 * 16 ) , mtx ) ;
716+ int err1 = QuantizeSingle ( c . Slice ( 1 * 16 ) , ref mtx ) ;
603717 c [ 2 * 16 ] += ( short ) ( ( ( C1 * err0 ) + ( C2 * left [ 1 ] ) ) >> ( DSHIFT - DSCALE ) ) ;
604- int err2 = QuantizeSingle ( c . Slice ( 2 * 16 ) , mtx ) ;
718+ int err2 = QuantizeSingle ( c . Slice ( 2 * 16 ) , ref mtx ) ;
605719 c [ 3 * 16 ] += ( short ) ( ( ( C1 * err1 ) + ( C2 * err2 ) ) >> ( DSHIFT - DSCALE ) ) ;
606- int err3 = QuantizeSingle ( c . Slice ( 3 * 16 ) , mtx ) ;
720+ int err3 = QuantizeSingle ( c . Slice ( 3 * 16 ) , ref mtx ) ;
607721
608722 rd . Derr [ ch , 0 ] = err1 ;
609723 rd . Derr [ ch , 1 ] = err2 ;
0 commit comments