@@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
1515{
1616 internal static class RgbToYCbCrConverterVectorized
1717 {
18- private static ReadOnlySpan < byte > ExtractionMasks => new byte [ ]
19- {
20- 0x0 , 0xFF , 0xFF , 0xFF , 0x1 , 0xFF , 0xFF , 0xFF , 0x2 , 0xFF , 0xFF , 0xFF , 0x3 , 0xFF , 0xFF , 0xFF , 0x10 , 0xFF , 0xFF , 0xFF , 0x11 , 0xFF , 0xFF , 0xFF , 0x12 , 0xFF , 0xFF , 0xFF , 0x13 , 0xFF , 0xFF , 0xFF ,
21- 0x4 , 0xFF , 0xFF , 0xFF , 0x5 , 0xFF , 0xFF , 0xFF , 0x6 , 0xFF , 0xFF , 0xFF , 0x7 , 0xFF , 0xFF , 0xFF , 0x14 , 0xFF , 0xFF , 0xFF , 0x15 , 0xFF , 0xFF , 0xFF , 0x16 , 0xFF , 0xFF , 0xFF , 0x17 , 0xFF , 0xFF , 0xFF ,
22- 0x8 , 0xFF , 0xFF , 0xFF , 0x9 , 0xFF , 0xFF , 0xFF , 0xA , 0xFF , 0xFF , 0xFF , 0xB , 0xFF , 0xFF , 0xFF , 0x18 , 0xFF , 0xFF , 0xFF , 0x19 , 0xFF , 0xFF , 0xFF , 0x1A , 0xFF , 0xFF , 0xFF , 0x1B , 0xFF , 0xFF , 0xFF ,
23- 0xC , 0xFF , 0xFF , 0xFF , 0xD , 0xFF , 0xFF , 0xFF , 0xE , 0xFF , 0xFF , 0xFF , 0xF , 0xFF , 0xFF , 0xFF , 0x1C , 0xFF , 0xFF , 0xFF , 0x1D , 0xFF , 0xFF , 0xFF , 0x1E , 0xFF , 0xFF , 0xFF , 0x1F , 0xFF , 0xFF , 0xFF ,
24- } ;
25-
2618 public static bool IsSupported
2719 {
2820 get
2921 {
3022#if SUPPORTS_RUNTIME_INTRINSICS
31- return Avx2 . IsSupported && Fma . IsSupported ;
23+ return Avx2 . IsSupported ;
3224#else
3325 return false ;
3426#endif
3527 }
3628 }
3729
38- public static void Convert ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
39- {
40- Debug . Assert ( IsSupported , "AVX2 and FMA are required to run this converter" ) ;
41-
4230#if SUPPORTS_RUNTIME_INTRINSICS
43- SeparateRgb ( rgbSpan ) ;
44- ConvertInternal ( rgbSpan , ref yBlock , ref cbBlock , ref crBlock ) ;
45- #endif
46- }
47-
48- #if SUPPORTS_RUNTIME_INTRINSICS
49- /// <summary>
50- /// Rearranges the provided <paramref name="rgbSpan"/> in-place
51- /// from { r00, g00, b00, ..., r63, g63, b63 }
52- /// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
53- /// r32, ... r63, g32, ..., g63, b31, ..., b63 }
54- /// </summary>
55- /// <remarks>
56- /// SSE is used for this operation as it is significantly faster than AVX in this specific case.
57- /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
58- /// </remarks>
59- [ MethodImpl ( InliningOptions . ShortMethod ) ]
60- private static void SeparateRgb ( ReadOnlySpan < Rgb24 > rgbSpan )
31+ private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ]
6132 {
62- var selectRed0 = Vector128 . Create ( 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
63- var selectRed1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x02 , 0x05 , 0x08 , 0x0B , 0x0E , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
64- var selectRed2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x01 , 0x04 , 0x07 , 0x0A , 0x0D ) ;
65-
66- var selectGreen0 = Vector128 . Create ( 0x01 , 0x04 , 0x07 , 0x0A , 0x0D , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
67- var selectGreen1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
68- var selectGreen2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x02 , 0x05 , 0x08 , 0x0B , 0x0E ) ;
33+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 ,
34+ 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0
35+ } ;
6936
70- var selectBlue0 = Vector128 . Create ( 0x02 , 0x05 , 0x08 , 0x0B , 0x0E , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
71- var selectBlue1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x01 , 0x04 , 0x07 , 0x0A , 0x0D , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
72- var selectBlue2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F ) ;
37+ private static ReadOnlySpan < byte > MoveLast24BytesToSeparateLanes => new byte [ ]
38+ {
39+ 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
40+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 1 , 0 , 0 , 0
41+ } ;
7342
74- for ( int i = 0 ; i < 2 ; i ++ )
75- {
76- ref Vector128 < byte > inRef = ref Unsafe . Add ( ref Unsafe . As < Rgb24 , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( rgbSpan ) ) , i * 6 ) ;
77-
78- Vector128 < byte > in0 = inRef ;
79- Vector128 < byte > in1 = Unsafe . Add ( ref inRef , 1 ) ;
80- Vector128 < byte > in2 = Unsafe . Add ( ref inRef , 2 ) ;
81-
82- Vector128 < byte > r0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectRed0 ) , Ssse3 . Shuffle ( in1 , selectRed1 ) ) , Ssse3 . Shuffle ( in2 , selectRed2 ) ) ;
83- Vector128 < byte > g0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectGreen0 ) , Ssse3 . Shuffle ( in1 , selectGreen1 ) ) , Ssse3 . Shuffle ( in2 , selectGreen2 ) ) ;
84- Vector128 < byte > b0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectBlue0 ) , Ssse3 . Shuffle ( in1 , selectBlue1 ) ) , Ssse3 . Shuffle ( in2 , selectBlue2 ) ) ;
85-
86- in0 = Unsafe . Add ( ref inRef , 3 ) ;
87- in1 = Unsafe . Add ( ref inRef , 4 ) ;
88- in2 = Unsafe . Add ( ref inRef , 5 ) ;
89-
90- Vector128 < byte > r1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectRed0 ) , Ssse3 . Shuffle ( in1 , selectRed1 ) ) , Ssse3 . Shuffle ( in2 , selectRed2 ) ) ;
91- Vector128 < byte > g1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectGreen0 ) , Ssse3 . Shuffle ( in1 , selectGreen1 ) ) , Ssse3 . Shuffle ( in2 , selectGreen2 ) ) ;
92- Vector128 < byte > b1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectBlue0 ) , Ssse3 . Shuffle ( in1 , selectBlue1 ) ) , Ssse3 . Shuffle ( in2 , selectBlue2 ) ) ;
93-
94- inRef = r0 ;
95- Unsafe . Add ( ref inRef , 1 ) = r1;
96- Unsafe . Add ( ref inRef , 2 ) = g0;
97- Unsafe . Add ( ref inRef , 3 ) = g1;
98- Unsafe . Add ( ref inRef , 4 ) = b0;
99- Unsafe . Add ( ref inRef , 5 ) = b1;
100- }
101- }
43+ private static ReadOnlySpan < byte > ExtractRgb => new byte [ ]
44+ {
45+ 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ,
46+ 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF
47+ } ;
48+ #endif
10249
103- /// <summary>
104- /// Converts the previously separated (see <see cref="SeparateRgb"/>) RGB values to YCbCr using AVX2 and FMA.
105- /// </summary>
106- [ MethodImpl ( InliningOptions . ShortMethod ) ]
107- private static void ConvertInternal ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
50+ public static void Convert ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
10851 {
52+ Debug . Assert ( IsSupported , "AVX2 is required to run this converter" ) ;
53+
54+ #if SUPPORTS_RUNTIME_INTRINSICS
10955 var f0299 = Vector256 . Create ( 0.299f ) ;
11056 var f0587 = Vector256 . Create ( 0.587f ) ;
11157 var f0114 = Vector256 . Create ( 0.114f ) ;
@@ -115,68 +61,60 @@ private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F y
11561 var fn0418688 = Vector256 . Create ( - 0.418688f ) ;
11662 var fn0081312F = Vector256 . Create ( - 0.081312F ) ;
11763 var f05 = Vector256 . Create ( 0.5f ) ;
64+ var zero = Vector256 . Create ( 0 ) . AsByte ( ) ;
11865
11966 ref Vector256 < byte > inRef = ref Unsafe . As < Rgb24 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( rgbSpan ) ) ;
120-
121- for ( int i = 0 ; i < 2 ; i ++ )
67+ ref Vector256 < float > destYRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref yBlock ) ;
68+ ref Vector256 < float > destCbRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref cbBlock ) ;
69+ ref Vector256 < float > destCrRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref crBlock ) ;
70+
71+ var extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
72+ var extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
73+ Vector256 < byte > rgb , rg , bx ;
74+ Vector256 < float > r , g , b ;
75+ for ( int i = 0 ; i < 7 ; i ++ )
12276 {
123- ref Vector256 < float > destYRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref yBlock ) , i * 4 ) ;
124- ref Vector256 < float > destCbRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref cbBlock ) , i * 4 ) ;
125- ref Vector256 < float > destCrRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref crBlock ) , i * 4 ) ;
126-
127- Vector256 < byte > red = Unsafe . Add ( ref inRef , i * 3 ) ;
128- Vector256 < byte > green = Unsafe . Add ( ref inRef , ( i * 3 ) + 1 ) ;
129- Vector256 < byte > blue = Unsafe . Add ( ref inRef , ( i * 3 ) + 2 ) ;
77+ rgb = Avx2 . PermuteVar8x32 ( Unsafe . AddByteOffset ( ref inRef , ( IntPtr ) ( 24 * i ) ) . AsUInt32 ( ) , extractToLanesMask ) . AsByte ( ) ;
13078
131- for ( int j = 0 ; j < 2 ; j ++ )
132- {
133- // 1st part of unrolled loop
134- Vector256 < byte > mask = Unsafe . Add ( ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractionMasks ) ) , j * 2 ) ;
79+ rgb = Avx2 . Shuffle ( rgb , extractRgbMask ) ;
13580
136- Vector256 < float > r = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( red , mask ) . AsInt32 ( ) ) ;
137- Vector256 < float > g = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( green , mask ) . AsInt32 ( ) ) ;
138- Vector256 < float > b = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( blue , mask ) . AsInt32 ( ) ) ;
81+ rg = Avx2 . UnpackLow ( rgb , zero ) ;
82+ bx = Avx2 . UnpackHigh ( rgb , zero ) ;
13983
140- // (0.299F * r) + (0.587F * g) + (0.114F * b);
141- Vector256 < float > yy0 = Fma . MultiplyAdd ( f0299 , r , Fma . MultiplyAdd ( f0587 , g , Avx . Multiply ( f0114 , b ) ) ) ;
84+ r = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( rg , zero ) . AsInt32 ( ) ) ;
85+ g = Avx . ConvertToVector256Single ( Avx2 . UnpackHigh ( rg , zero ) . AsInt32 ( ) ) ;
86+ b = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( bx , zero ) . AsInt32 ( ) ) ;
14287
143- // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
144- Vector256 < float > cb0 = Avx . Add ( f128 , Fma . MultiplyAdd ( fn0168736 , r , Fma . MultiplyAdd ( fn0331264 , g , Avx . Multiply ( f05 , b ) ) ) ) ;
88+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
89+ Unsafe . Add ( ref destYRef , i ) = SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f0114 , b ) , f0587 , g ) , f0299 , r ) ;
14590
146- // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
147- Vector256 < float > cr0 = Avx . Add ( f128 , Fma . MultiplyAdd ( f05 , r , Fma . MultiplyAdd ( fn0418688 , g , Avx . Multiply ( fn0081312F , b ) ) ) ) ;
91+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
92+ Unsafe . Add ( ref destCbRef , i ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f05 , b ) , fn0331264 , g ) , fn0168736 , r ) ) ;
14893
149- // 2nd part of unrolled loop
150- mask = Unsafe . Add ( ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractionMasks ) ) , ( j * 2 ) + 1 ) ;
151-
152- r = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( red , mask ) . AsInt32 ( ) ) ;
153- g = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( green , mask ) . AsInt32 ( ) ) ;
154- b = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( blue , mask ) . AsInt32 ( ) ) ;
94+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
95+ Unsafe . Add ( ref destCrRef , i ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( fn0081312F , b ) , fn0418688 , g ) , f05 , r ) ) ;
96+ }
15597
156- // (0.299F * r) + (0.587F * g) + (0.114F * b);
157- Vector256 < float > yy1 = Fma . MultiplyAdd ( f0299 , r , Fma . MultiplyAdd ( f0587 , g , Avx . Multiply ( f0114 , b ) ) ) ;
98+ extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveLast24BytesToSeparateLanes ) ) ;
99+ rgb = Avx2 . PermuteVar8x32 ( Unsafe . AddByteOffset ( ref inRef , ( IntPtr ) 160 ) . AsUInt32 ( ) , extractToLanesMask ) . AsByte ( ) ;
100+ rgb = Avx2 . Shuffle ( rgb , extractRgbMask ) ;
158101
159- // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
160- Vector256 < float > cb1 = Avx . Add ( f128 , Fma . MultiplyAdd ( fn0168736 , r , Fma . MultiplyAdd ( fn0331264 , g , Avx . Multiply ( f05 , b ) ) ) ) ;
102+ rg = Avx2 . UnpackLow ( rgb , zero ) ;
103+ bx = Avx2 . UnpackHigh ( rgb , zero ) ;
161104
162- // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
163- Vector256 < float > cr1 = Avx . Add ( f128 , Fma . MultiplyAdd ( f05 , r , Fma . MultiplyAdd ( fn0418688 , g , Avx . Multiply ( fn0081312F , b ) ) ) ) ;
105+ r = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( rg , zero ) . AsInt32 ( ) ) ;
106+ g = Avx . ConvertToVector256Single ( Avx2 . UnpackHigh ( rg , zero ) . AsInt32 ( ) ) ;
107+ b = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( bx , zero ) . AsInt32 ( ) ) ;
164108
165- // store results from 1st and 2nd part
166- Vector256 < float > tmpY = Avx . Permute2x128 ( yy0 , yy1 , 0b0010_0001 ) ;
167- Unsafe . Add ( ref destYRef , j ) = Avx. Blend ( yy0 , tmpY , 0b1111_0000 ) ;
168- Unsafe . Add ( ref destYRef , j + 2 ) = Avx. Blend ( yy1 , tmpY , 0b0000_1111 ) ;
109+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
110+ Unsafe . Add ( ref destYRef , 7 ) = SimdUtils. HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f0114 , b ) , f0587 , g ) , f0299 , r ) ;
169111
170- Vector256 < float > tmpCb = Avx . Permute2x128 ( cb0 , cb1 , 0b0010_0001 ) ;
171- Unsafe . Add ( ref destCbRef , j ) = Avx. Blend ( cb0 , tmpCb , 0b1111_0000 ) ;
172- Unsafe . Add ( ref destCbRef , j + 2 ) = Avx. Blend ( cb1 , tmpCb , 0b0000_1111 ) ;
112+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
113+ Unsafe . Add ( ref destCbRef , 7 ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f05 , b ) , fn0331264 , g ) , fn0168736 , r ) ) ;
173114
174- Vector256 < float > tmpCr = Avx . Permute2x128 ( cr0 , cr1 , 0b0010_0001 ) ;
175- Unsafe . Add ( ref destCrRef , j ) = Avx. Blend ( cr0 , tmpCr , 0b1111_0000 ) ;
176- Unsafe . Add ( ref destCrRef , j + 2 ) = Avx. Blend ( cr1 , tmpCr , 0b0000_1111 ) ;
177- }
178- }
179- }
115+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
116+ Unsafe . Add ( ref destCrRef , 7 ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( fn0081312F , b ) , fn0418688 , g ) , f05 , r ) ) ;
180117#endif
118+ }
181119 }
182120}
0 commit comments