77using System . Runtime . InteropServices ;
88using System . Runtime . Intrinsics ;
99using System . Runtime . Intrinsics . X86 ;
10+ using SixLabors . ImageSharp . PixelFormats ;
1011
1112namespace SixLabors . ImageSharp
1213{
@@ -22,6 +23,20 @@ public static class HwIntrinsics
2223
2324 private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
2425
26+ private static ReadOnlySpan < byte > ShuffleMaskShiftAlpha =>
27+ new byte [ ]
28+ {
29+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
30+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15
31+ } ;
32+
33+ public static ReadOnlySpan < byte > PermuteMaskShiftAlpha8x32 =>
34+ new byte [ ]
35+ {
36+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
37+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0
38+ } ;
39+
2540 /// <summary>
2641 /// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
2742 /// using the control and store the results in <paramref name="dest"/>.
@@ -789,6 +804,138 @@ internal static void NormalizedFloatToByteSaturate(
789804 }
790805 }
791806 }
807+
808+ internal static void PackFromRgbPlanesAvx2Reduce (
809+ ref ReadOnlySpan < byte > redChannel ,
810+ ref ReadOnlySpan < byte > greenChannel ,
811+ ref ReadOnlySpan < byte > blueChannel ,
812+ ref Span < Rgb24 > destination )
813+ {
814+ ref Vector256 < byte > rBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( redChannel ) ) ;
815+ ref Vector256 < byte > gBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
816+ ref Vector256 < byte > bBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
817+ ref byte dBase = ref Unsafe . As < Rgb24 , byte > ( ref MemoryMarshal . GetReference ( destination ) ) ;
818+
819+ int count = redChannel . Length / Vector256 < byte > . Count ;
820+
821+ ref byte control1Bytes = ref MemoryMarshal . GetReference ( SimdUtils . HwIntrinsics . PermuteMaskEvenOdd8x32 ) ;
822+ Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
823+
824+ ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
825+ Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
826+
827+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
828+
829+ Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
830+
831+ for ( int i = 0 ; i < count ; i ++ )
832+ {
833+ Vector256 < byte > r0 = Unsafe . Add ( ref rBase , i ) ;
834+ Vector256 < byte > g0 = Unsafe . Add ( ref gBase , i ) ;
835+ Vector256 < byte > b0 = Unsafe . Add ( ref bBase , i ) ;
836+
837+ r0 = Avx2 . PermuteVar8x32 ( r0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
838+ g0 = Avx2 . PermuteVar8x32 ( g0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
839+ b0 = Avx2 . PermuteVar8x32 ( b0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
840+
841+ Vector256 < byte > rg = Avx2 . UnpackLow ( r0 , g0 ) ;
842+ Vector256 < byte > b1 = Avx2 . UnpackLow ( b0 , a ) ;
843+
844+ Vector256 < byte > rgb1 = Avx2 . UnpackLow ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
845+ Vector256 < byte > rgb2 = Avx2 . UnpackHigh ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
846+
847+ rg = Avx2 . UnpackHigh ( r0 , g0 ) ;
848+ b1 = Avx2 . UnpackHigh ( b0 , a ) ;
849+
850+ Vector256 < byte > rgb3 = Avx2 . UnpackLow ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
851+ Vector256 < byte > rgb4 = Avx2 . UnpackHigh ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
852+
853+ rgb1 = Avx2 . Shuffle ( rgb1 , shuffleAlpha ) ;
854+ rgb2 = Avx2 . Shuffle ( rgb2 , shuffleAlpha ) ;
855+ rgb3 = Avx2 . Shuffle ( rgb3 , shuffleAlpha ) ;
856+ rgb4 = Avx2 . Shuffle ( rgb4 , shuffleAlpha ) ;
857+
858+ rgb1 = Avx2 . PermuteVar8x32 ( rgb1 . AsUInt32 ( ) , control2 ) . AsByte ( ) ;
859+ rgb2 = Avx2 . PermuteVar8x32 ( rgb2 . AsUInt32 ( ) , control2 ) . AsByte ( ) ;
860+ rgb3 = Avx2 . PermuteVar8x32 ( rgb3 . AsUInt32 ( ) , control2 ) . AsByte ( ) ;
861+ rgb4 = Avx2 . PermuteVar8x32 ( rgb4 . AsUInt32 ( ) , control2 ) . AsByte ( ) ;
862+
863+ ref byte d1 = ref Unsafe . Add ( ref dBase , 24 * 4 * i ) ;
864+ ref byte d2 = ref Unsafe . Add ( ref d1 , 24 ) ;
865+ ref byte d3 = ref Unsafe . Add ( ref d2 , 24 ) ;
866+ ref byte d4 = ref Unsafe . Add ( ref d3 , 24 ) ;
867+
868+ Unsafe . As < byte , Vector256 < byte > > ( ref d1 ) = rgb1;
869+ Unsafe . As < byte , Vector256 < byte > > ( ref d2 ) = rgb2;
870+ Unsafe . As < byte , Vector256 < byte > > ( ref d3 ) = rgb3;
871+ Unsafe . As < byte , Vector256 < byte > > ( ref d4 ) = rgb4;
872+ }
873+
874+ int slice = count * Vector256 < byte > . Count ;
875+ redChannel = redChannel . Slice ( slice ) ;
876+ greenChannel = greenChannel . Slice ( slice ) ;
877+ blueChannel = blueChannel . Slice ( slice ) ;
878+ destination = destination . Slice ( slice ) ;
879+ }
880+
881+ internal static void PackFromRgbPlanesAvx2Reduce (
882+ ref ReadOnlySpan < byte > redChannel ,
883+ ref ReadOnlySpan < byte > greenChannel ,
884+ ref ReadOnlySpan < byte > blueChannel ,
885+ ref Span < Rgba32 > destination )
886+ {
887+ ref Vector256 < byte > rBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( redChannel ) ) ;
888+ ref Vector256 < byte > gBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
889+ ref Vector256 < byte > bBase = ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
890+ ref Vector256 < byte > dBase = ref Unsafe . As < Rgba32 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
891+
892+ int count = redChannel . Length / Vector256 < byte > . Count ;
893+
894+ ref byte control1Bytes = ref MemoryMarshal . GetReference ( SimdUtils . HwIntrinsics . PermuteMaskEvenOdd8x32 ) ;
895+ Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
896+
897+ ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
898+ Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
899+
900+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
901+
902+ Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
903+
904+ for ( int i = 0 ; i < count ; i ++ )
905+ {
906+ Vector256 < byte > r0 = Unsafe . Add ( ref rBase , i ) ;
907+ Vector256 < byte > g0 = Unsafe . Add ( ref gBase , i ) ;
908+ Vector256 < byte > b0 = Unsafe . Add ( ref bBase , i ) ;
909+
910+ r0 = Avx2 . PermuteVar8x32 ( r0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
911+ g0 = Avx2 . PermuteVar8x32 ( g0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
912+ b0 = Avx2 . PermuteVar8x32 ( b0 . AsUInt32 ( ) , control1 ) . AsByte ( ) ;
913+
914+ Vector256 < byte > rg = Avx2 . UnpackLow ( r0 , g0 ) ;
915+ Vector256 < byte > b1 = Avx2 . UnpackLow ( b0 , a ) ;
916+
917+ Vector256 < byte > rgb1 = Avx2 . UnpackLow ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
918+ Vector256 < byte > rgb2 = Avx2 . UnpackHigh ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
919+
920+ rg = Avx2 . UnpackHigh ( r0 , g0 ) ;
921+ b1 = Avx2 . UnpackHigh ( b0 , a ) ;
922+
923+ Vector256 < byte > rgb3 = Avx2 . UnpackLow ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
924+ Vector256 < byte > rgb4 = Avx2 . UnpackHigh ( rg . AsUInt16 ( ) , b1 . AsUInt16 ( ) ) . AsByte ( ) ;
925+
926+ ref Vector256 < byte > d0 = ref Unsafe . Add ( ref dBase , i * 4 ) ;
927+ d0 = rgb1 ;
928+ Unsafe . Add ( ref d0 , 1 ) = rgb2;
929+ Unsafe . Add ( ref d0 , 2 ) = rgb3;
930+ Unsafe . Add ( ref d0 , 3 ) = rgb4;
931+ }
932+
933+ int slice = count * Vector256 < byte > . Count ;
934+ redChannel = redChannel . Slice ( slice ) ;
935+ greenChannel = greenChannel . Slice ( slice ) ;
936+ blueChannel = blueChannel . Slice ( slice ) ;
937+ destination = destination . Slice ( slice ) ;
938+ }
792939 }
793940 }
794941}
0 commit comments