@@ -18,6 +18,10 @@ public static class HwIntrinsics
1818
1919 public static ReadOnlySpan < byte > PermuteMaskEvenOdd8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
2020
21+ private static ReadOnlySpan < byte > ShuffleMaskPad4Nx16 => new byte [ ] { 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 } ;
22+
23+ private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
24+
2125 /// <summary>
2226 /// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
2327 /// using the control and store the results in <paramref name="dest"/>.
@@ -352,10 +356,12 @@ private static void Shuffle3(
352356 {
353357 if ( Ssse3 . IsSupported )
354358 {
355- Vector128 < byte > vmask = Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) . AsByte ( ) ;
359+ ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
360+ Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
356361 Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
357- Vector128 < byte > vmasko = Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) . AsByte ( ) ;
358- Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) . AsByte ( ) ;
362+ ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
363+ Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
364+ Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
359365
360366 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
361367 Shuffle . MmShuffleSpan ( ref bytes , control ) ;
@@ -381,23 +387,19 @@ private static void Shuffle3(
381387 v2 = Ssse3 . AlignRight ( v2 , v1 , 8 ) ;
382388 v1 = Ssse3 . AlignRight ( v1 , v0 , 12 ) ;
383389
384- v0 = Ssse3 . Shuffle ( Sse2 . Or ( Ssse3 . Shuffle ( v0 , vmask ) , vfill ) , vshuffle ) ;
385- v1 = Ssse3 . Shuffle ( Sse2 . Or ( Ssse3 . Shuffle ( v1 , vmask ) , vfill ) , vshuffle ) ;
386- v2 = Ssse3 . Shuffle ( Sse2 . Or ( Ssse3 . Shuffle ( v2 , vmask ) , vfill ) , vshuffle ) ;
387- v3 = Ssse3 . Shuffle ( Sse2 . Or ( Ssse3 . Shuffle ( v3 , vmask ) , vfill ) , vshuffle ) ;
390+ v0 = Ssse3 . Shuffle ( Ssse3 . Shuffle ( v0 , vmask ) , vshuffle ) ;
391+ v1 = Ssse3 . Shuffle ( Ssse3 . Shuffle ( v1 , vmask ) , vshuffle ) ;
392+ v2 = Ssse3 . Shuffle ( Ssse3 . Shuffle ( v2 , vmask ) , vshuffle ) ;
393+ v3 = Ssse3 . Shuffle ( Ssse3 . Shuffle ( v3 , vmask ) , vshuffle ) ;
388394
389395 v0 = Ssse3 . Shuffle ( v0 , vmaske ) ;
390396 v1 = Ssse3 . Shuffle ( v1 , vmasko ) ;
391397 v2 = Ssse3 . Shuffle ( v2 , vmaske ) ;
392398 v3 = Ssse3 . Shuffle ( v3 , vmasko ) ;
393399
394400 v0 = Ssse3 . AlignRight ( v1 , v0 , 4 ) ;
395- v3 = Ssse3 . AlignRight ( v3 , v2 , 12 ) ;
396-
397- v1 = Sse2 . ShiftLeftLogical128BitLane ( v1 , 4 ) ;
398- v2 = Sse2 . ShiftRightLogical128BitLane ( v2 , 4 ) ;
399-
400- v1 = Ssse3 . AlignRight ( v2 , v1 , 8 ) ;
401+ v1 = Sse2 . Or ( Sse2 . ShiftRightLogical128BitLane ( v1 , 4 ) , Sse2 . ShiftLeftLogical128BitLane ( v2 , 4 ) ) ;
402+ v2 = Ssse3 . AlignRight ( v3 , v2 , 12 ) ;
401403
402404 ref Vector128 < byte > vd = ref Unsafe . Add ( ref destBase , i ) ;
403405
@@ -416,7 +418,8 @@ private static void Pad3Shuffle4(
416418 {
417419 if ( Ssse3 . IsSupported )
418420 {
419- Vector128 < byte > vmask = Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) . AsByte ( ) ;
421+ ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
422+ Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
420423 Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
421424
422425 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -459,8 +462,9 @@ private static void Shuffle4Slice3(
459462 {
460463 if ( Ssse3 . IsSupported )
461464 {
462- Vector128 < byte > vmasko = Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) . AsByte ( ) ;
463- Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) . AsByte ( ) ;
465+ ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
466+ Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
467+ Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
464468
465469 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
466470 Shuffle . MmShuffleSpan ( ref bytes , control ) ;
0 commit comments