Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ChaCha and Salsa intrinsics. #128

Open
wants to merge 65 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
6ecc467
Initial integration of intrinsics
macaba Apr 6, 2020
24901e3
Update Program.cs
macaba Apr 6, 2020
6424c40
Removed 5.0 target to allow CI to build
macaba Apr 6, 2020
b675b83
Resolve conflicts
TimothyMakkison Oct 7, 2022
f1f3460
Remove treat warning as errors
TimothyMakkison Oct 8, 2022
b3d9aa6
Alter Dencrypt test
TimothyMakkison Oct 8, 2022
a6b8a4a
Add Salsa20 and XSalsa20 benchmarks
TimothyMakkison Oct 10, 2022
565ad2c
Added back .net 6 support
TimothyMakkison Oct 11, 2022
2184dba
Parameterise Salsa20TestVectors
TimothyMakkison Oct 11, 2022
e18036f
Added Intrinsics check
TimothyMakkison Oct 11, 2022
fdc1e1c
Added intrinsics support for Salsa20 (64 <= is broken)
TimothyMakkison Oct 11, 2022
0b81a0d
Fix Salsa20 intrinsics for bytes >= 64
TimothyMakkison Oct 11, 2022
c134b8b
Merge benchmark updates
TimothyMakkison Oct 11, 2022
e149e3c
Fix benchmark
TimothyMakkison Oct 11, 2022
f6e1680
Use transpose rotate transpose trick - 10-20% faster
TimothyMakkison Oct 11, 2022
cbf3778
Add Transpose method
TimothyMakkison Oct 11, 2022
3ff01b1
Revert "Add Transpose method"
TimothyMakkison Oct 11, 2022
d8b341a
Add transpose method
TimothyMakkison Oct 11, 2022
c286d7b
Fix XSalsa benchmark
TimothyMakkison Oct 11, 2022
e770b25
Code cleanup
TimothyMakkison Oct 11, 2022
836ae77
Correct nonce size
TimothyMakkison Oct 11, 2022
e4d35c6
Inline pre processor variable
TimothyMakkison Oct 11, 2022
288ae8c
Add variable length test for ChaCha and Salsa
TimothyMakkison Oct 11, 2022
e090aa9
Refactor ChaCha20BaseIntrinsics
TimothyMakkison Oct 12, 2022
b2478ac
Refactor Salsa20BaseIntrinsics
TimothyMakkison Oct 12, 2022
27b544d
Added HChaCha and HSalsa
TimothyMakkison Oct 12, 2022
79ca4ce
Added little endian check
TimothyMakkison Oct 13, 2022
589912e
Added intrinsics process stream
TimothyMakkison Oct 14, 2022
5deec51
Added Salsa ProcessKeyStreamBlock test
TimothyMakkison Oct 14, 2022
37b41ab
Minor HSalsa & KeyStream api changes
TimothyMakkison Oct 14, 2022
e26c7fb
Remove pre processor per method system checks. Instead using either a…
TimothyMakkison Oct 15, 2022
58d6c25
Use ChaChaCore or ChaChaIntrinsics instead of pre processor checks
TimothyMakkison Oct 16, 2022
24b3357
Delete Snuffle pre processor checks
TimothyMakkison Oct 16, 2022
8c88ea5
Delete Snuffle pre processor checks
TimothyMakkison Oct 16, 2022
ae42b81
Resolve
TimothyMakkison Oct 16, 2022
9270136
Fix process stackoverflow bug
TimothyMakkison Oct 16, 2022
b46ca34
Fix incorrect system checks
TimothyMakkison Oct 17, 2022
b340bc4
Fix incorrect system checks
TimothyMakkison Oct 17, 2022
a583d02
Resolve
TimothyMakkison Oct 17, 2022
f3605ba
Rewrite HSalsa to use only Sse2
TimothyMakkison Oct 17, 2022
8db68e1
Minor refactoring, add comments
TimothyMakkison Oct 17, 2022
202ff15
Refactor BaseIntrinsics to use pointers
TimothyMakkison Oct 17, 2022
4d68791
Resolve conflicts
TimothyMakkison Oct 17, 2022
59797d5
Move Core files into folders
TimothyMakkison Oct 17, 2022
cd7cd48
Fix process methods
TimothyMakkison Oct 17, 2022
6b3298d
Added powershell test file, runs tests with various simd modes enable…
TimothyMakkison Oct 18, 2022
d1ede2a
Update core namespaces
TimothyMakkison Oct 18, 2022
2ba36af
Added Intrinsic/Scalar tests
TimothyMakkison Oct 18, 2022
d7d04d4
Updated Intrinsics test powershell script
TimothyMakkison Oct 18, 2022
e559677
Code cleanup and formatting changes
TimothyMakkison Oct 18, 2022
1613592
Remove unused code
TimothyMakkison Nov 8, 2022
6ca8d65
Rename internal protected to protected internal
TimothyMakkison Nov 8, 2022
6c2b54b
Remove unnecessary usings
TimothyMakkison Nov 8, 2022
f54e5c0
Simplified slices
TimothyMakkison Nov 8, 2022
6319ec1
Removed unnecessary using statements
TimothyMakkison Nov 8, 2022
8d0d596
Add explcit access modifiers
TimothyMakkison Nov 8, 2022
3a5051b
Edit error message, move namespace and change visibility to internal
TimothyMakkison Nov 8, 2022
c8ff6fc
Remove internal core code, replaced with pre processor functions
TimothyMakkison Nov 9, 2022
2c55739
Delete core scalar and itnrinsics tests
TimothyMakkison Nov 9, 2022
23722b4
Code cleanup
TimothyMakkison Nov 9, 2022
3970397
Deleted powershell script and updated cake script
TimothyMakkison Nov 9, 2022
4cebc30
Correct visibility, remove unneeded InternalsVisibleTo, minor chnages
TimothyMakkison Nov 9, 2022
dd6178c
Add Salsa64 SSE41 shuffle optimisation, updated guards/checks and add…
TimothyMakkison Nov 9, 2022
1efadaa
Add Salsa64 SSE41 shuffle optimisation, updated guards/checks, change…
TimothyMakkison Nov 9, 2022
a3a7bb6
Merge branch 'merge_intrinsics' of https://github.com/TimothyMakkison…
TimothyMakkison Nov 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor BaseIntrinsics to use pointers
  • Loading branch information
TimothyMakkison committed Oct 17, 2022
commit 202ff15f1fc0fa74e842cbe384581e490f1f42c1
9 changes: 3 additions & 6 deletions src/NaCl.Core/Base/ChaCha20Base.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,9 @@ public override unsafe void ProcessStream(ReadOnlySpan<byte> nonce, Span<byte> o
{
Span<uint> state = stackalloc uint[BLOCK_SIZE_IN_INTS];
SetInitialState(state, nonce, initialCounter);

fixed(uint* x = state)
fixed (byte* m = input, c = output.Slice(offset))
{
ChaCha20BaseIntrinsics.ChaCha20(x, m, c, (ulong)input.Length);
}
var c = output.Slice(offset);

ChaCha20BaseIntrinsics.ChaCha20(state, input, c, (ulong)input.Length);
}
#endif

Expand Down
49 changes: 32 additions & 17 deletions src/NaCl.Core/Base/ChaCha20BaseIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,33 @@ namespace NaCl.Core.Base;
public static class ChaCha20BaseIntrinsics
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void ChaCha20(uint* x, byte* m, byte* c, ulong bytes)
public static unsafe void ChaCha20(Span<uint> state, ReadOnlySpan<byte> input, Span<byte> output, ulong bytes)
{
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

if (Avx2.IsSupported && bytes >= 512)
fixed (uint* x = state)
fixed (byte* m_p = input, c_p = output)
{
ChaCha512.Process(x, ref m, ref c, ref bytes);
}
if (bytes >= 256)
{
ChaCha256.Process(x, ref m, ref c, ref bytes);
}
while (bytes >= 64)
{
ChaCha64.Process64(x, ref m, ref c, ref bytes);
}
if (bytes > 0)
{
ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes);
var m = m_p;
var c = c_p;

if (Avx2.IsSupported && bytes >= 512)
{
ChaCha512.Process(x, ref m, ref c, ref bytes);
}
if (bytes >= 256)
{
ChaCha256.Process(x, ref m, ref c, ref bytes);
}
while (bytes >= 64)
{
ChaCha64.Process64(x, ref m, ref c, ref bytes);
}
if (bytes > 0)
{
ChaCha64.ProcessVarLength(x, ref m, ref c, ref bytes);
}
}
}

Expand All @@ -39,7 +46,11 @@ public static unsafe void HChaCha20(ReadOnlySpan<uint> state, Span<byte> subKey)
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

ChaCha64.HChaCha20(state, subKey);
fixed (uint* x = state)
fixed (byte* sk = subKey)
{
ChaCha64.HChaCha20(x, sk);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -48,7 +59,11 @@ public static unsafe void ChaCha20KeyStream(ReadOnlySpan<uint> state, Span<byte>
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

ChaCha64.KeyStream64(state, output);
fixed (byte* c = output)
fixed (uint* x = state)
{
ChaCha64.KeyStream64(x, c);
}
}
}
#endif
68 changes: 30 additions & 38 deletions src/NaCl.Core/Base/ChaChaIntrinsics/ChaCha64.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,51 +94,43 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void HChaCha20(ReadOnlySpan<uint> state, Span<byte> subKey)
public static unsafe void HChaCha20(uint* x, byte* sk)
{
fixed(uint* x = state)
fixed(byte* sk = subKey)
{
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);
ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

Sse2.Store(sk, Vector128.AsByte(x_0));
Sse2.Store(sk + 16, Vector128.AsByte(x_3));
}
Sse2.Store(sk, Vector128.AsByte(x_0));
Sse2.Store(sk + 16, Vector128.AsByte(x_3));
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void KeyStream64(ReadOnlySpan<uint> state, Span<byte> output)
public static unsafe void KeyStream64(uint* x, byte* c)
{
fixed (byte* k = output)
fixed (uint* x = state)
{
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

Vector128<uint> orig_0 = x_0;
Vector128<uint> orig_1 = x_1;
Vector128<uint> orig_2 = x_2;
Vector128<uint> orig_3 = x_3;

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

x_0 = Sse2.Add(x_0, orig_0);
x_1 = Sse2.Add(x_1, orig_1);
x_2 = Sse2.Add(x_2, orig_2);
x_3 = Sse2.Add(x_3, orig_3);

Sse2.Store(k, x_0.AsByte());
Sse2.Store(k + 16, x_1.AsByte());
Sse2.Store(k + 32, x_2.AsByte());
Sse2.Store(k + 48, x_3.AsByte());
}
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

Vector128<uint> orig_0 = x_0;
Vector128<uint> orig_1 = x_1;
Vector128<uint> orig_2 = x_2;
Vector128<uint> orig_3 = x_3;

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

x_0 = Sse2.Add(x_0, orig_0);
x_1 = Sse2.Add(x_1, orig_1);
x_2 = Sse2.Add(x_2, orig_2);
x_3 = Sse2.Add(x_3, orig_3);

Sse2.Store(c, x_0.AsByte());
Sse2.Store(c + 16, x_1.AsByte());
Sse2.Store(c + 32, x_2.AsByte());
Sse2.Store(c + 48, x_3.AsByte());
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
8 changes: 3 additions & 5 deletions src/NaCl.Core/Base/Salsa20Base.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,9 @@ public override unsafe void ProcessStream(ReadOnlySpan<byte> nonce, Span<byte> o
{
Span<uint> state = stackalloc uint[BLOCK_SIZE_IN_INTS];
SetInitialState(state, nonce, initialCounter);
fixed (uint* x = state)
fixed (byte* m = input, c = output.Slice(offset))
{
Salsa20BaseIntrinsics.Salsa20(x, m, c, (ulong)input.Length);
}
var c = output.Slice(offset);

Salsa20BaseIntrinsics.Salsa20(state, input, c, (ulong)input.Length);
}
#endif

Expand Down
49 changes: 32 additions & 17 deletions src/NaCl.Core/Base/Salsa20BaseIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,33 @@ namespace NaCl.Core.Base;
public static class Salsa20BaseIntrinsics
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void Salsa20(uint* x, byte* m, byte* c, ulong bytes)
public static unsafe void Salsa20(Span<uint> state, ReadOnlySpan<byte> input, Span<byte> output, ulong bytes)
{
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

if (Avx2.IsSupported && bytes >= 512)
fixed (uint* x = state)
fixed (byte* m_p = input, c_p = output)
{
Salsa512.Process(x, ref m, ref c, ref bytes);
}
if (bytes >= 256)
{
Salsa256.Process(x, ref m, ref c, ref bytes);
}
while (bytes >= 64)
{
Salsa64.Process64(x, ref m, ref c, ref bytes);
}
if (bytes > 0)
{
Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes);
var m = m_p;
var c = c_p;

if (Avx2.IsSupported && bytes >= 512)
{
Salsa512.Process(x, ref m, ref c, ref bytes);
}
if (bytes >= 256)
{
Salsa256.Process(x, ref m, ref c, ref bytes);
}
while (bytes >= 64)
{
Salsa64.Process64(x, ref m, ref c, ref bytes);
}
if (bytes > 0)
{
Salsa64.ProcessVarLength(x, ref m, ref c, ref bytes);
}
}
}

Expand All @@ -39,7 +46,11 @@ public static unsafe void HSalsa20(ReadOnlySpan<uint> state, Span<byte> subKey)
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

Salsa64.HSalsa20(state, subKey);
fixed (uint* x = state)
fixed (byte* sk = subKey)
{
Salsa64.HSalsa20(x, sk);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -48,7 +59,11 @@ public static unsafe void Salsa20KeyStream(ReadOnlySpan<uint> state, Span<byte>
if (!Sse3.IsSupported || !BitConverter.IsLittleEndian)
throw new Exception("Error this vectorisation is not supported on this CPU");
TimothyMakkison marked this conversation as resolved.
Show resolved Hide resolved

Salsa64.KeyStream64(state, output);
fixed (byte* c = output)
fixed (uint* x = state)
{
Salsa64.KeyStream64(x, c);
}
}
}
#endif
78 changes: 35 additions & 43 deletions src/NaCl.Core/Base/SalsaIntrinsics/Salsa64.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,59 +94,51 @@ public static unsafe void ProcessVarLength(uint* x, ref byte* m, ref byte* c, re
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void HSalsa20(ReadOnlySpan<uint> state, Span<byte> subKey)
public static unsafe void HSalsa20(uint* x, byte* sk)
{
fixed (uint* x = state)
fixed (byte* sk = subKey)
{
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);
ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

// HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9
// HSalsa returns a 32 byte array of 0,5,10,15,6,7,8,9

// <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15>
var t_0 = GetDiagonal(x_0, x_1, x_2, x_3);
// <0, 5, 2, 3> + <8, 9, 10, 15> -> <0, 5, 10, 15>
var t_0 = GetDiagonal(x_0, x_1, x_2, x_3);

// Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9>
var t_1 = UnpackHighLow(x_1, x_2);
// Get <4, 5, 6, 7> & <8, 9, 10, 11> then unpack halves for <6, 7, 8, 9>
var t_1 = UnpackHighLow(x_1, x_2);

Sse2.Store(sk, Vector128.AsByte(t_0));
Sse2.Store(sk + 16, Vector128.AsByte(t_1));
}
Sse2.Store(sk, Vector128.AsByte(t_0));
Sse2.Store(sk + 16, Vector128.AsByte(t_1));
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void KeyStream64(ReadOnlySpan<uint> state, Span<byte> output)
public static unsafe void KeyStream64(uint* x, byte* c)
{
fixed (byte* k = output)
fixed (uint* x = state)
{
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

Vector128<uint> orig_0 = x_0;
Vector128<uint> orig_1 = x_1;
Vector128<uint> orig_2 = x_2;
Vector128<uint> orig_3 = x_3;

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

x_0 = Sse2.Add(x_0, orig_0);
x_1 = Sse2.Add(x_1, orig_1);
x_2 = Sse2.Add(x_2, orig_2);
x_3 = Sse2.Add(x_3, orig_3);

Sse2.Store(k, x_0.AsByte());
Sse2.Store(k + 16, x_1.AsByte());
Sse2.Store(k + 32, x_2.AsByte());
Sse2.Store(k + 48, x_3.AsByte());
}
Vector128<uint> x_0 = Sse2.LoadVector128(x);
Vector128<uint> x_1 = Sse2.LoadVector128(x + 4);
Vector128<uint> x_2 = Sse2.LoadVector128(x + 8);
Vector128<uint> x_3 = Sse2.LoadVector128(x + 12);

Vector128<uint> orig_0 = x_0;
Vector128<uint> orig_1 = x_1;
Vector128<uint> orig_2 = x_2;
Vector128<uint> orig_3 = x_3;

ShuffleState(ref x_0, ref x_1, ref x_2, ref x_3);

x_0 = Sse2.Add(x_0, orig_0);
x_1 = Sse2.Add(x_1, orig_1);
x_2 = Sse2.Add(x_2, orig_2);
x_3 = Sse2.Add(x_3, orig_3);

Sse2.Store(c, x_0.AsByte());
Sse2.Store(c + 16, x_1.AsByte());
Sse2.Store(c + 32, x_2.AsByte());
Sse2.Store(c + 48, x_3.AsByte());
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down