Skip to content

Commit

Permalink
Tweak v2.2 implementation
Browse files Browse the repository at this point in the history
Unoptimized
  • Loading branch information
SChernykh committed Sep 21, 2018
1 parent dfa4f53 commit 3b465f6
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 41 deletions.
39 changes: 27 additions & 12 deletions crypto/asm/cnv2_double_main_loop_sandybridge.inc
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,8 @@ main_loop_double_sandybridge:
mov r9, QWORD PTR [rbx+8]

xor edx, 16
mov eax, edx
movdqu xmm0, XMMWORD PTR [rdx+r13]
xor edx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm0
paddq xmm1, xmm2
xor edx, 16
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
mov r8d, edx
mov r15d, edx

movq rdx, xmm5
shl rdx, 32
Expand All @@ -181,6 +171,25 @@ main_loop_double_sandybridge:
xor r10, rdx
mov rax, r10
mul r11
movq xmm0, rax
movq xmm1, rdx
punpcklqdq xmm1, xmm0

movdqu xmm0, XMMWORD PTR [r8+r13]
pxor xmm0, xmm1
xor r8d, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [r8+r13]
xor rdx, [r8+r13]
xor rax, [r8+r13+8]
movdqu XMMWORD PTR [r8+r13], xmm0
paddq xmm1, xmm2
xor r8d, 16
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0

mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
Expand Down Expand Up @@ -278,14 +287,20 @@ sqrt_fix_2_ret_sandybridge:

mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0

mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
add rdi, rdx
add rbp, rax
Expand Down
12 changes: 9 additions & 3 deletions crypto/asm/cnv2_main_loop_ivybridge.inc
Original file line number Diff line number Diff line change
Expand Up @@ -114,24 +114,30 @@ $sqrt_fixup_ivybridge_ret:
mov r9, r10
mov rax, rdi
mul rbp
movq xmm0, rax
movq xmm1, rdx
punpcklqdq xmm1, xmm0

xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
add r8, rdx
add r11, rax
movdqu xmm0, XMMWORD PTR [r10+rbx]
movdqu xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm1
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
add r8, rdx
add r11, rax
mov QWORD PTR [r14], r8
xor r8, rdi
mov r10, r8
Expand Down
10 changes: 8 additions & 2 deletions crypto/asm/cnv2_main_loop_ryzen.inc
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,21 @@ $main_loop_ryzen:
$sqrt_fixup_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1

mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm0, XMMWORD PTR [r10+rbx]
movdqa xmm2, XMMWORD PTR [r9+rbx]
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm4
paddq xmm2, xmm3
paddq xmm1, xmm6
Expand Down
31 changes: 17 additions & 14 deletions crypto/cryptonight_aesni.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,11 +446,10 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
if (SHUFFLE)
{
// Shuffle constants here were chosen carefully
// to maximize permutation cycle length
// and have no 2-byte elements stay in their places
const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]);
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi));
const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]);
hi ^= ((uint64_t*)&l0[idx1 ^ 0x20])[0];
lo ^= ((uint64_t*)&l0[idx1 ^ 0x20])[1];
const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]);
_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));
_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));
Expand Down Expand Up @@ -637,25 +636,27 @@ void cryptonight_double_hash(const void* input1, size_t len1, void* output1, con
cl = ((uint64_t*)&l0[idx01])[0];
ch = ((uint64_t*)&l0[idx01])[1];

if (INT_MATH)
{
const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm);
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32);
}

lo = _umul128(idx00, cl, &hi);

if (SHUFFLE)
{
uint32_t k = idx01 ^ 0x10;
const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[k]); k ^= 0x30;
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[k]), _mm_set_epi64x(lo, hi)); k ^= 0x30;
const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[k]);
hi ^= ((uint64_t*)&l0[k])[0];
lo ^= ((uint64_t*)&l0[k])[1];
_mm_store_si128((__m128i *)&l0[k], _mm_add_epi64(chunk1, bx00)); k ^= 0x10;
const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[k]);
_mm_store_si128((__m128i *)&l0[k], _mm_add_epi64(chunk2, ax0)); k ^= 0x20;
_mm_store_si128((__m128i *)&l0[k], _mm_add_epi64(chunk3, bx01));
}

if (INT_MATH)
{
const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm);
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32);
}

lo = _umul128(idx00, cl, &hi);

axl0 += hi;
axh0 += lo;
((uint64_t*)&l0[idx01])[0] = axl0;
Expand Down Expand Up @@ -683,8 +684,10 @@ void cryptonight_double_hash(const void* input1, size_t len1, void* output1, con
if (SHUFFLE)
{
uint32_t k = idx11 ^ 0x10;
const __m128i chunk1 = _mm_load_si128((__m128i *)&l1[k]); k ^= 0x30;
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l1[k]), _mm_set_epi64x(lo, hi)); k ^= 0x30;
const __m128i chunk2 = _mm_load_si128((__m128i *)&l1[k]);
hi ^= ((uint64_t*)&l1[k])[0];
lo ^= ((uint64_t*)&l1[k])[1];
_mm_store_si128((__m128i *)&l1[k], _mm_add_epi64(chunk1, bx10)); k ^= 0x10;
const __m128i chunk3 = _mm_load_si128((__m128i *)&l1[k]);
_mm_store_si128((__m128i *)&l1[k], _mm_add_epi64(chunk2, ax1)); k ^= 0x20;
Expand Down
16 changes: 16 additions & 0 deletions minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,16 @@ cryptonight_ctx* minethd_alloc_ctx()
return nullptr; //Should never happen
}

static void print_hash(const char* input, const char* hash)
{
printf("HASH(\"%s\") = ", input);
for (int k = 0; k < 32; ++k)
{
printf("%02x", static_cast<uint8_t>(hash[k]));
}
printf("\n");
}

bool minethd::self_test()
{
alloc_msg msg = { 0 };
Expand Down Expand Up @@ -327,6 +337,7 @@ bool minethd::self_test()

if (memcmp(hash, reference_hash[i], HASH_SIZE) != 0)
{
print_hash(input.c_str(), hash);
printer::inst()->print_msg(L0, "Cryptonight hash self-test (variant %d) failed.", i);
return false;
}
Expand All @@ -335,6 +346,8 @@ bool minethd::self_test()
{
if (memcmp(hash_dbl, reference_hash_dbl[i], HASH_SIZE * 2) != 0)
{
print_hash(prev_input.c_str(), hash_dbl);
print_hash(input.c_str(), hash_dbl + HASH_SIZE);
printer::inst()->print_msg(L0, "Cryptonight double hash self-test (variant %d) failed.", i);
return false;
}
Expand All @@ -356,13 +369,16 @@ bool minethd::self_test()

if (memcmp(hash, reference_hash[i], HASH_SIZE) != 0)
{
print_hash(input.c_str(), hash);
printer::inst()->print_msg(L0, "Cryptonight hash self-test (variant 2, asm version %d) failed.", j);
return false;
}
if (!prev_input.empty())
{
if (memcmp(hash_dbl, reference_hash_dbl[i], HASH_SIZE * 2) != 0)
{
print_hash(prev_input.c_str(), hash_dbl);
print_hash(input.c_str(), hash_dbl + HASH_SIZE);
printer::inst()->print_msg(L0, "Cryptonight double hash self-test (variant 2, asm version %d) failed.", j);
return false;
}
Expand Down
20 changes: 10 additions & 10 deletions tests.txt
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
This is a test This is a test This is a test
74d15836e33d14e164c2494648996eb5ed71a3ec2c72c2be225eda1b8a857aba
0157c5ee188bbec8975285a3064ee92065217672fd69a1aebd0766c7b56ee0bd
4cf1ff9ca46eb433b36cd9f70e02b14cc06bfd18ca77fa9ccaafd1fd96c674b0
353fdc068fd47b03c04b9431e005e00b68c2168a3cc7335c8b9b308156591a4f
Lorem ipsum dolor sit amet, consectetur adipiscing
22ec483997cba20105378af3ec647ee5d20401d7df21c0bf4bf866bc55383e92
755d58e48e53f795a0ed6b27c794018372922e5d1a256cdbf9fc442f59f284c9
7d292e43f4751714ec07dbcb0e4bbffe2a7afb6066420960684ff57d7474c871
72f134fc50880c330fe65a2cb7896d59b2e708a0221c6a9da3f69b3a702d8682
elit, sed do eiusmod tempor incididunt ut labore
c5efc04bf88b450e86537dc046339b16d35133c4d905ec7fa16bd28a67c4f2fe
7158c9c0d5082df7f2ee236b994f385bd96fd09eda30e21643cb7351fd7301ce
335563425256edebf1d92dc342369c2f4770ebb4112ba975659bd8a0f210abd0
410919660ec540fc49d8695ff01f974226a2a28dbbac82949c12f541b9a62d2f
et dolore magna aliqua. Ut enim ad minim veniam,
628c400e4712cecb44d88572e9e8bb9be9a1221da1cb52ff8eefaf4adcc172eb
7329cde3fbf98bec02578fcdcfeaf2cf11e2a1f105324f89c36470708bd6db16
47758e86d2f57210366cec36fff26f9464d89efd116fe6ef28b718b5da120801
4472fecfeb371e8b7942ce0378c0ba5e6d0c6361b669c587807365c787ae652d
quis nostrud exercitation ullamco laboris nisi
a0351d7aa54c2e7c774695af86f8bbb859a0ef9b0d4f0031dd1df5ea7ccc752d
05066660ea3bc0568269cd95c212ad2bf2f2ced4e4cdb1f2bc5f766e88e4862b
48787b48d5c68f0c1dd825c32580af741cc0ee314f08133135c1e86d87a24a95
577568395203f1f1225f2982b637f7d5e61b47a0f546ba16d46020b471b74076
ut aliquip ex ea commodo consequat. Duis aute
677b3a14c1875eda0ca0c3d6c340413848b1ab0bf9d448dddd5714cbc6d170b9
edc9f99dfd626ddc5604f8b387c7a88cc6fcb17cef46a3b917c2f8ffbd449982
93bdf47495854f7cfaaca1af8c0f39ef4a3024c10eb0dea23726b0e06ef29e84
f6fd7efe95a5c6c4bb46d9b429e3faf65b1ce439e116742d42b928e61de52385
irure dolor in reprehenderit in voluptate velit
8a73c33ebfd11d78db984486a298149d034051c61cdaf6ff7e783e46a6763edf
44df1cbd33439b82f901bcad232f3908331330edad0c9b9af35d62f524fd92b4
a375a71d0541057ccc96719150dfe10b6e6f486b19cf4a0835e19605413a8417
422f8cfe8060cf6c3d9fd66f68e3c9977adb683aea2788029308bbe9bc50d728
esse cillum dolore eu fugiat nulla pariatur.
021007fa46b46110e7dd6c7f1bb392499d7461950efd884e6bb4260d57906b6f
0fa9723e149c0772d16ae95b744186f419b48adcbfe685c99b53f6db44ba2668
163478a76f8f1432533fbdd1284d65c89f37479e54f20841c6ce4eba56c73854
512e62c8c8c833cfbd9d361442cb00d63c0a3fd8964cfd2fedc17c7c25ec2d4b
Excepteur sint occaecat cupidatat non proident,
d61f8a0722e9d38c691fe22613ef68c83a498dd24e3c382ee1abfa665d632371
90c71412c2ca0c2e5789a98fb7ce36179d3c7f8b164f9aa07df56d44c9e9e96d
356b0470c6eea75cad7a108179e232905b23bdaf03c2824c6e619d503ee93677
12a794c1aa13d561c9c6111cee631ca9d0a321718d67d3416add9de1693ba41e
sunt in culpa qui officia deserunt mollit anim id est laborum.
75a105029f6b8c00429c427ffc7a64d84dbcdf2728ce0d2df9133cef91c9f8d3
5944b5b0480e84dc233bcc37101c23077542433c868c67325e9c501cfd1b8151
a47e2b007dc25bb279e197a1b91f67ecebe2ddd8791cd32dd2cb76dd21ed943f
2659ff95fc74b6215c1dc741e85b7a9710101b30620212f80eb59c3c55993f9d
1 change: 1 addition & 0 deletions xmr-stak-cpu.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@
<MASM Include="crypto\asm\cnv2_main_loop.asm" />
</ItemGroup>
<ItemGroup>
<None Include="crypto\asm\cnv2_double_main_loop_sandybridge.inc" />
<None Include="crypto\asm\cnv2_main_loop_ivybridge.inc" />
<None Include="crypto\asm\cnv2_main_loop_ryzen.inc" />
</ItemGroup>
Expand Down
3 changes: 3 additions & 0 deletions xmr-stak-cpu.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -266,5 +266,8 @@
<None Include="crypto\asm\cnv2_main_loop_ryzen.inc">
<Filter>Source Files\asm</Filter>
</None>
<None Include="crypto\asm\cnv2_double_main_loop_sandybridge.inc">
<Filter>Source Files\asm</Filter>
</None>
</ItemGroup>
</Project>

0 comments on commit 3b465f6

Please sign in to comment.