Skip to content

Commit bb6920d

Browse files
committed
Updated RandomX to v1.0.4
1 parent 65def48 commit bb6920d

File tree

4 files changed

+74
-42
lines changed

4 files changed

+74
-42
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# RandomX CUDA implementation
22

3-
This repository contains full RandomX implementation for NVIDIA GPUs. The latest version of RandomX (1.0.3 as of June 1st, 2019) is supported.
3+
This repository contains full RandomX implementation for NVIDIA GPUs. The latest version of RandomX (1.0.4 as of June 23rd, 2019) is supported.
44

55
Note: it's only a benchmark/testing tool, not an actual miner. RandomX hashrate is expected to improve somewhat in the future thanks to further optimizations.
66

77
GPUs tested so far:
88

99
Model|CryptonightR H/S|RandomX H/S|Relative speed
1010
-----|---------------|-----------|--------------
11-
GTX 1660 Ti max overclock (2070/13760 MHz)|626 (98 W)|663 (103 W)|105.9%
12-
GTX 1660 Ti low power (1785/13760 MHz)|604 (70 W)|560 (70 W)|92.7%
11+
GTX 1660 Ti max overclock (2070/13760 MHz)|626 (98 W)|671 (103 W)|107.2%
12+
GTX 1660 Ti low power (1785/13760 MHz)|604 (70 W)|567 (70 W)|93.9%
1313
GTX 1070 (1850/7600 MHz) [[1]](https://termbin.com/g2z7)|612 (89 W)|609 (108 W)|99.5%
1414
GTX 1070 Ti (1900/7600 MHz) [[2]](https://termbin.com/iyo1)|625 (97 W)|769 (123 W)|123.0%
1515
GTX 1080 Ti (1930/10010 MHz)[[3]](https://termbin.com/rva66)|787 (145 W)|1136 (190 W)|144.3%

RandomX_CUDA/aes_cuda.hpp

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -645,30 +645,49 @@ __global__ void fillAes4Rx4(void* state, void* out, uint32_t batch_size)
645645
const uint32_t* const t2 = (sub & 1) ? (T + 512) : (T + 1536);
646646
const uint32_t* const t3 = (sub & 1) ? (T + 768) : (T + 1280);
647647

648+
const bool b = (sub < 2);
649+
uint32_t k[16];
650+
k[ 0] = b ? 0x6421aaddu : 0xb5826f73u;
651+
k[ 1] = b ? 0xd1833ddbu : 0xe3d6a7a6u;
652+
k[ 2] = b ? 0x2f546d2bu : 0x3d518b6du;
653+
k[ 3] = b ? 0x99e5d23fu : 0x229effb4u;
654+
k[ 4] = b ? 0xb20e3450u : 0xc7566bf3u;
655+
k[ 5] = b ? 0xb6913f55u : 0x9c10b3d9u;
656+
k[ 6] = b ? 0x06f79d53u : 0xe9024d4eu;
657+
k[ 7] = b ? 0xa5dfcde5u : 0xb272b7d2u;
658+
k[ 8] = b ? 0x5c3ed904u : 0xf273c9e7u;
659+
k[ 9] = b ? 0x515e7bafu : 0xf765a38bu;
660+
k[10] = b ? 0x0aa4679fu : 0x2ba9660au;
661+
k[11] = b ? 0x171c02bfu : 0xf63befa7u;
662+
k[12] = b ? 0x85623763u : 0x7a7cd609u;
663+
k[13] = b ? 0xe78f5d08u : 0x915839deu;
664+
k[14] = b ? 0xcd673785u : 0x0c06d1fdu;
665+
k[15] = b ? 0xd8ded291u : 0xc0b0762du;
666+
648667
#pragma unroll(((outputSize % 512) == 0) ? 8 : 2)
649668
for (uint32_t i = 0; i < outputSize / sizeof(uint4); i += 4, p += strided ? stride_size : 4)
650669
{
651670
uint32_t y[4];
652671

653-
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0xf890465du;
654-
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0x7ffbe4a6u;
655-
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0x141f82b7u;
656-
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0xcf359e95u;
657-
658-
x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x6a55c450u;
659-
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0xfee8278au;
660-
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0xbd5c5ac3u;
661-
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x6741ffdcu;
662-
663-
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0x114c47a4u;
664-
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0xd524fde4u;
665-
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0xa7279ad2u;
666-
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0x3d324aacu;
667-
668-
x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x810c3a2au;
669-
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0x99a9aeffu;
670-
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0x42d3dbd9u;
671-
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x76f6db08u;
672+
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 0];
673+
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 1];
674+
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[ 2];
675+
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[ 3];
676+
677+
x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[ 4];
678+
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[ 5];
679+
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[ 6];
680+
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[ 7];
681+
682+
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 8];
683+
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 9];
684+
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[10];
685+
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[11];
686+
687+
x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[12];
688+
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[13];
689+
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[14];
690+
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[15];
672691

673692
*p = *(uint4*)(x);
674693
}

RandomX_CUDA/randomx_cuda.hpp

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,13 @@ __device__ void print_inst(uint2 inst)
317317
}
318318
opcode -= RANDOMX_FREQ_IROR_R;
319319

320+
if (opcode < RANDOMX_FREQ_IROL_R)
321+
{
322+
printf("%s%sIROL_R r%u, r%u ", branch_target, fp_inst, dst, src);
323+
break;
324+
}
325+
opcode -= RANDOMX_FREQ_IROL_R;
326+
320327
if (opcode < RANDOMX_FREQ_ISWAP_R)
321328
{
322329
printf("%s%sISWAP_R r%u, r%u ", branch_target, fp_inst, dst, src);
@@ -475,13 +482,13 @@ __global__ void __launch_bounds__(32, 16) init_vm(void* entropy_data, void* vm_s
475482
}
476483
opcode -= RANDOMX_FREQ_IMUL_RCP;
477484

478-
if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R)
485+
if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
479486
{
480487
set_byte(registerLastChanged, dst, i);
481488
set_byte(registerWasChanged, dst, 1);
482489
continue;
483490
}
484-
opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R;
491+
opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
485492

486493
if (opcode < RANDOMX_FREQ_ISWAP_R)
487494
{
@@ -703,12 +710,12 @@ __global__ void __launch_bounds__(32, 16) init_vm(void* entropy_data, void* vm_s
703710
}
704711
opcode -= RANDOMX_FREQ_IXOR_M;
705712

706-
if (opcode < RANDOMX_FREQ_IROR_R)
713+
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
707714
{
708715
latency = reg_read_latency;
709716
break;
710717
}
711-
opcode -= RANDOMX_FREQ_IROR_R;
718+
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
712719

713720
if (opcode < RANDOMX_FREQ_ISWAP_R)
714721
{
@@ -1376,20 +1383,24 @@ __global__ void __launch_bounds__(32, 16) init_vm(void* entropy_data, void* vm_s
13761383
}
13771384
opcode -= RANDOMX_FREQ_IXOR_M;
13781385

1379-
if (opcode < RANDOMX_FREQ_IROR_R)
1386+
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
13801387
{
13811388
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (7 << OPCODE_OFFSET);
13821389
if (src == dst)
13831390
{
13841391
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET);
13851392
if (imm_index < IMM_INDEX_COUNT)
1386-
imm_buf[imm_index++] = inst.y;
1393+
imm_buf[imm_index++] = (opcode < RANDOMX_FREQ_IROR_R) ? inst.y : -inst.y;
1394+
}
1395+
else if (opcode >= RANDOMX_FREQ_IROR_R)
1396+
{
1397+
inst.x |= (1 << NEGATIVE_SRC_OFFSET);
13871398
}
13881399

13891400
*(compiled_program++) = inst.x | num_workers;
13901401
continue;
13911402
}
1392-
opcode -= RANDOMX_FREQ_IROR_R;
1403+
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
13931404

13941405
if (opcode < RANDOMX_FREQ_ISWAP_R)
13951406
{
@@ -1848,7 +1859,7 @@ __device__ void inner_loop(
18481859
}
18491860
else if (opcode == 12)
18501861
{
1851-
asm("// FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (70/256) ------>");
1862+
asm("// FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256) ------>");
18521863

18531864
if (location) src = bit_cast<uint64_t>(__int2double_rn(static_cast<int32_t>(src >> ((sub & 1) * 32))));
18541865
if (inst & (1 << NEGATIVE_SRC_OFFSET)) src ^= 0x8000000000000000ULL;
@@ -1859,7 +1870,7 @@ __device__ void inner_loop(
18591870

18601871
dst = bit_cast<uint64_t>(fma_rnd<ROUNDING_MODE>(a, is_mul ? b : 1.0, is_mul ? 0.0 : b, fprc));
18611872

1862-
asm("// <------ FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (70/256)");
1873+
asm("// <------ FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256)");
18631874
}
18641875
else if (opcode == 9)
18651876
{
@@ -1873,16 +1884,12 @@ __device__ void inner_loop(
18731884
}
18741885
else if (opcode == 7)
18751886
{
1876-
asm("// IROR_R (10/256) ------>");
1877-
const uint32_t shift = src & 63;
1878-
dst = (dst >> shift) | (dst << (64 - shift));
1879-
asm("// <------ IROR_R (10/256)");
1880-
}
1881-
else if (opcode == 11)
1882-
{
1883-
asm("// FSWAP_R (8/256) ------>");
1884-
dst = __shfl_xor_sync(fp_workers_mask, dst, 1, 8);
1885-
asm("// <------ FSWAP_R (8/256)");
1887+
asm("// IROR_R, IROL_R (10/256) ------>");
1888+
const uint32_t shift1 = src & 63;
1889+
const uint32_t shift2 = 64 - shift1;
1890+
const bool is_rol = (inst & (1 << NEGATIVE_SRC_OFFSET));
1891+
dst = (dst >> (is_rol ? shift2 : shift1)) | (dst << (is_rol ? shift1 : shift2));
1892+
asm("// <------ IROR_R, IROL_R (10/256)");
18861893
}
18871894
else if (opcode == 14)
18881895
{
@@ -1902,6 +1909,12 @@ __device__ void inner_loop(
19021909
dst = static_cast<uint64_t>(__mul64hi(static_cast<int64_t>(dst), static_cast<int64_t>(src)));
19031910
asm("// <------ ISMULH_R, ISMULH_M (5/256)");
19041911
}
1912+
else if (opcode == 11)
1913+
{
1914+
asm("// FSWAP_R (4/256) ------>");
1915+
dst = __shfl_xor_sync(fp_workers_mask, dst, 1, 8);
1916+
asm("// <------ FSWAP_R (8/256)");
1917+
}
19051918
else if (opcode == 8)
19061919
{
19071920
asm("// ISWAP_R (4/256) ------>");

0 commit comments

Comments
 (0)