From 7a051638949a01c4cc22fae3850176c4e9bfdcd3 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 9 Apr 2018 18:22:07 +0200 Subject: [PATCH] TITAN V fix + code style + spaces (to be continued) revert some useless changes, to continue... --- ccminer.cpp | 66 ++++----- crypto/cn_blake.cuh | 269 ++++++++++++++++++------------------ crypto/cn_keccak.cuh | 14 +- crypto/cryptonight-core.cu | 33 +++-- crypto/cryptonight-extra.cu | 93 +++++++------ crypto/cryptonight.cu | 6 +- crypto/xmr-rpc.cpp | 9 +- 7 files changed, 249 insertions(+), 241 deletions(-) diff --git a/ccminer.cpp b/ccminer.cpp index 344b364f1c..89a4f2f502 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -269,7 +269,7 @@ Options:\n\ lyra2v2 VertCoin\n\ lyra2z ZeroCoin (3rd impl)\n\ myr-gr Myriad-Groestl\n\ - monero XMR cryptonight v7 (new)\n\ + monero XMR cryptonight (v7)\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ @@ -577,13 +577,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work); void get_currentalgo(char* buf, int sz) { - int algo = opt_algo; - - if (algo == ALGO_CRYPTONIGHT) { - algo = get_cryptonight_algo(cryptonight_fork); - } - - snprintf(buf, sz, "%s", algo_names[algo]); + int algo = opt_algo; + if (algo == ALGO_CRYPTONIGHT) + algo = get_cryptonight_algo(cryptonight_fork); + snprintf(buf, sz, "%s", algo_names[algo]); } void format_hashrate(double hashrate, char *output) @@ -1828,7 +1825,6 @@ static void *miner_thread(void *userdata) bool extrajob = false; char s[16]; int rc = 0; - int variant; memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized @@ -2366,14 +2362,13 @@ static void *miner_thread(void *userdata) rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_CRYPTONIGHT: - variant = 0; - - if (cryptonight_fork > 1) { - variant = ((((unsigned char*)work.data)[0] >= cryptonight_fork) ? ((unsigned char*)work.data)[0] - cryptonight_fork + 1 : 0 ); - } - - rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, variant); + { + int cn_variant = 0; + if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork) + cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1; + rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant); break; + } case ALGO_DECRED: rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done); break; @@ -3136,27 +3131,24 @@ void parse_arg(int key, char *arg) } } - //fix cryptonight - switch (opt_algo) { - case ALGO_MONERO: - opt_algo = ALGO_CRYPTONIGHT; - cryptonight_fork = 7; - break; - - case ALGO_GRAFT: - opt_algo = ALGO_CRYPTONIGHT; - cryptonight_fork = 8; - break; - - case ALGO_STELLITE: - opt_algo = ALGO_CRYPTONIGHT; - cryptonight_fork = 3; - break; - - case ALGO_CRYPTONIGHT: - cryptonight_fork = 1; - break; - } + // cryptonight variants + switch (opt_algo) { + case ALGO_MONERO: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 7; + break; + case ALGO_GRAFT: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 8; + break; + case ALGO_STELLITE: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 3; + break; + case ALGO_CRYPTONIGHT: + cryptonight_fork = 1; + break; + } break; case 'b': diff --git a/crypto/cn_blake.cuh b/crypto/cn_blake.cuh index 3f410f73d6..5c0d09f27a 100644 --- a/crypto/cn_blake.cuh +++ b/crypto/cn_blake.cuh @@ -1,3 +1,4 @@ +//#include typedef struct { uint32_t h[8], s[4], t[2]; @@ -6,24 +7,25 @@ typedef struct { } blake_state; #define U8TO32(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + #define U32TO8(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + #define BLAKE_ROT(x,n) ROTR32(x, n) #define BLAKE_G(a,b,c,d,e) \ - v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ - v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ - v[b] = BLAKE_ROT(v[b] ^ v[c], 7); - -__constant__ uint8_t d_blake_sigma[14][16] = -{ + v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ + v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c], 7); + +__constant__ uint8_t d_blake_sigma[14][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, @@ -39,137 +41,140 @@ __constant__ uint8_t d_blake_sigma[14][16] = {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8} }; -__constant__ uint32_t d_blake_cst[16] -= { + +__constant__ uint32_t d_blake_cst[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 }; -__device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block) +__device__ +void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block) { - uint32_t v[16], m[16], i; - - for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - v[ 8] = S->s[0] ^ 0x243F6A88; - v[ 9] = S->s[1] ^ 0x85A308D3; - v[10] = S->s[2] ^ 0x13198A2E; - v[11] = S->s[3] ^ 0x03707344; - v[12] = 0xA4093822; - v[13] = 0x299F31D0; - v[14] = 0x082EFA98; - v[15] = 0xEC4E6C89; - - if (S->nullt == 0) { - v[12] ^= S->t[0]; - v[13] ^= S->t[0]; - v[14] ^= S->t[1]; - v[15] ^= S->t[1]; - } - - for (i = 0; i < 14; ++i) { - BLAKE_G(0, 4, 8, 12, 0); - BLAKE_G(1, 5, 9, 13, 2); - BLAKE_G(2, 6, 10, 14, 4); - BLAKE_G(3, 7, 11, 15, 6); - BLAKE_G(3, 4, 9, 14, 14); - BLAKE_G(2, 7, 8, 13, 12); - BLAKE_G(0, 5, 10, 15, 8); - BLAKE_G(1, 6, 11, 12, 10); - } - - for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; - for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; + uint32_t v[16], m[16], i; + + for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); + for (i = 0; i < 8; ++i) v[i] = S->h[i]; + v[ 8] = S->s[0] ^ 0x243F6A88; + v[ 9] = S->s[1] ^ 0x85A308D3; + v[10] = S->s[2] ^ 0x13198A2E; + v[11] = S->s[3] ^ 0x03707344; + v[12] = 0xA4093822; + v[13] = 0x299F31D0; + v[14] = 0x082EFA98; + v[15] = 0xEC4E6C89; + + if (S->nullt == 0) { + v[12] ^= S->t[0]; + v[13] ^= S->t[0]; + v[14] ^= S->t[1]; + v[15] ^= S->t[1]; + } + + for (i = 0; i < 14; ++i) { + BLAKE_G(0, 4, 8, 12, 0); + BLAKE_G(1, 5, 9, 13, 2); + BLAKE_G(2, 6, 10, 14, 4); + BLAKE_G(3, 7, 11, 15, 6); + BLAKE_G(3, 4, 9, 14, 14); + BLAKE_G(2, 7, 8, 13, 12); + BLAKE_G(0, 5, 10, 15, 8); + BLAKE_G(1, 6, 11, 12, 10); + } + + for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; + for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; } __device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen) { - int left = S->buflen >> 3; - int fill = 64 - left; - - if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) { - memcpy((void *) (S->buf + left), (void *) data, fill); - S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; - cn_blake_compress(S, S->buf); - data += fill; - datalen -= (fill << 3); - left = 0; - } - - while (datalen >= 512) { - S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; - cn_blake_compress(S, data); - data += 64; - datalen -= 512; - } - - if (datalen > 0) { - memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); - S->buflen = (left << 3) + datalen; - } else { - S->buflen = 0; - } + int left = S->buflen >> 3; + int fill = 64 - left; + + if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) { + memcpy((void *) (S->buf + left), (void *) data, fill); + S->t[0] += 512; + if (S->t[0] == 0) S->t[1]++; + cn_blake_compress(S, S->buf); + data += fill; + datalen -= (fill << 3); + left = 0; + } + + while (datalen >= 512) { + S->t[0] += 512; + if (S->t[0] == 0) S->t[1]++; + cn_blake_compress(S, data); + data += 64; + datalen -= 512; + } + + if (datalen > 0) { + memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + S->buflen = (left << 3) + datalen; + } else { + S->buflen = 0; + } } -__device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) +__device__ +void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) { - const uint8_t padding[] = { - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - }; - uint8_t pa = 0x81, pb = 0x01; - uint8_t msglen[8]; - uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; - if (lo < (unsigned) S->buflen) hi++; - U32TO8(msglen + 0, hi); - U32TO8(msglen + 4, lo); - - if (S->buflen == 440) { - S->t[0] -= 8; - cn_blake_update(S, &pa, 8); - } else { - if (S->buflen < 440) { - if (S->buflen == 0) S->nullt = 1; - S->t[0] -= 440 - S->buflen; - cn_blake_update(S, padding, 440 - S->buflen); - } else { - S->t[0] -= 512 - S->buflen; - cn_blake_update(S, padding, 512 - S->buflen); - S->t[0] -= 440; - cn_blake_update(S, padding + 1, 440); - S->nullt = 1; - } - cn_blake_update(S, &pb, 8); - S->t[0] -= 8; - } - S->t[0] -= 64; - cn_blake_update(S, msglen, 64); - - U32TO8(digest + 0, S->h[0]); - U32TO8(digest + 4, S->h[1]); - U32TO8(digest + 8, S->h[2]); - U32TO8(digest + 12, S->h[3]); - U32TO8(digest + 16, S->h[4]); - U32TO8(digest + 20, S->h[5]); - U32TO8(digest + 24, S->h[6]); - U32TO8(digest + 28, S->h[7]); + const uint8_t padding[] = { + 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + }; + uint8_t pa = 0x81, pb = 0x01; + uint8_t msglen[8]; + uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; + if (lo < (unsigned) S->buflen) hi++; + U32TO8(msglen + 0, hi); + U32TO8(msglen + 4, lo); + + if (S->buflen == 440) { + S->t[0] -= 8; + cn_blake_update(S, &pa, 8); + } else { + if (S->buflen < 440) { + if (S->buflen == 0) S->nullt = 1; + S->t[0] -= 440 - S->buflen; + cn_blake_update(S, padding, 440 - S->buflen); + } else { + S->t[0] -= 512 - S->buflen; + cn_blake_update(S, padding, 512 - S->buflen); + S->t[0] -= 440; + cn_blake_update(S, padding + 1, 440); + S->nullt = 1; + } + cn_blake_update(S, &pb, 8); + S->t[0] -= 8; + } + S->t[0] -= 64; + cn_blake_update(S, msglen, 64); + + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 12, S->h[3]); + U32TO8(digest + 16, S->h[4]); + U32TO8(digest + 20, S->h[5]); + U32TO8(digest + 24, S->h[6]); + U32TO8(digest + 28, S->h[7]); } -__device__ void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) +__device__ +void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) { - blake_state bs; - blake_state *S = (blake_state *)&bs; - - S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; - S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; - S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; - S->t[0] = S->t[1] = S->buflen = S->nullt = 0; - S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; - - cn_blake_update(S, (uint8_t *)in, inlen * 8); - cn_blake_final(S, (uint8_t *)out); -} \ No newline at end of file + blake_state bs; + blake_state *S = (blake_state *)&bs; + + S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; + S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; + S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; + S->t[0] = S->t[1] = S->buflen = S->nullt = 0; + S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; + + cn_blake_update(S, (uint8_t *)in, inlen * 8); + cn_blake_final(S, (uint8_t *)out); +} diff --git a/crypto/cn_keccak.cuh b/crypto/cn_keccak.cuh index 985f798d60..c6f5908f61 100644 --- a/crypto/cn_keccak.cuh +++ b/crypto/cn_keccak.cuh @@ -1,5 +1,4 @@ -__constant__ uint64_t keccakf_rndc[24] = -{ +__constant__ uint64_t keccakf_rndc[24] = { 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, @@ -34,7 +33,8 @@ __constant__ uint64_t keccakf_rndc[24] = #define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y)) #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__device__ __forceinline__ void cn_keccakf2(uint64_t *s) +__device__ __forceinline__ +void cn_keccakf2(uint64_t *s) { uint8_t i; @@ -91,7 +91,8 @@ __device__ __forceinline__ void cn_keccakf2(uint64_t *s) } } -__device__ __forceinline__ void cn_keccakf(uint64_t *s) +__device__ __forceinline__ +void cn_keccakf(uint64_t *s) { uint64_t bc[5], tmpxor[5], tmp1, tmp2; @@ -193,7 +194,8 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s) } } -__device__ __forceinline__ void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md) +__device__ __forceinline__ +void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md) { uint64_t st[25]; @@ -206,4 +208,4 @@ __device__ __forceinline__ void cn_keccak(const uint32_t * __restrict__ in, uint MEMCPY8(md, st, 25); return; -} \ No newline at end of file +} diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu index 83a2098086..cce3e1422f 100644 --- a/crypto/cryptonight-core.cu +++ b/crypto/cryptonight-core.cu @@ -145,10 +145,11 @@ void cryptonight_core_gpu_phase2(uint32_t threads, int bfactor, int partidx, uin a = ctx_a[sub]; d[1] = ctx_b[sub]; -#pragma unroll 2 + + #pragma unroll 2 for (i = start; i < end; ++i) { -#pragma unroll 2 + #pragma unroll for (int x = 0; x < 2; ++x) { j = ((__shfl(a, 0, 4) & 0x1FFFF0) >> 2) + sub; @@ -180,7 +181,7 @@ void cryptonight_core_gpu_phase2(uint32_t threads, int bfactor, int partidx, uin zz[1] = __shfl(yy[1], 0, 4); t1[1] = __shfl(d[x], 1, 4); -#pragma unroll + #pragma unroll for (k = 0; k < 2; k++) t2[k] = __shfl(a, k + sub2, 4); @@ -214,14 +215,14 @@ void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long { const int sub = (threadIdx.x & 7) << 2; const uint32_t *longstate = &long_state[(thread << 19) + sub]; - uint32_t key[40], text[4], i, j; + uint32_t key[40], text[4]; MEMCPY8(key, d_ctx_key2 + thread * 40, 20); MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); - for(i = 0; i < 0x80000; i += 32) + for(int i = 0; i < 0x80000; i += 32) { -#pragma unroll - for(j = 0; j < 4; ++j) + #pragma unroll + for(int j = 0; j < 4; ++j) text[j] ^= longstate[i + j]; cn_aes_pseudo_round_mut(sharedMemory, text, key); @@ -239,26 +240,24 @@ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ dim3 block4(threads << 2); dim3 block8(threads << 3); - uint32_t i; const uint32_t bfactor = (uint32_t) device_bfactor[thr_id]; const uint32_t partcount = 1 << bfactor; const uint32_t throughput = (uint32_t) (blocks*threads); - const int bsleep = bfactor ? 100 : 0; - //const int dev_id = device_map[thr_id]; + uint32_t i; - cryptonight_core_gpu_phase1 <<< grid, block8 >>>(throughput, d_long_state, d_ctx_state, d_ctx_key1); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + cryptonight_core_gpu_phase1 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key1); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); for(i = 0; i < partcount; i++) { - cryptonight_core_gpu_phase2 <<< grid, block4 >>>(throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, variant, d_ctx_tweak1_2); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + cryptonight_core_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, variant, d_ctx_tweak1_2); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } cudaDeviceSynchronize(); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); - cryptonight_core_gpu_phase3 <<< grid, block8 >>>(throughput, d_long_state, d_ctx_state, d_ctx_key2); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cryptonight_core_gpu_phase3 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key2); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptonight-extra.cu b/crypto/cryptonight-extra.cu index 31eaf4ed83..97c1c3991a 100644 --- a/crypto/cryptonight-extra.cu +++ b/crypto/cryptonight-extra.cu @@ -9,12 +9,12 @@ #include #include "cryptonight.h" -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; +typedef uint8_t BitSequence; +typedef uint64_t DataLength; -static uint32_t *d_input[MAX_GPUS]; +static uint32_t *d_input[MAX_GPUS] = { 0 }; static uint32_t *d_target[MAX_GPUS]; -static uint32_t *d_resultNonce[MAX_GPUS]; +static uint32_t *d_result[MAX_GPUS]; #include "cn_keccak.cuh" #include "cn_blake.cuh" @@ -22,8 +22,7 @@ static uint32_t *d_resultNonce[MAX_GPUS]; #include "cn_jh.cuh" #include "cn_skein.cuh" -__constant__ uint8_t d_sub_byte[16][16] = -{ +__constant__ uint8_t d_sub_byte[16][16] = { {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76}, {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0}, {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15}, @@ -42,37 +41,40 @@ __constant__ uint8_t d_sub_byte[16][16] = {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16} }; -__device__ __forceinline__ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data) +__device__ __forceinline__ +void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data) { - int i, j; - uint8_t temp[4]; - const uint32_t aes_gf[10] = - { + const uint32_t aes_gf[10] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; MEMCPY4(key, data, 8); -#pragma unroll - for(i = 8; i < 40; i++) + #pragma unroll + for(int i = 8; i < 40; i++) { + uint8_t temp[4]; *(uint32_t *)temp = key[i - 1]; - if(i % 8 == 0) - { + + if(i % 8 == 0) { *(uint32_t *)temp = ROTR32(*(uint32_t *)temp, 8); - for(j = 0; j < 4; j++) + for(int j = 0; j < 4; j++) temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; *(uint32_t *)temp ^= aes_gf[i / 8 - 1]; } - else - if(i % 8 == 4) -#pragma unroll - for(j = 0; j < 4; j++) - temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; + else if(i % 8 == 4) { + #pragma unroll + for(int j = 0; j < 4; j++) + temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; + } + key[i] = key[(i - 8)] ^ *(uint32_t *)temp; } } -__global__ void cryptonight_extra_gpu_prepare(int threads, const uint32_t * __restrict__ d_input, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint32_t * d_ctx_tweak1_2) +__global__ +void cryptonight_extra_gpu_prepare(int threads, const uint32_t * __restrict__ d_input, uint32_t startNonce, + uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint32_t * d_ctx_tweak1_2) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -117,10 +119,12 @@ __global__ void cryptonight_extra_gpu_prepare(int threads, const uint32_t * __re } } -__global__ void cryptonight_extra_gpu_final(int threads, uint32_t startNonce, const uint32_t * __restrict__ d_target, uint32_t * __restrict__ resNonce, uint32_t * __restrict__ d_ctx_state) +__global__ +void cryptonight_extra_gpu_final(int threads, uint32_t startNonce, const uint32_t * __restrict__ d_target, + uint32_t * __restrict__ resNonce, uint32_t * __restrict__ d_ctx_state) { const int thread = blockDim.x * blockIdx.x + threadIdx.x; - + if(thread < threads) { int i; @@ -172,59 +176,64 @@ __global__ void cryptonight_extra_gpu_final(int threads, uint32_t startNonce, co if(rc == true) { uint32_t tmp = atomicExch(resNonce, nonce); - if(tmp != 0xffffffff) + if(tmp != UINT32_MAX) resNonce[1] = tmp; } } } -__host__ void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn) +__host__ +void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn) { cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice); cudaMemcpy(d_target[thr_id], pTargetIn, 8 * sizeof(uint32_t), cudaMemcpyHostToDevice); - cudaMemset(d_resultNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t)); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } -__host__ void cryptonight_extra_cpu_init(int thr_id) +__host__ +void cryptonight_extra_cpu_init(int thr_id) { cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t)); cudaMalloc(&d_target[thr_id], 8 * sizeof(uint32_t)); - cudaMalloc(&d_resultNonce[thr_id], 2*sizeof(uint32_t)); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t)); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } -__host__ void cryptonight_extra_cpu_prepare(int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint32_t *d_ctx_tweak1_2) +__host__ +void cryptonight_extra_cpu_prepare(int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint32_t *d_ctx_tweak1_2) { int threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cryptonight_extra_gpu_prepare << > >(threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak1_2); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + cryptonight_extra_gpu_prepare <<>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak1_2); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } -__host__ void cryptonight_extra_cpu_final(int thr_id, int threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state) +__host__ +void cryptonight_extra_cpu_final(int thr_id, int threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state) { int threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); - cryptonight_extra_gpu_final << > >(threads, startNonce, d_target[thr_id], d_resultNonce[thr_id], d_ctx_state); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); - cudaMemcpy(resnonce, d_resultNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); - exit_if_cudaerror(thr_id, __FILE__, __LINE__); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cryptonight_extra_gpu_final <<>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } -__host__ void cryptonight_extra_cpu_free(int thr_id) +__host__ +void cryptonight_extra_cpu_free(int thr_id) { if (d_input[thr_id]) { cudaFree(d_input[thr_id]); cudaFree(d_target[thr_id]); - cudaFree(d_resultNonce[thr_id]); + cudaFree(d_result[thr_id]); d_input[thr_id] = NULL; } } diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 2b82468d03..739321bf25 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -50,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], mem, device_mpcount[dev_id]); + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x24"); + } + if (device_config[thr_id]) { int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -71,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ exit(1); } - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index 3c6118c44b..6d5f53b281 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -559,14 +559,11 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) else if (opt_algo == ALGO_CRYPTONIGHT) { uint32_t nonce = work->nonces[idnonce]; + int variant = 0; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - int variant = 0; - - if (cryptonight_fork > 1) { - variant = ((((unsigned char*)work->data)[0] >= cryptonight_fork) ? ((unsigned char*)work->data)[0] - cryptonight_fork + 1 : 0 ); - } - + if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; cryptonight_hash(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); }