Skip to content

Commit

Permalink
xmr: fix decimal diff + aes cleanup
Browse files Browse the repository at this point in the history
change default launch config to -l 32x16 to handle the 750 Ti better
not definitive, doing tests..
  • Loading branch information
tpruvot committed Jan 8, 2017
1 parent c1f1ad9 commit 2479ffa
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 325 deletions.
2 changes: 1 addition & 1 deletion configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"

CUDA_CFLAGS="-O3 -lineno -Xcompiler -Wall -D_FORCE_INLINES" \
./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda-7.5 --with-nvml=libnvidia-ml.so
./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so

456 changes: 172 additions & 284 deletions crypto/cn_aes.cuh

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions crypto/cryptonight.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#include <miner.h>
#include "cryptonight.h"

extern char *device_config[MAX_GPUS]; // -l 24x32
extern char *device_config[MAX_GPUS]; // -l 32x16

uint32_t cn_blocks = 24;
uint32_t cn_threads = 32;
uint32_t cn_blocks = 32;
uint32_t cn_threads = 16;

static uint32_t *d_long_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
Expand Down Expand Up @@ -43,11 +43,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
{
if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
gpulog(LOG_INFO, thr_id, "Using %u x %u threads kernel launch config", cn_blocks, cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
gpulog(LOG_INFO, thr_id, "Using %u x %u (%u) threads kernel launch config",
cn_blocks, cn_threads, throughput);
} else {
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, (%u x %u) %u threads",
throughput2intensity(throughput), cn_blocks, cn_threads, throughput);
}

if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
Expand All @@ -59,9 +61,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
CUDA_LOG_ERROR();
}

Expand Down Expand Up @@ -107,7 +108,6 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
gpulog(LOG_DEBUG, thr_id, "found nonce %x", resNonces[0]);
cryptonight_hash(vhash, tempdata, 76);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
Expand Down
12 changes: 0 additions & 12 deletions crypto/cryptonight.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,22 +105,10 @@ struct uint3 blockDim;
for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }

#define XOR_BLOCKS(a,b) { \
((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }

#define XOR_BLOCKS_DST(x,y,z) { \
((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }

#define MUL_SUM_XOR_DST(a,c,dst) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo; }

#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))

union hash_state {
Expand Down
36 changes: 23 additions & 13 deletions crypto/cuda_cryptonight_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "cryptonight.h"

#ifdef WIN32
int cn_bfactor = 6;
int cn_bfactor = 8;
int cn_bsleep = 100;
#else
int cn_bfactor = 0;
Expand All @@ -19,6 +19,14 @@ int cn_bsleep = 0;

#include "cn_aes.cuh"

#define MUL_SUM_XOR_DST(a,c,dst) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo; }

__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
{
*product_hi = __umul64hi(multiplier, multiplicand);
Expand Down Expand Up @@ -52,15 +60,15 @@ void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state
}

__global__
void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
{
__shared__ uint32_t sharedMemory[1024];

cn_aes_gpu_init(sharedMemory);

__syncthreads();

#if __CUDA_ARCH__ >= 300
#if 0 && __CUDA_ARCH__ >= 300

const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
const int sub = threadIdx.x & 3;
Expand Down Expand Up @@ -171,25 +179,27 @@ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t
const int batchsize = ITER >> (2 + bfactor);
const int start = partidx * batchsize;
const int end = start + batchsize;
uint32_t * __restrict__ long_state = &d_long_state[thread << 19];
uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
uint32_t a[4], b[4], c[4];
int j;
const off_t longptr = (off_t) thread << 19;
uint32_t * long_state = &d_long_state[longptr];
uint32_t * ctx_a = &d_ctx_a[thread * 4];
uint32_t * ctx_b = &d_ctx_b[thread * 4];
uint32_t a[4], b[4];

MEMCPY8(a, ctx_a, 2);
MEMCPY8(b, ctx_b, 2);

for(int i = start; i < end; ++i)
for(int i = start; i < end; i++) // end = 262144
{
j = (a[0] & 0x1FFFF0) >> 2;
uint32_t c[4];
uint32_t j = (a[0] >> 2) & 0x7FFFC;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
MUL_SUM_XOR_DST(c, a, (uint8_t *)&long_state[(c[0] & 0x1FFFF0) >> 2]);
j = (a[0] & 0x1FFFF0) >> 2;
MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & 0x7FFFC]);

j = (a[0] >> 2) & 0x7FFFC;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] & 0x1FFFF0) >> 2]);
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & 0x7FFFC]);
}

if(bfactor > 0)
Expand Down
21 changes: 16 additions & 5 deletions crypto/xmr-rpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,18 @@
#include "xmr-rpc.h"
#include "wildkeccak.h"

//#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
double target_to_diff_rpc2(uint32_t* target)
{
// unlike other algos, xmr diff is very low
if (opt_algo == ALGO_CRYPTONIGHT) {
// simplified to get 1.0 for 10K
return (double) (UINT32_MAX / target[7]) / 10000;
}
else if (opt_algo == ALGO_WILDKECCAK) {
return target_to_diff(target) * 1000;
}
return target_to_diff(target); // util.cpp
}

extern struct stratum_ctx stratum;

Expand Down Expand Up @@ -441,7 +452,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
memcpy(work->data, rpc2_blob, rpc2_bloblen);
memset(work->target, 0xff, sizeof(work->target));
work->target[7] = rpc2_target;
work->targetdiff = target_to_diff(work->target);
work->targetdiff = target_to_diff_rpc2(work->target);

snprintf(work->job_id, sizeof(work->job_id), "%s", rpc2_job_id);
}
Expand Down Expand Up @@ -479,15 +490,15 @@ bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
char sdiff[32] = { 0 };
stratum_diff = sctx->job.diff;
if (opt_showdiff && work->targetdiff != stratum_diff)
snprintf(sdiff, 32, " (%.5f)", work->targetdiff);
snprintf(sdiff, 32, " (%g)", work->targetdiff);
if (stratum_diff >= 1e6)
applog(LOG_WARNING, "Stratum difficulty set to %.1f M%s", stratum_diff/1e6, sdiff);
else
applog(LOG_WARNING, "Stratum difficulty set to %.0f%s", stratum_diff, sdiff);
}
if (work->target[7] != rpc2_target) {
work->target[7] = rpc2_target;
work->targetdiff = target_to_diff(work->target);
work->targetdiff = target_to_diff_rpc2(work->target);
g_work_time = 0;
restart_threads();
}
Expand Down Expand Up @@ -547,7 +558,7 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
return false;
}

stratum.sharediff = target_to_diff((uint32_t*)hash);
stratum.sharediff = target_to_diff_rpc2((uint32_t*)hash);

return true;
}
Expand Down
4 changes: 2 additions & 2 deletions cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ void cuda_print_devices()
void cuda_shutdown()
{
// require gpu init first
if (thr_info != NULL)
cudaDeviceSynchronize();
//if (thr_info != NULL)
// cudaDeviceSynchronize();
cudaDeviceReset();
}

Expand Down

0 comments on commit 2479ffa

Please sign in to comment.