xmr: fix decimal diff + aes cleanup

change default launch config to -l 32x16 to handle the 750 Ti better not definitive, doing tests..
raziel1000 · Jan 8, 2017 · 2479ffa · 2479ffa
1 parent c1f1ad9
commit 2479ffa
Show file tree

Hide file tree

Showing 7 changed files with 222 additions and 325 deletions.
diff --git a/configure.sh b/configure.sh
@@ -3,5 +3,5 @@
 extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 
 CUDA_CFLAGS="-O3 -lineno -Xcompiler -Wall  -D_FORCE_INLINES" \
-	./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda-7.5 --with-nvml=libnvidia-ml.so
+	./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
 
diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
@@ -9,10 +9,10 @@
 #include <miner.h>
 #include "cryptonight.h"
 
-extern char *device_config[MAX_GPUS]; // -l 24x32
+extern char *device_config[MAX_GPUS]; // -l 32x16
 
-uint32_t cn_blocks  = 24;
-uint32_t cn_threads = 32;
+uint32_t cn_blocks  = 32;
+uint32_t cn_threads = 16;
 
 static uint32_t *d_long_state[MAX_GPUS];
 static uint32_t *d_ctx_state[MAX_GPUS];
@@ -43,11 +43,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 	{
 		if (device_config[thr_id]) {
 			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
-			gpulog(LOG_INFO, thr_id, "Using %u x %u threads kernel launch config", cn_blocks, cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
+			gpulog(LOG_INFO, thr_id, "Using %u x %u (%u) threads kernel launch config",
+				cn_blocks, cn_threads, throughput);
 		} else {
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
-			gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+			gpulog(LOG_INFO, thr_id, "Intensity set to %g, (%u x %u) %u threads",
+				throughput2intensity(throughput), cn_blocks, cn_threads, throughput);
 		}
 
 		if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
@@ -59,9 +61,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
-			// reduce cpu usage (linux)
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-			//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
 
@@ -107,7 +108,6 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			gpulog(LOG_DEBUG, thr_id, "found nonce %x", resNonces[0]);
 			cryptonight_hash(vhash, tempdata, 76);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{

diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
@@ -105,22 +105,10 @@ struct uint3  blockDim;
     for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
         out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
 
-#define XOR_BLOCKS(a,b) { \
-    ((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
-    ((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
-
 #define XOR_BLOCKS_DST(x,y,z) { \
     ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
     ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
 
-#define MUL_SUM_XOR_DST(a,c,dst) { \
-    uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
-    hi += ((uint64_t *)c)[0]; \
-    ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
-    ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
-    ((uint64_t *)dst)[0] = hi; \
-    ((uint64_t *)dst)[1] = lo; }
-
 #define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
 
 union hash_state {

diff --git a/crypto/cuda_cryptonight_core.cu b/crypto/cuda_cryptonight_core.cu
@@ -10,7 +10,7 @@
 #include "cryptonight.h"
 
 #ifdef WIN32
-int cn_bfactor = 6;
+int cn_bfactor = 8;
 int cn_bsleep = 100;
 #else
 int cn_bfactor = 0;
@@ -19,6 +19,14 @@ int cn_bsleep = 0;
 
 #include "cn_aes.cuh"
 
+#define MUL_SUM_XOR_DST(a,c,dst) { \
+	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+	hi += ((uint64_t *)c)[0]; \
+	((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+	((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+	((uint64_t *)dst)[0] = hi; \
+	((uint64_t *)dst)[1] = lo; }
+
 __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
 {
 	*product_hi = __umul64hi(multiplier, multiplicand);
@@ -52,15 +60,15 @@ void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state
 }
 
 __global__
-void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
+void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
 	cn_aes_gpu_init(sharedMemory);
 
 	__syncthreads();
 
-#if __CUDA_ARCH__ >= 300
+#if 0 && __CUDA_ARCH__ >= 300
 
 	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
 	const int sub = threadIdx.x & 3;
@@ -171,25 +179,27 @@ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t
 		const int batchsize = ITER >> (2 + bfactor);
 		const int start = partidx * batchsize;
 		const int end = start + batchsize;
-		uint32_t * __restrict__ long_state = &d_long_state[thread << 19];
-		uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
-		uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
-		uint32_t a[4], b[4], c[4];
-		int j;
+		const off_t longptr = (off_t) thread << 19;
+		uint32_t * long_state = &d_long_state[longptr];
+		uint32_t * ctx_a = &d_ctx_a[thread * 4];
+		uint32_t * ctx_b = &d_ctx_b[thread * 4];
+		uint32_t a[4], b[4];
 
 		MEMCPY8(a, ctx_a, 2);
 		MEMCPY8(b, ctx_b, 2);
 
-		for(int i = start; i < end; ++i)
+		for(int i = start; i < end; i++) // end = 262144
 		{
-			j = (a[0] & 0x1FFFF0) >> 2;
+			uint32_t c[4];
+			uint32_t j = (a[0] >> 2) & 0x7FFFC;
 			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
 			XOR_BLOCKS_DST(c, b, &long_state[j]);
-			MUL_SUM_XOR_DST(c, a, (uint8_t *)&long_state[(c[0] & 0x1FFFF0) >> 2]);
-			j = (a[0] & 0x1FFFF0) >> 2;
+			MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & 0x7FFFC]);
+
+			j = (a[0] >> 2) & 0x7FFFC;
 			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
 			XOR_BLOCKS_DST(b, c, &long_state[j]);
-			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] & 0x1FFFF0) >> 2]);
+			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & 0x7FFFC]);
 		}
 
 		if(bfactor > 0)

diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
@@ -31,7 +31,18 @@
 #include "xmr-rpc.h"
 #include "wildkeccak.h"
 
-//#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+double target_to_diff_rpc2(uint32_t* target)
+{
+	// unlike other algos, xmr diff is very low
+	if (opt_algo == ALGO_CRYPTONIGHT) {
+		// simplified to get 1.0 for 10K
+		return (double) (UINT32_MAX / target[7]) / 10000;
+	}
+	else if (opt_algo == ALGO_WILDKECCAK) {
+		return target_to_diff(target) * 1000;
+	}
+	return target_to_diff(target); // util.cpp
+}
 
 extern struct stratum_ctx stratum;
 
@@ -441,7 +452,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
 		memcpy(work->data, rpc2_blob, rpc2_bloblen);
 		memset(work->target, 0xff, sizeof(work->target));
 		work->target[7] = rpc2_target;
-		work->targetdiff = target_to_diff(work->target);
+		work->targetdiff = target_to_diff_rpc2(work->target);
 
 		snprintf(work->job_id, sizeof(work->job_id), "%s", rpc2_job_id);
 	}
@@ -479,15 +490,15 @@ bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		char sdiff[32] = { 0 };
 		stratum_diff = sctx->job.diff;
 		if (opt_showdiff && work->targetdiff != stratum_diff)
-			snprintf(sdiff, 32, " (%.5f)", work->targetdiff);
+			snprintf(sdiff, 32, " (%g)", work->targetdiff);
 		if (stratum_diff >= 1e6)
 			applog(LOG_WARNING, "Stratum difficulty set to %.1f M%s", stratum_diff/1e6, sdiff);
 		else
 			applog(LOG_WARNING, "Stratum difficulty set to %.0f%s", stratum_diff, sdiff);
 	}
 	if (work->target[7] != rpc2_target) {
 		work->target[7] = rpc2_target;
-		work->targetdiff = target_to_diff(work->target);
+		work->targetdiff = target_to_diff_rpc2(work->target);
 		g_work_time = 0;
 		restart_threads();
 	}
@@ -547,7 +558,7 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
 		return false;
 	}
 
-	stratum.sharediff = target_to_diff((uint32_t*)hash);
+	stratum.sharediff = target_to_diff_rpc2((uint32_t*)hash);
 
 	return true;
 }

diff --git a/cuda.cpp b/cuda.cpp
@@ -123,8 +123,8 @@ void cuda_print_devices()
 void cuda_shutdown()
 {
 	// require gpu init first
-	if (thr_info != NULL)
-		cudaDeviceSynchronize();
+	//if (thr_info != NULL)
+	//	cudaDeviceSynchronize();
 	cudaDeviceReset();
 }