From 5836da60961720778a20349a5e622a8d659a13b5 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Fri, 17 Dec 2021 15:28:43 -0700 Subject: [PATCH] Fix mismatch in CN_GPU algorithm id (upstream changed to 0300 vs 00ff) --- CMakeLists.txt | 10 +- src/crypto/cn/CnAlgo.h | 26 +- src/crypto/common/Algorithm.h | 3 +- src/cuda_core.cu | 4 +- src/cuda_cryptonight_gpu.hpp | 840 +++++++++++++++++----------------- src/cuda_extra.cu | 81 ++-- 6 files changed, 492 insertions(+), 472 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca8d44..547f042 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ option(WITH_CN_R "Enable CryptoNight-R algorithm" ON) option(WITH_CN_LITE "Enable CryptoNight-Lite algorithms family" ON) option(WITH_CN_HEAVY "Enable CryptoNight-Heavy algorithms family" ON) option(WITH_CN_PICO "Enable CryptoNight-Pico algorithm" ON) -option(WITH_CN_GPU "Enable CryptoNight-GPU algorithm" ON) option(WITH_CN_FEMTO "Enable CryptoNight-UPX2 algorithm" ON) +option(WITH_CN_GPU "Enable CryptoNight-GPU algorithm" ON) option(WITH_ARGON2 "Enable Argon2 algorithms family" OFF) #unsupported if (CUDA_VERSION VERSION_LESS 9.0) @@ -37,14 +37,14 @@ if (WITH_CN_PICO) add_definitions(/DXMRIG_ALGO_CN_PICO) endif() -if (WITH_CN_GPU) - add_definitions(/DXMRIG_ALGO_CN_GPU) -endif() - if (WITH_CN_FEMTO) add_definitions(/DXMRIG_ALGO_CN_FEMTO) endif() +if (WITH_CN_GPU) + add_definitions(/DXMRIG_ALGO_CN_GPU) +endif() + if (WITH_RANDOMX) add_definitions(/DXMRIG_ALGO_RANDOMX) endif() diff --git a/src/crypto/cn/CnAlgo.h b/src/crypto/cn/CnAlgo.h index 3d3eb7e..e7387d8 100644 --- a/src/crypto/cn/CnAlgo.h +++ b/src/crypto/cn/CnAlgo.h @@ -82,16 +82,16 @@ class CnAlgo return CN_ITER / 8; # endif -# ifdef XMRIG_ALGO_CN_GPU - case Algorithm::CN_GPU: - return 0xC000; -# endif - # ifdef XMRIG_ALGO_CN_FEMTO case Algorithm::CN_UPX2: return CN_ITER / 32; # endif +# ifdef XMRIG_ALGO_CN_GPU + case Algorithm::CN_GPU: + return 0xC000; +# endif + default: break; } @@ -107,18 +107,18 @@ class CnAlgo } # endif -# ifdef XMRIG_ALGO_CN_GPU - if (algo == Algorithm::CN_GPU) { - return 0x1FFFC0; - } -# endif - # ifdef XMRIG_ALGO_CN_FEMTO if (algo == Algorithm::CN_UPX2) { return 0x1FFF0; } # endif +# ifdef XMRIG_ALGO_CN_GPU + if (algo == Algorithm::CN_GPU) { + return 0x1FFFC0; + } +# endif + return ((Algorithm::l3(algo) - 1) / 16) * 16; } @@ -140,14 +140,14 @@ template<> constexpr inline uint32_t CnAlgo::iterations() con template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0x60000; } template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 8; } template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 8; } -template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0xC000; } template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 2; } template<> constexpr inline uint32_t CnAlgo::iterations() const { return CN_ITER / 32; } +template<> constexpr inline uint32_t CnAlgo::iterations() const { return 0xC000; } template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFF0; } -template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFFC0; } template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFF0; } +template<> constexpr inline uint32_t CnAlgo::mask() const { return 0x1FFFC0; } } /* namespace xmrig_cuda */ diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h index 6364d34..86d1e96 100644 --- a/src/crypto/common/Algorithm.h +++ b/src/crypto/common/Algorithm.h @@ -21,6 +21,7 @@ #define XMRIG_ALGORITHM_H +#include #include #include @@ -53,6 +54,7 @@ class Algorithm CN_PICO_0 = 0x63120200, // "cn-pico" CryptoNight-Pico CN_PICO_TLO = 0x63120274, // "cn-pico/tlo" CryptoNight-Pico (TLO) CN_UPX2 = 0x63110200, // "cn/upx2" Uplexa (UPX2) + CN_GPU = 0x63150300, // "cn/gpu" CryptoNight-GPU (Ryo). RX_0 = 0x72151200, // "rx/0" RandomX (reference configuration). RX_WOW = 0x72141177, // "rx/wow" RandomWOW (Wownero). RX_ARQ = 0x72121061, // "rx/arq" RandomARQ (Arqma). @@ -65,7 +67,6 @@ class Algorithm ASTROBWT_DERO = 0x41000000, // "astrobwt" AstroBWT (Dero) KAWPOW_RVN = 0x6b0f0000, // "kawpow/rvn" KawPow (RVN) - CN_GPU = 0x631500ff, // "cn/gpu" CryptoNight-GPU (Ryo). RX_XLA = 0x721211ff, // "panthera" Panthera (Scala2). }; diff --git a/src/cuda_core.cu b/src/cuda_core.cu index df1f764..f083b73 100644 --- a/src/cuda_core.cu +++ b/src/cuda_core.cu @@ -837,7 +837,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_KERNEL( ctx->device_id, - xmrig::cn_gpu::cn_explode_gpu<<>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state) + xmrig_cuda::cn_gpu::cn_explode_gpu<<>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state) ); int partcount = 1 << ctx->device_bfactor; @@ -846,7 +846,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_KERNEL( ctx->device_id, // 36 x 16byte x numThreads - xmrig::cn_gpu::cryptonight_core_gpu_phase2_gpu + xmrig_cuda::cn_gpu::cryptonight_core_gpu_phase2_gpu <<device_blocks, ctx->device_threads * 16, 36 * 16 * ctx->device_threads>>> ( (int*)ctx->d_ctx_state, diff --git a/src/cuda_cryptonight_gpu.hpp b/src/cuda_cryptonight_gpu.hpp index 3528e5d..e789286 100644 --- a/src/cuda_cryptonight_gpu.hpp +++ b/src/cuda_cryptonight_gpu.hpp @@ -7,7 +7,7 @@ #include "cuda_keccak.hpp" #include "cuda_extra.h" -namespace xmrig +namespace xmrig_cuda { namespace cn_gpu { @@ -15,180 +15,180 @@ namespace cn_gpu struct __m128i : public int4 { - __forceinline__ __device__ __m128i(){} - - __forceinline__ __device__ __m128i( - const uint32_t x0, const uint32_t x1, - const uint32_t x2, const uint32_t x3) - { - x = x0; - y = x1; - z = x2; - w = x3; - } - - __forceinline__ __device__ __m128i( const int x0) - { - x = x0; - y = x0; - z = x0; - w = x0; - } - - __forceinline__ __device__ __m128i operator|(const __m128i& other) - { - return __m128i( - x | other.x, - y | other.y, - z | other.z, - w | other.w - ); - } - - __forceinline__ __device__ __m128i operator^(const __m128i& other) - { - return __m128i( - x ^ other.x, - y ^ other.y, - z ^ other.z, - w ^ other.w - ); - } + __forceinline__ __device__ __m128i(){} + + __forceinline__ __device__ __m128i( + const uint32_t x0, const uint32_t x1, + const uint32_t x2, const uint32_t x3) + { + x = x0; + y = x1; + z = x2; + w = x3; + } + + __forceinline__ __device__ __m128i( const int x0) + { + x = x0; + y = x0; + z = x0; + w = x0; + } + + __forceinline__ __device__ __m128i operator|(const __m128i& other) + { + return __m128i( + x | other.x, + y | other.y, + z | other.z, + w | other.w + ); + } + + __forceinline__ __device__ __m128i operator^(const __m128i& other) + { + return __m128i( + x ^ other.x, + y ^ other.y, + z ^ other.z, + w ^ other.w + ); + } }; struct __m128 : public float4 { - __forceinline__ __device__ __m128(){} - - __forceinline__ __device__ __m128( - const float x0, const float x1, - const float x2, const float x3) - { - float4::x = x0; - float4::y = x1; - float4::z = x2; - float4::w = x3; - } - - __forceinline__ __device__ __m128( const float x0) - { - float4::x = x0; - float4::y = x0; - float4::z = x0; - float4::w = x0; - } - - __forceinline__ __device__ __m128( const __m128i& x0) - { - float4::x = int2float(x0.x); - float4::y = int2float(x0.y); - float4::z = int2float(x0.z); - float4::w = int2float(x0.w); - } - - __forceinline__ __device__ __m128i get_int( ) - { - return __m128i( - (int)x, - (int)y, - (int)z, - (int)w - ); - } - - __forceinline__ __device__ __m128 operator+(const __m128& other) - { - return __m128( - x + other.x, - y + other.y, - z + other.z, - w + other.w - ); - } - - __forceinline__ __device__ __m128 operator-(const __m128& other) - { - return __m128( - x - other.x, - y - other.y, - z - other.z, - w - other.w - ); - } - - __forceinline__ __device__ __m128 operator*(const __m128& other) - { - return __m128( - x * other.x, - y * other.y, - z * other.z, - w * other.w - ); - } - - __forceinline__ __device__ __m128 operator/(const __m128& other) - { - return __m128( - x / other.x, - y / other.y, - z / other.z, - w / other.w - ); - } - - __forceinline__ __device__ __m128& trunc() - { - x=::truncf(x); - y=::truncf(y); - z=::truncf(z); - w=::truncf(w); - - return *this; - } - - __forceinline__ __device__ __m128& abs() - { - x=::fabsf(x); - y=::fabsf(y); - z=::fabsf(z); - w=::fabsf(w); - - return *this; - } - - __forceinline__ __device__ __m128& floor() - { - x=::floorf(x); - y=::floorf(y); - z=::floorf(z); - w=::floorf(w); - - return *this; - } + __forceinline__ __device__ __m128(){} + + __forceinline__ __device__ __m128( + const float x0, const float x1, + const float x2, const float x3) + { + float4::x = x0; + float4::y = x1; + float4::z = x2; + float4::w = x3; + } + + __forceinline__ __device__ __m128( const float x0) + { + float4::x = x0; + float4::y = x0; + float4::z = x0; + float4::w = x0; + } + + __forceinline__ __device__ __m128( const __m128i& x0) + { + float4::x = int2float(x0.x); + float4::y = int2float(x0.y); + float4::z = int2float(x0.z); + float4::w = int2float(x0.w); + } + + __forceinline__ __device__ __m128i get_int( ) + { + return __m128i( + (int)x, + (int)y, + (int)z, + (int)w + ); + } + + __forceinline__ __device__ __m128 operator+(const __m128& other) + { + return __m128( + x + other.x, + y + other.y, + z + other.z, + w + other.w + ); + } + + __forceinline__ __device__ __m128 operator-(const __m128& other) + { + return __m128( + x - other.x, + y - other.y, + z - other.z, + w - other.w + ); + } + + __forceinline__ __device__ __m128 operator*(const __m128& other) + { + return __m128( + x * other.x, + y * other.y, + z * other.z, + w * other.w + ); + } + + __forceinline__ __device__ __m128 operator/(const __m128& other) + { + return __m128( + x / other.x, + y / other.y, + z / other.z, + w / other.w + ); + } + + __forceinline__ __device__ __m128& trunc() + { + x=::truncf(x); + y=::truncf(y); + z=::truncf(z); + w=::truncf(w); + + return *this; + } + + __forceinline__ __device__ __m128& abs() + { + x=::fabsf(x); + y=::fabsf(y); + z=::fabsf(z); + w=::fabsf(w); + + return *this; + } + + __forceinline__ __device__ __m128& floor() + { + x=::floorf(x); + y=::floorf(y); + z=::floorf(z); + w=::floorf(w); + + return *this; + } }; template __device__ void print(const char* name, T value) { - printf("g %s: ", name); - for(int i = 0; i < 4; ++i) - { - printf("%08X ",((uint32_t*)&value)[i]); - } - printf("\n"); + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%08X ",((uint32_t*)&value)[i]); + } + printf("\n"); } template<> __device__ void print<__m128>(const char* name, __m128 value) { - printf("g %s: ", name); - for(int i = 0; i < 4; ++i) - { - printf("%f ",((float*)&value)[i]); - } - printf("\n"); + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%f ",((float*)&value)[i]); + } + printf("\n"); } #define SHOW(name) print(#name, name) @@ -196,83 +196,83 @@ __device__ void print<__m128>(const char* name, __m128 value) __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) { - return a + b; + return a + b; } __forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b) { - return a - b; + return a - b; } __forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b) { - return a * b; + return a * b; } __forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b) { - return a / b; + return a / b; } __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) { - return __m128( - int_as_float(float_as_int(a.x) & b), - int_as_float(float_as_int(a.y) & b), - int_as_float(float_as_int(a.z) & b), - int_as_float(float_as_int(a.w) & b) - ); + return __m128( + int_as_float(float_as_int(a.x) & b), + int_as_float(float_as_int(a.y) & b), + int_as_float(float_as_int(a.z) & b), + int_as_float(float_as_int(a.w) & b) + ); } __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) { - return __m128( - int_as_float(float_as_int(a.x) | b), - int_as_float(float_as_int(a.y) | b), - int_as_float(float_as_int(a.z) | b), - int_as_float(float_as_int(a.w) | b) - ); + return __m128( + int_as_float(float_as_int(a.x) | b), + int_as_float(float_as_int(a.y) | b), + int_as_float(float_as_int(a.z) | b), + int_as_float(float_as_int(a.w) | b) + ); } __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) { - return __m128( - int_as_float(float_as_int(a.x) ^ b), - int_as_float(float_as_int(a.y) ^ b), - int_as_float(float_as_int(a.z) ^ b), - int_as_float(float_as_int(a.w) ^ b) - ); + return __m128( + int_as_float(float_as_int(a.x) ^ b), + int_as_float(float_as_int(a.y) ^ b), + int_as_float(float_as_int(a.z) ^ b), + int_as_float(float_as_int(a.w) ^ b) + ); } __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) { - __m128 d(dc); - __m128 c = _mm_div_ps(v, d); - c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); - // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 - c = _mm_mul_ps(c, d); - return _mm_sub_ps(v, c); + __m128 d(dc); + __m128 c = _mm_div_ps(v, d); + c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); - //return a.fmodf(b); + //return a.fmodf(b); } __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) { - return a ^ b; + return a ^ b; } __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) { - const uint32_t right = 8 * rot; - const uint32_t left = (32 - 8 * rot); - return __m128i( - ((uint32_t)a.x >> right) | ( a.y << left ), - ((uint32_t)a.y >> right) | ( a.z << left ), - ((uint32_t)a.z >> right) | ( a.w << left ), - ((uint32_t)a.w >> right) | ( a.x << left ) - ); + const uint32_t right = 8 * rot; + const uint32_t left = (32 - 8 * rot); + return __m128i( + ((uint32_t)a.x >> right) | ( a.y << left ), + ((uint32_t)a.y >> right) | ( a.z << left ), + ((uint32_t)a.z >> right) | ( a.w << left ), + ((uint32_t)a.w >> right) | ( a.x << left ) + ); } template @@ -281,282 +281,282 @@ __device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return __forceinline__ __device__ __m128 fma_break(__m128 x) { - // Break the dependency chain by setitng the exp to ?????01 - x = _mm_and_ps(x, 0xFEFFFFFF); - return _mm_or_ps(x, 0x00800000); + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); } // 9 __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) { - n1 = _mm_add_ps(n1, c); - __m128 nn = _mm_mul_ps(n0, c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); - nn = fma_break(nn); - n = _mm_add_ps(n, nn); - - n3 = _mm_sub_ps(n3, c); - __m128 dd = _mm_mul_ps(n2, c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); - dd = fma_break(dd); - d = _mm_add_ps(d, dd); - - //Constant feedback - c = _mm_add_ps(c, rnd_c); - c = _mm_add_ps(c, 0.734375f); - __m128 r = _mm_add_ps(nn, dd); - r = _mm_and_ps(r, 0x807FFFFF); - r = _mm_or_ps(r, 0x40000000); - c = _mm_add_ps(c, r); + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, 0.734375f); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + c = _mm_add_ps(c, r); } // 9*8 + 2 = 74 __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) { - __m128 n(0.0f), d(0.0f); - - sub_round(n0, n1, n2, n3, rnd_c, n, d, c); - sub_round(n1, n2, n3, n0, rnd_c, n, d, c); - sub_round(n2, n3, n0, n1, rnd_c, n, d, c); - sub_round(n3, n0, n1, n2, rnd_c, n, d, c); - sub_round(n3, n2, n1, n0, rnd_c, n, d, c); - sub_round(n2, n1, n0, n3, rnd_c, n, d, c); - sub_round(n1, n0, n3, n2, rnd_c, n, d, c); - sub_round(n0, n3, n2, n1, rnd_c, n, d, c); - - // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 - d = _mm_and_ps(d, 0xFF7FFFFF); - d = _mm_or_ps(d, 0x40000000); - r =_mm_add_ps(r, _mm_div_ps(n,d)); + __m128 n(0.0f), d(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + r =_mm_add_ps(r, _mm_div_ps(n,d)); } // 74*8 = 595 __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) { - __m128 c(cnt); - // 35 maths calls follow (140 FLOPS) - __m128 r = __m128(0.0f); - for(int i=0; i< 4; ++i) - round_compute(n0, n1, n2, n3, rnd_c, c, r); - // do a quick fmod by setting exp to 2 - r = _mm_and_ps(r, 0x807FFFFF); - r = _mm_or_ps(r, 0x40000000); - sum = r; // 34 - r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 - return r.get_int(); + __m128 c(cnt); + // 35 maths calls follow (140 FLOPS) + __m128 r = __m128(0.0f); + for(int i=0; i< 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, c, r); + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + sum = r; // 34 + r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 + return r.get_int(); } __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { - __m128 n0(v0); - __m128 n1(v1); - __m128 n2(v2); - __m128 n3(v3); + __m128 n0(v0); + __m128 n1(v1); + __m128 n2(v2); + __m128 n3(v3); - __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); - out = rot == 0 ? r : _mm_alignr_epi8(r, rot); + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + out = rot == 0 ? r : _mm_alignr_epi8(r, rot); } __constant__ uint32_t look[16][4] = { - {0, 1, 2, 3}, - {0, 2, 3, 1}, - {0, 3, 1, 2}, - {0, 3, 2, 1}, - - {1, 0, 2, 3}, - {1, 2, 3, 0}, - {1, 3, 0, 2}, - {1, 3, 2, 0}, - - {2, 1, 0, 3}, - {2, 0, 3, 1}, - {2, 3, 1, 0}, - {2, 3, 0, 1}, - - {3, 1, 2, 0}, - {3, 2, 0, 1}, - {3, 0, 1, 2}, - {3, 0, 2, 1} + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} }; __constant__ float ccnt[16] = { - 1.34375f, - 1.28125f, - 1.359375f, - 1.3671875f, - - 1.4296875f, - 1.3984375f, - 1.3828125f, - 1.3046875f, - - 1.4140625f, - 1.2734375f, - 1.2578125f, - 1.2890625f, - - 1.3203125f, - 1.3515625f, - 1.3359375f, - 1.4609375f + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f }; __forceinline__ __device__ void sync() { #if (__CUDACC_VER_MAJOR__ >= 9) - __syncwarp(); + __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif } template __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) { - static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6; - - const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); - - extern __shared__ __m128i smemExtern_in[]; - - const uint32_t chunk = threadIdx.x / 16; - const uint32_t numHashPerBlock = blockDim.x / 16; - - int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); - - __m128i* smem = smemExtern_in + 4 * chunk; - - __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4; - __m128i* smemOut = smemExtern + 16 * chunk; - - smemExtern = smemExtern + numHashPerBlock * 16; - __m128* smemVa = (__m128*)smemExtern + 16 * chunk; - - uint32_t tid = threadIdx.x % 16; - - const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; - uint32_t s = 0; - - __m128 vs(0); - if(partidx != 0) - { - vs = ((__m128*)roundVs)[idxHash]; - s = roundS[idxHash]; - } - else - { - s = ((uint32_t*)spad)[idxHash * 50] >> 8; - } - - const uint32_t b = tid / 4; - const uint32_t bb = tid % 4; - const uint32_t block = b * 16 + bb; - - for(size_t i = 0; i < batchsize; i++) - { - sync(); - ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, b, lpad))[bb]; - sync(); - - __m128 rc = vs; - single_comupte_wrap( - bb, - *(smem + look[tid][0]), - *(smem + look[tid][1]), - *(smem + look[tid][2]), - *(smem + look[tid][3]), - ccnt[tid], rc, smemVa[tid], - smemOut[tid] - ); - - sync(); - - int outXor = ((int*)smemOut)[block]; - for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4) - outXor ^= ((int*)smemOut)[dd]; - - ((int*)scratchpad_ptr(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid]; - ((int*)smemOut)[tid] = outXor; - - float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; - float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12]; - ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; - - sync(); - - __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3]; - va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; - va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12]; - ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; - - sync(); - - vs = smemVa[0]; - vs.abs(); // take abs(va) by masking the float sign bit - auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); - // vs range 0 - 64 - auto xx_int = xx.get_int(); - out2 = _mm_xor_si128(xx_int, out2); - // vs is now between 0 and 1 - vs = _mm_div_ps(vs, __m128(64.0f)); - s = out2.x ^ out2.y ^ out2.z ^ out2.w; - } - if(partidx != ((1<> 6) << 6; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + + extern __shared__ __m128i smemExtern_in[]; + + const uint32_t chunk = threadIdx.x / 16; + const uint32_t numHashPerBlock = blockDim.x / 16; + + int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); + + __m128i* smem = smemExtern_in + 4 * chunk; + + __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4; + __m128i* smemOut = smemExtern + 16 * chunk; + + smemExtern = smemExtern + numHashPerBlock * 16; + __m128* smemVa = (__m128*)smemExtern + 16 * chunk; + + uint32_t tid = threadIdx.x % 16; + + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + uint32_t s = 0; + + __m128 vs(0); + if(partidx != 0) + { + vs = ((__m128*)roundVs)[idxHash]; + s = roundS[idxHash]; + } + else + { + s = ((uint32_t*)spad)[idxHash * 50] >> 8; + } + + const uint32_t b = tid / 4; + const uint32_t bb = tid % 4; + const uint32_t block = b * 16 + bb; + + for(size_t i = 0; i < batchsize; i++) + { + sync(); + ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, b, lpad))[bb]; + sync(); + + __m128 rc = vs; + single_comupte_wrap( + bb, + *(smem + look[tid][0]), + *(smem + look[tid][1]), + *(smem + look[tid][2]), + *(smem + look[tid][3]), + ccnt[tid], rc, smemVa[tid], + smemOut[tid] + ); + + sync(); + + int outXor = ((int*)smemOut)[block]; + for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4) + outXor ^= ((int*)smemOut)[dd]; + + ((int*)scratchpad_ptr(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid]; + ((int*)smemOut)[tid] = outXor; + + float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; + float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12]; + ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; + + sync(); + + __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3]; + va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; + va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12]; + ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; + + sync(); + + vs = smemVa[0]; + vs.abs(); // take abs(va) by masking the float sign bit + auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); + // vs range 0 - 64 + auto xx_int = xx.get_int(); + out2 = _mm_xor_si128(xx_int, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, __m128(64.0f)); + s = out2.x ^ out2.y ^ out2.z ^ out2.w; + } + if(partidx != ((1< __global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in) { - __shared__ uint64_t state[25]; + __shared__ uint64_t state[25]; - uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY; - uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200); + uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY; + uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200); - for(int i = threadIdx.x; i < 25; i += blockDim.x) - state[i] = spad[i]; + for(int i = threadIdx.x; i < 25; i += blockDim.x) + state[i] = spad[i]; - sync(); + sync(); - for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x) - { - generate_512(i, state, (uint8_t*)lpad + i*512); - } + for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x) + { + generate_512(i, state, (uint8_t*)lpad + i*512); + } } -} // namespace xmrig +} // namespace xmrig_cuda } // namespace cn_gpu diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu index 5a2b3d6..44c6961 100644 --- a/src/cuda_extra.cu +++ b/src/cuda_extra.cu @@ -289,48 +289,50 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 template __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 ) { - const int thread = blockDim.x * blockIdx.x + threadIdx.x; + using namespace xmrig_cuda; - __shared__ uint32_t sharedMemory[1024]; + const int thread = blockDim.x * blockIdx.x + threadIdx.x; - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + __shared__ uint32_t sharedMemory[1024]; - if ( thread >= threads ) - return; + cn_aes_gpu_init( sharedMemory ); + __syncthreads( ); - int i; - uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; - uint32_t state[50]; + if ( thread >= threads ) + return; - #pragma unroll - for ( i = 0; i < 50; i++ ) - state[i] = ctx_state[i]; + int i; + uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; + uint32_t state[50]; - uint32_t key[40]; + #pragma unroll + for ( i = 0; i < 50; i++ ) + state[i] = ctx_state[i]; - // load keys - MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); + uint32_t key[40]; - for(int i=0; i < 16; i++) - { - for(size_t t = 4; t < 12; ++t) - { - cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key ); - } - // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values - mix_and_propagate(state + 4 * 4); - } + // load keys + MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); - cn_keccakf2( (uint64_t *) state ); + for(int i=0; i < 16; i++) + { + for(size_t t = 4; t < 12; ++t) + { + cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key ); + } + // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values + mix_and_propagate(state + 4 * 4); + } - if ( ((uint64_t*)state)[3] < target ) - { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + cn_keccakf2( (uint64_t *) state ); + + if ( ((uint64_t*)state)[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); - if(idx < 10) - d_res_nonce[idx] = thread; - } + if(idx < 10) + d_res_nonce[idx] = thread; + } } void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len) @@ -728,6 +730,23 @@ int cuda_get_deviceinfo(nvid_ctx *ctx) } } + if (ctx->algorithm == Algorithm::CN_GPU && props.major < 7) { + int t = 32; + int b = ctx->device_blocks; + int target_intensity = ctx->device_threads * b; + for (; t * b <= target_intensity; b++) {} + b--; + if (t != ctx->device_threads || b != ctx->device_blocks) { + printf("WARNING: NVIDIA GPU %d: modified cn/gpu t/b from %d/%d to %d/%d\n", + ctx->device_id, + ctx->device_threads, ctx->device_blocks, + t, b + ); + ctx->device_threads = t; + ctx->device_blocks = b; + } + } + ctx->device_threads = std::min(ctx->device_threads, (props.major == 2 ? 64 : 128)); }