From 5836da60961720778a20349a5e622a8d659a13b5 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Fri, 17 Dec 2021 15:28:43 -0700
Subject: [PATCH] Fix mismatch in CN_GPU algorithm id (upstream changed to 0300
 vs 00ff)

---
 CMakeLists.txt                |  10 +-
 src/crypto/cn/CnAlgo.h        |  26 +-
 src/crypto/common/Algorithm.h |   3 +-
 src/cuda_core.cu              |   4 +-
 src/cuda_cryptonight_gpu.hpp  | 840 +++++++++++++++++-----------------
 src/cuda_extra.cu             |  81 ++--
 6 files changed, 492 insertions(+), 472 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ca8d44..547f042 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ option(WITH_CN_R       "Enable CryptoNight-R algorithm" ON)
 option(WITH_CN_LITE    "Enable CryptoNight-Lite algorithms family" ON)
 option(WITH_CN_HEAVY   "Enable CryptoNight-Heavy algorithms family" ON)
 option(WITH_CN_PICO    "Enable CryptoNight-Pico algorithm" ON)
-option(WITH_CN_GPU     "Enable CryptoNight-GPU algorithm" ON)
 option(WITH_CN_FEMTO   "Enable CryptoNight-UPX2 algorithm" ON)
+option(WITH_CN_GPU     "Enable CryptoNight-GPU algorithm" ON)
 option(WITH_ARGON2     "Enable Argon2 algorithms family" OFF) #unsupported
 
 if (CUDA_VERSION VERSION_LESS 9.0)
@@ -37,14 +37,14 @@ if (WITH_CN_PICO)
     add_definitions(/DXMRIG_ALGO_CN_PICO)
 endif()
 
-if (WITH_CN_GPU)
-    add_definitions(/DXMRIG_ALGO_CN_GPU)
-endif()
-
 if (WITH_CN_FEMTO)
     add_definitions(/DXMRIG_ALGO_CN_FEMTO)
 endif()
 
+if (WITH_CN_GPU)
+    add_definitions(/DXMRIG_ALGO_CN_GPU)
+endif()
+
 if (WITH_RANDOMX)
     add_definitions(/DXMRIG_ALGO_RANDOMX)
 endif()
diff --git a/src/crypto/cn/CnAlgo.h b/src/crypto/cn/CnAlgo.h
index 3d3eb7e..e7387d8 100644
--- a/src/crypto/cn/CnAlgo.h
+++ b/src/crypto/cn/CnAlgo.h
@@ -82,16 +82,16 @@ class CnAlgo
             return CN_ITER / 8;
 #       endif
 
-#       ifdef XMRIG_ALGO_CN_GPU
-        case Algorithm::CN_GPU:
-            return 0xC000;
-#	endif
-
 #       ifdef XMRIG_ALGO_CN_FEMTO
         case Algorithm::CN_UPX2:
             return CN_ITER / 32;
 #       endif
 
+#       ifdef XMRIG_ALGO_CN_GPU
+        case Algorithm::CN_GPU:
+            return 0xC000;
+#       endif
+
         default:
             break;
         }
@@ -107,18 +107,18 @@ class CnAlgo
         }
 #       endif
 
-#       ifdef XMRIG_ALGO_CN_GPU
-        if (algo == Algorithm::CN_GPU) {
-            return 0x1FFFC0;
-        }
-#       endif
-
 #       ifdef XMRIG_ALGO_CN_FEMTO
         if (algo == Algorithm::CN_UPX2) {
             return 0x1FFF0;
         }
 #       endif
 
+#       ifdef XMRIG_ALGO_CN_GPU
+        if (algo == Algorithm::CN_GPU) {
+            return 0x1FFFC0;
+        }
+#       endif
+
         return ((Algorithm::l3(algo) - 1) / 16) * 16;
     }
 
@@ -140,14 +140,14 @@ template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_RWZ>::iterations() con
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_ZLS>::iterations() const          { return 0x60000; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::iterations() const       { return CN_ITER / 8; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_TLO>::iterations() const     { return CN_ITER / 8; }
-template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::iterations() const          { return 0xC000; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_CCX>::iterations() const          { return CN_ITER / 2; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::iterations() const         { return CN_ITER / 32; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::iterations() const          { return 0xC000; }
 
 
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_PICO_0>::mask() const             { return 0x1FFF0; }
-template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::mask() const                { return 0x1FFFC0; }
 template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_UPX2>::mask() const               { return 0x1FFF0; }
+template<> constexpr inline uint32_t CnAlgo<Algorithm::CN_GPU>::mask() const                { return 0x1FFFC0; }
 
 
 } /* namespace xmrig_cuda */
diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h
index 6364d34..86d1e96 100644
--- a/src/crypto/common/Algorithm.h
+++ b/src/crypto/common/Algorithm.h
@@ -21,6 +21,7 @@
 #define XMRIG_ALGORITHM_H
 
 
+#include <cstddef>
 #include <cstdint>
 #include <vector>
 
@@ -53,6 +54,7 @@ class Algorithm
         CN_PICO_0       = 0x63120200,   // "cn-pico"          CryptoNight-Pico
         CN_PICO_TLO     = 0x63120274,   // "cn-pico/tlo"      CryptoNight-Pico (TLO)
         CN_UPX2         = 0x63110200,   // "cn/upx2"          Uplexa (UPX2)
+        CN_GPU          = 0x63150300,   // "cn/gpu"           CryptoNight-GPU (Ryo).
         RX_0            = 0x72151200,   // "rx/0"             RandomX (reference configuration).
         RX_WOW          = 0x72141177,   // "rx/wow"           RandomWOW (Wownero).
         RX_ARQ          = 0x72121061,   // "rx/arq"           RandomARQ (Arqma).
@@ -65,7 +67,6 @@ class Algorithm
         ASTROBWT_DERO   = 0x41000000,   // "astrobwt"         AstroBWT (Dero)
         KAWPOW_RVN      = 0x6b0f0000,   // "kawpow/rvn"       KawPow (RVN)
 
-        CN_GPU          = 0x631500ff,   // "cn/gpu"           CryptoNight-GPU (Ryo).
         RX_XLA          = 0x721211ff,   // "panthera"         Panthera (Scala2).
     };
 
diff --git a/src/cuda_core.cu b/src/cuda_core.cu
index df1f764..f083b73 100644
--- a/src/cuda_core.cu
+++ b/src/cuda_core.cu
@@ -837,7 +837,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
 
     CUDA_CHECK_KERNEL(
         ctx->device_id,
-        xmrig::cn_gpu::cn_explode_gpu<MEMORY><<<intensity,32>>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
+        xmrig_cuda::cn_gpu::cn_explode_gpu<MEMORY><<<intensity,32>>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
     );
 
     int partcount = 1 << ctx->device_bfactor;
@@ -846,7 +846,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
         CUDA_CHECK_KERNEL(
             ctx->device_id,
             // 36 x 16byte x numThreads
-            xmrig::cn_gpu::cryptonight_core_gpu_phase2_gpu<ITERATIONS, MEMORY>
+            xmrig_cuda::cn_gpu::cryptonight_core_gpu_phase2_gpu<ITERATIONS, MEMORY>
                 <<<ctx->device_blocks, ctx->device_threads * 16,  36 * 16 * ctx->device_threads>>>
                 (
                     (int*)ctx->d_ctx_state,
diff --git a/src/cuda_cryptonight_gpu.hpp b/src/cuda_cryptonight_gpu.hpp
index 3528e5d..e789286 100644
--- a/src/cuda_cryptonight_gpu.hpp
+++ b/src/cuda_cryptonight_gpu.hpp
@@ -7,7 +7,7 @@
 #include "cuda_keccak.hpp"
 #include "cuda_extra.h"
 
-namespace xmrig
+namespace xmrig_cuda
 {
 namespace cn_gpu
 {
@@ -15,180 +15,180 @@ namespace cn_gpu
 struct __m128i : public int4
 {
 
-	__forceinline__ __device__ __m128i(){}
-
-	__forceinline__ __device__ __m128i(
-		const uint32_t x0, const uint32_t x1,
-		const uint32_t x2, const uint32_t x3)
-	{
-		x = x0;
-		y = x1;
-		z = x2;
-		w = x3;
-	}
-
-	__forceinline__ __device__ __m128i( const int x0)
-	{
-		x = x0;
-		y = x0;
-		z = x0;
-		w = x0;
-	}
-
-	__forceinline__ __device__ __m128i operator|(const __m128i& other)
-	{
-		return __m128i(
-			x | other.x,
-			y | other.y,
-			z | other.z,
-			w | other.w
-		);
-	}
-
-	__forceinline__ __device__ __m128i operator^(const __m128i& other)
-	{
-		return __m128i(
-			x ^ other.x,
-			y ^ other.y,
-			z ^ other.z,
-			w ^ other.w
-		);
-	}
+    __forceinline__ __device__ __m128i(){}
+
+    __forceinline__ __device__ __m128i(
+        const uint32_t x0, const uint32_t x1,
+        const uint32_t x2, const uint32_t x3)
+    {
+        x = x0;
+        y = x1;
+        z = x2;
+        w = x3;
+    }
+
+    __forceinline__ __device__ __m128i( const int x0)
+    {
+        x = x0;
+        y = x0;
+        z = x0;
+        w = x0;
+    }
+
+    __forceinline__ __device__ __m128i operator|(const __m128i& other)
+    {
+        return __m128i(
+            x | other.x,
+            y | other.y,
+            z | other.z,
+            w | other.w
+        );
+    }
+
+    __forceinline__ __device__ __m128i operator^(const __m128i& other)
+    {
+        return __m128i(
+            x ^ other.x,
+            y ^ other.y,
+            z ^ other.z,
+            w ^ other.w
+        );
+    }
 };
 
 struct __m128 : public float4
 {
 
-	__forceinline__ __device__ __m128(){}
-
-	__forceinline__ __device__ __m128(
-		const float x0, const float x1,
-		const float x2, const float x3)
-	{
-		float4::x = x0;
-		float4::y = x1;
-		float4::z = x2;
-		float4::w = x3;
-	}
-
-	__forceinline__ __device__ __m128( const float x0)
-	{
-		float4::x = x0;
-		float4::y = x0;
-		float4::z = x0;
-		float4::w = x0;
-	}
-
-	__forceinline__ __device__ __m128( const __m128i& x0)
-	{
-		float4::x = int2float(x0.x);
-		float4::y = int2float(x0.y);
-		float4::z = int2float(x0.z);
-		float4::w = int2float(x0.w);
-	}
-
-	__forceinline__ __device__ __m128i get_int( )
-	{
-		return __m128i(
-			(int)x,
-			(int)y,
-			(int)z,
-			(int)w
-		);
-	}
-
-	__forceinline__ __device__ __m128 operator+(const __m128& other)
-	{
-		return __m128(
-			x + other.x,
-			y + other.y,
-			z + other.z,
-			w + other.w
-		);
-	}
-
-	__forceinline__ __device__ __m128 operator-(const __m128& other)
-	{
-		return __m128(
-			x - other.x,
-			y - other.y,
-			z - other.z,
-			w - other.w
-		);
-	}
-
-	__forceinline__ __device__ __m128 operator*(const __m128& other)
-	{
-		return __m128(
-			x * other.x,
-			y * other.y,
-			z * other.z,
-			w * other.w
-		);
-	}
-
-	__forceinline__ __device__ __m128 operator/(const __m128& other)
-	{
-		return __m128(
-			x / other.x,
-			y / other.y,
-			z / other.z,
-			w / other.w
-		);
-	}
-
-	__forceinline__ __device__ __m128& trunc()
-	{
-		x=::truncf(x);
-		y=::truncf(y);
-		z=::truncf(z);
-		w=::truncf(w);
-
-		return *this;
-	}
-
-	__forceinline__ __device__ __m128& abs()
-	{
-		x=::fabsf(x);
-		y=::fabsf(y);
-		z=::fabsf(z);
-		w=::fabsf(w);
-
-		return *this;
-	}
-
-	__forceinline__ __device__ __m128& floor()
-	{
-		x=::floorf(x);
-		y=::floorf(y);
-		z=::floorf(z);
-		w=::floorf(w);
-
-		return *this;
-	}
+    __forceinline__ __device__ __m128(){}
+
+    __forceinline__ __device__ __m128(
+        const float x0, const float x1,
+        const float x2, const float x3)
+    {
+        float4::x = x0;
+        float4::y = x1;
+        float4::z = x2;
+        float4::w = x3;
+    }
+
+    __forceinline__ __device__ __m128( const float x0)
+    {
+        float4::x = x0;
+        float4::y = x0;
+        float4::z = x0;
+        float4::w = x0;
+    }
+
+    __forceinline__ __device__ __m128( const __m128i& x0)
+    {
+        float4::x = int2float(x0.x);
+        float4::y = int2float(x0.y);
+        float4::z = int2float(x0.z);
+        float4::w = int2float(x0.w);
+    }
+
+    __forceinline__ __device__ __m128i get_int( )
+    {
+        return __m128i(
+            (int)x,
+            (int)y,
+            (int)z,
+            (int)w
+        );
+    }
+
+    __forceinline__ __device__ __m128 operator+(const __m128& other)
+    {
+        return __m128(
+            x + other.x,
+            y + other.y,
+            z + other.z,
+            w + other.w
+        );
+    }
+
+    __forceinline__ __device__ __m128 operator-(const __m128& other)
+    {
+        return __m128(
+            x - other.x,
+            y - other.y,
+            z - other.z,
+            w - other.w
+        );
+    }
+
+    __forceinline__ __device__ __m128 operator*(const __m128& other)
+    {
+        return __m128(
+            x * other.x,
+            y * other.y,
+            z * other.z,
+            w * other.w
+        );
+    }
+
+    __forceinline__ __device__ __m128 operator/(const __m128& other)
+    {
+        return __m128(
+            x / other.x,
+            y / other.y,
+            z / other.z,
+            w / other.w
+        );
+    }
+
+    __forceinline__ __device__ __m128& trunc()
+    {
+        x=::truncf(x);
+        y=::truncf(y);
+        z=::truncf(z);
+        w=::truncf(w);
+
+        return *this;
+    }
+
+    __forceinline__ __device__ __m128& abs()
+    {
+        x=::fabsf(x);
+        y=::fabsf(y);
+        z=::fabsf(z);
+        w=::fabsf(w);
+
+        return *this;
+    }
+
+    __forceinline__ __device__ __m128& floor()
+    {
+        x=::floorf(x);
+        y=::floorf(y);
+        z=::floorf(z);
+        w=::floorf(w);
+
+        return *this;
+    }
 };
 
 
 template<typename T>
 __device__ void print(const char* name, T value)
 {
-	printf("g %s: ", name);
-	for(int i = 0; i < 4; ++i)
-	{
-		printf("%08X ",((uint32_t*)&value)[i]);
-	}
-	printf("\n");
+    printf("g %s: ", name);
+    for(int i = 0; i < 4; ++i)
+    {
+        printf("%08X ",((uint32_t*)&value)[i]);
+    }
+    printf("\n");
 }
 
 template<>
 __device__ void print<__m128>(const char* name, __m128 value)
 {
-	printf("g %s: ", name);
-	for(int i = 0; i < 4; ++i)
-	{
-		printf("%f ",((float*)&value)[i]);
-	}
-	printf("\n");
+    printf("g %s: ", name);
+    for(int i = 0; i < 4; ++i)
+    {
+        printf("%f ",((float*)&value)[i]);
+    }
+    printf("\n");
 }
 
 #define SHOW(name) print(#name, name)
@@ -196,83 +196,83 @@ __device__ void print<__m128>(const char* name, __m128 value)
 
 __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b)
 {
-	return a + b;
+    return a + b;
 }
 
 __forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b)
 {
-	return a - b;
+    return a - b;
 }
 
 __forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b)
 {
-	return a * b;
+    return a * b;
 }
 
 __forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-	return a / b;
+    return a / b;
 }
 
 __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
 {
-	return __m128(
-		int_as_float(float_as_int(a.x) & b),
-		int_as_float(float_as_int(a.y) & b),
-		int_as_float(float_as_int(a.z) & b),
-		int_as_float(float_as_int(a.w) & b)
-	);
+    return __m128(
+        int_as_float(float_as_int(a.x) & b),
+        int_as_float(float_as_int(a.y) & b),
+        int_as_float(float_as_int(a.z) & b),
+        int_as_float(float_as_int(a.w) & b)
+    );
 }
 
 __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
 {
-	return __m128(
-		int_as_float(float_as_int(a.x) | b),
-		int_as_float(float_as_int(a.y) | b),
-		int_as_float(float_as_int(a.z) | b),
-		int_as_float(float_as_int(a.w) | b)
-	);
+    return __m128(
+        int_as_float(float_as_int(a.x) | b),
+        int_as_float(float_as_int(a.y) | b),
+        int_as_float(float_as_int(a.z) | b),
+        int_as_float(float_as_int(a.w) | b)
+    );
 }
 
 __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
 {
-	return __m128(
-		int_as_float(float_as_int(a.x) ^ b),
-		int_as_float(float_as_int(a.y) ^ b),
-		int_as_float(float_as_int(a.z) ^ b),
-		int_as_float(float_as_int(a.w) ^ b)
-	);
+    return __m128(
+        int_as_float(float_as_int(a.x) ^ b),
+        int_as_float(float_as_int(a.y) ^ b),
+        int_as_float(float_as_int(a.z) ^ b),
+        int_as_float(float_as_int(a.w) ^ b)
+    );
 }
 
 __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc)
 {
-	__m128 d(dc);
-	__m128 c = _mm_div_ps(v, d);
-	c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
-	// c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
-	c = _mm_mul_ps(c, d);
-	return _mm_sub_ps(v, c);
+    __m128 d(dc);
+    __m128 c = _mm_div_ps(v, d);
+    c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+    // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
+    c = _mm_mul_ps(c, d);
+    return _mm_sub_ps(v, c);
 
 
-	//return a.fmodf(b);
+    //return a.fmodf(b);
 }
 
 __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b)
 {
-	return a ^ b;
+    return a ^ b;
 }
 
 
 __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot)
 {
-	const uint32_t right = 8 * rot;
-	const uint32_t left = (32 - 8 * rot);
-	return __m128i(
-		((uint32_t)a.x >> right) | ( a.y << left ),
-		((uint32_t)a.y >> right) | ( a.z << left ),
-		((uint32_t)a.z >> right) | ( a.w << left ),
-		((uint32_t)a.w >> right) | ( a.x << left )
-	);
+    const uint32_t right = 8 * rot;
+    const uint32_t left = (32 - 8 * rot);
+    return __m128i(
+        ((uint32_t)a.x >> right) | ( a.y << left ),
+        ((uint32_t)a.y >> right) | ( a.z << left ),
+        ((uint32_t)a.z >> right) | ( a.w << left ),
+        ((uint32_t)a.w >> right) | ( a.x << left )
+    );
 }
 
 template<uint32_t MASK>
@@ -281,282 +281,282 @@ __device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return
 
 __forceinline__ __device__  __m128 fma_break(__m128 x)
 {
-	// Break the dependency chain by setitng the exp to ?????01
-	x = _mm_and_ps(x, 0xFEFFFFFF);
-	return _mm_or_ps(x, 0x00800000);
+    // Break the dependency chain by setitng the exp to ?????01
+    x = _mm_and_ps(x, 0xFEFFFFFF);
+    return _mm_or_ps(x, 0x00800000);
 }
 
 // 9
 __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
 {
-	n1 = _mm_add_ps(n1, c);
-	__m128 nn = _mm_mul_ps(n0, c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
-	nn = fma_break(nn);
-	n = _mm_add_ps(n, nn);
-
-	n3 = _mm_sub_ps(n3, c);
-	__m128 dd = _mm_mul_ps(n2, c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
-	dd = fma_break(dd);
-	d = _mm_add_ps(d, dd);
-
-	//Constant feedback
-	c = _mm_add_ps(c, rnd_c);
-	c = _mm_add_ps(c, 0.734375f);
-	__m128 r = _mm_add_ps(nn, dd);
-	r = _mm_and_ps(r, 0x807FFFFF);
-	r = _mm_or_ps(r, 0x40000000);
-	c = _mm_add_ps(c, r);
+    n1 = _mm_add_ps(n1, c);
+    __m128 nn = _mm_mul_ps(n0, c);
+    nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+    nn = fma_break(nn);
+    n = _mm_add_ps(n, nn);
+
+    n3 = _mm_sub_ps(n3, c);
+    __m128 dd = _mm_mul_ps(n2, c);
+    dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+    dd = fma_break(dd);
+    d = _mm_add_ps(d, dd);
+
+    //Constant feedback
+    c = _mm_add_ps(c, rnd_c);
+    c = _mm_add_ps(c, 0.734375f);
+    __m128 r = _mm_add_ps(nn, dd);
+    r = _mm_and_ps(r, 0x807FFFFF);
+    r = _mm_or_ps(r, 0x40000000);
+    c = _mm_add_ps(c, r);
 }
 
 // 9*8 + 2 = 74
 __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
 {
-	__m128 n(0.0f), d(0.0f);
-
-	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
-	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
-	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
-	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
-	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
-	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
-	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
-	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
-
-	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
-	d = _mm_and_ps(d, 0xFF7FFFFF);
-	d = _mm_or_ps(d, 0x40000000);
-	r =_mm_add_ps(r, _mm_div_ps(n,d));
+    __m128 n(0.0f), d(0.0f);
+
+    sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+    sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+    sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+    sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+    sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+    sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+    sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+    sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+    // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+    d = _mm_and_ps(d, 0xFF7FFFFF);
+    d = _mm_or_ps(d, 0x40000000);
+    r =_mm_add_ps(r, _mm_div_ps(n,d));
 }
 
 // 74*8 = 595
 __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
 {
-	__m128 c(cnt);
-	// 35 maths calls follow (140 FLOPS)
-	__m128 r = __m128(0.0f);
-	for(int i=0; i< 4; ++i)
-		round_compute(n0, n1, n2, n3, rnd_c, c, r);
-	// do a quick fmod by setting exp to 2
-	r = _mm_and_ps(r, 0x807FFFFF);
-	r = _mm_or_ps(r, 0x40000000);
-	sum = r; // 34
-	r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
-	return r.get_int();
+    __m128 c(cnt);
+    // 35 maths calls follow (140 FLOPS)
+    __m128 r = __m128(0.0f);
+    for(int i=0; i< 4; ++i)
+        round_compute(n0, n1, n2, n3, rnd_c, c, r);
+    // do a quick fmod by setting exp to 2
+    r = _mm_and_ps(r, 0x807FFFFF);
+    r = _mm_or_ps(r, 0x40000000);
+    sum = r; // 34
+    r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
+    return r.get_int();
 
 }
 
 __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
-	__m128 n0(v0);
-	__m128 n1(v1);
-	__m128 n2(v2);
-	__m128 n3(v3);
+    __m128 n0(v0);
+    __m128 n1(v1);
+    __m128 n2(v2);
+    __m128 n3(v3);
 
-	__m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
-	out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+    __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+    out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
 }
 
 __constant__ uint32_t look[16][4] = {
-	{0, 1, 2, 3},
-	{0, 2, 3, 1},
-	{0, 3, 1, 2},
-	{0, 3, 2, 1},
-
-	{1, 0, 2, 3},
-	{1, 2, 3, 0},
-	{1, 3, 0, 2},
-	{1, 3, 2, 0},
-
-	{2, 1, 0, 3},
-	{2, 0, 3, 1},
-	{2, 3, 1, 0},
-	{2, 3, 0, 1},
-
-	{3, 1, 2, 0},
-	{3, 2, 0, 1},
-	{3, 0, 1, 2},
-	{3, 0, 2, 1}
+    {0, 1, 2, 3},
+    {0, 2, 3, 1},
+    {0, 3, 1, 2},
+    {0, 3, 2, 1},
+
+    {1, 0, 2, 3},
+    {1, 2, 3, 0},
+    {1, 3, 0, 2},
+    {1, 3, 2, 0},
+
+    {2, 1, 0, 3},
+    {2, 0, 3, 1},
+    {2, 3, 1, 0},
+    {2, 3, 0, 1},
+
+    {3, 1, 2, 0},
+    {3, 2, 0, 1},
+    {3, 0, 1, 2},
+    {3, 0, 2, 1}
 };
 
 __constant__ float ccnt[16] = {
-	1.34375f,
-	1.28125f,
-	1.359375f,
-	1.3671875f,
-
-	1.4296875f,
-	1.3984375f,
-	1.3828125f,
-	1.3046875f,
-
-	1.4140625f,
-	1.2734375f,
-	1.2578125f,
-	1.2890625f,
-
-	1.3203125f,
-	1.3515625f,
-	1.3359375f,
-	1.4609375f
+    1.34375f,
+    1.28125f,
+    1.359375f,
+    1.3671875f,
+
+    1.4296875f,
+    1.3984375f,
+    1.3828125f,
+    1.3046875f,
+
+    1.4140625f,
+    1.2734375f,
+    1.2578125f,
+    1.2890625f,
+
+    1.3203125f,
+    1.3515625f,
+    1.3359375f,
+    1.4609375f
 };
 
 
 __forceinline__ __device__ void sync()
 {
 #if (__CUDACC_VER_MAJOR__ >= 9)
-	__syncwarp();
+    __syncwarp();
 #else
-	__syncthreads( );
+    __syncthreads();
 #endif
 }
 
 template<size_t ITERATIONS, uint32_t MEMORY>
 __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
 {
-	static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6;
-
-	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
-
-	extern __shared__ __m128i smemExtern_in[];
-
-	const uint32_t chunk = threadIdx.x / 16;
-	const uint32_t numHashPerBlock = blockDim.x / 16;
-
-	int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
-
-	__m128i* smem = smemExtern_in + 4 * chunk;
-
-	__m128i* smemExtern = smemExtern_in + numHashPerBlock * 4;
-	__m128i* smemOut = smemExtern + 16 * chunk;
-
-	smemExtern = smemExtern + numHashPerBlock * 16;
-	__m128* smemVa = (__m128*)smemExtern + 16 * chunk;
-
-	uint32_t tid = threadIdx.x % 16;
-
-	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
-	uint32_t s = 0;
-
-	__m128 vs(0);
-	if(partidx != 0)
-	{
-		vs = ((__m128*)roundVs)[idxHash];
-		s = roundS[idxHash];
-	}
-	else
-	{
-		s = ((uint32_t*)spad)[idxHash * 50] >> 8;
-	}
-
-	const uint32_t b = tid / 4;
-	const uint32_t bb = tid % 4;
-	const uint32_t block = b * 16 + bb;
-
-	for(size_t i = 0; i < batchsize; i++)
-	{
-		sync();
-		((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb];
-		sync();
-
-		__m128 rc = vs;
-		single_comupte_wrap(
-			bb,
-			*(smem + look[tid][0]),
-			*(smem + look[tid][1]),
-			*(smem + look[tid][2]),
-			*(smem + look[tid][3]),
-			ccnt[tid], rc, smemVa[tid],
-			smemOut[tid]
-		);
-
-		sync();
-
-		int outXor = ((int*)smemOut)[block];
-		for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4)
-			outXor ^= ((int*)smemOut)[dd];
-
-		((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid];
-		((int*)smemOut)[tid] = outXor;
-
-		float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
-		float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12];
-		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
-
-		sync();
-
-		__m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
-		va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
-		va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12];
-		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
-
-		sync();
-
-		vs = smemVa[0];
-		vs.abs(); // take abs(va) by masking the float sign bit
-		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
-		// vs range 0 - 64
-		auto xx_int = xx.get_int();
-		out2 = _mm_xor_si128(xx_int, out2);
-		// vs is now between 0 and 1
-		vs = _mm_div_ps(vs, __m128(64.0f));
-		s = out2.x ^ out2.y ^ out2.z ^ out2.w;
-	}
-	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
-	{
-		const uint32_t numHashPerBlock2 = blockDim.x / 16;
-		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
-		((__m128*)roundVs)[idxHash2] = vs;
-		roundS[idxHash2] = s;
-	}
+    static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6;
+
+    const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+
+    extern __shared__ __m128i smemExtern_in[];
+
+    const uint32_t chunk = threadIdx.x / 16;
+    const uint32_t numHashPerBlock = blockDim.x / 16;
+
+    int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
+
+    __m128i* smem = smemExtern_in + 4 * chunk;
+
+    __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4;
+    __m128i* smemOut = smemExtern + 16 * chunk;
+
+    smemExtern = smemExtern + numHashPerBlock * 16;
+    __m128* smemVa = (__m128*)smemExtern + 16 * chunk;
+
+    uint32_t tid = threadIdx.x % 16;
+
+    const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
+    uint32_t s = 0;
+
+    __m128 vs(0);
+    if(partidx != 0)
+    {
+        vs = ((__m128*)roundVs)[idxHash];
+        s = roundS[idxHash];
+    }
+    else
+    {
+        s = ((uint32_t*)spad)[idxHash * 50] >> 8;
+    }
+
+    const uint32_t b = tid / 4;
+    const uint32_t bb = tid % 4;
+    const uint32_t block = b * 16 + bb;
+
+    for(size_t i = 0; i < batchsize; i++)
+    {
+        sync();
+        ((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb];
+        sync();
+
+        __m128 rc = vs;
+        single_comupte_wrap(
+            bb,
+            *(smem + look[tid][0]),
+            *(smem + look[tid][1]),
+            *(smem + look[tid][2]),
+            *(smem + look[tid][3]),
+            ccnt[tid], rc, smemVa[tid],
+            smemOut[tid]
+        );
+
+        sync();
+
+        int outXor = ((int*)smemOut)[block];
+        for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4)
+            outXor ^= ((int*)smemOut)[dd];
+
+        ((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid];
+        ((int*)smemOut)[tid] = outXor;
+
+        float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
+        float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12];
+        ((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
+
+        sync();
+
+        __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
+        va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
+        va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12];
+        ((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
+
+        sync();
+
+        vs = smemVa[0];
+        vs.abs(); // take abs(va) by masking the float sign bit
+        auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
+        // vs range 0 - 64
+        auto xx_int = xx.get_int();
+        out2 = _mm_xor_si128(xx_int, out2);
+        // vs is now between 0 and 1
+        vs = _mm_div_ps(vs, __m128(64.0f));
+        s = out2.x ^ out2.y ^ out2.z ^ out2.w;
+    }
+    if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
+    {
+        const uint32_t numHashPerBlock2 = blockDim.x / 16;
+        const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
+        ((__m128*)roundVs)[idxHash2] = vs;
+        roundS[idxHash2] = s;
+    }
 }
 
 __forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, uint8_t* out)
 {
-	uint64_t hash[25];
-
-	hash[0] = in[0] ^ idx;
-	#pragma unroll 24
-	for(int i = 1; i < 25; ++i)
-		hash[i] = in[i];
-
-	cn_keccakf2(hash);
-	#pragma unroll 10
-	for(int i = 0; i < 10; ++i)
-		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=160;
-
-	cn_keccakf2(hash);
-	#pragma unroll 11
-	for(int i = 0; i < 11; ++i)
-		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=176;
-
-	cn_keccakf2(hash);
-	#pragma unroll 11
-	for(int i = 0; i < 11; ++i)
-		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+    uint64_t hash[25];
+
+    hash[0] = in[0] ^ idx;
+    #pragma unroll 24
+    for(int i = 1; i < 25; ++i)
+        hash[i] = in[i];
+
+    cn_keccakf2(hash);
+    #pragma unroll 10
+    for(int i = 0; i < 10; ++i)
+        ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+    out+=160;
+
+    cn_keccakf2(hash);
+    #pragma unroll 11
+    for(int i = 0; i < 11; ++i)
+        ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+    out+=176;
+
+    cn_keccakf2(hash);
+    #pragma unroll 11
+    for(int i = 0; i < 11; ++i)
+        ((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
 }
 
 template<size_t MEMORY>
 __global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in)
 {
-	__shared__ uint64_t state[25];
+    __shared__ uint64_t state[25];
 
-	uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY;
-	uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
+    uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY;
+    uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
 
-	for(int i = threadIdx.x; i < 25; i += blockDim.x)
-		state[i] = spad[i];
+    for(int i = threadIdx.x; i < 25; i += blockDim.x)
+        state[i] = spad[i];
 
-	sync();
+    sync();
 
-	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
-	{
-		generate_512(i, state, (uint8_t*)lpad + i*512);
-	}
+    for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
+    {
+        generate_512(i, state, (uint8_t*)lpad + i*512);
+    }
 }
 
-} // namespace xmrig
+} // namespace xmrig_cuda
 } // namespace cn_gpu
diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu
index 5a2b3d6..44c6961 100644
--- a/src/cuda_extra.cu
+++ b/src/cuda_extra.cu
@@ -289,48 +289,50 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 template<xmrig_cuda::Algorithm::Id ALGO>
 __global__ void cryptonight_gpu_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
 {
-	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+    using namespace xmrig_cuda;
 
-	__shared__ uint32_t sharedMemory[1024];
+    const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
-	cn_aes_gpu_init( sharedMemory );
-	__syncthreads( );
+    __shared__ uint32_t sharedMemory[1024];
 
-	if ( thread >= threads )
-		return;
+    cn_aes_gpu_init( sharedMemory );
+    __syncthreads( );
 
-	int i;
-	uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
-	uint32_t state[50];
+    if ( thread >= threads )
+        return;
 
-	#pragma unroll
-	for ( i = 0; i < 50; i++ )
-		state[i] = ctx_state[i];
+    int i;
+    uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
+    uint32_t state[50];
 
-	uint32_t key[40];
+    #pragma unroll
+    for ( i = 0; i < 50; i++ )
+        state[i] = ctx_state[i];
 
-	// load keys
-	MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+    uint32_t key[40];
 
-	for(int i=0; i < 16; i++)
-	{
-		for(size_t t = 4; t < 12; ++t)
-		{
-			cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
-		}
-		// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
-		mix_and_propagate(state + 4 * 4);
-	}
+    // load keys
+    MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
 
-	cn_keccakf2( (uint64_t *) state );
+    for(int i=0; i < 16; i++)
+    {
+        for(size_t t = 4; t < 12; ++t)
+        {
+            cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
+        }
+        // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
+        mix_and_propagate(state + 4 * 4);
+    }
 
-	if ( ((uint64_t*)state)[3] < target )
-	{
-		uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+    cn_keccakf2( (uint64_t *) state );
+
+    if ( ((uint64_t*)state)[3] < target )
+    {
+        uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
 
-		if(idx < 10)
-			d_res_nonce[idx] = thread;
-	}
+        if(idx < 10)
+            d_res_nonce[idx] = thread;
+    }
 }
 
 void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len)
@@ -728,6 +730,23 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
             }
         }
 
+        if (ctx->algorithm == Algorithm::CN_GPU && props.major < 7) {
+            int t = 32;
+            int b = ctx->device_blocks;
+            int target_intensity = ctx->device_threads * b;
+            for (; t * b <= target_intensity; b++) {}
+            b--;
+            if (t != ctx->device_threads || b != ctx->device_blocks) {
+                printf("WARNING: NVIDIA GPU %d: modified cn/gpu t/b from %d/%d to %d/%d\n",
+                    ctx->device_id,
+                    ctx->device_threads, ctx->device_blocks,
+                    t, b
+                );
+                ctx->device_threads = t;
+                ctx->device_blocks = b;
+            }
+        }
+
         ctx->device_threads = std::min(ctx->device_threads, (props.major == 2 ? 64 : 128));
     }