diff --git a/ccminer.VC.VC.opendb b/ccminer.VC.VC.opendb
deleted file mode 100644
index c4b2a0be40..0000000000
Binary files a/ccminer.VC.VC.opendb and /dev/null differ
diff --git a/ccminer.cpp b/ccminer.cpp
index 0d99af2982..4660f5e207 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1,4 +1,4 @@
-/*
+ï»¿/*
  * Copyright 2010 Jeff Garzik
  * Copyright 2012-2014 pooler
  * Copyright 2014-2017 tpruvot
@@ -42,7 +42,8 @@
 
 #include "miner.h"
 #include "algos.h"
-
+#include "sia/sia-rpc.h"
+#include "crypto/xmr-rpc.h"
 #include "equi/equihash.h"
 
 //#include <cuda_runtime.h>
@@ -1600,6 +1601,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
+		work->hash_ver = sctx->job.hash_ver;
 		memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce
 		work->data[35] = 0x80;
 		//applog_hex(work->data, 140);
@@ -1744,7 +1746,7 @@ static bool wanna_mine(int thr_id)
 		float temp = gpu_temp(cgpu);
 		if (temp > opt_max_temp) {
 			if (!conditional_state[thr_id] && !opt_quiet)
-				gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp);
+				gpulog(LOG_INFO, thr_id, "temperature too high (%.0fÂ°c), waiting...", temp);
 			state = false;
 		} else if (opt_max_temp > 0. && opt_resume_temp > 0. && conditional_state[thr_id] && temp > opt_resume_temp) {
 			if (!thr_id && opt_debug)
@@ -1838,7 +1840,7 @@ static void *miner_thread(void *userdata)
 	}
 
 	/* Cpu thread affinity */
-	if (num_cpus > 1) {
+	/*if (num_cpus > 1) {
 		if (opt_affinity == -1L && opt_n_threads > 1) {
 			if (opt_debug)
 				applog(LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id,
@@ -1850,7 +1852,7 @@ static void *miner_thread(void *userdata)
 						(long) opt_affinity);
 			affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);
 		}
-	}
+	}*/
 
 
 
@@ -3436,6 +3438,18 @@ BOOL WINAPI ConsoleHandler(DWORD dwType)
 }
 #endif
 
+void Clear()
+{
+#if defined _WIN32
+	system("cls");
+#elif defined (__LINUX__) || defined(__gnu_linux__) || defined(__linux__)
+	system("clear");
+#elif defined (__APPLE__)
+	system("clear");
+#endif
+}
+
+
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
@@ -3444,15 +3458,34 @@ int main(int argc, char *argv[])
 
 	// get opt_quiet early
 	parse_single_opt('q', argc, argv);
-
-	printf("*** ccminer " PACKAGE_VERSION " for CPU's by Monkins1010 based on ccminer***\n");
-	if (!opt_quiet) {
-		const char* arch = is_x64() ? "64-bits" : "32-bits";
-
-		printf("  Originally based on Christian Buchner and Christian H. project\n");
-		printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
-		printf("Verus donation address: REoPcdGXthL5yeTCrJtrQv5xhYTknbFbec  (monkins)\n");
-	}
+	
+	Clear();
+		
+	printf("      ..              ..\n");
+	printf("   .lkK0o.          'd0Kkc.  \n");
+	printf("  cKWMMMW0;       'dXMMMMWK:     \n");
+	printf(" :XMMMMMMMXc.    lKMMMMMMMMK;                                                          .\n");
+	printf(" lWMMMMMMMMNl., OWMMMMMMMMMNkoxkO0OOko:.   'odddddl,;okOOdodddddc.     ,oddddo'  ., ldkO00Okxo : .\n");
+	printf("  ,0MMMMMMMMMNxdXMMMMMMMMMMNNWMMMMMMMMMWK: .xWMMMMMWXWMMMNKNMMMMWd.   .kMMMMM0,'oKWMMMMMMMMMMWKc\n");
+	printf("   ;KMMMMMMMMMMMMMMMMMMMMWWWMMMN0kOXMMMMMK; :XMMMMMMMMMMMWKKWMMMMK,   cNMMMMWdlKMMMMXxlcdXMMMMM0'\n");
+	printf("    :KMMMMMMMMMMMMMMMMMWWWWMMNd'.  :XMMMMNdkWMMMMWN0oc:::dXMMMMWd.   .kMMMMM0o0MMMMWx.   :xxxxxl.   \n");
+	printf("     :KMMMMMMMMMMMMMMMWWWMMMMKdllllxNMMMMN0XMMMMWx,     ..xMMMMMK,   cNMMMMWo; 0MMMMMNKkdl:, .\n");
+	printf("      :KMMMMMMMMMMMMNXNWMMMMMMMMMMMMMMMMMXXWMMMMK,      :XMMMMWo    .kMMMMM0' ;ONWMMMMMMMMN0o.   \n");
+	printf("       ;KMMMMMMMMMW0coNMMMMMKkxxxxxxxxxxxONMMMMWo      .xMMMMM0,    cNMMMMWl    ':oxOKWMMMMMWk.     \n");
+	printf("        ;0MMMMMMMNd.lWMMMMMO,     .'cc. .kWMMMM0,      ;KMMMMMk.   ;0MMMMMXkddddd;   .cKMMMMMk.  \n");
+	printf("         ,0WMMMWO;, 0MMMMMWXkxdx0NWWx' :XMMMMWo        :NMMMMMNOxxONMMMMMNkOMMMMMKo::ckNMMMMK;   \n");
+	printf("          'OWMXl.     ,OWMMMMMMMMMMMWKo:kWMMMMK,       ,0MMMMMMMMWWWMMMMM0,'kWMMMMMMMMMMMWKd' \n");
+	printf("           .dd'        .;oxOKXXK0kdl,. 'x0000Oc         .oOKXXKOxc:x0000O:  .; okOKXXK0Oxl, .\n");
+//	printf("                            ....                           ....                  .....\n");
+//	printf("*********************************************************************************************************\n");
+
+	printf("\n      *** ccminer CPU" PACKAGE_VERSION " for Verushash v2.1 - 2.2  by Monkins1010 based on ccminer***\n\n");
+
+		//printf("    Built with VC++ %d" , msver());
+		printf("Originally based on Christian Buchner and Christian H. project\n");
+
+		printf("Located at: " PACKAGE_URL " \n\n");
+	
 
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
@@ -3612,11 +3645,12 @@ int main(int argc, char *argv[])
 	// Enable windows high precision timer
 	timeBeginPeriod(1);
 #endif
-	if (opt_affinity != -1) {
-		if (!opt_quiet)
-			applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
-		affine_to_cpu_mask(-1, (unsigned long)opt_affinity);
-	}
+	//if (opt_affinity != -1) {
+	//	if (!opt_quiet)
+//	opt_affinity = 0xffffffffffffffff;
+	//		applog(LOG_DEBUG, "Binding process to cpu mask %llx", opt_affinity);
+	//	affine_to_cpu_mask(-1, (unsigned long)opt_affinity);
+	//}
 	if (active_gpus == 0) {
 		applog(LOG_ERR, "No CUDA devices found! terminating.");
 		exit(1);
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
new file mode 100644
index 0000000000..a3b76a7733
--- /dev/null
+++ b/equi/cuda_equi.cu
@@ -0,0 +1,2130 @@
+/*
+ * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash
+ * Adapted to be more compatible with older C++ compilers
+ *
+ * cuda_djezo solver was released by NiceHash (www.nicehash.com) under
+ * GPL 3.0 license. If you don't have a copy, you can obtain one from
+ * https://www.gnu.org/licenses/gpl-3.0.txt
+ *
+ * Based on CUDA solver by John Tromp released under MIT license.
+ * Some helper functions taken out of OpenCL solver by Marc Bevand
+ * released under MIT license.
+ *
+ * Copyright (c) 2016 John Tromp, Marc Bevand
+ * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3)
+ */
+
+#ifdef WIN32
+#include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <vector>
+//#include <mutex>
+
+#include "equihash.h"
+#include "eqcuda.hpp" // eq_cuda_context
+
+#include "blake2/blake2.h"
+
+//#define WN 200
+//#define WK 9
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+#define NDIGITS		(WK+1)
+#define DIGITBITS	(WN/(NDIGITS))
+#define PROOFSIZE (1<<WK)
+#define BASE (1<<DIGITBITS)
+#define NHASHES (2*BASE)
+#define HASHESPERBLAKE (512/WN)
+#define HASHOUT (HASHESPERBLAKE*WN/8)
+#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE)
+#define BUCKBITS (DIGITBITS - RB)
+#define NBUCKETS (1 << BUCKBITS)
+#define BUCKMASK (NBUCKETS - 1)
+#define SLOTBITS (RB + 2)
+#define SLOTRANGE (1 << SLOTBITS)
+#define NSLOTS SM
+#define SLOTMASK (SLOTRANGE - 1)
+#define NRESTS (1 << RB)
+#define RESTMASK (NRESTS - 1)
+#define CANTORBITS (2 * SLOTBITS - 2)
+#define CANTORMASK ((1 << CANTORBITS) - 1)
+#define CANTORMAXSQRT (2 * NSLOTS)
+#define RB8_NSLOTS 640
+#define RB8_NSLOTS_LD 624
+#define FD_THREADS 128
+
+#ifdef __INTELLISENSE__
+// reduce vstudio editor warnings
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 520
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl2(uint32_t x, uint32_t y);
+uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+uint32_t __ldg(const uint32_t* address);
+uint64_t __ldg(const uint64_t* address);
+uint4 __ldca(const uint4 *ptr);
+u32 __ldca(const u32 *ptr);
+u32 umin(const u32, const u32);
+u32 umax(const u32, const u32);
+#endif
+
+#define OPT_SYNC_ALL
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#undef __any
+#define __any(p) __any_sync(0xFFFFFFFFu, p)
+#else
+#define __shfl2 __shfl
+#endif
+
+typedef u32 proof[PROOFSIZE];
+
+struct __align__(32) slot {
+	u32 hash[8];
+};
+
+struct __align__(16) slotsmall {
+	u32 hash[4];
+};
+
+struct __align__(8) slottiny {
+	u32 hash[2];
+};
+
+template <u32 RB, u32 SM>
+struct equi
+{
+	slot round0trees[4096][RB8_NSLOTS];
+	slot trees[1][NBUCKETS][NSLOTS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round2trees[NBUCKETS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round3trees[NBUCKETS];
+	slotsmall treessmall[4][NBUCKETS][NSLOTS];
+	slottiny treestiny[1][4096][RB8_NSLOTS_LD];
+	u32 round4bidandsids[NBUCKETS][NSLOTS];
+	union {
+		u64 blake_h[8];
+		u32 blake_h32[16];
+	};
+	struct {
+		u32 nslots8[4096];
+		u32 nslots0[4096];
+		u32 nslots[9][NBUCKETS];
+		scontainerreal srealcont;
+	} edata;
+};
+
+// todo: use cuda_helper.h and/or cuda_vector.h
+__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b)
+{
+	return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64)
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+	result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+#endif
+	return result;
+}
+
+
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+__device__ __forceinline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+
+__device__ __forceinline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y)
+{
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+// untested..
+struct packer_default
+{
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1;
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors
+		return (bid >> (2 * SLOTBITS)) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return bid & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> SLOTBITS) & SLOTMASK;
+	}
+};
+
+
+struct packer_cantor
+{
+	__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1)
+	{
+		u32 a = umax(s0, s1);
+		u32 b = umin(s0, s1);
+		return a * (a + 1) / 2 + b;
+	}
+
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (bucketid << CANTORBITS) | cantor(s0, s1);
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> CANTORBITS) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1;
+		// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9
+		for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2);
+		return ((k - 1) / 2) & SLOTMASK;
+	}
+};
+
+__device__ __constant__ const u64 blake_iv[] = {
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#if CUDART_VERSION < 8000 || !defined(__ldca)
+#define __ldca(ptr) *(ptr)
+#endif
+
+template <u32 RB, u32 SM, typename PACKER>
+__global__ void digit_first(equi<RB, SM>* eq, u32 nonce)
+{
+	const u32 block = blockIdx.x * blockDim.x + threadIdx.x;
+	__shared__ u64 hash_h[8];
+	u32* hash_h32 = (u32*)hash_h;
+
+	if (threadIdx.x < 16)
+		hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]);
+
+	__syncthreads();
+
+	u64 m = (u64)block << 32 | (u64)nonce;
+
+	union
+	{
+		u64 v[16];
+		u32 v32[32];
+		uint4 v128[8];
+	};
+
+	v[0] = hash_h[0];
+	v[1] = hash_h[1];
+	v[2] = hash_h[2];
+	v[3] = hash_h[3];
+	v[4] = hash_h[4];
+	v[5] = hash_h[5];
+	v[6] = hash_h[6];
+	v[7] = hash_h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	v[0] ^= hash_h[0] ^ v[8];
+	v[1] ^= hash_h[1] ^ v[9];
+	v[2] ^= hash_h[2] ^ v[10];
+	v[3] ^= hash_h[3] ^ v[11];
+	v[4] ^= hash_h[4] ^ v[12];
+	v[5] ^= hash_h[5] ^ v[13];
+	v32[12] ^= hash_h32[12] ^ v32[28];
+
+	u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits
+	u32 bucketid;
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[0], v32[1], 0x1234);
+		tt.y = __byte_perm(v32[1], v32[2], 0x1234);
+		tt.z = __byte_perm(v32[2], v32[3], 0x1234);
+		tt.w = __byte_perm(v32[3], v32[4], 0x1234);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[4], v32[5], 0x1234);
+		tt.y = __byte_perm(v32[5], v32[6], 0x1234);
+		tt.z = 0;
+		tt.w = block << 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+
+	bexor = __byte_perm(v32[6], 0, 0x0123);
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[6], v32[7], 0x2345);
+		tt.y = __byte_perm(v32[7], v32[8], 0x2345);
+		tt.z = __byte_perm(v32[8], v32[9], 0x2345);
+		tt.w = __byte_perm(v32[9], v32[10], 0x2345);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[10], v32[11], 0x2345);
+		tt.y = __byte_perm(v32[11], v32[12], 0x2345);
+		tt.z = 0;
+		tt.w = (block << 1) + 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+}
+
+/*
+  Functions digit_1 to digit_8 works by the same principle;
+  Each thread does 2-3 slot loads (loads are coalesced).
+  Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1).
+  At the same time, restbits (8 or 9 bits) in xorwork are used for collisions.
+  Restbits determine position in ht.
+  Following next is pair creation. First one (or two) pairs' xorworks are put into global memory
+  as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices).
+  In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step.
+  In last step we save xorwork of pairs in memory.
+*/
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_1(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[256][SSM - 1];
+	__shared__ uint2 lastword1[RB8_NSLOTS];
+	__shared__ uint4 lastword2[RB8_NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < 256)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	uint2 ta[2];
+	uint4 tb[2];
+
+	u32 si[2];
+
+#ifdef OPT_SYNC_ALL
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	__syncthreads();
+#endif
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slot* pslot1 = eq->round0trees[bucketid] + si[i];
+
+		// get xhash
+		uint4 a1 = *(uint4*)(&pslot1->hash[0]);
+		uint2 a2 = *(uint2*)(&pslot1->hash[4]);
+		ta[i].x = a1.x;
+		ta[i].y = a1.y;
+		lastword1[si[i]] = ta[i];
+		tb[i].x = a1.z;
+		tb[i].y = a1.w;
+		tb[i].z = a2.x;
+		tb[i].w = a2.y;
+		lastword2[si[i]] = tb[i];
+
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	int* pairs = ht_len;
+
+	u32 xors[6];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p];
+
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p];
+
+				slot &xs = eq->trees[0][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				uint4 ttx;
+				ttx.x = xors[5];
+				ttx.y = xors[0];
+				ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS);
+				ttx.w = 0;
+				*(uint4*)(&xs.hash[4]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k];
+
+			slot &xs = eq->trees[0][xorbucketid][xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+			uint4 ttx;
+			ttx.x = xors[5];
+			ttx.y = xors[0];
+			ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS);
+			ttx.w = 0;
+			*(uint4*)(&xs.hash[4]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_2(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][SSM - 1];
+	__shared__ u32 lastword1[NSLOTS];
+	__shared__ uint4 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slot* buck = eq->trees[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 ta[2];
+	uint4 tt[2];
+
+	u32 si[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		// get slot
+		const slot* pslot1 = buck + si[i];
+
+		uint4 ttx = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = ta[i] = ttx.x;
+		uint2 tty = *(uint2*)(&pslot1->hash[4]);
+		tt[i].x = ttx.y;
+		tt[i].y = ttx.z;
+		tt[i].z = ttx.w;
+		tt[i].w = tty.x;
+		lastword2[si[i]] = tt[i];
+
+		hr[i] = tty.y & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[0] = ta[i] ^ lastword1[p];
+
+			xorbucketid = xors[0] >> (12 + RB);
+			xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p];
+				slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = xors[4];
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[0] = lastword1[i] ^ lastword1[k];
+
+		xorbucketid = xors[0] >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k];
+			slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+			slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+			uint2 ttx;
+			ttx.x = xors[4];
+			ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			*(uint2*)(&xst.hash[0]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_3(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+	u32 ta[2];
+
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]];
+
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword1[si[i]] = tt[i];
+		ta[i] = xst.hash[0];
+		lastword2[si[i]] = ta[i];
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[4] = ta[i] ^ lastword2[p];
+
+			if (xors[4] != 0)
+			{
+				*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x2107);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+					slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+					uint2 ttx;
+					ttx.x = bexor;
+					ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xst.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[4] = lastword2[i] ^ lastword2[k];
+
+		if (xors[4] != 0)
+		{
+			*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+			bexor = __byte_perm(xors[0], xors[1], 0x2107);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = bexor;
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_4(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]];
+
+		// get xhash
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword[si[i]] = tt[i];
+		hr[i] = xst.hash[0] & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+
+					eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+		if (xors[3] != 0)
+		{
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_5(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slotsmall* buck = eq->treessmall[3][bucketid];
+	u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword[si[i]] = tt[i];
+		asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = xors[3];
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+
+		if (xors[3] != 0)
+		{
+			bexor = __byte_perm(xors[0], xors[1], 0x1076);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+				uint4 tt;
+				tt.x = xors[1];
+				tt.y = xors[2];
+				tt.z = xors[3];
+				tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint4*)(&xs.hash[0]) = tt;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_6(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint2 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[2][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = *(uint2*)(&tt[i].x);
+		lastword2[si[i]] = tt[i].z;
+		asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	// doing this to save shared memory
+	int* pairs = ht_len;
+	__syncthreads();
+
+	u32 xors[3];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[2] = tt[i].z ^ lastword2[p];
+
+			if (xors[2] != 0)
+			{
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				xorbucketid = bexor >> (12 + RB);
+				xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = bexor;
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				xors[2] = tt[i].z ^ lastword2[p];
+
+				if (xors[2] != 0)
+				{
+					*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+					bexor = __byte_perm(xors[0], xors[1], 0x1076);
+					xorbucketid = bexor >> (12 + RB);
+					xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[1];
+						ttx.y = xors[2];
+						ttx.z = bexor;
+						ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		u32 pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		xors[2] = lastword2[i] ^ lastword2[k];
+		if (xors[2] == 0)
+			continue;
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		bexor = __byte_perm(xors[0], xors[1], 0x1076);
+		xorbucketid = bexor >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+		uint4 ttx;
+		ttx.x = xors[1];
+		ttx.y = xors[2];
+		ttx.z = bexor;
+		ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint4*)(&xs.hash[0]) = ttx;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_7(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[0][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[0];
+					ttx.y = xors[1];
+					ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					ttx.w = 0;
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+					xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[0];
+						ttx.y = xors[1];
+						ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						ttx.w = 0;
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+		uint4 tt;
+		tt.x = xors[0];
+		tt.y = xors[1];
+		tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		tt.w = 0;
+		*(uint4*)(&xs.hash[0]) = tt;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_8(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		next_pair = 0;
+		pairs_len = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[1][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint2 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint2*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x0765);
+				xorbucketid = bexor >> (12 + 8);
+				xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+				if (xorslot < RB8_NSLOTS_LD)
+				{
+					slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+					uint2 tt;
+					tt.x = xors[1];
+					tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xs.hash[0]) = tt;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					bexor = __byte_perm(xors[0], xors[1], 0x0765);
+					xorbucketid = bexor >> (12 + 8);
+					xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+					if (xorslot < RB8_NSLOTS_LD)
+					{
+						slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+						uint2 tt;
+						tt.x = xors[1];
+						tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint2*)(&xs.hash[0]) = tt;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		bexor = __byte_perm(xors[0], xors[1], 0x0765);
+		xorbucketid = bexor >> (12 + 8);
+		xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+		if (xorslot >= RB8_NSLOTS_LD) continue;
+		slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+		uint2 tt;
+		tt.x = xors[1];
+		tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint2*)(&xs.hash[0]) = tt;
+	}
+}
+
+/*
+  Last round function is similar to previous ones but has different ending.
+  We use warps to process final candidates. Each warp process one candidate.
+  First two bidandsids (u32 of stored bucketid and two slotids) are retreived by
+  lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until
+  all lanes in warp have bidandsids from round 4. Next, each thread retreives
+  16 indices. While doing so, indices are put into comparison using atomicExch
+  to determine if there are duplicates (tromp's method). At the end, if no
+  duplicates are found, candidate solution is saved (all indices). Note that this
+  dup check method is not exact so CPU dup checking is needed after.
+*/
+template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W>
+__global__ void digit_last_wdc(equi<RB, SM>* eq)
+{
+	__shared__ u8 shared_data[8192];
+	int* ht_len = (int*)(&shared_data[0]);
+	int* pairs = ht_len;
+	u32* lastword = (u32*)(&shared_data[256 * 4]);
+	u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]);
+	u32* pairs_len = (u32*)(&shared_data[8188]);
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	#pragma unroll
+	for (u32 i = 0; i < FCT; i++)
+		ht_len[(i * (256 / FCT)) + threadid] = 0;
+
+	if (threadid == ((256 / FCT) - 1))
+		*pairs_len = 0;
+
+	slottiny* buck = eq->treestiny[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD);
+
+	u32 si[3 * FCT];
+	u32 hr[3 * FCT];
+	int pos[3 * FCT];
+	u32 lw[3 * FCT];
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+		pos[i] = SSM;
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		si[i] = i * (256 / FCT) + threadid;
+		if (si[i] >= bsize) break;
+
+		const slottiny* pslot1 = buck + si[i];
+
+		// get xhash
+		uint2 tt = *(uint2*)(&pslot1->hash[0]);
+		lw[i] = tt.x;
+		lastword[si[i]] = lw[i];
+
+		u32 a;
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i]));
+		hr[i] = a;
+
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1))
+			ht[hr[i] * (SSM - 1) + pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		for (int k = 0; k != pos[i]; ++k)
+		{
+			u16 prev = ht[hr[i] * (SSM - 1) + k];
+			if (lw[i] != lastword[prev]) continue;
+			u32 pindex = atomicAdd(pairs_len, 1);
+			if (pindex >= MAXPAIRS) break;
+			pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+		}
+	}
+
+	__syncthreads();
+	u32 plen = umin(*pairs_len, 64);
+
+#define CALC_LEVEL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1]; \
+	levels[c] = bucks[slot0]; \
+				}
+
+#define CALC_LEVEL_SMALL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1].hash[d]; \
+	levels[c] = bucks[slot0].hash[d]; \
+				}
+
+	u32 lane = threadIdx.x & 0x1f;
+	u32 par = threadIdx.x >> 5;
+
+	u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)];
+	u32* susp = levels;
+
+	while (par < plen)
+	{
+		int pair = pairs[par];
+		par += W;
+
+		if (lane % 16 == 0)
+		{
+			u32 plvl;
+			if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1];
+			else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1];
+			slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)];
+			u32 slot1 = PACKER::get_slot1(plvl, RB, SM);
+			u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM);
+			levels[lane] = bucks[slot1].hash[2];
+			levels[lane + 8] = bucks[slot0].hash[2];
+		}
+
+		if (lane % 8 == 0)
+			CALC_LEVEL_SMALL(0, lane, lane + 4, 3);
+
+		if (lane % 4 == 0)
+			CALC_LEVEL_SMALL(2, lane, lane + 2, 3);
+
+		if (lane % 2 == 0)
+			CALC_LEVEL(0, lane, lane + 1, 4);
+
+		u32 ind[16];
+
+		u32 f1 = levels[lane];
+		const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0];
+		const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM);
+		const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM);
+
+		susp[lane] = 0xffffffff;
+		susp[32 + lane] = 0xffffffff;
+
+#define CHECK_DUP(a) \
+	__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS))
+
+		u32 f2 = buck_v4[slot1_v4].hash[1];
+		const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0];
+		const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM);
+		const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM);
+
+		susp[64 + lane] = 0xffffffff;
+		susp[96 + lane] = 0xffffffff;
+
+		u32 f0 = buck_v3_1[slot1_v3_1].hash[1];
+		const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)];
+		const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM);
+		const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM);
+
+		susp[128 + lane] = 0xffffffff;
+		susp[160 + lane] = 0xffffffff;
+
+		u32 f3 = buck_v2_1[slot1_v2_1].hash[6];
+		const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS);
+		const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS);
+
+		susp[192 + lane] = 0xffffffff;
+		susp[224 + lane] = 0xffffffff;
+
+		ind[0] = buck_fin_1[slot1_fin_1].hash[7];
+		if (CHECK_DUP(0)) continue;
+		ind[1] = buck_fin_1[slot0_fin_1].hash[7];
+		if (CHECK_DUP(1)) continue;
+
+		u32 f4 = buck_v2_1[slot0_v2_1].hash[6];
+		const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS);
+		const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS);
+
+		ind[2] = buck_fin_2[slot1_fin_2].hash[7];
+		if (CHECK_DUP(2)) continue;
+		ind[3] = buck_fin_2[slot0_fin_2].hash[7];
+		if (CHECK_DUP(3)) continue;
+
+		u32 f5 = buck_v3_1[slot0_v3_1].hash[1];
+		const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)];
+		const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM);
+		const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM);
+
+		u32 f6 = buck_v2_2[slot1_v2_2].hash[6];
+		const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS);
+		const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS);
+
+		ind[4] = buck_fin_3[slot1_fin_3].hash[7];
+		if (CHECK_DUP(4)) continue;
+		ind[5] = buck_fin_3[slot0_fin_3].hash[7];
+		if (CHECK_DUP(5)) continue;
+
+		u32 f7 = buck_v2_2[slot0_v2_2].hash[6];
+		const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS);
+		const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS);
+
+		ind[6] = buck_fin_4[slot1_fin_4].hash[7];
+		if (CHECK_DUP(6)) continue;
+		ind[7] = buck_fin_4[slot0_fin_4].hash[7];
+		if (CHECK_DUP(7)) continue;
+
+		u32 f8 = buck_v4[slot0_v4].hash[1];
+		const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0];
+		const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM);
+		const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM);
+
+		u32 f9 = buck_v3_2[slot1_v3_2].hash[1];
+		const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)];
+		const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM);
+		const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM);
+
+		u32 f10 = buck_v2_3[slot1_v2_3].hash[6];
+		const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS);
+		const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS);
+
+		ind[8] = buck_fin_5[slot1_fin_5].hash[7];
+		if (CHECK_DUP(8)) continue;
+		ind[9] = buck_fin_5[slot0_fin_5].hash[7];
+		if (CHECK_DUP(9)) continue;
+
+		u32 f11 = buck_v2_3[slot0_v2_3].hash[6];
+		const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS);
+		const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS);
+
+		ind[10] = buck_fin_6[slot1_fin_6].hash[7];
+		if (CHECK_DUP(10)) continue;
+		ind[11] = buck_fin_6[slot0_fin_6].hash[7];
+		if (CHECK_DUP(11)) continue;
+
+		u32 f12 = buck_v3_2[slot0_v3_2].hash[1];
+		const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)];
+		const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM);
+		const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM);
+
+		u32 f13 = buck_v2_4[slot1_v2_4].hash[6];
+		const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS);
+		const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS);
+
+		ind[12] = buck_fin_7[slot1_fin_7].hash[7];
+		if (CHECK_DUP(12)) continue;
+		ind[13] = buck_fin_7[slot0_fin_7].hash[7];
+		if (CHECK_DUP(13)) continue;
+
+		u32 f14 = buck_v2_4[slot0_v2_4].hash[6];
+		const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS);
+		const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS);
+
+		ind[14] = buck_fin_8[slot1_fin_8].hash[7];
+		if (CHECK_DUP(14)) continue;
+		ind[15] = buck_fin_8[slot0_fin_8].hash[7];
+		if (CHECK_DUP(15)) continue;
+
+		u32 soli;
+		if (lane == 0) {
+			soli = atomicAdd(&eq->edata.srealcont.nsols, 1);
+		}
+#if __CUDA_ARCH__ >= 300
+		// all threads get the value from lane 0
+		soli = __shfl2(soli, 0);
+#else
+		__syncthreads();
+		soli = eq->edata.srealcont.nsols;
+#endif
+		if (soli < MAXREALSOLS)
+		{
+			u32 pos = lane << 4;
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos     ]) = *(uint4*)(&ind[ 0]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  4]) = *(uint4*)(&ind[ 4]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  8]) = *(uint4*)(&ind[ 8]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]);
+		}
+	}
+}
+
+//std::mutex dev_init;
+int dev_init_done[MAX_GPUS] = { 0 };
+
+__host__
+static int compu32(const void *pa, const void *pb)
+{
+	uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb;
+	return a<b ? -1 : a == b ? 0 : +1;
+}
+
+__host__
+static bool duped(uint32_t* prf)
+{
+	uint32_t sortprf[512];
+	memcpy(sortprf, prf, sizeof(uint32_t) * 512);
+	qsort(sortprf, 512, sizeof(uint32_t), &compu32);
+	for (uint32_t i = 1; i<512; i++) {
+		if (sortprf[i] <= sortprf[i - 1])
+			return true;
+	}
+	return false;
+}
+
+__host__
+static void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t *b = a + len;
+	uint32_t  tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++) {
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+	}
+}
+
+__host__
+static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen)
+{
+	uint32_t le_N = WN;
+	uint32_t le_K = WK;
+	uchar personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(ctx, P);
+	eq_blake2b_update(ctx, (const uchar *)header, headerLen);
+	if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen);
+}
+
+#ifdef WIN32
+typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int);
+typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice);
+typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext);
+typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext);
+
+dec_cuDeviceGet _cuDeviceGet = nullptr;
+dec_cuCtxCreate _cuCtxCreate = nullptr;
+dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr;
+dec_cuCtxDestroy _cuCtxDestroy = nullptr;
+#endif
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int thr_id, int dev_id)
+{
+	thread_id = thr_id;
+	device_id = dev_id;
+	solutions = nullptr;
+	equi_mem_sz = sizeof(equi<RB, SM>);
+	throughput = NBLOCKS;
+	totalblocks = NBLOCKS/FD_THREADS;
+	threadsperblock = FD_THREADS;
+	threadsperblock_digits = THREADS;
+
+	//dev_init.lock();
+	if (!dev_init_done[device_id])
+	{
+		// only first thread shall init device
+		checkCudaErrors(cudaSetDevice(device_id));
+		checkCudaErrors(cudaDeviceReset());
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+
+		pctx = nullptr;
+	}
+	else
+	{
+		// create new context
+		CUdevice dev;
+
+#ifdef WIN32
+		if (_cuDeviceGet == nullptr)
+		{
+			HMODULE hmod = LoadLibraryA("nvcuda.dll");
+			if (hmod == NULL)
+				throw std::runtime_error("Failed to load nvcuda.dll");
+			_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet");
+			if (_cuDeviceGet == nullptr)
+				throw std::runtime_error("Failed to get cuDeviceGet address");
+			_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2");
+			if (_cuCtxCreate == nullptr)
+				throw std::runtime_error("Failed to get cuCtxCreate address");
+			_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2");
+			if (_cuCtxPushCurrent == nullptr)
+				throw std::runtime_error("Failed to get cuCtxPushCurrent address");
+			_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2");
+			if (_cuCtxDestroy == nullptr)
+				throw std::runtime_error("Failed to get cuCtxDestroy address");
+		}
+
+		checkCudaDriverErrors(_cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(_cuCtxPushCurrent(pctx));
+#else
+		checkCudaDriverErrors(cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(cuCtxPushCurrent(pctx));
+#endif
+	}
+	++dev_init_done[device_id];
+	//dev_init.unlock();
+
+	if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess)
+		throw std::runtime_error("CUDA: failed to alloc memory");
+
+	solutions = (scontainerreal*) malloc(sizeof(scontainerreal));
+	if (!solutions)
+		throw std::runtime_error("EOM: failed to alloc solutions memory");
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	fn_cancel cancelf,
+	fn_solution solutionf,
+	fn_hashdone hashdonef)
+{
+	blake2b_state blake_ctx;
+
+	int blocks = NBUCKETS;
+
+	setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len);
+
+	// todo: improve
+	// djezo solver allows last 4 bytes of nonce to be iterrated
+	// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h)
+	// then just iterate nn++
+	// less CPU load, 1 cudaMemcpy less -> faster
+	//u32 nn = *(u32*)&nonce[28];
+	u32 nn = 0;
+
+	checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata)));
+
+	digit_first<RB, SM, PACKER> <<<NBLOCKS / FD_THREADS, FD_THREADS >>>(device_eq, nn);
+
+	digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> <<<4096, 512 >>>(device_eq);
+	digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	if (cancelf(thread_id)) return;
+
+	digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+
+	digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> <<<4096, 256 / 2 >>>(device_eq);
+
+	checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost));
+
+	//printf("T%d nsols: %u\n", thread_id, solutions->nsols);
+	//if (solutions->nsols > 9)
+	//	printf("missing sol, total: %u\n", solutions->nsols);
+
+	for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++)
+	{
+		// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions)
+		if (duped(solutions->sols[s])) continue;
+
+		// perform sort of pairs
+		for (uint32_t level = 0; level < 9; level++)
+			for (uint32_t i = 0; i < (1 << 9); i += (2 << level))
+				sort_pair(&solutions->sols[s][i], 1 << level);
+
+		std::vector<uint32_t> index_vector(PROOFSIZE);
+		for (u32 i = 0; i < PROOFSIZE; i++) {
+			index_vector[i] = solutions->sols[s][i];
+		}
+
+		solutionf(thread_id, index_vector, DIGITBITS, nullptr);
+	}
+
+	// ccminer: only use hashdonef if no solutions...
+	if (!solutions->nsols)
+		hashdonef(thread_id);
+}
+
+// destructor
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::freemem()
+{
+	if (solutions)
+		free(solutions);
+
+	if (device_eq) {
+		cudaFree(device_eq);
+		device_eq = NULL;
+	}
+
+	if (pctx) {
+		// non primary thread, destroy context
+#ifdef WIN32
+		checkCudaDriverErrors(_cuCtxDestroy(pctx));
+#else
+		checkCudaDriverErrors(cuCtxDestroy(pctx));
+#endif
+	} else {
+		checkCudaErrors(cudaDeviceReset());
+		dev_init_done[device_id] = 0;
+	}
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	freemem();
+}
+
+#ifdef CONFIG_MODE_1
+template class eq_cuda_context<CONFIG_MODE_1>;
+#endif
+
+#ifdef CONFIG_MODE_2
+template class eq_cuda_context<CONFIG_MODE_2>;
+#endif
+
+#ifdef CONFIG_MODE_3
+template class eq_cuda_context<CONFIG_MODE_3>;
+#endif
diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp
index a643381ea9..fa15fa7b1e 100644
--- a/equi/equi-stratum.cpp
+++ b/equi/equi-stratum.cpp
@@ -118,7 +118,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
 
 bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
-	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime;
+	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime, *hash_version = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int ntime, i, p=0;
@@ -130,6 +130,7 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	stime = json_string_value(json_array_get(params, p++));
 	nbits = json_string_value(json_array_get(params, p++));
 	clean = json_is_true(json_array_get(params, p)); p++;
+	hash_version = json_string_value(json_array_get(params, p++));
 
 	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
@@ -138,7 +139,7 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
 		applog(LOG_ERR, "Stratum notify: invalid parameters");
 		goto out;
 	}
-
+	hex2bin(&sctx->job.hash_ver, hash_version, 1);
 	/* store stratum server time diff */
 	hex2bin((uchar *)&ntime, stime, 4);
 	ntime = ntime - (int) time(0);
diff --git a/miner.h b/miner.h
index 6375fe7393..b67aaae301 100644
--- a/miner.h
+++ b/miner.h
@@ -308,24 +308,7 @@ struct cgpu_info {
 	uint32_t rejected;
 	double khashes;
 	int has_monitoring;
-	float gpu_temp;
-	uint16_t gpu_fan;
-	uint16_t gpu_fan_rpm;
-	uint16_t gpu_arch;
-	uint32_t gpu_clock;
-	uint32_t gpu_memclock;
-	uint64_t gpu_mem;
-	uint64_t gpu_memfree;
-	uint32_t gpu_power;
-	uint32_t gpu_plimit;
-	double gpu_vddc;
-	int16_t gpu_pstate;
-	int16_t gpu_bus;
-	uint16_t gpu_vid;
-	uint16_t gpu_pid;
-
-	int8_t nvml_id;
-	int8_t nvapi_id;
+	
 
 	char gpu_sn[64];
 	char gpu_desc[64] = "cpu";
@@ -442,7 +425,7 @@ extern uint64_t net_hashrate;
 extern double net_diff;
 extern double stratum_diff;
 
-#define MAX_GPUS 80
+#define MAX_GPUS 140
 //#define MAX_THREADS 32 todo
 extern char* device_name[MAX_GPUS];
 extern short device_map[MAX_GPUS];
@@ -554,6 +537,7 @@ struct stratum_job {
 	uint32_t height;
 	uint32_t shares_count;
 	double diff;
+	int hash_ver;
 };
 
 struct stratum_ctx {
@@ -627,6 +611,7 @@ struct work {
 	struct tx txs[POK_MAX_TXS];
 	// zec solution
 	uint8_t extra[1388];
+	int hash_ver;
 };
 
 #define POK_BOOL_MASK 0x00008000
@@ -770,71 +755,7 @@ void applog_hash64(void *hash);
 void applog_compare_hash(void *hash, void *hash_ref);
 
 void print_hash_tests(void);
-void allium_hash(void *state, const void *input);
-void bastionhash(void* output, const unsigned char* input);
-void blake256hash(void *output, const void *input, int8_t rounds);
-void blake2b_hash(void *output, const void *input);
-void blake2s_hash(void *output, const void *input);
-void bmw_hash(void *state, const void *input);
-void c11hash(void *output, const void *input);
-void cryptolight_hash_variant(void* output, const void* input, int len, int variant);
-void cryptolight_hash(void* output, const void* input);
-void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
-void cryptonight_hash(void* output, const void* input);
-void monero_hash(void* output, const void* input);
-void stellite_hash(void* output, const void* input);
-void decred_hash(void *state, const void *input);
-void deephash(void *state, const void *input);
-void luffa_hash(void *state, const void *input);
-void fresh_hash(void *state, const void *input);
-void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
-void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
-void hmq17hash(void *output, const void *input);
-void hsr_hash(void *output, const void *input);
-void keccak256_hash(void *state, const void *input);
-void jackpothash(void *state, const void *input);
-void groestlhash(void *state, const void *input);
-void jha_hash(void *output, const void *input);
-void lbry_hash(void *output, const void *input);
-void lyra2re_hash(void *state, const void *input);
-void lyra2v2_hash(void *state, const void *input);
-void lyra2Z_hash(void *state, const void *input);
-void myriadhash(void *state, const void *input);
-void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
-void nist5hash(void *state, const void *input);
-void pentablakehash(void *output, const void *input);
-void phi_hash(void *output, const void *input);
-void phi2_hash(void *output, const void *input);
-void polytimos_hash(void *output, const void *input);
-void quarkhash(void *state, const void *input);
-void qubithash(void *state, const void *input);
-void scrypthash(void* output, const void* input);
-void scryptjane_hash(void* output, const void* input);
-void sha256d_hash(void *output, const void *input);
-void sha256t_hash(void *output, const void *input);
-void sibhash(void *output, const void *input);
-void skeincoinhash(void *output, const void *input);
-void skein2hash(void *output, const void *input);
-void skunk_hash(void *state, const void *input);
-void s3hash(void *output, const void *input);
-void timetravel_hash(void *output, const void *input);
-void bitcore_hash(void *output, const void *input);
-void tribus_hash(void *output, const void *input);
-void veltorhash(void *output, const void *input);
-void wcoinhash(void *state, const void *input);
-void whirlxHash(void *state, const void *input);
-void x11evo_hash(void *output, const void *input);
-void x11hash(void *output, const void *input);
-void x12hash(void *output, const void *input);
-void x13hash(void *output, const void *input);
-void x14hash(void *output, const void *input);
-void x15hash(void *output, const void *input);
-void x16r_hash(void *output, const void *input);
-void x16s_hash(void *output, const void *input);
-void x17hash(void *output, const void *input);
-void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
-void zr5hash(void *output, const void *input);
-void zr5hash_pok(void *output, uint32_t *pdata);
+
 
 #ifdef __cplusplus
 }
diff --git a/verus/verus_clhash.cpp b/verus/verus_clhash.cpp
index 93b7a6e17b..48f277073f 100644
--- a/verus/verus_clhash.cpp
+++ b/verus/verus_clhash.cpp
@@ -34,65 +34,62 @@
 int __cpuverusoptimized = 0x80;
 
 // multiply the length and the some key, no modulo
-static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
+__m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
     const __m128i lengthvector = _mm_set_epi64x(keylength,length);
     const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10);
     return clprod1;
 }
 
 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
-static inline __m128i precompReduction64_si128( __m128i A) {
+ __m128i precompReduction64_si128( __m128i A) {
 
     //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
     const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
-    __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
-    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+   const  __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
+    const __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
                                   _mm_srli_si128(Q2,8));
-    __m128i Q4 = _mm_xor_si128(Q2,A);
+    const __m128i Q4 = _mm_xor_si128(Q2,A);
     const __m128i final = _mm_xor_si128(Q3,Q4);
     return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
 }
 
-static inline uint64_t precompReduction64( __m128i A) {
+ uint64_t precompReduction64( __m128i A) {
     return _mm_cvtsi128_si64(precompReduction64_si128(A));
 }
 
 // verus intermediate hash extra
-static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, 
+ __m128i __verusclmulwithoutreduction64alignedrepeatv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, 
 	uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
 {
-    __m128i *pbuf;
-	__m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
+    const __m128i *pbuf;
+	const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
 
     // divide key mask by 16 from bytes to __m128i
-    keyMask >>= 4;
+   // keyMask >>= 4;
+	
+	__m128i acc = _mm_load_si128(randomsource + 513);
 	
-	__m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
-
     // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
     // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
     // be used to xor into the accumulator before it is hashed with other values first
-#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
 
-#define LIONELK_FETCH_DIST 0
+//#pragma unroll 32
 
-#pragma unroll 32
-
-	for (uint64_t i = 0; i < 32; i++)
+	for (uint32_t i = 0; i < 32; i++)
     {
 		  
 		const uint64_t selector = _mm_cvtsi128_si64(acc);
 
+	uint32_t prand_idx = (selector >> 5) & keyMask;
+	uint32_t prandex_idx = (selector >>32) & keyMask;
 	// get two random locations in the key, which will be mutated and swapped
-	__m128i *prand = randomsource + ((selector >> 5) & keyMask);
-	__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+	__m128i *prand = randomsource + prand_idx;
+	__m128i *prandex = randomsource + prandex_idx;
 
 	// select random start and order of pbuf processing
 	pbuf = pbuf_copy + (selector & 3);
-	uint32_t prand_idx = (selector >> 5) & keyMask;
-	uint32_t prandex_idx = (selector >>32) & keyMask;
-	g_prand[i] = prand[0];
-	g_prandex[i] = prandex[0];
+	_mm_store_si128(&g_prand[i] , prand[0]);
+	_mm_store_si128(&g_prandex[i], prandex[0]);
 	fixrand[i] = prand_idx;
 	fixrandex[i] = prandex_idx;
 
@@ -101,140 +98,136 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 
             case 0:
             {
-                const __m128i temp1 = _mm_load_si128(prandex);
-                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                const  __m128i temp1 = _mm_load_si128(prandex);
+				const   __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+				const   __m128i add1 = _mm_xor_si128(temp1, temp2);
+				const   __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
                 acc = _mm_xor_si128(clprod1, acc);
 
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+				const    __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+				const    __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
 
-                const __m128i temp12 = _mm_load_si128(prand);
+				const   __m128i temp12 = _mm_load_si128(prand);
                 _mm_store_si128(prand, tempa2);
 
-                const __m128i temp22 = _mm_load_si128(pbuf);
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+				const   __m128i temp22 = _mm_load_si128(pbuf);
+				const    __m128i add12 = _mm_xor_si128(temp12, temp22);
+				const   __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
                 acc = _mm_xor_si128(clprod12, acc);
 
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+
+				const   __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+				const   __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
                 _mm_store_si128(prandex, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 				
                 break;
             }
             case 4:
             {
-                const __m128i temp1 = _mm_load_si128(prand);
-                const __m128i temp2 = _mm_load_si128(pbuf);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                 const  __m128i temp1 = _mm_load_si128(prand);
+                 const  __m128i temp2 = _mm_load_si128(pbuf);
+                 const  __m128i add1 = _mm_xor_si128(temp1, temp2);
+                 const  __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
                 acc = _mm_xor_si128(clprod1, acc);
-                const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
+                  __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
                 acc = _mm_xor_si128(clprod2, acc);
 
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+                 const  __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                 const  __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
 
-                const __m128i temp12 = _mm_load_si128(prandex);
+                 const  __m128i temp12 = _mm_load_si128(prandex);
                 _mm_store_si128(prandex, tempa2);
 
-                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                 const  __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                 const  __m128i add12 = _mm_xor_si128(temp12, temp22);
                 acc = _mm_xor_si128(add12, acc);
 
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+
+                 const  __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                 const  __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
                 _mm_store_si128(prand, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 
                 break;
             }
             case 8:
             {
-                const __m128i temp1 = _mm_load_si128(prandex);
-                const __m128i temp2 = _mm_load_si128(pbuf);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                 const  __m128i temp1 = _mm_load_si128(prandex);
+                 const  __m128i temp2 = _mm_load_si128(pbuf);
+                 const  __m128i add1 = _mm_xor_si128(temp1, temp2);
                 acc = _mm_xor_si128(add1, acc);
 
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+                 const  __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                 const  __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
 
-                const __m128i temp12 = _mm_load_si128(prand);
+                 const  __m128i temp12 = _mm_load_si128(prand);
                 _mm_store_si128(prand, tempa2);
 
-                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                 const  __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                 const  __m128i add12 = _mm_xor_si128(temp12, temp22);
+                 const  __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
                 acc = _mm_xor_si128(clprod12, acc);
-                const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                 const  __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
                 acc = _mm_xor_si128(clprod22, acc);
 
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+
+                 const  __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                 const  __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
                 _mm_store_si128(prandex, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 
                 break;
             }
             case 0xc:
             {
-                const __m128i temp1 = _mm_load_si128(prand);
-                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                 const  __m128i temp1 = _mm_load_si128(prand);
+                 const  __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                 const  __m128i add1 = _mm_xor_si128(temp1, temp2);
 
                 // cannot be zero here
-                const int32_t divisor = (uint32_t)selector;
+                  int32_t divisor = (uint32_t)selector;
 
                 acc = _mm_xor_si128(add1, acc);
 
-                const int64_t dividend = _mm_cvtsi128_si64(acc);
-                const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+                  int64_t dividend = _mm_cvtsi128_si64(acc);
+                 const  __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
                 acc = _mm_xor_si128(modulo, acc);
 
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+                 const  __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                 const  __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
 
                 if (dividend & 1)
                 {
-                    const __m128i temp12 = _mm_load_si128(prandex);
+                     const  __m128i temp12 = _mm_load_si128(prandex);
                     _mm_store_si128(prandex, tempa2);
 
-                    const __m128i temp22 = _mm_load_si128(pbuf);
-                    const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                    const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                     const  __m128i temp22 = _mm_load_si128(pbuf);
+                     const  __m128i add12 = _mm_xor_si128(temp12, temp22);
+                     const  __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
                     acc = _mm_xor_si128(clprod12, acc);
-                    const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                     const  __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
                     acc = _mm_xor_si128(clprod22, acc);
 
-                    const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                    const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                     const  __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                     const  __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
                     _mm_store_si128(prand, tempb2);
                 }
                 else
                 {
-                    const __m128i tempb3 = _mm_load_si128(prandex);
+
+                     const  __m128i tempb3 = _mm_load_si128(prandex);
                     _mm_store_si128(prandex, tempa2);
                     _mm_store_si128(prand, tempb3);
                 }
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 
                 break;
             }
             case 0x10:
             {
                 // a few AES operations
-                const __m128i *rc = prand;
-                __m128i tmp;
+                 const  __m128i *rc = prand;
+                 __m128i tmp;
 
-                __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                __m128i temp2 = _mm_load_si128(pbuf);
+                 __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                 __m128i temp2 = _mm_load_si128(pbuf);
 
                 AES2(temp1, temp2, 0);
 
@@ -248,28 +241,27 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 
                 acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
 
-                const __m128i tempa1 = _mm_load_si128(prand);
-                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
-                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+                 const  __m128i tempa1 = _mm_load_si128(prand);
+                 const  __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                 const  __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
 
-                const __m128i tempa4 = _mm_load_si128(prandex);
+                 const  __m128i tempa4 = _mm_load_si128(prandex);
                 _mm_store_si128(prandex, tempa3);
                 _mm_store_si128(prand, tempa4);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
+
 
                 break;
             }
             case 0x14:
             {
                 // we'll just call this one the monkins loop, inspired by Chris
-                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
-                __m128i tmp; // used by MIX2
+                 const  __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+                 __m128i tmp; // used by MIX2
 
                 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
-                __m128i *rc = prand;
+               const  __m128i *rc = prand;
                 uint64_t aesroundoffset = 0,loop_c;
-                __m128i onekey;
+                 __m128i onekey;
 
                 do
                 {
@@ -277,15 +269,15 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 					if (loop_c)
                     {
                         onekey = _mm_load_si128(rc++);
-                        const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
-                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
-                        const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                         const  __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+                         const  __m128i add1 = _mm_xor_si128(onekey, temp2);
+                         const  __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
                         acc = _mm_xor_si128(clprod1, acc);
                     }
                     else
                     {
                         onekey = _mm_load_si128(rc++);
-                        __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+                         __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
 						
                         AES2(onekey, temp2, aesroundoffset);
 						
@@ -298,21 +290,20 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 			
                 } while (rounds--);
 
-                const __m128i tempa1 = _mm_load_si128(prand);
-                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
-                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
 
-                const __m128i tempa4 = _mm_load_si128(prandex);
+                 const  __m128i tempa1 = _mm_load_si128(prand);
+                 const  __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                 const  __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+                 const  __m128i tempa4 = _mm_load_si128(prandex);
                 _mm_store_si128(prandex, tempa3);
                 _mm_store_si128(prand, tempa4);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 
                 break;
             }
             case 0x18:
             {
-				__m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+				const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
 				__m128i tmp; // used by MIX2
 
 				uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
@@ -352,16 +343,16 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
             }
             case 0x1c:
             {
-                const __m128i temp1 = _mm_load_si128(pbuf);
-                const __m128i temp2 = _mm_load_si128(prandex);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                 const  __m128i temp1 = _mm_load_si128(pbuf);
+                 const  __m128i temp2 = _mm_load_si128(prandex);
+                 const  __m128i add1 = _mm_xor_si128(temp1, temp2);
+                 const  __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
                 acc = _mm_xor_si128(clprod1, acc);
 
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+                 const  __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+                 const  __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
 
-                const __m128i tempa3 = _mm_load_si128(prand);
+                 const  __m128i tempa3 = _mm_load_si128(prand);
 #ifdef VERUSHASHDEBUGo
 
 				printf("[cpu] tempa1    : ");
@@ -379,12 +370,11 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 
                 acc = _mm_xor_si128(tempa3, acc);
 
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
+
+                 const  __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
+                 const  __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
                 _mm_store_si128(prandex, tempb2);
 
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
 
                 break;
             }
@@ -397,179 +387,314 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
     return acc;
 }
 
+__m128i __verusclmulwithoutreduction64alignedrepeatv2_2(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask,
+	uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
+{
+	const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
+	const __m128i *pbuf;
+
+	// divide key mask by 16 from bytes to __m128i
+	keyMask >>= 4;
+
+	// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
+	// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
+	// be used to xor into the accumulator before it is hashed with other values first
+	__m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
+
+	for (int64_t i = 0; i < 32; i++)
+	{
+		const uint64_t selector = _mm_cvtsi128_si64(acc);
+
+		uint32_t prand_idx = (selector >> 5) & keyMask;
+		uint32_t prandex_idx = (selector >> 32) & keyMask;
+		// get two random locations in the key, which will be mutated and swapped
+		__m128i *prand = randomsource + prand_idx;
+		__m128i *prandex = randomsource + prandex_idx;
+
+		// select random start and order of pbuf processing
+		pbuf = pbuf_copy + (selector & 3);
+		_mm_store_si128(&g_prand[i], prand[0]);
+		_mm_store_si128(&g_prandex[i], prandex[0]);
+		fixrand[i] = prand_idx;
+		fixrandex[i] = prandex_idx;
+
+		switch (selector & 0x1c)
+		{
+		case 0:
+		{
+			const __m128i temp1 = _mm_load_si128(prandex);
+			const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf);
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+			acc = _mm_xor_si128(clprod12, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		case 4:
+		{
+			const __m128i temp1 = _mm_load_si128(prand);
+			const __m128i temp2 = _mm_load_si128(pbuf);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+			const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
+			acc = _mm_xor_si128(clprod2, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			acc = _mm_xor_si128(add12, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prand, tempb2);
+			break;
+		}
+		case 8:
+		{
+			const __m128i temp1 = _mm_load_si128(prandex);
+			const __m128i temp2 = _mm_load_si128(pbuf);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			acc = _mm_xor_si128(add1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+			acc = _mm_xor_si128(clprod12, acc);
+			const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+			acc = _mm_xor_si128(clprod22, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		case 0xc:
+		{
+			const __m128i temp1 = _mm_load_si128(prand);
+			const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+
+			// cannot be zero here
+			const int32_t divisor = (uint32_t)selector;
+
+			acc = _mm_xor_si128(add1, acc);
+
+			const int64_t dividend = _mm_cvtsi128_si64(acc);
+			const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+			acc = _mm_xor_si128(modulo, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			if (dividend & 1)
+			{
+				const __m128i temp12 = _mm_load_si128(prandex);
+				_mm_store_si128(prandex, tempa2);
+
+				const __m128i temp22 = _mm_load_si128(pbuf);
+				const __m128i add12 = _mm_xor_si128(temp12, temp22);
+				const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+				acc = _mm_xor_si128(clprod12, acc);
+				const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+				acc = _mm_xor_si128(clprod22, acc);
+
+				const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+				const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+				_mm_store_si128(prand, tempb2);
+			}
+			else
+			{
+				const __m128i tempb3 = _mm_load_si128(prandex);
+				_mm_store_si128(prandex, tempa2);
+				_mm_store_si128(prand, tempb3);
+				const __m128i tempb4 = _mm_load_si128(pbuf);
+				acc = _mm_xor_si128(tempb4, acc);
+			}
+			break;
+		}
+		case 0x10:
+		{
+			// a few AES operations
+			const __m128i *rc = prand;
+			__m128i tmp;
+
+			__m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			__m128i temp2 = _mm_load_si128(pbuf);
+
+			AES2(temp1, temp2, 0);
+			MIX2(temp1, temp2);
+
+			AES2(temp1, temp2, 4);
+			MIX2(temp1, temp2);
+
+			AES2(temp1, temp2, 8);
+			MIX2(temp1, temp2);
+
+			acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
+
+			const __m128i tempa1 = _mm_load_si128(prand);
+			const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+			const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+			const __m128i tempa4 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa3);
+			_mm_store_si128(prand, tempa4);
+			break;
+		}
+		case 0x14:
+		{
+			// we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop
+			const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+			__m128i tmp; // used by MIX2
+
+			uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+			__m128i *rc = prand;
+			uint64_t aesroundoffset = 0;
+			__m128i onekey;
+
+			do
+			{
+				if (selector & (((uint64_t)0x10000000) << rounds))
+				{
+					onekey = _mm_load_si128(rc++);
+					const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+					const __m128i add1 = _mm_xor_si128(onekey, temp2);
+					const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+					acc = _mm_xor_si128(clprod1, acc);
+				}
+				else
+				{
+					onekey = _mm_load_si128(rc++);
+					__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+					AES2(onekey, temp2, aesroundoffset);
+					aesroundoffset += 4;
+					MIX2(onekey, temp2);
+					acc = _mm_xor_si128(onekey, acc);
+					acc = _mm_xor_si128(temp2, acc);
+				}
+			} while (rounds--);
+
+			const __m128i tempa1 = _mm_load_si128(prand);
+			const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+			const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+			const __m128i tempa4 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa3);
+			_mm_store_si128(prand, tempa4);
+			break;
+		}
+		case 0x18:
+		{
+			const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+			__m128i tmp; // used by MIX2
+
+			uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+			__m128i *rc = prand;
+			__m128i onekey;
+
+			do
+			{
+				if (selector & (((uint64_t)0x10000000) << rounds))
+				{
+					onekey = _mm_load_si128(rc++);
+					const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+					onekey = _mm_xor_si128(onekey, temp2);
+					// cannot be zero here, may be negative
+					const int32_t divisor = (uint32_t)selector;
+					const int64_t dividend = _mm_cvtsi128_si64(onekey);
+					const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+					acc = _mm_xor_si128(modulo, acc);
+				}
+				else
+				{
+					onekey = _mm_load_si128(rc++);
+					__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+					const __m128i add1 = _mm_xor_si128(onekey, temp2);
+					onekey = _mm_clmulepi64_si128(add1, add1, 0x10);
+					const __m128i clprod2 = _mm_mulhrs_epi16(acc, onekey);
+					acc = _mm_xor_si128(clprod2, acc);
+				}
+			} while (rounds--);
+
+			const __m128i tempa3 = _mm_load_si128(prandex);
+			const __m128i tempa4 = _mm_xor_si128(tempa3, acc);
+
+			_mm_store_si128(prandex, onekey);
+			_mm_store_si128(prand, tempa4);
+			break;
+		}
+		case 0x1c:
+		{
+			const __m128i temp1 = _mm_load_si128(pbuf);
+			const __m128i temp2 = _mm_load_si128(prandex);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+
+			const __m128i tempa3 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			acc = _mm_xor_si128(tempa3, acc);
+			const __m128i temp4 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			acc = _mm_xor_si128(temp4, acc);
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		}
+	}
+	return acc;
+}
+
 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
 // returning a 64 bit hash value
-uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
+uint64_t verusclhashv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
 	u128 *g_prand, u128 *g_prandex) {
-    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex);
+    __m128i  acc = __verusclmulwithoutreduction64alignedrepeatv2_1((__m128i *)random, (const __m128i *)buf, 511, fixrand, fixrandex, g_prand, g_prandex);
     acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
 
 
     return precompReduction64(acc);
 }
 
-inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) {
-  u128 s[4], tmp;
-
-  s[0] = LOAD(in);
-  s[1] = LOAD(in + 16);
-  s[2] = LOAD(in + 32);
-  s[3] = LOAD(in + 48);
-
-  AES4(s[0], s[1], s[2], s[3], 0);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 8);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 16);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 24);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 32);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  s[0] = _mm_xor_si128(s[0], LOAD(in));
-  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
-  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
-  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
-
- // TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
-}
-/*
-void cpu_verushash::solve_verus_v2_opt(CBlockHeader &bh, 
-	arith_uint256 &target,
-	std::function<bool()> cancelf,
-	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
-	std::function<void(void)> hashdonef,
-	cpu_verushash &device_context)
-{
-	CVerusHashV2bWriter &vhw = *(device_context.pVHW2b);
-	CVerusHashV2 &vh = vhw.GetState();
-    verusclhasher &vclh = vh.vclh;
-
-	alignas(32) uint256 curHash, curTarget = ArithToUint256(target);
-
-    const uint64_t *compResult = (uint64_t *)&curHash;
-    const uint64_t *compTarget = (uint64_t *)&curTarget;
-
-    u128 *hashKey = (u128 *)verusclhasher_key.get();
-    verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
-    void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes;
-    const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks
-
-	bh.nSolution = std::vector<unsigned char>(1344);
-	bh.nSolution[0] = VERUSHHASH_SOLUTION_VERSION; // earliest VerusHash 2.0 solution version
-
-	// prepare the hash state
-	vhw.Reset();
-	vhw << bh;
-
-	int64_t *extraPtr = vhw.xI64p();
-	unsigned char *curBuf = vh.CurBuffer();
-
-    // skip keygen if it is the current key
-    if (pdesc->seed != *((uint256 *)curBuf))
-    {
-        // generate a new key by chain hashing with Haraka256 from the last curbuf
-        // assume 256 bit boundary
-        int n256blks = pdesc->keySizeInBytes >> 5;
-        unsigned char *pkey = ((unsigned char *)hashKey);
-        unsigned char *psrc = curBuf;
-        for (int i = 0; i < n256blks; i++)
-        {
-            haraka256(pkey, psrc);
-            psrc = pkey;
-            pkey += 32;
-        }
-        pdesc->seed = *((uint256 *)curBuf);
-        memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes);
-    }
-
-    const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0);
-    const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1);
-    const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0);
-    unsigned char ch = curBuf[0];
-
-	// loop the requested number of times or until canceled. determine if we 
-	// found a winner, and send all winners found as solutions. count only one hash. 
-	// hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only
-	// hashrate and sharerate are valid, solutionrate will equal sharerate
-	for (int64_t i = 0; i < VERUSHASHES_PER_SOLVE; i++)
-	{
-		*extraPtr = i;
-
-		// prepare the buffer
-        _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1);
-        curBuf[32 + 15] = ch;
-
-		// run verusclhash on the buffer
-		const uint64_t intermediate = vclh(curBuf, hashKey);
-
-		// fill buffer to the end with the result and final hash
-        __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2);
-        _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2);
-        curBuf[32 + 15] = *((unsigned char *)&intermediate);
-
-		haraka512_keyed_local((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate));
-
-        if (compResult[3] > compTarget[3] || (compResult[3] == compTarget[3] && compResult[2] > compTarget[2]) ||
-            (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] > compTarget[1]) ||
-            (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] == compTarget[1] && compResult[0] > compTarget[0]))
-        {
-            // refresh the key
-            memcpy(hashKey, hasherrefresh, keyrefreshsize);
-			continue;
-        }
-
-        std::vector<unsigned char> solution = bh.nSolution;
-		int extraSpace = (solution.size() % 32) + 15;
-		assert(solution.size() > 32);
-		*((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i;
-
-		solutionf(std::vector<uint32_t>(0), solution.size(), solution.data());
-		if (cancelf()) return;
-
-        // refresh the key
-        memcpy(hashKey, hasherrefresh, keyrefreshsize);
-	}
-	hashdonef();
-}
-
-
-void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
-  u128 s[4], tmp;
-
-  s[0] = LOAD(in);
-  s[1] = LOAD(in + 16);
-  s[2] = LOAD(in + 32);
-  s[3] = LOAD(in + 48);
-
-  AES4(s[0], s[1], s[2], s[3], 0);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 8);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 16);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 24);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 32);
-  MIX4(s[0], s[1], s[2], s[3]);
+uint64_t verusclhashv2_2(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
+	u128 *g_prand, u128 *g_prandex) {
+	__m128i  acc = __verusclmulwithoutreduction64alignedrepeatv2_2((__m128i *)random, (const __m128i *)buf, 511, fixrand, fixrandex, g_prand, g_prandex);
+	acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
 
-  s[0] = _mm_xor_si128(s[0], LOAD(in));
-  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
-  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
-  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
 
-  TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
+	return precompReduction64(acc);
 }
-*/
 
 #ifdef _WIN32
 
diff --git a/verus/verus_clhash.h b/verus/verus_clhash.h
index ea3e94b3a2..5a5a2723f9 100644
--- a/verus/verus_clhash.h
+++ b/verus/verus_clhash.h
@@ -123,7 +123,9 @@ inline void ForceCPUVerusOptimized(bool trueorfalse)
     __cpuverusoptimized = trueorfalse;
 };
 
-uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
+uint64_t verusclhashv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
+	u128 *g_prand, u128 *g_prandex);
+uint64_t verusclhashv2_2(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
 	u128 *g_prand, u128 *g_prandex);
 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
 	u128 *g_prand, u128 *g_prandex);
diff --git a/verus/verus_hash.cpp b/verus/verus_hash.cpp
index 93b7a6e17b..8b153e83cd 100644
--- a/verus/verus_hash.cpp
+++ b/verus/verus_hash.cpp
@@ -1,590 +1,181 @@
-/*
- * This uses veriations of the clhash algorithm for Verus Coin, licensed
- * with the Apache-2.0 open source license.
- * 
- * Copyright (c) 2018 Michael Toutonghi
- * Distributed under the Apache 2.0 software license, available in the original form for clhash
- * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
- * 
- * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
- * Faster 64-bit universal hashing
- * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
- *
- * Best used on recent x64 processors (Haswell or better).
- * 
- * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
- * is to more effectively equalize FPGAs over GPUs and CPUs.
- *
- **/
-
-
-#include "verus_clhash.h"
+// (C) 2018 The Verus Developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
-
-#include <assert.h>
+/*
+This provides the PoW hash function for Verus, a CPU-optimized hash 
+function with a Haraka V2 core. Unlike Haraka, which is made for short 
+inputs only, Verus Hash takes any length of input and produces a 256 
+bit output.
+*/
 #include <string.h>
-//#include <intrin.h>
-//#include "cpu_verushash.hpp"
-
-#ifdef _WIN32
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
-#endif
+//#include "common.h"
+#include "verus_hash.h"
 
+void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in);
 
-int __cpuverusoptimized = 0x80;
-
-// multiply the length and the some key, no modulo
-static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
-    const __m128i lengthvector = _mm_set_epi64x(keylength,length);
-    const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10);
-    return clprod1;
-}
-
-// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
-static inline __m128i precompReduction64_si128( __m128i A) {
-
-    //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
-    const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
-    __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
-    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
-                                  _mm_srli_si128(Q2,8));
-    __m128i Q4 = _mm_xor_si128(Q2,A);
-    const __m128i final = _mm_xor_si128(Q3,Q4);
-    return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
-}
+void CVerusHash::Hash(void *result, const void *data, size_t _len)
+{
+    unsigned char buf[128];
+    unsigned char *bufPtr = buf;
+    int nextOffset = 64;
+    uint32_t pos = 0, len = _len;
+    unsigned char *bufPtr2 = bufPtr + nextOffset;
+    unsigned char *ptr = (unsigned char *)data;
+
+    // put our last result or zero at beginning of buffer each time
+    memset(bufPtr, 0, 32);
+
+    // digest up to 32 bytes at a time
+    for ( ; pos < len; pos += 32)
+    {
+        if (len - pos >= 32)
+        {
+            memcpy(bufPtr + 32, ptr + pos, 32);
+        }
+        else
+        {
+            int i = (int)(len - pos);
+            memcpy(bufPtr + 32, ptr + pos, i);
+            memset(bufPtr + 32 + i, 0, 32 - i);
+        }
+        (*haraka512Function)(bufPtr2, bufPtr);
+        bufPtr2 = bufPtr;
+        bufPtr += nextOffset;
+        nextOffset *= -1;
+    }
+    memcpy(result, bufPtr, 32);
+};
 
-static inline uint64_t precompReduction64( __m128i A) {
-    return _mm_cvtsi128_si64(precompReduction64_si128(A));
+void CVerusHash::init()
+{
+   
+        haraka512Function = &haraka512_port_zero;
+    
 }
 
-// verus intermediate hash extra
-static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, 
-	uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
+CVerusHash &CVerusHash::Write(const unsigned char *data, size_t _len)
 {
-    __m128i *pbuf;
-	__m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
-
-    // divide key mask by 16 from bytes to __m128i
-    keyMask >>= 4;
-	
-	__m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
-
-    // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
-    // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
-    // be used to xor into the accumulator before it is hashed with other values first
-#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
+    unsigned char *tmp;
+    uint32_t pos, len = _len;
 
-#define LIONELK_FETCH_DIST 0
-
-#pragma unroll 32
-
-	for (uint64_t i = 0; i < 32; i++)
+    // digest up to 32 bytes at a time
+    for ( pos = 0; pos < len; )
     {
-		  
-		const uint64_t selector = _mm_cvtsi128_si64(acc);
-
-	// get two random locations in the key, which will be mutated and swapped
-	__m128i *prand = randomsource + ((selector >> 5) & keyMask);
-	__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
-
-	// select random start and order of pbuf processing
-	pbuf = pbuf_copy + (selector & 3);
-	uint32_t prand_idx = (selector >> 5) & keyMask;
-	uint32_t prandex_idx = (selector >>32) & keyMask;
-	g_prand[i] = prand[0];
-	g_prandex[i] = prandex[0];
-	fixrand[i] = prand_idx;
-	fixrandex[i] = prandex_idx;
+        uint32_t room = 32 - curPos;
 
-        switch (selector & 0x1c)
+        if (len - pos >= room)
         {
-
-            case 0:
-            {
-                const __m128i temp1 = _mm_load_si128(prandex);
-                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
-                acc = _mm_xor_si128(clprod1, acc);
-
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
-
-                const __m128i temp12 = _mm_load_si128(prand);
-                _mm_store_si128(prand, tempa2);
-
-                const __m128i temp22 = _mm_load_si128(pbuf);
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
-                acc = _mm_xor_si128(clprod12, acc);
-
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
-                _mm_store_si128(prandex, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-				
-                break;
-            }
-            case 4:
-            {
-                const __m128i temp1 = _mm_load_si128(prand);
-                const __m128i temp2 = _mm_load_si128(pbuf);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
-                acc = _mm_xor_si128(clprod1, acc);
-                const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
-                acc = _mm_xor_si128(clprod2, acc);
-
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
-
-                const __m128i temp12 = _mm_load_si128(prandex);
-                _mm_store_si128(prandex, tempa2);
-
-                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                acc = _mm_xor_si128(add12, acc);
-
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
-                _mm_store_si128(prand, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
-            case 8:
-            {
-                const __m128i temp1 = _mm_load_si128(prandex);
-                const __m128i temp2 = _mm_load_si128(pbuf);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                acc = _mm_xor_si128(add1, acc);
-
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
-
-                const __m128i temp12 = _mm_load_si128(prand);
-                _mm_store_si128(prand, tempa2);
-
-                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
-                acc = _mm_xor_si128(clprod12, acc);
-                const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
-                acc = _mm_xor_si128(clprod22, acc);
-
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
-                _mm_store_si128(prandex, tempb2);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
-            case 0xc:
-            {
-                const __m128i temp1 = _mm_load_si128(prand);
-                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-
-                // cannot be zero here
-                const int32_t divisor = (uint32_t)selector;
-
-                acc = _mm_xor_si128(add1, acc);
-
-                const int64_t dividend = _mm_cvtsi128_si64(acc);
-                const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
-                acc = _mm_xor_si128(modulo, acc);
-
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
-
-                if (dividend & 1)
-                {
-                    const __m128i temp12 = _mm_load_si128(prandex);
-                    _mm_store_si128(prandex, tempa2);
-
-                    const __m128i temp22 = _mm_load_si128(pbuf);
-                    const __m128i add12 = _mm_xor_si128(temp12, temp22);
-                    const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
-                    acc = _mm_xor_si128(clprod12, acc);
-                    const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
-                    acc = _mm_xor_si128(clprod22, acc);
-
-                    const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
-                    const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
-                    _mm_store_si128(prand, tempb2);
-                }
-                else
-                {
-                    const __m128i tempb3 = _mm_load_si128(prandex);
-                    _mm_store_si128(prandex, tempa2);
-                    _mm_store_si128(prand, tempb3);
-                }
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
-            case 0x10:
-            {
-                // a few AES operations
-                const __m128i *rc = prand;
-                __m128i tmp;
-
-                __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
-                __m128i temp2 = _mm_load_si128(pbuf);
-
-                AES2(temp1, temp2, 0);
-
-				MIX2(temp1, temp2);
-
-                AES2(temp1, temp2, 4);
-                MIX2(temp1, temp2);
-
-                AES2(temp1, temp2, 8);
-                MIX2(temp1, temp2);		
-
-                acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
-
-                const __m128i tempa1 = _mm_load_si128(prand);
-                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
-                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
-
-                const __m128i tempa4 = _mm_load_si128(prandex);
-                _mm_store_si128(prandex, tempa3);
-                _mm_store_si128(prand, tempa4);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
-            case 0x14:
-            {
-                // we'll just call this one the monkins loop, inspired by Chris
-                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
-                __m128i tmp; // used by MIX2
-
-                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
-                __m128i *rc = prand;
-                uint64_t aesroundoffset = 0,loop_c;
-                __m128i onekey;
-
-                do
-                {
-					loop_c = selector & (((uint64_t)0x10000000) << rounds);
-					if (loop_c)
-                    {
-                        onekey = _mm_load_si128(rc++);
-                        const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
-                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
-                        const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
-                        acc = _mm_xor_si128(clprod1, acc);
-                    }
-                    else
-                    {
-                        onekey = _mm_load_si128(rc++);
-                        __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
-						
-                        AES2(onekey, temp2, aesroundoffset);
-						
-                        aesroundoffset += 4;
-                        MIX2(onekey, temp2);
-						
-                        acc = _mm_xor_si128(onekey, acc);
-                        acc = _mm_xor_si128(temp2, acc);
-                    }
-			
-                } while (rounds--);
-
-                const __m128i tempa1 = _mm_load_si128(prand);
-                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
-                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
-
-                const __m128i tempa4 = _mm_load_si128(prandex);
-                _mm_store_si128(prandex, tempa3);
-                _mm_store_si128(prand, tempa4);
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
-            case 0x18:
-            {
-				__m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
-				__m128i tmp; // used by MIX2
-
-				uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
-				__m128i *rc = prand;
-				uint64_t aesroundoffset = 0;
-				__m128i onekey;
-
-				do
-				{
-					if (selector & (((uint64_t)0x10000000) << rounds))
-					{
-						onekey = _mm_load_si128(rc++);
-						__m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
-						__m128i add1 = _mm_xor_si128(onekey, temp2);
-						// cannot be zero here, may be negative
-						int32_t divisor = (uint32_t)selector;
-						int64_t dividend = _mm_cvtsi128_si64(add1);
-						__m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
-						acc = _mm_xor_si128(modulo, acc);
-					}
-					else
-					{
-						onekey = _mm_load_si128(rc++);
-						__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
-						__m128i add1 = _mm_xor_si128(onekey, temp2);
-						__m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
-						__m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1);
-						acc = _mm_xor_si128(clprod2, acc);
-					}
-				} while (rounds--);
-
-				__m128i tempa3 = _mm_load_si128(prandex);
-				__m128i tempa4 = _mm_xor_si128(tempa3, acc);
-				_mm_store_si128(prandex, tempa4);
-				_mm_store_si128(prand, onekey);
-				break;
-            }
-            case 0x1c:
-            {
-                const __m128i temp1 = _mm_load_si128(pbuf);
-                const __m128i temp2 = _mm_load_si128(prandex);
-                const __m128i add1 = _mm_xor_si128(temp1, temp2);
-                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
-                acc = _mm_xor_si128(clprod1, acc);
-
-                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
-                const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
-
-                const __m128i tempa3 = _mm_load_si128(prand);
-#ifdef VERUSHASHDEBUGo
-
-				printf("[cpu] tempa1    : ");
-				printf("%016llx%016llx", ((uint64_t*)&tempa1)[0], ((uint64_t*)&tempa1)[1]);
-				printf("\n");
-				printf("[cpu] tempa2    : ");
-				printf("%016llx%016llx", ((uint64_t*)&tempa2)[0], ((uint64_t*)&tempa2)[1]);
-				printf("\n");
-				printf("[cpu] tempa3    : ");
-				printf("%016llx%016llx", ((uint64_t*)&tempa3)[0], ((uint64_t*)&tempa3)[1]);
-				printf("\n");
-
-#endif
-                _mm_store_si128(prand, tempa2);
-
-                acc = _mm_xor_si128(tempa3, acc);
-
-                const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
-                const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
-                _mm_store_si128(prandex, tempb2);
-
-				PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST);
-				PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST);
-
-                break;
-            }
+            memcpy(curBuf + 32 + curPos, data + pos, room);
+            (*haraka512Function)(result, curBuf);
+            tmp = curBuf;
+            curBuf = result;
+            result = tmp;
+            pos += room;
+            curPos = 0;
+        }
+        else
+        {
+            memcpy(curBuf + 32 + curPos, data + pos, len - pos);
+            curPos += len - pos;
+            pos = len;
         }
-	
-
-		
     }
-
-    return acc;
+    return *this;
 }
 
-// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
-// returning a 64 bit hash value
-uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
-	u128 *g_prand, u128 *g_prandex) {
-    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex);
-    acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
-
-
-    return precompReduction64(acc);
+// to be declared and accessed from C
+void verus_hash(void *result, const void *data, size_t len)
+{
+    return CVerusHash::Hash(result, data, len);
 }
 
-inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) {
-  u128 s[4], tmp;
-
-  s[0] = LOAD(in);
-  s[1] = LOAD(in + 16);
-  s[2] = LOAD(in + 32);
-  s[3] = LOAD(in + 48);
-
-  AES4(s[0], s[1], s[2], s[3], 0);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 8);
-  MIX4(s[0], s[1], s[2], s[3]);
+void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in);
+void (*CVerusHashV2::haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc);
+void (*CVerusHashV2::haraka256Function)(unsigned char *out, const unsigned char *in);
 
-  AES4(s[0], s[1], s[2], s[3], 16);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 24);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 32);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  s[0] = _mm_xor_si128(s[0], LOAD(in));
-  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
-  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
-  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
-
- // TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
-}
-/*
-void cpu_verushash::solve_verus_v2_opt(CBlockHeader &bh, 
-	arith_uint256 &target,
-	std::function<bool()> cancelf,
-	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
-	std::function<void(void)> hashdonef,
-	cpu_verushash &device_context)
+void CVerusHashV2::init()
 {
-	CVerusHashV2bWriter &vhw = *(device_context.pVHW2b);
-	CVerusHashV2 &vh = vhw.GetState();
-    verusclhasher &vclh = vh.vclh;
-
-	alignas(32) uint256 curHash, curTarget = ArithToUint256(target);
-
-    const uint64_t *compResult = (uint64_t *)&curHash;
-    const uint64_t *compTarget = (uint64_t *)&curTarget;
-
-    u128 *hashKey = (u128 *)verusclhasher_key.get();
-    verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
-    void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes;
-    const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks
-
-	bh.nSolution = std::vector<unsigned char>(1344);
-	bh.nSolution[0] = VERUSHHASH_SOLUTION_VERSION; // earliest VerusHash 2.0 solution version
+    if (IsCPUVerusOptimized())
+    {
+        load_constants();
+        haraka512Function = &haraka512;
+        haraka512KeyedFunction = &haraka512_keyed;
+        haraka256Function = &haraka256;
+    }
+    else
+    {
+        // load the haraka constants
+        load_constants_port();
+        haraka512Function = &haraka512_port;
+        haraka512KeyedFunction = &haraka512_port_keyed;
+        haraka256Function = &haraka256_port;
+    }
+}
 
-	// prepare the hash state
-	vhw.Reset();
-	vhw << bh;
+void CVerusHashV2::Hash(void *result, const void *data, size_t len)
+{
+    unsigned char buf[128];
+    unsigned char *bufPtr = buf;
+    int pos = 0, nextOffset = 64;
+    unsigned char *bufPtr2 = bufPtr + nextOffset;
+    unsigned char *ptr = (unsigned char *)data;
 
-	int64_t *extraPtr = vhw.xI64p();
-	unsigned char *curBuf = vh.CurBuffer();
+    // put our last result or zero at beginning of buffer each time
+    memset(bufPtr, 0, 32);
 
-    // skip keygen if it is the current key
-    if (pdesc->seed != *((uint256 *)curBuf))
+    // digest up to 32 bytes at a time
+    for ( ; pos < len; pos += 32)
     {
-        // generate a new key by chain hashing with Haraka256 from the last curbuf
-        // assume 256 bit boundary
-        int n256blks = pdesc->keySizeInBytes >> 5;
-        unsigned char *pkey = ((unsigned char *)hashKey);
-        unsigned char *psrc = curBuf;
-        for (int i = 0; i < n256blks; i++)
+        if (len - pos >= 32)
         {
-            haraka256(pkey, psrc);
-            psrc = pkey;
-            pkey += 32;
+            memcpy(bufPtr + 32, ptr + pos, 32);
         }
-        pdesc->seed = *((uint256 *)curBuf);
-        memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes);
+        else
+        {
+            int i = (int)(len - pos);
+            memcpy(bufPtr + 32, ptr + pos, i);
+            memset(bufPtr + 32 + i, 0, 32 - i);
+        }
+        (*haraka512Function)(bufPtr2, bufPtr);
+        bufPtr2 = bufPtr;
+        bufPtr += nextOffset;
+        nextOffset *= -1;
     }
+    memcpy(result, bufPtr, 32);
+};
 
-    const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0);
-    const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1);
-    const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0);
-    unsigned char ch = curBuf[0];
-
-	// loop the requested number of times or until canceled. determine if we 
-	// found a winner, and send all winners found as solutions. count only one hash. 
-	// hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only
-	// hashrate and sharerate are valid, solutionrate will equal sharerate
-	for (int64_t i = 0; i < VERUSHASHES_PER_SOLVE; i++)
-	{
-		*extraPtr = i;
-
-		// prepare the buffer
-        _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1);
-        curBuf[32 + 15] = ch;
-
-		// run verusclhash on the buffer
-		const uint64_t intermediate = vclh(curBuf, hashKey);
-
-		// fill buffer to the end with the result and final hash
-        __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2);
-        _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2);
-        curBuf[32 + 15] = *((unsigned char *)&intermediate);
+CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len)
+{
+    unsigned char *tmp;
 
-		haraka512_keyed_local((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate));
+    // digest up to 32 bytes at a time
+    for ( int pos = 0; pos < len; )
+    {
+        int room = 32 - curPos;
 
-        if (compResult[3] > compTarget[3] || (compResult[3] == compTarget[3] && compResult[2] > compTarget[2]) ||
-            (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] > compTarget[1]) ||
-            (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] == compTarget[1] && compResult[0] > compTarget[0]))
+        if (len - pos >= room)
         {
-            // refresh the key
-            memcpy(hashKey, hasherrefresh, keyrefreshsize);
-			continue;
+            memcpy(curBuf + 32 + curPos, data + pos, room);
+            (*haraka512Function)(result, curBuf);
+            tmp = curBuf;
+            curBuf = result;
+            result = tmp;
+            pos += room;
+            curPos = 0;
         }
-
-        std::vector<unsigned char> solution = bh.nSolution;
-		int extraSpace = (solution.size() % 32) + 15;
-		assert(solution.size() > 32);
-		*((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i;
-
-		solutionf(std::vector<uint32_t>(0), solution.size(), solution.data());
-		if (cancelf()) return;
-
-        // refresh the key
-        memcpy(hashKey, hasherrefresh, keyrefreshsize);
-	}
-	hashdonef();
-}
-
-
-void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
-  u128 s[4], tmp;
-
-  s[0] = LOAD(in);
-  s[1] = LOAD(in + 16);
-  s[2] = LOAD(in + 32);
-  s[3] = LOAD(in + 48);
-
-  AES4(s[0], s[1], s[2], s[3], 0);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 8);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 16);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 24);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  AES4(s[0], s[1], s[2], s[3], 32);
-  MIX4(s[0], s[1], s[2], s[3]);
-
-  s[0] = _mm_xor_si128(s[0], LOAD(in));
-  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
-  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
-  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
-
-  TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
+        else
+        {
+            memcpy(curBuf + 32 + curPos, data + pos, len - pos);
+            curPos += len - pos;
+            pos = len;
+        }
+    }
+    return *this;
 }
-*/
-
-#ifdef _WIN32
 
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
-#endif
-
-void *alloc_aligned_buffer(uint64_t bufSize)
+// to be declared and accessed from C
+void verus_hash_v2(void *result, const void *data, size_t len)
 {
-    void *answer = NULL;
-    if (posix_memalign(&answer, sizeof(__m256i), bufSize))
-    {
-        return NULL;
-    }
-    else
-    {
-        return answer;
-    }
+    return CVerusHashV2::Hash(result, data, len);
 }
diff --git a/verus/verusscan.cpp b/verus/verusscan.cpp
index 4d5c2d536c..f8dae57bf5 100644
--- a/verus/verusscan.cpp
+++ b/verus/verusscan.cpp
@@ -130,20 +130,27 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len)
 
 
 extern "C" void Verus2hash(unsigned char *hash, unsigned char *curBuf, uint32_t nonce, 
-	u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
+	u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, 
+	u128 *g_prandex, int version)
 {
 	//uint64_t mask = VERUS_KEY_SIZE128; //552
-
-	memcpy(curBuf + 47, curBuf, 16);
-	memcpy(curBuf + 63, curBuf, 1);
+	static const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0);
+	const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1);
+	static const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0);
+	unsigned char ch = curBuf[0];
+	_mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1);
+	curBuf[32 + 15] = ch;
 	//	FillExtra((u128 *)curBuf);
-	
+	uint64_t intermediate;
 	((uint32_t*)&curBuf[0])[8] = nonce;
-	uint64_t intermediate = verusclhash(data_key,curBuf, 8191, fixrand, fixrandex, g_prand, g_prandex);
+	if(version = 3)
+	intermediate = verusclhashv2_1(data_key,curBuf, 511, fixrand, fixrandex, g_prand, g_prandex);
+	else
+		intermediate = verusclhashv2_2(data_key, curBuf, 511, fixrand, fixrandex, g_prand, g_prandex);
 		//FillExtra
-	memcpy(curBuf + 47, &intermediate, 8);
-	memcpy(curBuf + 55, &intermediate, 8);
-	memcpy(curBuf + 63, &intermediate, 1);
+	__m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2);
+	_mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2);
+	curBuf[32 + 15] = *((unsigned char *)&intermediate);
 	intermediate &= 511;
 	haraka512_keyed(hash, curBuf, data_key + intermediate);
 	FixKey(fixrand, fixrandex, data_key, g_prand, g_prandex);
@@ -158,30 +165,31 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
 	//	unsigned char data[] = {
 	//		0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x40, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 	//};
-	uint32_t _ALIGN(64) endiandata[35];
+	uint32_t _ALIGN(4) endiandata[35];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 
-	uint8_t blockhash_half[64] = { 0 };
+	uint8_t _ALIGN(4)  blockhash_half[64] = { 0 };
 	uint8_t gpuinit = 0;
 	struct timeval tv_start, tv_end, diff;
-	double secs, solps;
-	u128 *data_key =  (u128*)malloc(VERUS_KEY_SIZE);
+	//double secs, solps;
+	u128 *data_key =  (u128*)malloc(VERUS_KEY_SIZE + 1024);
 
 	//u128 *data_key_master = NULL;
 //	posix_memalign((void**)&data_key, sizeof(__m128i), VERUS_KEY_SIZE);
-		u128 data_key_prand[32];
-	u128 data_key_prandex[32];
+	u128 *data_key_prand = data_key + VERUS_KEY_SIZE128 ;
+	u128 *data_key_prandex = data_key + VERUS_KEY_SIZE128 + 32;
   //u128 data_key[VERUS_KEY_SIZE128] = { 0 }; // 552 required
 	//u128 data_key_master[VERUS_KEY_SIZE128] = { 0 };
 	uint32_t nonce_buf = 0;
-	uint32_t fixrand[32];
-	uint32_t fixrandex[32];
+	uint32_t _ALIGN(4)  fixrand[32];
+	uint32_t _ALIGN(4)  fixrandex[32];
 
-	unsigned char block_41970[] = { 0xfd, 0x40, 0x05, 0x03 };
-	uint8_t _ALIGN(64) full_data[140 + 3 + 1344] = { 0 };
+	uint8_t version = work->hash_ver;
+	unsigned char block_41970[4] = { 0xfd, 0x40, 0x05};
+	block_41970[3] = version;
+	uint8_t  _ALIGN(4)  full_data[140 + 3 + 1344] = { 0 };
 	uint8_t* sol_data = &full_data[140];
-
 	memcpy(endiandata, pdata, 140);
 	memcpy(sol_data, block_41970, 4);
 	memcpy(full_data, endiandata, 140);
@@ -202,7 +210,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
 
 		*hashes_done = nonce_buf + throughput;
 Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key, 
-				&gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex);
+				&gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex, version);
 
 		if (vhash[7] <= Htarg )
 		{
@@ -230,10 +238,10 @@ Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, d
 
 
 out:
-//	gettimeofday(&tv_end, NULL);
-//	timeval_subtract(&diff, &tv_end, &tv_start);
-//	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
-//	solps = (double)nonce_buf / secs;
+	gettimeofday(&tv_end, NULL);
+	//timeval_subtract(&diff, &tv_end, &tv_start);
+	//secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
+	//solps = (double)nonce_buf / secs;
 
 	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
 	free(data_key);