diff --git a/ccminer.VC.VC.opendb b/ccminer.VC.VC.opendb deleted file mode 100644 index c4b2a0be40..0000000000 Binary files a/ccminer.VC.VC.opendb and /dev/null differ diff --git a/ccminer.cpp b/ccminer.cpp index 0d99af2982..4660f5e207 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1,4 +1,4 @@ -/* +/* * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler * Copyright 2014-2017 tpruvot @@ -42,7 +42,8 @@ #include "miner.h" #include "algos.h" - +#include "sia/sia-rpc.h" +#include "crypto/xmr-rpc.h" #include "equi/equihash.h" //#include @@ -1600,6 +1601,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved work->data[25] = le32dec(sctx->job.ntime); work->data[26] = le32dec(sctx->job.nbits); + work->hash_ver = sctx->job.hash_ver; memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce work->data[35] = 0x80; //applog_hex(work->data, 140); @@ -1744,7 +1746,7 @@ static bool wanna_mine(int thr_id) float temp = gpu_temp(cgpu); if (temp > opt_max_temp) { if (!conditional_state[thr_id] && !opt_quiet) - gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp); + gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp); state = false; } else if (opt_max_temp > 0. && opt_resume_temp > 0. && conditional_state[thr_id] && temp > opt_resume_temp) { if (!thr_id && opt_debug) @@ -1838,7 +1840,7 @@ static void *miner_thread(void *userdata) } /* Cpu thread affinity */ - if (num_cpus > 1) { + /*if (num_cpus > 1) { if (opt_affinity == -1L && opt_n_threads > 1) { if (opt_debug) applog(LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id, @@ -1850,7 +1852,7 @@ static void *miner_thread(void *userdata) (long) opt_affinity); affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity); } - } + }*/ @@ -3436,6 +3438,18 @@ BOOL WINAPI ConsoleHandler(DWORD dwType) } #endif +void Clear() +{ +#if defined _WIN32 + system("cls"); +#elif defined (__LINUX__) || defined(__gnu_linux__) || defined(__linux__) + system("clear"); +#elif defined (__APPLE__) + system("clear"); +#endif +} + + int main(int argc, char *argv[]) { struct thr_info *thr; @@ -3444,15 +3458,34 @@ int main(int argc, char *argv[]) // get opt_quiet early parse_single_opt('q', argc, argv); - - printf("*** ccminer " PACKAGE_VERSION " for CPU's by Monkins1010 based on ccminer***\n"); - if (!opt_quiet) { - const char* arch = is_x64() ? "64-bits" : "32-bits"; - - printf(" Originally based on Christian Buchner and Christian H. project\n"); - printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n"); - printf("Verus donation address: REoPcdGXthL5yeTCrJtrQv5xhYTknbFbec (monkins)\n"); - } + + Clear(); + + printf(" .. ..\n"); + printf(" .lkK0o. 'd0Kkc. \n"); + printf(" cKWMMMW0; 'dXMMMMWK: \n"); + printf(" :XMMMMMMMXc. lKMMMMMMMMK; .\n"); + printf(" lWMMMMMMMMNl., OWMMMMMMMMMNkoxkO0OOko:. 'odddddl,;okOOdodddddc. ,oddddo' ., ldkO00Okxo : .\n"); + printf(" ,0MMMMMMMMMNxdXMMMMMMMMMMNNWMMMMMMMMMWK: .xWMMMMMWXWMMMNKNMMMMWd. .kMMMMM0,'oKWMMMMMMMMMMWKc\n"); + printf(" ;KMMMMMMMMMMMMMMMMMMMMWWWMMMN0kOXMMMMMK; :XMMMMMMMMMMMWKKWMMMMK, cNMMMMWdlKMMMMXxlcdXMMMMM0'\n"); + printf(" :KMMMMMMMMMMMMMMMMMWWWWMMNd'. :XMMMMNdkWMMMMWN0oc:::dXMMMMWd. .kMMMMM0o0MMMMWx. :xxxxxl. \n"); + printf(" :KMMMMMMMMMMMMMMMWWWMMMMKdllllxNMMMMN0XMMMMWx, ..xMMMMMK, cNMMMMWo; 0MMMMMNKkdl:, .\n"); + printf(" :KMMMMMMMMMMMMNXNWMMMMMMMMMMMMMMMMMXXWMMMMK, :XMMMMWo .kMMMMM0' ;ONWMMMMMMMMN0o. \n"); + printf(" ;KMMMMMMMMMW0coNMMMMMKkxxxxxxxxxxxONMMMMWo .xMMMMM0, cNMMMMWl ':oxOKWMMMMMWk. \n"); + printf(" ;0MMMMMMMNd.lWMMMMMO, .'cc. .kWMMMM0, ;KMMMMMk. ;0MMMMMXkddddd; .cKMMMMMk. \n"); + printf(" ,0WMMMWO;, 0MMMMMWXkxdx0NWWx' :XMMMMWo :NMMMMMNOxxONMMMMMNkOMMMMMKo::ckNMMMMK; \n"); + printf(" 'OWMXl. ,OWMMMMMMMMMMMWKo:kWMMMMK, ,0MMMMMMMMWWWMMMMM0,'kWMMMMMMMMMMMWKd' \n"); + printf(" .dd' .;oxOKXXK0kdl,. 'x0000Oc .oOKXXKOxc:x0000O: .; okOKXXK0Oxl, .\n"); +// printf(" .... .... .....\n"); +// printf("*********************************************************************************************************\n"); + + printf("\n *** ccminer CPU" PACKAGE_VERSION " for Verushash v2.1 - 2.2 by Monkins1010 based on ccminer***\n\n"); + + //printf(" Built with VC++ %d" , msver()); + printf("Originally based on Christian Buchner and Christian H. project\n"); + + printf("Located at: " PACKAGE_URL " \n\n"); + rpc_user = strdup(""); rpc_pass = strdup(""); @@ -3612,11 +3645,12 @@ int main(int argc, char *argv[]) // Enable windows high precision timer timeBeginPeriod(1); #endif - if (opt_affinity != -1) { - if (!opt_quiet) - applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity); - affine_to_cpu_mask(-1, (unsigned long)opt_affinity); - } + //if (opt_affinity != -1) { + // if (!opt_quiet) +// opt_affinity = 0xffffffffffffffff; + // applog(LOG_DEBUG, "Binding process to cpu mask %llx", opt_affinity); + // affine_to_cpu_mask(-1, (unsigned long)opt_affinity); + //} if (active_gpus == 0) { applog(LOG_ERR, "No CUDA devices found! terminating."); exit(1); diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu new file mode 100644 index 0000000000..a3b76a7733 --- /dev/null +++ b/equi/cuda_equi.cu @@ -0,0 +1,2130 @@ +/* + * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash + * Adapted to be more compatible with older C++ compilers + * + * cuda_djezo solver was released by NiceHash (www.nicehash.com) under + * GPL 3.0 license. If you don't have a copy, you can obtain one from + * https://www.gnu.org/licenses/gpl-3.0.txt + * + * Based on CUDA solver by John Tromp released under MIT license. + * Some helper functions taken out of OpenCL solver by Marc Bevand + * released under MIT license. + * + * Copyright (c) 2016 John Tromp, Marc Bevand + * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3) + */ + +#ifdef WIN32 +#include +#endif + +#include +#include +//#include + +#include "equihash.h" +#include "eqcuda.hpp" // eq_cuda_context + +#include "blake2/blake2.h" + +//#define WN 200 +//#define WK 9 +#ifndef MAX_GPUS +#define MAX_GPUS 16 +#endif + +#define NDIGITS (WK+1) +#define DIGITBITS (WN/(NDIGITS)) +#define PROOFSIZE (1< +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define __CUDA_ARCH__ 520 +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl2(uint32_t x, uint32_t y); +uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); +uint32_t __ldg(const uint32_t* address); +uint64_t __ldg(const uint64_t* address); +uint4 __ldca(const uint4 *ptr); +u32 __ldca(const u32 *ptr); +u32 umin(const u32, const u32); +u32 umax(const u32, const u32); +#endif + +#define OPT_SYNC_ALL + +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) +#undef __any +#define __any(p) __any_sync(0xFFFFFFFFu, p) +#else +#define __shfl2 __shfl +#endif + +typedef u32 proof[PROOFSIZE]; + +struct __align__(32) slot { + u32 hash[8]; +}; + +struct __align__(16) slotsmall { + u32 hash[4]; +}; + +struct __align__(8) slottiny { + u32 hash[2]; +}; + +template +struct equi +{ + slot round0trees[4096][RB8_NSLOTS]; + slot trees[1][NBUCKETS][NSLOTS]; + struct { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round2trees[NBUCKETS]; + struct { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round3trees[NBUCKETS]; + slotsmall treessmall[4][NBUCKETS][NSLOTS]; + slottiny treestiny[1][4096][RB8_NSLOTS_LD]; + u32 round4bidandsids[NBUCKETS][NSLOTS]; + union { + u64 blake_h[8]; + u32 blake_h32[16]; + }; + struct { + u32 nslots8[4096]; + u32 nslots0[4096]; + u32 nslots[9][NBUCKETS]; + scontainerreal srealcont; + } edata; +}; + +// todo: use cuda_helper.h and/or cuda_vector.h +__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) +{ + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} + +__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64) +__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; +#if __CUDA_ARCH__ > 300 + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } +#else + result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); + result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); +#endif + return result; +} + + +__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +__device__ __forceinline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} + +__device__ __forceinline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} + +__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) +{ + a = a + b + x; + ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); + a = a + b + y; + ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +// untested.. +struct packer_default +{ + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1; + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + // BUCKMASK-ed to prevent illegal memory accesses in case of memory errors + return (bid >> (2 * SLOTBITS)) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return bid & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> SLOTBITS) & SLOTMASK; + } +}; + + +struct packer_cantor +{ + __device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1) + { + u32 a = umax(s0, s1); + u32 b = umin(s0, s1); + return a * (a + 1) / 2 + b; + } + + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (bucketid << CANTORBITS) | cantor(s0, s1); + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> CANTORBITS) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1; + // this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9 + for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2); + return ((k - 1) / 2) & SLOTMASK; + } +}; + +__device__ __constant__ const u64 blake_iv[] = { + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +#if CUDART_VERSION < 8000 || !defined(__ldca) +#define __ldca(ptr) *(ptr) +#endif + +template +__global__ void digit_first(equi* eq, u32 nonce) +{ + const u32 block = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ u64 hash_h[8]; + u32* hash_h32 = (u32*)hash_h; + + if (threadIdx.x < 16) + hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]); + + __syncthreads(); + + u64 m = (u64)block << 32 | (u64)nonce; + + union + { + u64 v[16]; + u32 v32[32]; + uint4 v128[8]; + }; + + v[0] = hash_h[0]; + v[1] = hash_h[1]; + v[2] = hash_h[2]; + v[3] = hash_h[3]; + v[4] = hash_h[4]; + v[5] = hash_h[5]; + v[6] = hash_h[6]; + v[7] = hash_h[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4] ^ (128 + 16); + v[13] = blake_iv[5]; + v[14] = blake_iv[6] ^ 0xffffffffffffffff; + v[15] = blake_iv[7]; + + // mix 1 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 2 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 3 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, m); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 4 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, m); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 5 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, m); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 6 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], m, 0); + + // mix 7 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], m, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 8 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, m); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 9 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], m, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 10 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], m, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 11 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 12 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + v[0] ^= hash_h[0] ^ v[8]; + v[1] ^= hash_h[1] ^ v[9]; + v[2] ^= hash_h[2] ^ v[10]; + v[3] ^= hash_h[3] ^ v[11]; + v[4] ^= hash_h[4] ^ v[12]; + v[5] ^= hash_h[5] ^ v[13]; + v32[12] ^= hash_h32[12] ^ v32[28]; + + u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits + u32 bucketid; + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[0], v32[1], 0x1234); + tt.y = __byte_perm(v32[1], v32[2], 0x1234); + tt.z = __byte_perm(v32[2], v32[3], 0x1234); + tt.w = __byte_perm(v32[3], v32[4], 0x1234); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[4], v32[5], 0x1234); + tt.y = __byte_perm(v32[5], v32[6], 0x1234); + tt.z = 0; + tt.w = block << 1; + *(uint4*)(&s->hash[4]) = tt; + } + + bexor = __byte_perm(v32[6], 0, 0x0123); + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[6], v32[7], 0x2345); + tt.y = __byte_perm(v32[7], v32[8], 0x2345); + tt.z = __byte_perm(v32[8], v32[9], 0x2345); + tt.w = __byte_perm(v32[9], v32[10], 0x2345); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[10], v32[11], 0x2345); + tt.y = __byte_perm(v32[11], v32[12], 0x2345); + tt.z = 0; + tt.w = (block << 1) + 1; + *(uint4*)(&s->hash[4]) = tt; + } +} + +/* + Functions digit_1 to digit_8 works by the same principle; + Each thread does 2-3 slot loads (loads are coalesced). + Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1). + At the same time, restbits (8 or 9 bits) in xorwork are used for collisions. + Restbits determine position in ht. + Following next is pair creation. First one (or two) pairs' xorworks are put into global memory + as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices). + In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step. + In last step we save xorwork of pairs in memory. +*/ +template +__global__ void digit_1(equi* eq) +{ + __shared__ u16 ht[256][SSM - 1]; + __shared__ uint2 lastword1[RB8_NSLOTS]; + __shared__ uint4 lastword2[RB8_NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < 256) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + uint2 ta[2]; + uint4 tb[2]; + + u32 si[2]; + +#ifdef OPT_SYNC_ALL + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + __syncthreads(); +#endif + #pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slot* pslot1 = eq->round0trees[bucketid] + si[i]; + + // get xhash + uint4 a1 = *(uint4*)(&pslot1->hash[0]); + uint2 a2 = *(uint2*)(&pslot1->hash[4]); + ta[i].x = a1.x; + ta[i].y = a1.y; + lastword1[si[i]] = ta[i]; + tb[i].x = a1.z; + tb[i].y = a1.w; + tb[i].z = a2.x; + tb[i].w = a2.y; + lastword2[si[i]] = tb[i]; + + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + int* pairs = ht_len; + + u32 xors[6]; + u32 xorbucketid, xorslot; + + #pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = ta[i] ^ lastword1[p]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + } +} + + +template +__global__ void digit_2(equi* eq) +{ + __shared__ u16 ht[NRESTS][SSM - 1]; + __shared__ u32 lastword1[NSLOTS]; + __shared__ uint4 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slot* buck = eq->trees[0][bucketid]; + u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 ta[2]; + uint4 tt[2]; + + u32 si[2]; +#ifdef OPT_SYNC_ALL + __syncthreads(); +#endif + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + // get slot + const slot* pslot1 = buck + si[i]; + + uint4 ttx = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = ta[i] = ttx.x; + uint2 tty = *(uint2*)(&pslot1->hash[4]); + tt[i].x = ttx.y; + tt[i].y = ttx.z; + tt[i].z = ttx.w; + tt[i].w = tty.x; + lastword2[si[i]] = tt[i]; + + hr[i] = tty.y & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[0] = ta[i] ^ lastword1[p]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = tt[i] ^ lastword2[p]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[0] = lastword1[i] ^ lastword1[k]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } +} + + +template +__global__ void digit_3(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + u32 ta[2]; + +#ifdef OPT_SYNC_ALL + __syncthreads(); +#endif + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]]; + + tt[i] = *(uint4*)(&xs.hash[0]); + lastword1[si[i]] = tt[i]; + ta[i] = xst.hash[0]; + lastword2[si[i]] = ta[i]; + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[4] = ta[i] ^ lastword2[p]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = tt[i] ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[4] = lastword2[i] ^ lastword2[k]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + } +} + + +template +__global__ void digit_4(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; +#ifdef OPT_SYNC_ALL + __syncthreads(); +#endif + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]]; + + // get xhash + tt[i] = *(uint4*)(&xs.hash[0]); + lastword[si[i]] = tt[i]; + hr[i] = xst.hash[0] & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + } + } + } +} + + +template +__global__ void digit_5(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slotsmall* buck = eq->treessmall[3][bucketid]; + u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; +#ifdef OPT_SYNC_ALL + __syncthreads(); +#endif + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword[si[i]] = tt[i]; + asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = xors[3]; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[1]; + tt.y = xors[2]; + tt.z = xors[3]; + tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = tt; + } + } + } +} + + +template +__global__ void digit_6(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint2 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[2][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = *(uint2*)(&tt[i].x); + lastword2[si[i]] = tt[i].z; + asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + // doing this to save shared memory + int* pairs = ht_len; + __syncthreads(); + + u32 xors[3]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + u32 pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + xors[2] = lastword2[i] ^ lastword2[k]; + if (xors[2] == 0) + continue; + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } +} + + +template +__global__ void digit_7(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[0][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint4*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[0]; + tt.y = xors[1]; + tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + tt.w = 0; + *(uint4*)(&xs.hash[0]) = tt; + } +} + + +template +__global__ void digit_8(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + next_pair = 0; + pairs_len = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[1][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint2 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint2*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot >= RB8_NSLOTS_LD) continue; + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } +} + +/* + Last round function is similar to previous ones but has different ending. + We use warps to process final candidates. Each warp process one candidate. + First two bidandsids (u32 of stored bucketid and two slotids) are retreived by + lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until + all lanes in warp have bidandsids from round 4. Next, each thread retreives + 16 indices. While doing so, indices are put into comparison using atomicExch + to determine if there are duplicates (tromp's method). At the end, if no + duplicates are found, candidate solution is saved (all indices). Note that this + dup check method is not exact so CPU dup checking is needed after. +*/ +template +__global__ void digit_last_wdc(equi* eq) +{ + __shared__ u8 shared_data[8192]; + int* ht_len = (int*)(&shared_data[0]); + int* pairs = ht_len; + u32* lastword = (u32*)(&shared_data[256 * 4]); + u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]); + u32* pairs_len = (u32*)(&shared_data[8188]); + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + #pragma unroll + for (u32 i = 0; i < FCT; i++) + ht_len[(i * (256 / FCT)) + threadid] = 0; + + if (threadid == ((256 / FCT) - 1)) + *pairs_len = 0; + + slottiny* buck = eq->treestiny[0][bucketid]; + u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD); + + u32 si[3 * FCT]; + u32 hr[3 * FCT]; + int pos[3 * FCT]; + u32 lw[3 * FCT]; + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + pos[i] = SSM; + + __syncthreads(); + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + { + si[i] = i * (256 / FCT) + threadid; + if (si[i] >= bsize) break; + + const slottiny* pslot1 = buck + si[i]; + + // get xhash + uint2 tt = *(uint2*)(&pslot1->hash[0]); + lw[i] = tt.x; + lastword[si[i]] = lw[i]; + + u32 a; + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i])); + hr[i] = a; + + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) + ht[hr[i] * (SSM - 1) + pos[i]] = si[i]; + } + + __syncthreads(); + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + { + if (pos[i] >= SSM) continue; + + for (int k = 0; k != pos[i]; ++k) + { + u16 prev = ht[hr[i] * (SSM - 1) + k]; + if (lw[i] != lastword[prev]) continue; + u32 pindex = atomicAdd(pairs_len, 1); + if (pindex >= MAXPAIRS) break; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + + __syncthreads(); + u32 plen = umin(*pairs_len, 64); + +#define CALC_LEVEL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1]; \ + levels[c] = bucks[slot0]; \ + } + +#define CALC_LEVEL_SMALL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1].hash[d]; \ + levels[c] = bucks[slot0].hash[d]; \ + } + + u32 lane = threadIdx.x & 0x1f; + u32 par = threadIdx.x >> 5; + + u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)]; + u32* susp = levels; + + while (par < plen) + { + int pair = pairs[par]; + par += W; + + if (lane % 16 == 0) + { + u32 plvl; + if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1]; + else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1]; + slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)]; + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); + levels[lane] = bucks[slot1].hash[2]; + levels[lane + 8] = bucks[slot0].hash[2]; + } + + if (lane % 8 == 0) + CALC_LEVEL_SMALL(0, lane, lane + 4, 3); + + if (lane % 4 == 0) + CALC_LEVEL_SMALL(2, lane, lane + 2, 3); + + if (lane % 2 == 0) + CALC_LEVEL(0, lane, lane + 1, 4); + + u32 ind[16]; + + u32 f1 = levels[lane]; + const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0]; + const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM); + const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM); + + susp[lane] = 0xffffffff; + susp[32 + lane] = 0xffffffff; + +#define CHECK_DUP(a) \ + __any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS)) + + u32 f2 = buck_v4[slot1_v4].hash[1]; + const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0]; + const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM); + const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM); + + susp[64 + lane] = 0xffffffff; + susp[96 + lane] = 0xffffffff; + + u32 f0 = buck_v3_1[slot1_v3_1].hash[1]; + const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)]; + const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM); + const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM); + + susp[128 + lane] = 0xffffffff; + susp[160 + lane] = 0xffffffff; + + u32 f3 = buck_v2_1[slot1_v2_1].hash[6]; + const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)]; + const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS); + const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS); + + susp[192 + lane] = 0xffffffff; + susp[224 + lane] = 0xffffffff; + + ind[0] = buck_fin_1[slot1_fin_1].hash[7]; + if (CHECK_DUP(0)) continue; + ind[1] = buck_fin_1[slot0_fin_1].hash[7]; + if (CHECK_DUP(1)) continue; + + u32 f4 = buck_v2_1[slot0_v2_1].hash[6]; + const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)]; + const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS); + const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS); + + ind[2] = buck_fin_2[slot1_fin_2].hash[7]; + if (CHECK_DUP(2)) continue; + ind[3] = buck_fin_2[slot0_fin_2].hash[7]; + if (CHECK_DUP(3)) continue; + + u32 f5 = buck_v3_1[slot0_v3_1].hash[1]; + const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)]; + const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM); + const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM); + + u32 f6 = buck_v2_2[slot1_v2_2].hash[6]; + const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)]; + const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS); + const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS); + + ind[4] = buck_fin_3[slot1_fin_3].hash[7]; + if (CHECK_DUP(4)) continue; + ind[5] = buck_fin_3[slot0_fin_3].hash[7]; + if (CHECK_DUP(5)) continue; + + u32 f7 = buck_v2_2[slot0_v2_2].hash[6]; + const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)]; + const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS); + const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS); + + ind[6] = buck_fin_4[slot1_fin_4].hash[7]; + if (CHECK_DUP(6)) continue; + ind[7] = buck_fin_4[slot0_fin_4].hash[7]; + if (CHECK_DUP(7)) continue; + + u32 f8 = buck_v4[slot0_v4].hash[1]; + const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0]; + const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM); + const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM); + + u32 f9 = buck_v3_2[slot1_v3_2].hash[1]; + const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)]; + const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM); + const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM); + + u32 f10 = buck_v2_3[slot1_v2_3].hash[6]; + const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)]; + const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS); + const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS); + + ind[8] = buck_fin_5[slot1_fin_5].hash[7]; + if (CHECK_DUP(8)) continue; + ind[9] = buck_fin_5[slot0_fin_5].hash[7]; + if (CHECK_DUP(9)) continue; + + u32 f11 = buck_v2_3[slot0_v2_3].hash[6]; + const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)]; + const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS); + const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS); + + ind[10] = buck_fin_6[slot1_fin_6].hash[7]; + if (CHECK_DUP(10)) continue; + ind[11] = buck_fin_6[slot0_fin_6].hash[7]; + if (CHECK_DUP(11)) continue; + + u32 f12 = buck_v3_2[slot0_v3_2].hash[1]; + const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)]; + const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM); + const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM); + + u32 f13 = buck_v2_4[slot1_v2_4].hash[6]; + const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)]; + const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS); + const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS); + + ind[12] = buck_fin_7[slot1_fin_7].hash[7]; + if (CHECK_DUP(12)) continue; + ind[13] = buck_fin_7[slot0_fin_7].hash[7]; + if (CHECK_DUP(13)) continue; + + u32 f14 = buck_v2_4[slot0_v2_4].hash[6]; + const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)]; + const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS); + const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS); + + ind[14] = buck_fin_8[slot1_fin_8].hash[7]; + if (CHECK_DUP(14)) continue; + ind[15] = buck_fin_8[slot0_fin_8].hash[7]; + if (CHECK_DUP(15)) continue; + + u32 soli; + if (lane == 0) { + soli = atomicAdd(&eq->edata.srealcont.nsols, 1); + } +#if __CUDA_ARCH__ >= 300 + // all threads get the value from lane 0 + soli = __shfl2(soli, 0); +#else + __syncthreads(); + soli = eq->edata.srealcont.nsols; +#endif + if (soli < MAXREALSOLS) + { + u32 pos = lane << 4; + *(uint4*)(&eq->edata.srealcont.sols[soli][pos ]) = *(uint4*)(&ind[ 0]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 4]) = *(uint4*)(&ind[ 4]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 8]) = *(uint4*)(&ind[ 8]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]); + } + } +} + +//std::mutex dev_init; +int dev_init_done[MAX_GPUS] = { 0 }; + +__host__ +static int compu32(const void *pa, const void *pb) +{ + uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb; + return a b[i]) + { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + return; + } +} + +__host__ +static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) +{ + uint32_t le_N = WN; + uint32_t le_K = WK; + uchar personal[] = "ZcashPoW01230123"; + memcpy(personal + 8, &le_N, 4); + memcpy(personal + 12, &le_K, 4); + blake2b_param P[1]; + P->digest_length = HASHOUT; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + P->node_offset = 0; + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memcpy(P->personal, (const uint8_t *)personal, 16); + eq_blake2b_init_param(ctx, P); + eq_blake2b_update(ctx, (const uchar *)header, headerLen); + if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen); +} + +#ifdef WIN32 +typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int); +typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice); +typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext); +typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext); + +dec_cuDeviceGet _cuDeviceGet = nullptr; +dec_cuCtxCreate _cuCtxCreate = nullptr; +dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr; +dec_cuCtxDestroy _cuCtxDestroy = nullptr; +#endif + +template +__host__ eq_cuda_context::eq_cuda_context(int thr_id, int dev_id) +{ + thread_id = thr_id; + device_id = dev_id; + solutions = nullptr; + equi_mem_sz = sizeof(equi); + throughput = NBLOCKS; + totalblocks = NBLOCKS/FD_THREADS; + threadsperblock = FD_THREADS; + threadsperblock_digits = THREADS; + + //dev_init.lock(); + if (!dev_init_done[device_id]) + { + // only first thread shall init device + checkCudaErrors(cudaSetDevice(device_id)); + checkCudaErrors(cudaDeviceReset()); + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + + pctx = nullptr; + } + else + { + // create new context + CUdevice dev; + +#ifdef WIN32 + if (_cuDeviceGet == nullptr) + { + HMODULE hmod = LoadLibraryA("nvcuda.dll"); + if (hmod == NULL) + throw std::runtime_error("Failed to load nvcuda.dll"); + _cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet"); + if (_cuDeviceGet == nullptr) + throw std::runtime_error("Failed to get cuDeviceGet address"); + _cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2"); + if (_cuCtxCreate == nullptr) + throw std::runtime_error("Failed to get cuCtxCreate address"); + _cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2"); + if (_cuCtxPushCurrent == nullptr) + throw std::runtime_error("Failed to get cuCtxPushCurrent address"); + _cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2"); + if (_cuCtxDestroy == nullptr) + throw std::runtime_error("Failed to get cuCtxDestroy address"); + } + + checkCudaDriverErrors(_cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(_cuCtxPushCurrent(pctx)); +#else + checkCudaDriverErrors(cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(cuCtxPushCurrent(pctx)); +#endif + } + ++dev_init_done[device_id]; + //dev_init.unlock(); + + if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess) + throw std::runtime_error("CUDA: failed to alloc memory"); + + solutions = (scontainerreal*) malloc(sizeof(scontainerreal)); + if (!solutions) + throw std::runtime_error("EOM: failed to alloc solutions memory"); +} + +template +__host__ void eq_cuda_context::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + fn_cancel cancelf, + fn_solution solutionf, + fn_hashdone hashdonef) +{ + blake2b_state blake_ctx; + + int blocks = NBUCKETS; + + setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len); + + // todo: improve + // djezo solver allows last 4 bytes of nonce to be iterrated + // this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h) + // then just iterate nn++ + // less CPU load, 1 cudaMemcpy less -> faster + //u32 nn = *(u32*)&nonce[28]; + u32 nn = 0; + + checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata))); + + digit_first <<>>(device_eq, nn); + + digit_1 <<<4096, 512 >>>(device_eq); + digit_2 <<>>(device_eq); + digit_3 <<>>(device_eq); + + if (cancelf(thread_id)) return; + + digit_4 <<>>(device_eq); + digit_5 <<>>(device_eq); + + digit_6 <<>>(device_eq); + digit_7 <<>>(device_eq); + digit_8 <<>>(device_eq); + + digit_last_wdc <<<4096, 256 / 2 >>>(device_eq); + + checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost)); + + //printf("T%d nsols: %u\n", thread_id, solutions->nsols); + //if (solutions->nsols > 9) + // printf("missing sol, total: %u\n", solutions->nsols); + + for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++) + { + // remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions) + if (duped(solutions->sols[s])) continue; + + // perform sort of pairs + for (uint32_t level = 0; level < 9; level++) + for (uint32_t i = 0; i < (1 << 9); i += (2 << level)) + sort_pair(&solutions->sols[s][i], 1 << level); + + std::vector index_vector(PROOFSIZE); + for (u32 i = 0; i < PROOFSIZE; i++) { + index_vector[i] = solutions->sols[s][i]; + } + + solutionf(thread_id, index_vector, DIGITBITS, nullptr); + } + + // ccminer: only use hashdonef if no solutions... + if (!solutions->nsols) + hashdonef(thread_id); +} + +// destructor +template +__host__ +void eq_cuda_context::freemem() +{ + if (solutions) + free(solutions); + + if (device_eq) { + cudaFree(device_eq); + device_eq = NULL; + } + + if (pctx) { + // non primary thread, destroy context +#ifdef WIN32 + checkCudaDriverErrors(_cuCtxDestroy(pctx)); +#else + checkCudaDriverErrors(cuCtxDestroy(pctx)); +#endif + } else { + checkCudaErrors(cudaDeviceReset()); + dev_init_done[device_id] = 0; + } +} + +template +__host__ +eq_cuda_context::~eq_cuda_context() +{ + freemem(); +} + +#ifdef CONFIG_MODE_1 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_2 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_3 +template class eq_cuda_context; +#endif diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp index a643381ea9..fa15fa7b1e 100644 --- a/equi/equi-stratum.cpp +++ b/equi/equi-stratum.cpp @@ -118,7 +118,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params) bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) { - const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime; + const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime, *hash_version = NULL; size_t coinb1_size, coinb2_size; bool clean, ret = false; int ntime, i, p=0; @@ -130,6 +130,7 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) stime = json_string_value(json_array_get(params, p++)); nbits = json_string_value(json_array_get(params, p++)); clean = json_is_true(json_array_get(params, p)); p++; + hash_version = json_string_value(json_array_get(params, p++)); if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || strlen(prevhash) != 64 || strlen(version) != 8 || @@ -138,7 +139,7 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) applog(LOG_ERR, "Stratum notify: invalid parameters"); goto out; } - + hex2bin(&sctx->job.hash_ver, hash_version, 1); /* store stratum server time diff */ hex2bin((uchar *)&ntime, stime, 4); ntime = ntime - (int) time(0); diff --git a/miner.h b/miner.h index 6375fe7393..b67aaae301 100644 --- a/miner.h +++ b/miner.h @@ -308,24 +308,7 @@ struct cgpu_info { uint32_t rejected; double khashes; int has_monitoring; - float gpu_temp; - uint16_t gpu_fan; - uint16_t gpu_fan_rpm; - uint16_t gpu_arch; - uint32_t gpu_clock; - uint32_t gpu_memclock; - uint64_t gpu_mem; - uint64_t gpu_memfree; - uint32_t gpu_power; - uint32_t gpu_plimit; - double gpu_vddc; - int16_t gpu_pstate; - int16_t gpu_bus; - uint16_t gpu_vid; - uint16_t gpu_pid; - - int8_t nvml_id; - int8_t nvapi_id; + char gpu_sn[64]; char gpu_desc[64] = "cpu"; @@ -442,7 +425,7 @@ extern uint64_t net_hashrate; extern double net_diff; extern double stratum_diff; -#define MAX_GPUS 80 +#define MAX_GPUS 140 //#define MAX_THREADS 32 todo extern char* device_name[MAX_GPUS]; extern short device_map[MAX_GPUS]; @@ -554,6 +537,7 @@ struct stratum_job { uint32_t height; uint32_t shares_count; double diff; + int hash_ver; }; struct stratum_ctx { @@ -627,6 +611,7 @@ struct work { struct tx txs[POK_MAX_TXS]; // zec solution uint8_t extra[1388]; + int hash_ver; }; #define POK_BOOL_MASK 0x00008000 @@ -770,71 +755,7 @@ void applog_hash64(void *hash); void applog_compare_hash(void *hash, void *hash_ref); void print_hash_tests(void); -void allium_hash(void *state, const void *input); -void bastionhash(void* output, const unsigned char* input); -void blake256hash(void *output, const void *input, int8_t rounds); -void blake2b_hash(void *output, const void *input); -void blake2s_hash(void *output, const void *input); -void bmw_hash(void *state, const void *input); -void c11hash(void *output, const void *input); -void cryptolight_hash_variant(void* output, const void* input, int len, int variant); -void cryptolight_hash(void* output, const void* input); -void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); -void cryptonight_hash(void* output, const void* input); -void monero_hash(void* output, const void* input); -void stellite_hash(void* output, const void* input); -void decred_hash(void *state, const void *input); -void deephash(void *state, const void *input); -void luffa_hash(void *state, const void *input); -void fresh_hash(void *state, const void *input); -void fugue256_hash(unsigned char* output, const unsigned char* input, int len); -void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); -void hmq17hash(void *output, const void *input); -void hsr_hash(void *output, const void *input); -void keccak256_hash(void *state, const void *input); -void jackpothash(void *state, const void *input); -void groestlhash(void *state, const void *input); -void jha_hash(void *output, const void *input); -void lbry_hash(void *output, const void *input); -void lyra2re_hash(void *state, const void *input); -void lyra2v2_hash(void *state, const void *input); -void lyra2Z_hash(void *state, const void *input); -void myriadhash(void *state, const void *input); -void neoscrypt(uchar *output, const uchar *input, uint32_t profile); -void nist5hash(void *state, const void *input); -void pentablakehash(void *output, const void *input); -void phi_hash(void *output, const void *input); -void phi2_hash(void *output, const void *input); -void polytimos_hash(void *output, const void *input); -void quarkhash(void *state, const void *input); -void qubithash(void *state, const void *input); -void scrypthash(void* output, const void* input); -void scryptjane_hash(void* output, const void* input); -void sha256d_hash(void *output, const void *input); -void sha256t_hash(void *output, const void *input); -void sibhash(void *output, const void *input); -void skeincoinhash(void *output, const void *input); -void skein2hash(void *output, const void *input); -void skunk_hash(void *state, const void *input); -void s3hash(void *output, const void *input); -void timetravel_hash(void *output, const void *input); -void bitcore_hash(void *output, const void *input); -void tribus_hash(void *output, const void *input); -void veltorhash(void *output, const void *input); -void wcoinhash(void *state, const void *input); -void whirlxHash(void *state, const void *input); -void x11evo_hash(void *output, const void *input); -void x11hash(void *output, const void *input); -void x12hash(void *output, const void *input); -void x13hash(void *output, const void *input); -void x14hash(void *output, const void *input); -void x15hash(void *output, const void *input); -void x16r_hash(void *output, const void *input); -void x16s_hash(void *output, const void *input); -void x17hash(void *output, const void *input); -void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize); -void zr5hash(void *output, const void *input); -void zr5hash_pok(void *output, uint32_t *pdata); + #ifdef __cplusplus } diff --git a/verus/verus_clhash.cpp b/verus/verus_clhash.cpp index 93b7a6e17b..48f277073f 100644 --- a/verus/verus_clhash.cpp +++ b/verus/verus_clhash.cpp @@ -34,65 +34,62 @@ int __cpuverusoptimized = 0x80; // multiply the length and the some key, no modulo -static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) { +__m128i lazyLengthHash(uint64_t keylength, uint64_t length) { const __m128i lengthvector = _mm_set_epi64x(keylength,length); const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10); return clprod1; } // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64 -static inline __m128i precompReduction64_si128( __m128i A) { + __m128i precompReduction64_si128( __m128i A) { //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); - __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); - __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), + const __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); + const __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), _mm_srli_si128(Q2,8)); - __m128i Q4 = _mm_xor_si128(Q2,A); + const __m128i Q4 = _mm_xor_si128(Q2,A); const __m128i final = _mm_xor_si128(Q3,Q4); return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE } -static inline uint64_t precompReduction64( __m128i A) { + uint64_t precompReduction64( __m128i A) { return _mm_cvtsi128_si64(precompReduction64_si128(A)); } // verus intermediate hash extra -static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, + __m128i __verusclmulwithoutreduction64alignedrepeatv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) { - __m128i *pbuf; - __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; + const __m128i *pbuf; + const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; // divide key mask by 16 from bytes to __m128i - keyMask >>= 4; + // keyMask >>= 4; + + __m128i acc = _mm_load_si128(randomsource + 513); - __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); - // the random buffer must have at least 32 16 byte dwords after the keymask to work with this // algorithm. we take the value from the last element inside the keyMask + 2, as that will never // be used to xor into the accumulator before it is hashed with other values first -#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0) -#define LIONELK_FETCH_DIST 0 +//#pragma unroll 32 -#pragma unroll 32 - - for (uint64_t i = 0; i < 32; i++) + for (uint32_t i = 0; i < 32; i++) { const uint64_t selector = _mm_cvtsi128_si64(acc); + uint32_t prand_idx = (selector >> 5) & keyMask; + uint32_t prandex_idx = (selector >>32) & keyMask; // get two random locations in the key, which will be mutated and swapped - __m128i *prand = randomsource + ((selector >> 5) & keyMask); - __m128i *prandex = randomsource + ((selector >> 32) & keyMask); + __m128i *prand = randomsource + prand_idx; + __m128i *prandex = randomsource + prandex_idx; // select random start and order of pbuf processing pbuf = pbuf_copy + (selector & 3); - uint32_t prand_idx = (selector >> 5) & keyMask; - uint32_t prandex_idx = (selector >>32) & keyMask; - g_prand[i] = prand[0]; - g_prandex[i] = prandex[0]; + _mm_store_si128(&g_prand[i] , prand[0]); + _mm_store_si128(&g_prandex[i], prandex[0]); fixrand[i] = prand_idx; fixrandex[i] = prandex_idx; @@ -101,140 +98,136 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource case 0: { - const __m128i temp1 = _mm_load_si128(prandex); - const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - const __m128i temp12 = _mm_load_si128(prand); + const __m128i temp12 = _mm_load_si128(prand); _mm_store_si128(prand, tempa2); - const __m128i temp22 = _mm_load_si128(pbuf); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prandex, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 4: { - const __m128i temp1 = _mm_load_si128(prand); - const __m128i temp2 = _mm_load_si128(pbuf); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); - const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); + __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); acc = _mm_xor_si128(clprod2, acc); - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - const __m128i temp12 = _mm_load_si128(prandex); + const __m128i temp12 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa2); - const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); acc = _mm_xor_si128(add12, acc); - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prand, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 8: { - const __m128i temp1 = _mm_load_si128(prandex); - const __m128i temp2 = _mm_load_si128(pbuf); - const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); acc = _mm_xor_si128(add1, acc); - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - const __m128i temp12 = _mm_load_si128(prand); + const __m128i temp12 = _mm_load_si128(prand); _mm_store_si128(prand, tempa2); - const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); - const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); acc = _mm_xor_si128(clprod22, acc); - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prandex, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 0xc: { - const __m128i temp1 = _mm_load_si128(prand); - const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); // cannot be zero here - const int32_t divisor = (uint32_t)selector; + int32_t divisor = (uint32_t)selector; acc = _mm_xor_si128(add1, acc); - const int64_t dividend = _mm_cvtsi128_si64(acc); - const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + int64_t dividend = _mm_cvtsi128_si64(acc); + const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); acc = _mm_xor_si128(modulo, acc); - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); if (dividend & 1) { - const __m128i temp12 = _mm_load_si128(prandex); + const __m128i temp12 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa2); - const __m128i temp22 = _mm_load_si128(pbuf); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); - const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); acc = _mm_xor_si128(clprod22, acc); - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prand, tempb2); } else { - const __m128i tempb3 = _mm_load_si128(prandex); + + const __m128i tempb3 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa2); _mm_store_si128(prand, tempb3); } - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 0x10: { // a few AES operations - const __m128i *rc = prand; - __m128i tmp; + const __m128i *rc = prand; + __m128i tmp; - __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i temp2 = _mm_load_si128(pbuf); + __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + __m128i temp2 = _mm_load_si128(pbuf); AES2(temp1, temp2, 0); @@ -248,28 +241,27 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc)); - const __m128i tempa1 = _mm_load_si128(prand); - const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); - const __m128i tempa4 = _mm_load_si128(prandex); + const __m128i tempa4 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa3); _mm_store_si128(prand, tempa4); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + break; } case 0x14: { // we'll just call this one the monkins loop, inspired by Chris - const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); - __m128i tmp; // used by MIX2 + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times - __m128i *rc = prand; + const __m128i *rc = prand; uint64_t aesroundoffset = 0,loop_c; - __m128i onekey; + __m128i onekey; do { @@ -277,15 +269,15 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource if (loop_c) { onekey = _mm_load_si128(rc++); - const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); - const __m128i add1 = _mm_xor_si128(onekey, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + const __m128i add1 = _mm_xor_si128(onekey, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); } else { onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); AES2(onekey, temp2, aesroundoffset); @@ -298,21 +290,20 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource } while (rounds--); - const __m128i tempa1 = _mm_load_si128(prand); - const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); - const __m128i tempa4 = _mm_load_si128(prandex); + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + + const __m128i tempa4 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa3); _mm_store_si128(prand, tempa4); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 0x18: { - __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); __m128i tmp; // used by MIX2 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times @@ -352,16 +343,16 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource } case 0x1c: { - const __m128i temp1 = _mm_load_si128(pbuf); - const __m128i temp2 = _mm_load_si128(prandex); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + const __m128i temp1 = _mm_load_si128(pbuf); + const __m128i temp2 = _mm_load_si128(prandex); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); - const __m128i tempa3 = _mm_load_si128(prand); + const __m128i tempa3 = _mm_load_si128(prand); #ifdef VERUSHASHDEBUGo printf("[cpu] tempa1 : "); @@ -379,12 +370,11 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource acc = _mm_xor_si128(tempa3, acc); - const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); - const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); + const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); _mm_store_si128(prandex, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } @@ -397,179 +387,314 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource return acc; } +__m128i __verusclmulwithoutreduction64alignedrepeatv2_2(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, + uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) +{ + const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; + const __m128i *pbuf; + + // divide key mask by 16 from bytes to __m128i + keyMask >>= 4; + + // the random buffer must have at least 32 16 byte dwords after the keymask to work with this + // algorithm. we take the value from the last element inside the keyMask + 2, as that will never + // be used to xor into the accumulator before it is hashed with other values first + __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); + + for (int64_t i = 0; i < 32; i++) + { + const uint64_t selector = _mm_cvtsi128_si64(acc); + + uint32_t prand_idx = (selector >> 5) & keyMask; + uint32_t prandex_idx = (selector >> 32) & keyMask; + // get two random locations in the key, which will be mutated and swapped + __m128i *prand = randomsource + prand_idx; + __m128i *prandex = randomsource + prandex_idx; + + // select random start and order of pbuf processing + pbuf = pbuf_copy + (selector & 3); + _mm_store_si128(&g_prand[i], prand[0]); + _mm_store_si128(&g_prandex[i], prandex[0]); + fixrand[i] = prand_idx; + fixrandex[i] = prandex_idx; + + switch (selector & 0x1c) + { + case 0: + { + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prand); + _mm_store_si128(prand, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prandex, tempb2); + break; + } + case 4: + { + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); + acc = _mm_xor_si128(clprod2, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + acc = _mm_xor_si128(add12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prand, tempb2); + break; + } + case 8: + { + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + acc = _mm_xor_si128(add1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prand); + _mm_store_si128(prand, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + acc = _mm_xor_si128(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prandex, tempb2); + break; + } + case 0xc: + { + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + + // cannot be zero here + const int32_t divisor = (uint32_t)selector; + + acc = _mm_xor_si128(add1, acc); + + const int64_t dividend = _mm_cvtsi128_si64(acc); + const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + acc = _mm_xor_si128(modulo, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + if (dividend & 1) + { + const __m128i temp12 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + acc = _mm_xor_si128(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prand, tempb2); + } + else + { + const __m128i tempb3 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + _mm_store_si128(prand, tempb3); + const __m128i tempb4 = _mm_load_si128(pbuf); + acc = _mm_xor_si128(tempb4, acc); + } + break; + } + case 0x10: + { + // a few AES operations + const __m128i *rc = prand; + __m128i tmp; + + __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + __m128i temp2 = _mm_load_si128(pbuf); + + AES2(temp1, temp2, 0); + MIX2(temp1, temp2); + + AES2(temp1, temp2, 4); + MIX2(temp1, temp2); + + AES2(temp1, temp2, 8); + MIX2(temp1, temp2); + + acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc)); + + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + + const __m128i tempa4 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa3); + _mm_store_si128(prand, tempa4); + break; + } + case 0x14: + { + // we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 + + uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times + __m128i *rc = prand; + uint64_t aesroundoffset = 0; + __m128i onekey; + + do + { + if (selector & (((uint64_t)0x10000000) << rounds)) + { + onekey = _mm_load_si128(rc++); + const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + const __m128i add1 = _mm_xor_si128(onekey, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + } + else + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + AES2(onekey, temp2, aesroundoffset); + aesroundoffset += 4; + MIX2(onekey, temp2); + acc = _mm_xor_si128(onekey, acc); + acc = _mm_xor_si128(temp2, acc); + } + } while (rounds--); + + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + + const __m128i tempa4 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa3); + _mm_store_si128(prand, tempa4); + break; + } + case 0x18: + { + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 + + uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times + __m128i *rc = prand; + __m128i onekey; + + do + { + if (selector & (((uint64_t)0x10000000) << rounds)) + { + onekey = _mm_load_si128(rc++); + const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + onekey = _mm_xor_si128(onekey, temp2); + // cannot be zero here, may be negative + const int32_t divisor = (uint32_t)selector; + const int64_t dividend = _mm_cvtsi128_si64(onekey); + const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + acc = _mm_xor_si128(modulo, acc); + } + else + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + const __m128i add1 = _mm_xor_si128(onekey, temp2); + onekey = _mm_clmulepi64_si128(add1, add1, 0x10); + const __m128i clprod2 = _mm_mulhrs_epi16(acc, onekey); + acc = _mm_xor_si128(clprod2, acc); + } + } while (rounds--); + + const __m128i tempa3 = _mm_load_si128(prandex); + const __m128i tempa4 = _mm_xor_si128(tempa3, acc); + + _mm_store_si128(prandex, onekey); + _mm_store_si128(prand, tempa4); + break; + } + case 0x1c: + { + const __m128i temp1 = _mm_load_si128(pbuf); + const __m128i temp2 = _mm_load_si128(prandex); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); + + const __m128i tempa3 = _mm_load_si128(prand); + _mm_store_si128(prand, tempa2); + + acc = _mm_xor_si128(tempa3, acc); + const __m128i temp4 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + acc = _mm_xor_si128(temp4, acc); + const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); + const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); + _mm_store_si128(prandex, tempb2); + break; + } + } + } + return acc; +} + // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, // returning a 64 bit hash value -uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, +uint64_t verusclhashv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) { - __m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex); + __m128i acc = __verusclmulwithoutreduction64alignedrepeatv2_1((__m128i *)random, (const __m128i *)buf, 511, fixrand, fixrandex, g_prand, g_prandex); acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64)); return precompReduction64(acc); } -inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) { - u128 s[4], tmp; - - s[0] = LOAD(in); - s[1] = LOAD(in + 16); - s[2] = LOAD(in + 32); - s[3] = LOAD(in + 48); - - AES4(s[0], s[1], s[2], s[3], 0); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 8); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 16); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 24); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 32); - MIX4(s[0], s[1], s[2], s[3]); - - s[0] = _mm_xor_si128(s[0], LOAD(in)); - s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); - s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); - - // TRUNCSTORE(out, s[0], s[1], s[2], s[3]); -} -/* -void cpu_verushash::solve_verus_v2_opt(CBlockHeader &bh, - arith_uint256 &target, - std::function cancelf, - std::function&, size_t, const unsigned char*)> solutionf, - std::function hashdonef, - cpu_verushash &device_context) -{ - CVerusHashV2bWriter &vhw = *(device_context.pVHW2b); - CVerusHashV2 &vh = vhw.GetState(); - verusclhasher &vclh = vh.vclh; - - alignas(32) uint256 curHash, curTarget = ArithToUint256(target); - - const uint64_t *compResult = (uint64_t *)&curHash; - const uint64_t *compTarget = (uint64_t *)&curTarget; - - u128 *hashKey = (u128 *)verusclhasher_key.get(); - verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); - void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes; - const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks - - bh.nSolution = std::vector(1344); - bh.nSolution[0] = VERUSHHASH_SOLUTION_VERSION; // earliest VerusHash 2.0 solution version - - // prepare the hash state - vhw.Reset(); - vhw << bh; - - int64_t *extraPtr = vhw.xI64p(); - unsigned char *curBuf = vh.CurBuffer(); - - // skip keygen if it is the current key - if (pdesc->seed != *((uint256 *)curBuf)) - { - // generate a new key by chain hashing with Haraka256 from the last curbuf - // assume 256 bit boundary - int n256blks = pdesc->keySizeInBytes >> 5; - unsigned char *pkey = ((unsigned char *)hashKey); - unsigned char *psrc = curBuf; - for (int i = 0; i < n256blks; i++) - { - haraka256(pkey, psrc); - psrc = pkey; - pkey += 32; - } - pdesc->seed = *((uint256 *)curBuf); - memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes); - } - - const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0); - const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1); - const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0); - unsigned char ch = curBuf[0]; - - // loop the requested number of times or until canceled. determine if we - // found a winner, and send all winners found as solutions. count only one hash. - // hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only - // hashrate and sharerate are valid, solutionrate will equal sharerate - for (int64_t i = 0; i < VERUSHASHES_PER_SOLVE; i++) - { - *extraPtr = i; - - // prepare the buffer - _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1); - curBuf[32 + 15] = ch; - - // run verusclhash on the buffer - const uint64_t intermediate = vclh(curBuf, hashKey); - - // fill buffer to the end with the result and final hash - __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2); - _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2); - curBuf[32 + 15] = *((unsigned char *)&intermediate); - - haraka512_keyed_local((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate)); - - if (compResult[3] > compTarget[3] || (compResult[3] == compTarget[3] && compResult[2] > compTarget[2]) || - (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] > compTarget[1]) || - (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] == compTarget[1] && compResult[0] > compTarget[0])) - { - // refresh the key - memcpy(hashKey, hasherrefresh, keyrefreshsize); - continue; - } - - std::vector solution = bh.nSolution; - int extraSpace = (solution.size() % 32) + 15; - assert(solution.size() > 32); - *((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i; - - solutionf(std::vector(0), solution.size(), solution.data()); - if (cancelf()) return; - - // refresh the key - memcpy(hashKey, hasherrefresh, keyrefreshsize); - } - hashdonef(); -} - - -void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) { - u128 s[4], tmp; - - s[0] = LOAD(in); - s[1] = LOAD(in + 16); - s[2] = LOAD(in + 32); - s[3] = LOAD(in + 48); - - AES4(s[0], s[1], s[2], s[3], 0); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 8); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 16); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 24); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 32); - MIX4(s[0], s[1], s[2], s[3]); +uint64_t verusclhashv2_2(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex) { + __m128i acc = __verusclmulwithoutreduction64alignedrepeatv2_2((__m128i *)random, (const __m128i *)buf, 511, fixrand, fixrandex, g_prand, g_prandex); + acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64)); - s[0] = _mm_xor_si128(s[0], LOAD(in)); - s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); - s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); - TRUNCSTORE(out, s[0], s[1], s[2], s[3]); + return precompReduction64(acc); } -*/ #ifdef _WIN32 diff --git a/verus/verus_clhash.h b/verus/verus_clhash.h index ea3e94b3a2..5a5a2723f9 100644 --- a/verus/verus_clhash.h +++ b/verus/verus_clhash.h @@ -123,7 +123,9 @@ inline void ForceCPUVerusOptimized(bool trueorfalse) __cpuverusoptimized = trueorfalse; }; -uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, +uint64_t verusclhashv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex); +uint64_t verusclhashv2_2(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex); uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex); diff --git a/verus/verus_hash.cpp b/verus/verus_hash.cpp index 93b7a6e17b..8b153e83cd 100644 --- a/verus/verus_hash.cpp +++ b/verus/verus_hash.cpp @@ -1,590 +1,181 @@ -/* - * This uses veriations of the clhash algorithm for Verus Coin, licensed - * with the Apache-2.0 open source license. - * - * Copyright (c) 2018 Michael Toutonghi - * Distributed under the Apache 2.0 software license, available in the original form for clhash - * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a - * - * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser - * Faster 64-bit universal hashing - * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) - * - * Best used on recent x64 processors (Haswell or better). - * - * This implements an intermediate step in the last part of a Verus block hash. The intent of this step - * is to more effectively equalize FPGAs over GPUs and CPUs. - * - **/ - - -#include "verus_clhash.h" +// (C) 2018 The Verus Developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. - -#include +/* +This provides the PoW hash function for Verus, a CPU-optimized hash +function with a Haraka V2 core. Unlike Haraka, which is made for short +inputs only, Verus Hash takes any length of input and produces a 256 +bit output. +*/ #include -//#include -//#include "cpu_verushash.hpp" - -#ifdef _WIN32 -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) -#endif +//#include "common.h" +#include "verus_hash.h" +void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in); -int __cpuverusoptimized = 0x80; - -// multiply the length and the some key, no modulo -static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) { - const __m128i lengthvector = _mm_set_epi64x(keylength,length); - const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10); - return clprod1; -} - -// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64 -static inline __m128i precompReduction64_si128( __m128i A) { - - //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) - const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); - __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); - __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), - _mm_srli_si128(Q2,8)); - __m128i Q4 = _mm_xor_si128(Q2,A); - const __m128i final = _mm_xor_si128(Q3,Q4); - return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE -} +void CVerusHash::Hash(void *result, const void *data, size_t _len) +{ + unsigned char buf[128]; + unsigned char *bufPtr = buf; + int nextOffset = 64; + uint32_t pos = 0, len = _len; + unsigned char *bufPtr2 = bufPtr + nextOffset; + unsigned char *ptr = (unsigned char *)data; + + // put our last result or zero at beginning of buffer each time + memset(bufPtr, 0, 32); + + // digest up to 32 bytes at a time + for ( ; pos < len; pos += 32) + { + if (len - pos >= 32) + { + memcpy(bufPtr + 32, ptr + pos, 32); + } + else + { + int i = (int)(len - pos); + memcpy(bufPtr + 32, ptr + pos, i); + memset(bufPtr + 32 + i, 0, 32 - i); + } + (*haraka512Function)(bufPtr2, bufPtr); + bufPtr2 = bufPtr; + bufPtr += nextOffset; + nextOffset *= -1; + } + memcpy(result, bufPtr, 32); +}; -static inline uint64_t precompReduction64( __m128i A) { - return _mm_cvtsi128_si64(precompReduction64_si128(A)); +void CVerusHash::init() +{ + + haraka512Function = &haraka512_port_zero; + } -// verus intermediate hash extra -static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, - uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) +CVerusHash &CVerusHash::Write(const unsigned char *data, size_t _len) { - __m128i *pbuf; - __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; - - // divide key mask by 16 from bytes to __m128i - keyMask >>= 4; - - __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); - - // the random buffer must have at least 32 16 byte dwords after the keymask to work with this - // algorithm. we take the value from the last element inside the keyMask + 2, as that will never - // be used to xor into the accumulator before it is hashed with other values first -#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0) + unsigned char *tmp; + uint32_t pos, len = _len; -#define LIONELK_FETCH_DIST 0 - -#pragma unroll 32 - - for (uint64_t i = 0; i < 32; i++) + // digest up to 32 bytes at a time + for ( pos = 0; pos < len; ) { - - const uint64_t selector = _mm_cvtsi128_si64(acc); - - // get two random locations in the key, which will be mutated and swapped - __m128i *prand = randomsource + ((selector >> 5) & keyMask); - __m128i *prandex = randomsource + ((selector >> 32) & keyMask); - - // select random start and order of pbuf processing - pbuf = pbuf_copy + (selector & 3); - uint32_t prand_idx = (selector >> 5) & keyMask; - uint32_t prandex_idx = (selector >>32) & keyMask; - g_prand[i] = prand[0]; - g_prandex[i] = prandex[0]; - fixrand[i] = prand_idx; - fixrandex[i] = prandex_idx; + uint32_t room = 32 - curPos; - switch (selector & 0x1c) + if (len - pos >= room) { - - case 0: - { - const __m128i temp1 = _mm_load_si128(prandex); - const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - const __m128i temp12 = _mm_load_si128(prand); - _mm_store_si128(prand, tempa2); - - const __m128i temp22 = _mm_load_si128(pbuf); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prandex, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 4: - { - const __m128i temp1 = _mm_load_si128(prand); - const __m128i temp2 = _mm_load_si128(pbuf); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); - acc = _mm_xor_si128(clprod2, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - const __m128i temp12 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - - const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - acc = _mm_xor_si128(add12, acc); - - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prand, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 8: - { - const __m128i temp1 = _mm_load_si128(prandex); - const __m128i temp2 = _mm_load_si128(pbuf); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - acc = _mm_xor_si128(add1, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - const __m128i temp12 = _mm_load_si128(prand); - _mm_store_si128(prand, tempa2); - - const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); - acc = _mm_xor_si128(clprod22, acc); - - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prandex, tempb2); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 0xc: - { - const __m128i temp1 = _mm_load_si128(prand); - const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - - // cannot be zero here - const int32_t divisor = (uint32_t)selector; - - acc = _mm_xor_si128(add1, acc); - - const int64_t dividend = _mm_cvtsi128_si64(acc); - const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); - acc = _mm_xor_si128(modulo, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - if (dividend & 1) - { - const __m128i temp12 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - - const __m128i temp22 = _mm_load_si128(pbuf); - const __m128i add12 = _mm_xor_si128(temp12, temp22); - const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); - acc = _mm_xor_si128(clprod22, acc); - - const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prand, tempb2); - } - else - { - const __m128i tempb3 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - _mm_store_si128(prand, tempb3); - } - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 0x10: - { - // a few AES operations - const __m128i *rc = prand; - __m128i tmp; - - __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i temp2 = _mm_load_si128(pbuf); - - AES2(temp1, temp2, 0); - - MIX2(temp1, temp2); - - AES2(temp1, temp2, 4); - MIX2(temp1, temp2); - - AES2(temp1, temp2, 8); - MIX2(temp1, temp2); - - acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc)); - - const __m128i tempa1 = _mm_load_si128(prand); - const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); - - const __m128i tempa4 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa3); - _mm_store_si128(prand, tempa4); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 0x14: - { - // we'll just call this one the monkins loop, inspired by Chris - const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); - __m128i tmp; // used by MIX2 - - uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times - __m128i *rc = prand; - uint64_t aesroundoffset = 0,loop_c; - __m128i onekey; - - do - { - loop_c = selector & (((uint64_t)0x10000000) << rounds); - if (loop_c) - { - onekey = _mm_load_si128(rc++); - const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); - const __m128i add1 = _mm_xor_si128(onekey, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - } - else - { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); - - AES2(onekey, temp2, aesroundoffset); - - aesroundoffset += 4; - MIX2(onekey, temp2); - - acc = _mm_xor_si128(onekey, acc); - acc = _mm_xor_si128(temp2, acc); - } - - } while (rounds--); - - const __m128i tempa1 = _mm_load_si128(prand); - const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); - - const __m128i tempa4 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa3); - _mm_store_si128(prand, tempa4); - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } - case 0x18: - { - __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); - __m128i tmp; // used by MIX2 - - uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times - __m128i *rc = prand; - uint64_t aesroundoffset = 0; - __m128i onekey; - - do - { - if (selector & (((uint64_t)0x10000000) << rounds)) - { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); - __m128i add1 = _mm_xor_si128(onekey, temp2); - // cannot be zero here, may be negative - int32_t divisor = (uint32_t)selector; - int64_t dividend = _mm_cvtsi128_si64(add1); - __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); - acc = _mm_xor_si128(modulo, acc); - } - else - { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); - __m128i add1 = _mm_xor_si128(onekey, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1); - acc = _mm_xor_si128(clprod2, acc); - } - } while (rounds--); - - __m128i tempa3 = _mm_load_si128(prandex); - __m128i tempa4 = _mm_xor_si128(tempa3, acc); - _mm_store_si128(prandex, tempa4); - _mm_store_si128(prand, onekey); - break; - } - case 0x1c: - { - const __m128i temp1 = _mm_load_si128(pbuf); - const __m128i temp2 = _mm_load_si128(prandex); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); - - const __m128i tempa3 = _mm_load_si128(prand); -#ifdef VERUSHASHDEBUGo - - printf("[cpu] tempa1 : "); - printf("%016llx%016llx", ((uint64_t*)&tempa1)[0], ((uint64_t*)&tempa1)[1]); - printf("\n"); - printf("[cpu] tempa2 : "); - printf("%016llx%016llx", ((uint64_t*)&tempa2)[0], ((uint64_t*)&tempa2)[1]); - printf("\n"); - printf("[cpu] tempa3 : "); - printf("%016llx%016llx", ((uint64_t*)&tempa3)[0], ((uint64_t*)&tempa3)[1]); - printf("\n"); - -#endif - _mm_store_si128(prand, tempa2); - - acc = _mm_xor_si128(tempa3, acc); - - const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); - const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); - _mm_store_si128(prandex, tempb2); - - PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); - PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); - - break; - } + memcpy(curBuf + 32 + curPos, data + pos, room); + (*haraka512Function)(result, curBuf); + tmp = curBuf; + curBuf = result; + result = tmp; + pos += room; + curPos = 0; + } + else + { + memcpy(curBuf + 32 + curPos, data + pos, len - pos); + curPos += len - pos; + pos = len; } - - - } - - return acc; + return *this; } -// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, -// returning a 64 bit hash value -uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, - u128 *g_prand, u128 *g_prandex) { - __m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex); - acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64)); - - - return precompReduction64(acc); +// to be declared and accessed from C +void verus_hash(void *result, const void *data, size_t len) +{ + return CVerusHash::Hash(result, data, len); } -inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) { - u128 s[4], tmp; - - s[0] = LOAD(in); - s[1] = LOAD(in + 16); - s[2] = LOAD(in + 32); - s[3] = LOAD(in + 48); - - AES4(s[0], s[1], s[2], s[3], 0); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 8); - MIX4(s[0], s[1], s[2], s[3]); +void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in); +void (*CVerusHashV2::haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc); +void (*CVerusHashV2::haraka256Function)(unsigned char *out, const unsigned char *in); - AES4(s[0], s[1], s[2], s[3], 16); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 24); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 32); - MIX4(s[0], s[1], s[2], s[3]); - - s[0] = _mm_xor_si128(s[0], LOAD(in)); - s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); - s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); - - // TRUNCSTORE(out, s[0], s[1], s[2], s[3]); -} -/* -void cpu_verushash::solve_verus_v2_opt(CBlockHeader &bh, - arith_uint256 &target, - std::function cancelf, - std::function&, size_t, const unsigned char*)> solutionf, - std::function hashdonef, - cpu_verushash &device_context) +void CVerusHashV2::init() { - CVerusHashV2bWriter &vhw = *(device_context.pVHW2b); - CVerusHashV2 &vh = vhw.GetState(); - verusclhasher &vclh = vh.vclh; - - alignas(32) uint256 curHash, curTarget = ArithToUint256(target); - - const uint64_t *compResult = (uint64_t *)&curHash; - const uint64_t *compTarget = (uint64_t *)&curTarget; - - u128 *hashKey = (u128 *)verusclhasher_key.get(); - verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); - void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes; - const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks - - bh.nSolution = std::vector(1344); - bh.nSolution[0] = VERUSHHASH_SOLUTION_VERSION; // earliest VerusHash 2.0 solution version + if (IsCPUVerusOptimized()) + { + load_constants(); + haraka512Function = &haraka512; + haraka512KeyedFunction = &haraka512_keyed; + haraka256Function = &haraka256; + } + else + { + // load the haraka constants + load_constants_port(); + haraka512Function = &haraka512_port; + haraka512KeyedFunction = &haraka512_port_keyed; + haraka256Function = &haraka256_port; + } +} - // prepare the hash state - vhw.Reset(); - vhw << bh; +void CVerusHashV2::Hash(void *result, const void *data, size_t len) +{ + unsigned char buf[128]; + unsigned char *bufPtr = buf; + int pos = 0, nextOffset = 64; + unsigned char *bufPtr2 = bufPtr + nextOffset; + unsigned char *ptr = (unsigned char *)data; - int64_t *extraPtr = vhw.xI64p(); - unsigned char *curBuf = vh.CurBuffer(); + // put our last result or zero at beginning of buffer each time + memset(bufPtr, 0, 32); - // skip keygen if it is the current key - if (pdesc->seed != *((uint256 *)curBuf)) + // digest up to 32 bytes at a time + for ( ; pos < len; pos += 32) { - // generate a new key by chain hashing with Haraka256 from the last curbuf - // assume 256 bit boundary - int n256blks = pdesc->keySizeInBytes >> 5; - unsigned char *pkey = ((unsigned char *)hashKey); - unsigned char *psrc = curBuf; - for (int i = 0; i < n256blks; i++) + if (len - pos >= 32) { - haraka256(pkey, psrc); - psrc = pkey; - pkey += 32; + memcpy(bufPtr + 32, ptr + pos, 32); } - pdesc->seed = *((uint256 *)curBuf); - memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes); + else + { + int i = (int)(len - pos); + memcpy(bufPtr + 32, ptr + pos, i); + memset(bufPtr + 32 + i, 0, 32 - i); + } + (*haraka512Function)(bufPtr2, bufPtr); + bufPtr2 = bufPtr; + bufPtr += nextOffset; + nextOffset *= -1; } + memcpy(result, bufPtr, 32); +}; - const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0); - const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1); - const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0); - unsigned char ch = curBuf[0]; - - // loop the requested number of times or until canceled. determine if we - // found a winner, and send all winners found as solutions. count only one hash. - // hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only - // hashrate and sharerate are valid, solutionrate will equal sharerate - for (int64_t i = 0; i < VERUSHASHES_PER_SOLVE; i++) - { - *extraPtr = i; - - // prepare the buffer - _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1); - curBuf[32 + 15] = ch; - - // run verusclhash on the buffer - const uint64_t intermediate = vclh(curBuf, hashKey); - - // fill buffer to the end with the result and final hash - __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2); - _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2); - curBuf[32 + 15] = *((unsigned char *)&intermediate); +CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len) +{ + unsigned char *tmp; - haraka512_keyed_local((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate)); + // digest up to 32 bytes at a time + for ( int pos = 0; pos < len; ) + { + int room = 32 - curPos; - if (compResult[3] > compTarget[3] || (compResult[3] == compTarget[3] && compResult[2] > compTarget[2]) || - (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] > compTarget[1]) || - (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] == compTarget[1] && compResult[0] > compTarget[0])) + if (len - pos >= room) { - // refresh the key - memcpy(hashKey, hasherrefresh, keyrefreshsize); - continue; + memcpy(curBuf + 32 + curPos, data + pos, room); + (*haraka512Function)(result, curBuf); + tmp = curBuf; + curBuf = result; + result = tmp; + pos += room; + curPos = 0; } - - std::vector solution = bh.nSolution; - int extraSpace = (solution.size() % 32) + 15; - assert(solution.size() > 32); - *((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i; - - solutionf(std::vector(0), solution.size(), solution.data()); - if (cancelf()) return; - - // refresh the key - memcpy(hashKey, hasherrefresh, keyrefreshsize); - } - hashdonef(); -} - - -void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) { - u128 s[4], tmp; - - s[0] = LOAD(in); - s[1] = LOAD(in + 16); - s[2] = LOAD(in + 32); - s[3] = LOAD(in + 48); - - AES4(s[0], s[1], s[2], s[3], 0); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 8); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 16); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 24); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 32); - MIX4(s[0], s[1], s[2], s[3]); - - s[0] = _mm_xor_si128(s[0], LOAD(in)); - s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); - s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); - - TRUNCSTORE(out, s[0], s[1], s[2], s[3]); + else + { + memcpy(curBuf + 32 + curPos, data + pos, len - pos); + curPos += len - pos; + pos = len; + } + } + return *this; } -*/ - -#ifdef _WIN32 -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) -#endif - -void *alloc_aligned_buffer(uint64_t bufSize) +// to be declared and accessed from C +void verus_hash_v2(void *result, const void *data, size_t len) { - void *answer = NULL; - if (posix_memalign(&answer, sizeof(__m256i), bufSize)) - { - return NULL; - } - else - { - return answer; - } + return CVerusHashV2::Hash(result, data, len); } diff --git a/verus/verusscan.cpp b/verus/verusscan.cpp index 4d5c2d536c..f8dae57bf5 100644 --- a/verus/verusscan.cpp +++ b/verus/verusscan.cpp @@ -130,20 +130,27 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len) extern "C" void Verus2hash(unsigned char *hash, unsigned char *curBuf, uint32_t nonce, - u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) + u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, + u128 *g_prandex, int version) { //uint64_t mask = VERUS_KEY_SIZE128; //552 - - memcpy(curBuf + 47, curBuf, 16); - memcpy(curBuf + 63, curBuf, 1); + static const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0); + const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1); + static const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0); + unsigned char ch = curBuf[0]; + _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1); + curBuf[32 + 15] = ch; // FillExtra((u128 *)curBuf); - + uint64_t intermediate; ((uint32_t*)&curBuf[0])[8] = nonce; - uint64_t intermediate = verusclhash(data_key,curBuf, 8191, fixrand, fixrandex, g_prand, g_prandex); + if(version = 3) + intermediate = verusclhashv2_1(data_key,curBuf, 511, fixrand, fixrandex, g_prand, g_prandex); + else + intermediate = verusclhashv2_2(data_key, curBuf, 511, fixrand, fixrandex, g_prand, g_prandex); //FillExtra - memcpy(curBuf + 47, &intermediate, 8); - memcpy(curBuf + 55, &intermediate, 8); - memcpy(curBuf + 63, &intermediate, 1); + __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2); + _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2); + curBuf[32 + 15] = *((unsigned char *)&intermediate); intermediate &= 511; haraka512_keyed(hash, curBuf, data_key + intermediate); FixKey(fixrand, fixrandex, data_key, g_prand, g_prandex); @@ -158,30 +165,31 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, // unsigned char data[] = { // 0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x40, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 //}; - uint32_t _ALIGN(64) endiandata[35]; + uint32_t _ALIGN(4) endiandata[35]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - uint8_t blockhash_half[64] = { 0 }; + uint8_t _ALIGN(4) blockhash_half[64] = { 0 }; uint8_t gpuinit = 0; struct timeval tv_start, tv_end, diff; - double secs, solps; - u128 *data_key = (u128*)malloc(VERUS_KEY_SIZE); + //double secs, solps; + u128 *data_key = (u128*)malloc(VERUS_KEY_SIZE + 1024); //u128 *data_key_master = NULL; // posix_memalign((void**)&data_key, sizeof(__m128i), VERUS_KEY_SIZE); - u128 data_key_prand[32]; - u128 data_key_prandex[32]; + u128 *data_key_prand = data_key + VERUS_KEY_SIZE128 ; + u128 *data_key_prandex = data_key + VERUS_KEY_SIZE128 + 32; //u128 data_key[VERUS_KEY_SIZE128] = { 0 }; // 552 required //u128 data_key_master[VERUS_KEY_SIZE128] = { 0 }; uint32_t nonce_buf = 0; - uint32_t fixrand[32]; - uint32_t fixrandex[32]; + uint32_t _ALIGN(4) fixrand[32]; + uint32_t _ALIGN(4) fixrandex[32]; - unsigned char block_41970[] = { 0xfd, 0x40, 0x05, 0x03 }; - uint8_t _ALIGN(64) full_data[140 + 3 + 1344] = { 0 }; + uint8_t version = work->hash_ver; + unsigned char block_41970[4] = { 0xfd, 0x40, 0x05}; + block_41970[3] = version; + uint8_t _ALIGN(4) full_data[140 + 3 + 1344] = { 0 }; uint8_t* sol_data = &full_data[140]; - memcpy(endiandata, pdata, 140); memcpy(sol_data, block_41970, 4); memcpy(full_data, endiandata, 140); @@ -202,7 +210,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, *hashes_done = nonce_buf + throughput; Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key, - &gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex); + &gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex, version); if (vhash[7] <= Htarg ) { @@ -230,10 +238,10 @@ Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, d out: -// gettimeofday(&tv_end, NULL); -// timeval_subtract(&diff, &tv_end, &tv_start); -// secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); -// solps = (double)nonce_buf / secs; + gettimeofday(&tv_end, NULL); + //timeval_subtract(&diff, &tv_end, &tv_start); + //secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); + //solps = (double)nonce_buf / secs; pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; free(data_key);