diff --git a/Makefile.am b/Makefile.am
index e8a35ca218..d6edb3479c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,7 @@ bin_PROGRAMS = ccminer
ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \
- crc32.c hefty1.c scrypt.c \
+ crc32.c hefty1.c \
ccminer.cpp util.cpp \
api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
heavy/heavy.cu \
@@ -57,6 +57,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
x11/s3.cu
+# scrypt
+ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
+ scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \
+ scrypt/salsa_kernel.cu scrypt/test_kernel.cu \
+ scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \
+ scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu
+
if HAVE_NVML
nvml_defs = -DUSE_WRAPNVML
nvml_libs = -ldl
@@ -118,6 +125,10 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+# This kernel need also an older SM to be able to autotune kernels
+scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
+ $(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $<
+
skein.o: skein.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
diff --git a/ccminer.cpp b/ccminer.cpp
index def142087b..d3203253ff 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -102,6 +102,8 @@ enum sha_algos {
ALGO_PLUCK,
ALGO_QUARK,
ALGO_QUBIT,
+ ALGO_SCRYPT,
+ ALGO_SCRYPT_JANE,
ALGO_SKEIN,
ALGO_SKEIN2,
ALGO_S3,
@@ -137,6 +139,8 @@ static const char *algo_names[] = {
"pluck",
"quark",
"qubit",
+ "scrypt",
+ "scrypt-jane",
"skein",
"skein2",
"s3",
@@ -184,6 +188,20 @@ char * device_name[MAX_GPUS];
short device_map[MAX_GPUS] = { 0 };
long device_sm[MAX_GPUS] = { 0 };
uint32_t gpus_intensity[MAX_GPUS] = { 0 };
+
+int device_interactive[MAX_GPUS] = { 0 };
+int device_batchsize[MAX_GPUS] = { 0 };
+int device_backoff[MAX_GPUS] = { 0 };
+int device_lookup_gap[MAX_GPUS] = { 0 };
+int device_texturecache[MAX_GPUS] = { 0 };
+int device_singlememory[MAX_GPUS] = { 0 };
+char *device_config[MAX_GPUS] = { 0 };
+int opt_nfactor = 0;
+int parallel = 2;
+bool autotune = true;
+bool abort_flag = false;
+char *jane_params = NULL;
+
char *rpc_user = NULL;
static char *rpc_pass;
static char *rpc_userpass = NULL;
@@ -255,6 +273,8 @@ Options:\n\
pluck SupCoin\n\
quark Quark\n\
qubit Qubit\n\
+ scrypt Scrypt\n\
+ scrypt-jane Scrypt-jane Chacha\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
s3 S3 (1Coin)\n\
@@ -439,6 +459,7 @@ void get_currentalgo(char* buf, int sz)
*/
void proper_exit(int reason)
{
+ abort_flag = true;
cuda_devicereset();
if (check_dups)
@@ -1173,6 +1194,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
switch (opt_algo) {
case ALGO_JACKPOT:
case ALGO_PLUCK:
+ case ALGO_SCRYPT:
+ case ALGO_SCRYPT_JANE:
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
break;
case ALGO_DMD_GR:
@@ -1386,6 +1409,8 @@ static void *miner_thread(void *userdata)
minmax = 0x400000;
break;
case ALGO_LYRA2:
+ case ALGO_SCRYPT:
+ case ALGO_SCRYPT_JANE:
minmax = 0x100000;
break;
case ALGO_PLUCK:
@@ -1526,6 +1551,16 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
+ case ALGO_SCRYPT:
+ rc = scanhash_scrypt(thr_id, work.data, work.target, NULL,
+ max_nonce, &hashes_done, &tv_start, &tv_end);
+ break;
+
+ case ALGO_SCRYPT_JANE:
+ rc = scanhash_scrypt_jane(thr_id, work.data, work.target, NULL,
+ max_nonce, &hashes_done, &tv_start, &tv_end);
+ break;
+
case ALGO_SKEIN:
rc = scanhash_skeincoin(thr_id, work.data, work.target,
max_nonce, &hashes_done);
@@ -1942,15 +1977,29 @@ void parse_arg(int key, char *arg)
switch(key) {
case 'a':
+ p = strstr(arg, ":"); // optional factor
+ if (p) *p = '\0';
for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
- if (algo_names[i] &&
- !strcmp(arg, algo_names[i])) {
+ if (algo_names[i] && !strcasecmp(arg, algo_names[i])) {
opt_algo = (enum sha_algos)i;
break;
}
}
if (i == ARRAY_SIZE(algo_names))
show_usage_and_exit(1);
+ if (p) {
+ opt_nfactor = atoi(p + 1);
+ if (opt_algo == ALGO_SCRYPT_JANE) {
+ free(jane_params);
+ jane_params = strdup(p+1);
+ }
+ }
+ if (!opt_nfactor) {
+ switch (opt_algo) {
+ case ALGO_SCRYPT: opt_nfactor = 9; break;
+ case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
+ }
+ }
break;
case 'b':
p = strstr(arg, ":");
@@ -2404,6 +2453,8 @@ int main(int argc, char *argv[])
rpc_pass = strdup("");
rpc_url = strdup("");
+ jane_params = strdup("");
+
pthread_mutex_init(&applog_lock, NULL);
// number of cpus for thread affinity
@@ -2423,9 +2474,17 @@ int main(int argc, char *argv[])
if (num_cpus < 1)
num_cpus = 1;
- // default thread to device map
for (i = 0; i < MAX_GPUS; i++) {
device_map[i] = i;
+ device_name[i] = NULL;
+ // for future use, maybe
+ device_interactive[i] = -1;
+ device_batchsize[i] = 1024;
+ device_backoff[i] = is_windows() ? 12 : 2;
+ device_lookup_gap[i] = 1;
+ device_texturecache[i] = -1;
+ device_singlememory[i] = -1;
+ device_config[i] = NULL;
}
// number of gpus
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 528d5c07c6..f8b3a6c319 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -250,6 +250,8 @@
false
Full
+
+
@@ -261,10 +263,6 @@
-
- Full
- /Tp %(AdditionalOptions)
-
@@ -322,6 +320,7 @@
+
@@ -352,6 +351,22 @@
+
+
+ compute_20,sm_21;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52
+
+
+
+
+
+ compute_35,sm_35;compute_50,sm_50;compute_52,sm_52
+
+
+ compute_20,sm_21
+
+
+
+
@@ -510,4 +525,4 @@
-
+
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 3d1fd916fd..9f62c54e14 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -73,6 +73,9 @@
{f5117ccb-a70d-411a-b7ea-d6faed230bc7}
+
+ {c26f5b02-37b5-4420-a4e8-ee1ad517dc95}
+
@@ -111,9 +114,6 @@
Source Files
-
- Source Files
-
Source Files
@@ -225,6 +225,12 @@
Source Files\sph
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
@@ -377,6 +383,9 @@
Ressources
+
+ Source Files\CUDA\scrypt
+
@@ -580,6 +589,36 @@
Source Files\CUDA
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
+
+ Source Files\CUDA\scrypt
+
@@ -596,4 +635,4 @@
Ressources
-
+
\ No newline at end of file
diff --git a/miner.h b/miner.h
index 71bf153f62..2b8a7b4fe6 100644
--- a/miner.h
+++ b/miner.h
@@ -272,8 +272,6 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
-extern unsigned char *scrypt_buffer_alloc();
-
extern int scanhash_deep(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@@ -343,8 +341,12 @@ extern int scanhash_qubit(int thr_id, uint32_t *pdata,
unsigned long *hashes_done);
extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
- unsigned char *scratchbuf, const uint32_t *ptarget,
- uint32_t max_nonce, unsigned long *hashes_done);
+ const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
+ unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
+
+extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
+ unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
@@ -683,6 +685,7 @@ void pentablakehash(void *output, const void *input);
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input);
+void scrypthash(void* output, const void* input);
void skeincoinhash(void *output, const void *input);
void skein2hash(void *output, const void *input);
void s3hash(void *output, const void *input);
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
new file mode 100644
index 0000000000..ce21ea2de5
--- /dev/null
+++ b/scrypt-jane.cpp
@@ -0,0 +1,626 @@
+/*
+ scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
+
+ Public Domain or MIT License, whichever is easier
+*/
+
+#include "miner.h"
+
+#include "scrypt/scrypt-jane.h"
+#include "scrypt/code/scrypt-jane-portable.h"
+#include "scrypt/code/scrypt-jane-romix.h"
+#include "scrypt/keccak.h"
+
+#include "scrypt/salsa_kernel.h"
+
+#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25 /* (1 << 25) = ~33 million */
+
+// ---------------------------- BEGIN keccak functions ------------------------------------
+
+#define SCRYPT_HASH "Keccak-512"
+#define SCRYPT_HASH_DIGEST_SIZE 64
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
+#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
+#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+ uint64_t state[SCRYPT_KECCAK_F / 64];
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t keccak_round_constants[24] = {
+ 0x0000000000000001ull, 0x0000000000008082ull,
+ 0x800000000000808aull, 0x8000000080008000ull,
+ 0x000000000000808bull, 0x0000000080000001ull,
+ 0x8000000080008081ull, 0x8000000000008009ull,
+ 0x000000000000008aull, 0x0000000000000088ull,
+ 0x0000000080008009ull, 0x000000008000000aull,
+ 0x000000008000808bull, 0x800000000000008bull,
+ 0x8000000000008089ull, 0x8000000000008003ull,
+ 0x8000000000008002ull, 0x8000000000000080ull,
+ 0x000000000000800aull, 0x800000008000000aull,
+ 0x8000000080008081ull, 0x8000000000008080ull,
+ 0x0000000080000001ull, 0x8000000080008008ull
+};
+
+static void
+keccak_block(scrypt_hash_state *S, const uint8_t *in) {
+ size_t i;
+ uint64_t *s = S->state, t[5], u[5], v, w;
+
+ /* absorb input */
+ for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
+ s[i] ^= U8TO64_LE(in);
+
+ for (i = 0; i < 24; i++) {
+ /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+ t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+ t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+ t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+ t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+ t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+ /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+ u[0] = t[4] ^ ROTL64(t[1], 1);
+ u[1] = t[0] ^ ROTL64(t[2], 1);
+ u[2] = t[1] ^ ROTL64(t[3], 1);
+ u[3] = t[2] ^ ROTL64(t[4], 1);
+ u[4] = t[3] ^ ROTL64(t[0], 1);
+
+ /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+ s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+ s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+ s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+ s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+ s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+ /* rho pi: b[..] = rotl(a[..], ..) */
+ v = s[ 1];
+ s[ 1] = ROTL64(s[ 6], 44);
+ s[ 6] = ROTL64(s[ 9], 20);
+ s[ 9] = ROTL64(s[22], 61);
+ s[22] = ROTL64(s[14], 39);
+ s[14] = ROTL64(s[20], 18);
+ s[20] = ROTL64(s[ 2], 62);
+ s[ 2] = ROTL64(s[12], 43);
+ s[12] = ROTL64(s[13], 25);
+ s[13] = ROTL64(s[19], 8);
+ s[19] = ROTL64(s[23], 56);
+ s[23] = ROTL64(s[15], 41);
+ s[15] = ROTL64(s[ 4], 27);
+ s[ 4] = ROTL64(s[24], 14);
+ s[24] = ROTL64(s[21], 2);
+ s[21] = ROTL64(s[ 8], 55);
+ s[ 8] = ROTL64(s[16], 45);
+ s[16] = ROTL64(s[ 5], 36);
+ s[ 5] = ROTL64(s[ 3], 28);
+ s[ 3] = ROTL64(s[18], 21);
+ s[18] = ROTL64(s[17], 15);
+ s[17] = ROTL64(s[11], 10);
+ s[11] = ROTL64(s[ 7], 6);
+ s[ 7] = ROTL64(s[10], 3);
+ s[10] = ROTL64( v, 1);
+
+ /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+ v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+ v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+ v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+ v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+ v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+ /* iota: a[0,0] ^= round constant */
+ s[0] ^= keccak_round_constants[i];
+ }
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+ memset(S, 0, sizeof(*S));
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+ size_t want;
+
+ /* handle the previous data */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ want = (want < inlen) ? want : inlen;
+ memcpy(S->buffer + S->leftover, in, want);
+ S->leftover += (uint32_t)want;
+ if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+ return;
+ in += want;
+ inlen -= want;
+ keccak_block(S, S->buffer);
+ }
+
+ /* handle the current data */
+ while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
+ keccak_block(S, in);
+ in += SCRYPT_HASH_BLOCK_SIZE;
+ inlen -= SCRYPT_HASH_BLOCK_SIZE;
+ }
+
+ /* handle leftover data */
+ S->leftover = (uint32_t)inlen;
+ if (S->leftover)
+ memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+ size_t i;
+
+ S->buffer[S->leftover] = 0x01;
+ memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
+ S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
+ keccak_block(S, S->buffer);
+
+ for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
+ U64TO8_LE(&hash[i], S->state[i / 8]);
+ }
+}
+
+// ---------------------------- END keccak functions ------------------------------------
+
+// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
+
+typedef struct scrypt_hmac_state_t {
+ scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void
+scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
+ scrypt_hash_state st;
+ scrypt_hash_init(&st);
+ scrypt_hash_update(&st, m, mlen);
+ scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void
+scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
+ uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+ size_t i;
+
+ scrypt_hash_init(&st->inner);
+ scrypt_hash_init(&st->outer);
+
+ if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+ /* use the key directly if it's <= blocksize bytes */
+ memcpy(pad, key, keylen);
+ } else {
+ /* if it's > blocksize bytes, hash it */
+ scrypt_hash(pad, key, keylen);
+ }
+
+ /* inner = (key ^ 0x36) */
+ /* h(inner || ...) */
+ for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+ pad[i] ^= 0x36;
+ scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+ /* outer = (key ^ 0x5c) */
+ /* h(outer || ...) */
+ for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+ pad[i] ^= (0x5c ^ 0x36);
+ scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void
+scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
+ /* h(inner || m...) */
+ scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void
+scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
+ /* h(inner || m) */
+ scrypt_hash_digest innerhash;
+ scrypt_hash_finish(&st->inner, innerhash);
+
+ /* h(outer || h(inner || m)) */
+ scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+ scrypt_hash_finish(&st->outer, mac);
+}
+
+/*
+ * Special version where N = 1
+ * - mikaelh
+ */
+static void
+scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
+ scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+ scrypt_hash_digest ti, u;
+ uint8_t be[4];
+ uint32_t i, /*j,*/ blocks;
+// uint64_t c;
+
+ /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+ /* hmac(password, ...) */
+ scrypt_hmac_init(&hmac_pw, password, password_len);
+
+ /* hmac(password, salt...) */
+ hmac_pw_salt = hmac_pw;
+ scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+ blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+ for (i = 1; i <= blocks; i++) {
+ /* U1 = hmac(password, salt || be(i)) */
+ U32TO8_BE(be, i);
+ work = hmac_pw_salt;
+ scrypt_hmac_update(&work, be, 4);
+ scrypt_hmac_finish(&work, ti);
+ memcpy(u, ti, sizeof(u));
+
+ memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+ out += SCRYPT_HASH_DIGEST_SIZE;
+ bytes -= SCRYPT_HASH_DIGEST_SIZE;
+ }
+}
+
+// ---------------------------- END PBKDF2 functions ------------------------------------
+
+static void
+scrypt_fatal_error_default(const char *msg) {
+ fprintf(stderr, "%s\n", msg);
+ exit(1);
+}
+
+static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
+
+void
+scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
+ scrypt_fatal_error = fn;
+}
+
+typedef struct scrypt_aligned_alloc_t {
+ uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+#if defined(SCRYPT_TEST_SPEED)
+static uint8_t *mem_base = (uint8_t *)0;
+static size_t mem_bump = 0;
+
+/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+ scrypt_aligned_alloc aa;
+ if (!mem_base) {
+ mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+ if (!mem_base)
+ scrypt_fatal_error("scrypt: out of memory");
+ mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+ }
+ aa.mem = mem_base + mem_bump;
+ aa.ptr = aa.mem;
+ mem_bump += (size_t)size;
+ return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+ mem_bump = 0;
+}
+#else
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+ static const size_t max_alloc = (size_t)-1;
+ scrypt_aligned_alloc aa;
+ size += (SCRYPT_BLOCK_BYTES - 1);
+ if (size > max_alloc)
+ scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+ aa.mem = (uint8_t *)malloc((size_t)size);
+ aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+ if (!aa.mem)
+ scrypt_fatal_error("scrypt: out of memory");
+ return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+ free(aa->mem);
+}
+#endif
+
+
+// yacoin: increasing Nfactor gradually
+unsigned char GetNfactor(unsigned int nTimestamp) {
+ int l = 0;
+
+ unsigned int Nfactor = 0;
+
+ // Yacoin defaults
+ unsigned int Ntimestamp = 1367991200;
+ unsigned int minN = 4;
+ unsigned int maxN = 30;
+
+ if (strlen(jane_params) > 0) {
+ if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op
+ //
+ // NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant
+ // in the src/main.cpp file of the official wallet clients as well as the
+ // const unsigned char minNfactor and const unsigned char maxNfactor
+ //
+ else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) {
+ // YBCoin: 1372386273, minN: 4, maxN: 30
+ Ntimestamp = 1372386273; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) {
+ // ZcCoin: 1375817223, minN: 12, maxN: 30
+ Ntimestamp = 1375817223; minN= 12; maxN= 30;
+ } else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) {
+ // FreeCoin: 1375801200, minN: 6, maxN: 32
+ Ntimestamp = 1375801200; minN= 6; maxN= 32;
+ } else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) {
+ // OneCoin: 1371119462, minN: 6, maxN: 30
+ Ntimestamp = 1371119462; minN= 6; maxN= 30;
+ } else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) {
+ // QQCoin: 1387769316, minN: 4, maxN: 30
+ Ntimestamp = 1387769316; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) {
+ // GoldPressedLatinum:1377557832, minN: 4, maxN: 30
+ Ntimestamp = 1377557832; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) {
+ // MicroCoin:1389028879, minN: 4, maxN: 30
+ Ntimestamp = 1389028879; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) {
+ // AppleCoin:1384720832, minN: 4, maxN: 30
+ Ntimestamp = 1384720832; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) {
+ // Copperbars:1376184687, minN: 4, maxN: 30
+ Ntimestamp = 1376184687; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) {
+ // CacheCoin:1388949883, minN: 4, maxN: 30
+ Ntimestamp = 1388949883; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) {
+ // MicroCoin:1388361600, minN: 4, maxN: 30
+ Ntimestamp = 1388361600; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) {
+ // VelocityCoin:1387769316, minN: 4, maxN: 30
+ Ntimestamp = 1387769316; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) {
+ // InternetCoin:1388385602, minN: 4, maxN: 30
+ Ntimestamp = 1388385602; minN= 4; maxN= 30;
+ } else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) {
+ // InternetCoin:1389196388, minN: 4, maxN: 30
+ Ntimestamp = 1389196388; minN= 4; maxN= 30;
+ } else {
+ if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3)
+ if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
+ else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params);
+ }
+ }
+ // determination based on the constants determined above
+ if (nTimestamp <= Ntimestamp)
+ return minN;
+
+ unsigned long int s = nTimestamp - Ntimestamp;
+ while ((s >> 1) > 3) {
+ l += 1;
+ s >>= 1;
+ }
+
+ s &= 3;
+
+ int n = (l * 170 + s * 25 - 2320) / 100;
+
+ if (n < 0) n = 0;
+
+ if (n > 255)
+ printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+
+ Nfactor = n;
+ if (NfactormaxN) return maxN;
+ return Nfactor;
+}
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+ | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static int s_Nfactor = 0;
+
+int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
+ uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
+{
+ const uint32_t Htarg = ptarget[7];
+
+ if (s_Nfactor == 0 && strlen(jane_params) > 0)
+ applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
+
+ int Nfactor = GetNfactor(bswap_32x4(pdata[17]));
+ if (Nfactor > scrypt_maxN) {
+ scrypt_fatal_error("scrypt: N out of range");
+ }
+
+ if (Nfactor != s_Nfactor)
+ {
+ // all of this isn't very thread-safe...
+ opt_nfactor = (1 << (Nfactor + 1));
+
+ applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor);
+
+ if (s_Nfactor != 0) {
+ // handle N-factor increase at runtime
+ // by adjusting the lookup_gap by factor 2
+ if (s_Nfactor == Nfactor-1)
+ for (int i=0; i < 8; ++i)
+ device_lookup_gap[i] *= 2;
+ }
+ s_Nfactor = Nfactor;
+ }
+
+ int throughput = cuda_throughput(thr_id);
+
+ if(throughput == 0)
+ return -1;
+
+ gettimeofday(tv_start, NULL);
+
+ uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] };
+ uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+
+ uint32_t n = pdata[19];
+
+ /* byte swap pdata into data[0]/[1] arrays */
+ for (int k=0; k<2; ++k) {
+ for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]);
+ for(int i=1;i 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput);
+ }
+#endif
+ } else {
+ n += throughput;
+
+ cuda_scrypt_serialize(thr_id, nxt);
+ pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
+ cuda_scrypt_core(thr_id, nxt, opt_nfactor);
+
+ cuda_scrypt_flush(thr_id, nxt);
+
+ post_keccak512(thr_id, nxt, nonce[nxt], throughput);
+ cuda_scrypt_done(thr_id, nxt);
+
+ cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+
+ if(!cuda_scrypt_sync(thr_id, cur))
+ {
+ return -1;
+ }
+ }
+
+ if(iteration > 0)
+ {
+ for(int i=0;i
-#include
-#include
-
-static const uint32_t keypad[12] = {
- 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
-};
-static const uint32_t innerpad[11] = {
- 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
-};
-static const uint32_t outerpad[8] = {
- 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
-};
-static const uint32_t finalblk[16] = {
- 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init(const uint32_t *key,
- uint32_t *tstate, uint32_t *ostate)
-{
- uint32_t ihash[8];
- uint32_t pad[16];
- int i;
-
- /* tstate is assumed to contain the midstate of key */
- memcpy(pad, key + 16, 16);
- memcpy(pad + 4, keypad, 48);
- sha256_transform(tstate, pad, 0);
- memcpy(ihash, tstate, 32);
-
- sha256_init(ostate);
- for (i = 0; i < 8; i++)
- pad[i] = ihash[i] ^ 0x5c5c5c5c;
- for (; i < 16; i++)
- pad[i] = 0x5c5c5c5c;
- sha256_transform(ostate, pad, 0);
-
- sha256_init(tstate);
- for (i = 0; i < 8; i++)
- pad[i] = ihash[i] ^ 0x36363636;
- for (; i < 16; i++)
- pad[i] = 0x36363636;
- sha256_transform(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
- const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
- uint32_t istate[8], ostate2[8];
- uint32_t ibuf[16], obuf[16];
- int i, j;
-
- memcpy(istate, tstate, 32);
- sha256_transform(istate, salt, 0);
-
- memcpy(ibuf, salt + 16, 16);
- memcpy(ibuf + 5, innerpad, 44);
- memcpy(obuf + 8, outerpad, 32);
-
- for (i = 0; i < 4; i++) {
- memcpy(obuf, istate, 32);
- ibuf[4] = i + 1;
- sha256_transform(obuf, ibuf, 0);
-
- memcpy(ostate2, ostate, 32);
- sha256_transform(ostate2, obuf, 0);
- for (j = 0; j < 8; j++)
- output[8 * i + j] = swab32(ostate2[j]);
- }
-}
-
-static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
- const uint32_t *salt, uint32_t *output)
-{
- uint32_t buf[16];
- int i;
-
- sha256_transform(tstate, salt, 1);
- sha256_transform(tstate, salt + 16, 1);
- sha256_transform(tstate, finalblk, 0);
- memcpy(buf, tstate, 32);
- memcpy(buf + 8, outerpad, 32);
-
- sha256_transform(ostate, buf, 0);
- for (i = 0; i < 8; i++)
- output[i] = swab32(ostate[i]);
-}
-
-
-#if HAVE_SHA256_4WAY
-
-static const uint32_t keypad_4way[4 * 12] = {
- 0x80000000, 0x80000000, 0x80000000, 0x80000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000280, 0x00000280, 0x00000280, 0x00000280
-};
-static const uint32_t innerpad_4way[4 * 11] = {
- 0x80000000, 0x80000000, 0x80000000, 0x80000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
-};
-static const uint32_t outerpad_4way[4 * 8] = {
- 0x80000000, 0x80000000, 0x80000000, 0x80000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000300, 0x00000300, 0x00000300, 0x00000300
-};
-static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
- 0x00000001, 0x00000001, 0x00000001, 0x00000001,
- 0x80000000, 0x80000000, 0x80000000, 0x80000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
- uint32_t *tstate, uint32_t *ostate)
-{
- uint32_t ihash[4 * 8] __attribute__((aligned(16)));
- uint32_t pad[4 * 16] __attribute__((aligned(16)));
- int i;
-
- /* tstate is assumed to contain the midstate of key */
- memcpy(pad, key + 4 * 16, 4 * 16);
- memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
- sha256_transform_4way(tstate, pad, 0);
- memcpy(ihash, tstate, 4 * 32);
-
- sha256_init_4way(ostate);
- for (i = 0; i < 4 * 8; i++)
- pad[i] = ihash[i] ^ 0x5c5c5c5c;
- for (; i < 4 * 16; i++)
- pad[i] = 0x5c5c5c5c;
- sha256_transform_4way(ostate, pad, 0);
-
- sha256_init_4way(tstate);
- for (i = 0; i < 4 * 8; i++)
- pad[i] = ihash[i] ^ 0x36363636;
- for (; i < 4 * 16; i++)
- pad[i] = 0x36363636;
- sha256_transform_4way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
- const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
- uint32_t istate[4 * 8] __attribute__((aligned(16)));
- uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
- uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
- uint32_t obuf[4 * 16] __attribute__((aligned(16)));
- int i, j;
-
- memcpy(istate, tstate, 4 * 32);
- sha256_transform_4way(istate, salt, 0);
-
- memcpy(ibuf, salt + 4 * 16, 4 * 16);
- memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
- memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
-
- for (i = 0; i < 4; i++) {
- memcpy(obuf, istate, 4 * 32);
- ibuf[4 * 4 + 0] = i + 1;
- ibuf[4 * 4 + 1] = i + 1;
- ibuf[4 * 4 + 2] = i + 1;
- ibuf[4 * 4 + 3] = i + 1;
- sha256_transform_4way(obuf, ibuf, 0);
-
- memcpy(ostate2, ostate, 4 * 32);
- sha256_transform_4way(ostate2, obuf, 0);
- for (j = 0; j < 4 * 8; j++)
- output[4 * 8 * i + j] = swab32(ostate2[j]);
- }
-}
-
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
- uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
- uint32_t buf[4 * 16] __attribute__((aligned(16)));
- int i;
-
- sha256_transform_4way(tstate, salt, 1);
- sha256_transform_4way(tstate, salt + 4 * 16, 1);
- sha256_transform_4way(tstate, finalblk_4way, 0);
- memcpy(buf, tstate, 4 * 32);
- memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
-
- sha256_transform_4way(ostate, buf, 0);
- for (i = 0; i < 4 * 8; i++)
- output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_4WAY */
-
-
-#if HAVE_SHA256_8WAY
-
-static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
- 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
- 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
- uint32_t *tstate, uint32_t *ostate)
-{
- uint32_t ihash[8 * 8] __attribute__((aligned(32)));
- uint32_t pad[8 * 16] __attribute__((aligned(32)));
- int i;
-
- /* tstate is assumed to contain the midstate of key */
- memcpy(pad, key + 8 * 16, 8 * 16);
- for (i = 0; i < 8; i++)
- pad[8 * 4 + i] = 0x80000000;
- memset(pad + 8 * 5, 0x00, 8 * 40);
- for (i = 0; i < 8; i++)
- pad[8 * 15 + i] = 0x00000280;
- sha256_transform_8way(tstate, pad, 0);
- memcpy(ihash, tstate, 8 * 32);
-
- sha256_init_8way(ostate);
- for (i = 0; i < 8 * 8; i++)
- pad[i] = ihash[i] ^ 0x5c5c5c5c;
- for (; i < 8 * 16; i++)
- pad[i] = 0x5c5c5c5c;
- sha256_transform_8way(ostate, pad, 0);
-
- sha256_init_8way(tstate);
- for (i = 0; i < 8 * 8; i++)
- pad[i] = ihash[i] ^ 0x36363636;
- for (; i < 8 * 16; i++)
- pad[i] = 0x36363636;
- sha256_transform_8way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
- const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
- uint32_t istate[8 * 8] __attribute__((aligned(32)));
- uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
- uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
- uint32_t obuf[8 * 16] __attribute__((aligned(32)));
- int i, j;
-
- memcpy(istate, tstate, 8 * 32);
- sha256_transform_8way(istate, salt, 0);
-
- memcpy(ibuf, salt + 8 * 16, 8 * 16);
- for (i = 0; i < 8; i++)
- ibuf[8 * 5 + i] = 0x80000000;
- memset(ibuf + 8 * 6, 0x00, 8 * 36);
- for (i = 0; i < 8; i++)
- ibuf[8 * 15 + i] = 0x000004a0;
-
- for (i = 0; i < 8; i++)
- obuf[8 * 8 + i] = 0x80000000;
- memset(obuf + 8 * 9, 0x00, 8 * 24);
- for (i = 0; i < 8; i++)
- obuf[8 * 15 + i] = 0x00000300;
-
- for (i = 0; i < 4; i++) {
- memcpy(obuf, istate, 8 * 32);
- ibuf[8 * 4 + 0] = i + 1;
- ibuf[8 * 4 + 1] = i + 1;
- ibuf[8 * 4 + 2] = i + 1;
- ibuf[8 * 4 + 3] = i + 1;
- ibuf[8 * 4 + 4] = i + 1;
- ibuf[8 * 4 + 5] = i + 1;
- ibuf[8 * 4 + 6] = i + 1;
- ibuf[8 * 4 + 7] = i + 1;
- sha256_transform_8way(obuf, ibuf, 0);
-
- memcpy(ostate2, ostate, 8 * 32);
- sha256_transform_8way(ostate2, obuf, 0);
- for (j = 0; j < 8 * 8; j++)
- output[8 * 8 * i + j] = swab32(ostate2[j]);
- }
-}
-
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
- uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
- uint32_t buf[8 * 16] __attribute__((aligned(32)));
- int i;
-
- sha256_transform_8way(tstate, salt, 1);
- sha256_transform_8way(tstate, salt + 8 * 16, 1);
- sha256_transform_8way(tstate, finalblk_8way, 0);
-
- memcpy(buf, tstate, 8 * 32);
- for (i = 0; i < 8; i++)
- buf[8 * 8 + i] = 0x80000000;
- memset(buf + 8 * 9, 0x00, 8 * 24);
- for (i = 0; i < 8; i++)
- buf[8 * 15 + i] = 0x00000300;
- sha256_transform_8way(ostate, buf, 0);
-
- for (i = 0; i < 8 * 8; i++)
- output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_8WAY */
-
-
-#if defined(__x86_64__)
-
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#if defined(USE_AVX2)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 21
-#define HAVE_SCRYPT_6WAY 0
-void scrypt_core_6way(uint32_t *X, uint32_t *V);
-#endif
-
-#elif defined(__i386__)
-
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-
-#elif defined(__arm__) && defined(__APCS_32__)
-
-static void scrypt_core(uint32_t *X, uint32_t *V);
-#if defined(__ARM_NEON__)
-#undef HAVE_SHA256_4WAY
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#endif
-
-#endif
-
-static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
-{
- uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
- int i;
-
- x00 = (B[ 0] ^= Bx[ 0]);
- x01 = (B[ 1] ^= Bx[ 1]);
- x02 = (B[ 2] ^= Bx[ 2]);
- x03 = (B[ 3] ^= Bx[ 3]);
- x04 = (B[ 4] ^= Bx[ 4]);
- x05 = (B[ 5] ^= Bx[ 5]);
- x06 = (B[ 6] ^= Bx[ 6]);
- x07 = (B[ 7] ^= Bx[ 7]);
- x08 = (B[ 8] ^= Bx[ 8]);
- x09 = (B[ 9] ^= Bx[ 9]);
- x10 = (B[10] ^= Bx[10]);
- x11 = (B[11] ^= Bx[11]);
- x12 = (B[12] ^= Bx[12]);
- x13 = (B[13] ^= Bx[13]);
- x14 = (B[14] ^= Bx[14]);
- x15 = (B[15] ^= Bx[15]);
- for (i = 0; i < 8; i += 2) {
-#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
- /* Operate on columns. */
- x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7);
- x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
-
- x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9);
- x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
-
- x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13);
- x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
-
- x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18);
- x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
-
- /* Operate on rows. */
- x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7);
- x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
-
- x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9);
- x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
-
- x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13);
- x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
-
- x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18);
- x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
-#undef R
- }
- B[ 0] += x00;
- B[ 1] += x01;
- B[ 2] += x02;
- B[ 3] += x03;
- B[ 4] += x04;
- B[ 5] += x05;
- B[ 6] += x06;
- B[ 7] += x07;
- B[ 8] += x08;
- B[ 9] += x09;
- B[10] += x10;
- B[11] += x11;
- B[12] += x12;
- B[13] += x13;
- B[14] += x14;
- B[15] += x15;
-}
-
-static inline void scrypt_core(uint32_t *X, uint32_t *V)
-{
- uint32_t i, j, k;
-
- for (i = 0; i < 1024; i++) {
- memcpy(&V[i * 32], X, 128);
- xor_salsa8(&X[0], &X[16]);
- xor_salsa8(&X[16], &X[0]);
- }
- for (i = 0; i < 1024; i++) {
- j = 32 * (X[16] & 1023);
- for (k = 0; k < 32; k++)
- X[k] ^= V[j + k];
- xor_salsa8(&X[0], &X[16]);
- xor_salsa8(&X[16], &X[0]);
- }
-}
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-#endif
-
-#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
-
-unsigned char *scrypt_buffer_alloc()
-{
- return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE);
-}
-
-static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
- uint32_t *midstate, unsigned char *scratchpad)
-{
- uint32_t tstate[8], ostate[8];
- uint32_t X[32];
- uint32_t *V;
-
- V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
- memcpy(tstate, midstate, 32);
- HMAC_SHA256_80_init(input, tstate, ostate);
- PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
- scrypt_core(X, V);
-
- PBKDF2_SHA256_128_32(tstate, ostate, X, output);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_4way(const uint32_t *input,
- uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
- uint32_t tstate[4 * 8] __attribute__((aligned(128)));
- uint32_t ostate[4 * 8] __attribute__((aligned(128)));
- uint32_t W[4 * 32] __attribute__((aligned(128)));
- uint32_t X[4 * 32] __attribute__((aligned(128)));
- uint32_t *V;
- int i, k;
-
- V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
- for (i = 0; i < 20; i++)
- for (k = 0; k < 4; k++)
- W[4 * i + k] = input[k * 20 + i];
- for (i = 0; i < 8; i++)
- for (k = 0; k < 4; k++)
- tstate[4 * i + k] = midstate[i];
- HMAC_SHA256_80_init_4way(W, tstate, ostate);
- PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
- for (i = 0; i < 32; i++)
- for (k = 0; k < 4; k++)
- X[k * 32 + i] = W[4 * i + k];
- scrypt_core(X + 0 * 32, V);
- scrypt_core(X + 1 * 32, V);
- scrypt_core(X + 2 * 32, V);
- scrypt_core(X + 3 * 32, V);
- for (i = 0; i < 32; i++)
- for (k = 0; k < 4; k++)
- W[4 * i + k] = X[k * 32 + i];
- PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
- for (i = 0; i < 8; i++)
- for (k = 0; k < 4; k++)
- output[k * 8 + i] = W[4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#if HAVE_SCRYPT_3WAY
-
-static void scrypt_1024_1_1_256_3way(const uint32_t *input,
- uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
- uint32_t tstate[3 * 8], ostate[3 * 8];
- uint32_t X[3 * 32] __attribute__((aligned(64)));
- uint32_t *V;
-
- V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
- memcpy(tstate + 0, midstate, 32);
- memcpy(tstate + 8, midstate, 32);
- memcpy(tstate + 16, midstate, 32);
- HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0);
- HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8);
- HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
- PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0);
- PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32);
- PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
-
- scrypt_core_3way(X, V);
-
- PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0);
- PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8);
- PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_12way(const uint32_t *input,
- uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
- uint32_t tstate[12 * 8] __attribute__((aligned(128)));
- uint32_t ostate[12 * 8] __attribute__((aligned(128)));
- uint32_t W[12 * 32] __attribute__((aligned(128)));
- uint32_t X[12 * 32] __attribute__((aligned(128)));
- uint32_t *V;
- int i, j, k;
-
- V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
- for (j = 0; j < 3; j++)
- for (i = 0; i < 20; i++)
- for (k = 0; k < 4; k++)
- W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
- for (j = 0; j < 3; j++)
- for (i = 0; i < 8; i++)
- for (k = 0; k < 4; k++)
- tstate[32 * j + 4 * i + k] = midstate[i];
- HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0);
- HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
- HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
- PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0);
- PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
- PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 32; i++)
- for (k = 0; k < 4; k++)
- X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
- scrypt_core_3way(X + 0 * 96, V);
- scrypt_core_3way(X + 1 * 96, V);
- scrypt_core_3way(X + 2 * 96, V);
- scrypt_core_3way(X + 3 * 96, V);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 32; i++)
- for (k = 0; k < 4; k++)
- W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
- PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0);
- PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
- PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 8; i++)
- for (k = 0; k < 4; k++)
- output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#endif /* HAVE_SCRYPT_3WAY */
-
-#if HAVE_SCRYPT_6WAY
-static void scrypt_1024_1_1_256_24way(const uint32_t *input,
- uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
- uint32_t tstate[24 * 8] __attribute__((aligned(128)));
- uint32_t ostate[24 * 8] __attribute__((aligned(128)));
- uint32_t W[24 * 32] __attribute__((aligned(128)));
- uint32_t X[24 * 32] __attribute__((aligned(128)));
- uint32_t *V;
- int i, j, k;
-
- V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
- for (j = 0; j < 3; j++)
- for (i = 0; i < 20; i++)
- for (k = 0; k < 8; k++)
- W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
- for (j = 0; j < 3; j++)
- for (i = 0; i < 8; i++)
- for (k = 0; k < 8; k++)
- tstate[8 * 8 * j + 8 * i + k] = midstate[i];
- HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0);
- HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64);
- HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
- PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0);
- PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256);
- PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 32; i++)
- for (k = 0; k < 8; k++)
- X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
- scrypt_core_6way(X + 0 * 32, V);
- scrypt_core_6way(X + 6 * 32, V);
- scrypt_core_6way(X + 12 * 32, V);
- scrypt_core_6way(X + 18 * 32, V);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 32; i++)
- for (k = 0; k < 8; k++)
- W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
- PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0);
- PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256);
- PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
- for (j = 0; j < 3; j++)
- for (i = 0; i < 8; i++)
- for (k = 0; k < 8; k++)
- output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
-}
-#endif /* HAVE_SCRYPT_6WAY */
-
-int scanhash_scrypt(int thr_id, uint32_t *pdata,
- unsigned char *scratchbuf, const uint32_t *ptarget,
- uint32_t max_nonce, unsigned long *hashes_done)
-{
- uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
- uint32_t midstate[8];
- uint32_t n = pdata[19] - 1;
- const uint32_t Htarg = ptarget[7];
- uint32_t throughput = scrypt_best_throughput();
- uint32_t i;
-
-#if HAVE_SHA256_4WAY
- if (sha256_use_4way())
- throughput *= 4;
-#endif
-
- for (i = 0; i < throughput; i++)
- memcpy(data + i * 20, pdata, 80);
-
- sha256_init(midstate);
- sha256_transform(midstate, data, 0);
-
- do {
- for (i = 0; i < throughput; i++)
- data[i * 20 + 19] = ++n;
-
-#if HAVE_SHA256_4WAY
- if (throughput == 4)
- scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
- else
-#endif
-#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY
- if (throughput == 12)
- scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
- else
-#endif
-#if HAVE_SCRYPT_6WAY
- if (throughput == 24)
- scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
- else
-#endif
-#if HAVE_SCRYPT_3WAY
- if (throughput == 3)
- scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
- else
-#endif
- scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
-
- for (i = 0; i < throughput; i++) {
- if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
- *hashes_done = n - pdata[19] + 1;
- pdata[19] = data[i * 20 + 19];
- return 1;
- }
- }
- } while (n < max_nonce && !work_restart[thr_id].restart);
-
- *hashes_done = n - pdata[19] + 1;
- pdata[19] = n;
- return 0;
-}
diff --git a/scrypt.cpp b/scrypt.cpp
new file mode 100644
index 0000000000..f7a3422c32
--- /dev/null
+++ b/scrypt.cpp
@@ -0,0 +1,1097 @@
+/*
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifdef WIN32
+#include
+using namespace Concurrency;
+#else
+#include
+#endif
+
+#include "miner.h"
+#include "scrypt/salsa_kernel.h"
+#include "scrypt/sha256.h"
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+// A thin wrapper around the builtin __m128i type
+class uint32x4_t
+{
+public:
+#if WIN32
+ void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+ void operator delete(void *p) { _aligned_free(p); }
+ void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+ void operator delete[](void *p) { _aligned_free(p); }
+#else
+ void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+ void operator delete(void *p) { free(p); }
+ void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+ void operator delete[](void *p) { free(p); }
+#endif
+ uint32x4_t() { };
+ uint32x4_t(const __m128i init) { val = init; }
+ uint32x4_t(const uint32_t init) { val = _mm_set1_epi32((int)init); }
+ uint32x4_t(const uint32_t a, const uint32_t b, const uint32_t c, const uint32_t d) { val = _mm_setr_epi32((int)a,(int)b,(int)c,(int)d); }
+ inline operator const __m128i() const { return val; }
+ inline const uint32x4_t operator+(const uint32x4_t &other) const { return _mm_add_epi32(val, other); }
+ inline const uint32x4_t operator+(const uint32_t other) const { return _mm_add_epi32(val, _mm_set1_epi32((int)other)); }
+ inline uint32x4_t& operator+=(const uint32x4_t other) { val = _mm_add_epi32(val, other); return *this; }
+ inline uint32x4_t& operator+=(const uint32_t other) { val = _mm_add_epi32(val, _mm_set1_epi32((int)other)); return *this; }
+ inline const uint32x4_t operator&(const uint32_t other) const { return _mm_and_si128(val, _mm_set1_epi32((int)other)); }
+ inline const uint32x4_t operator&(const uint32x4_t &other) const { return _mm_and_si128(val, other); }
+ inline const uint32x4_t operator|(const uint32x4_t &other) const { return _mm_or_si128(val, other); }
+ inline const uint32x4_t operator^(const uint32x4_t &other) const { return _mm_xor_si128(val, other); }
+ inline const uint32x4_t operator<<(const int num) const { return _mm_slli_epi32(val, num); }
+ inline const uint32x4_t operator>>(const int num) const { return _mm_srli_epi32(val, num); }
+ inline const uint32_t operator[](const int num) const { return ((uint32_t*)&val)[num]; }
+ protected:
+ __m128i val;
+};
+
+// non-member overload
+inline const uint32x4_t operator+(const uint32_t left, const uint32x4_t &right) { return _mm_add_epi32(_mm_set1_epi32((int)left), right); }
+
+
+//
+// Code taken from sha2.cpp and vectorized, with minimal changes where required
+// Not all subroutines are actually used.
+//
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+ | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static __inline uint32x4_t swab32x4(const uint32x4_t &v)
+{
+ return bswap_32x4(v);
+}
+
+static const uint32_t sha256_h[8] = {
+ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+void sha256_initx4(uint32x4_t *statex4)
+{
+ for (int i=0; i<8; ++i)
+ statex4[i] = sha256_h[i];
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z) ((x & (y | z)) | (y & z))
+#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
+#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+ do { \
+ t0 = h + S1(e) + Ch(e, f, g) + k; \
+ t1 = S0(a) + Maj(a, b, c); \
+ d += t0; \
+ h = t0 + t1; \
+ } while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+ RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+ S[(66 - i) % 8], S[(67 - i) % 8], \
+ S[(68 - i) % 8], S[(69 - i) % 8], \
+ S[(70 - i) % 8], S[(71 - i) % 8], \
+ W[i] + sha256_k[i])
+
+/*
+ * SHA256 block compression function. The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+void sha256_transformx4(uint32x4_t *state, const uint32x4_t *block, int swap)
+{
+ uint32x4_t W[64];
+ uint32x4_t S[8];
+ uint32x4_t t0, t1;
+ int i;
+
+ /* 1. Prepare message schedule W. */
+ if (swap) {
+ for (i = 0; i < 16; i++)
+ W[i] = swab32x4(block[i]);
+ } else
+ memcpy(W, block, 4*64);
+ for (i = 16; i < 64; i += 2) {
+ W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+ W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+ }
+
+ /* 2. Initialize working variables. */
+ memcpy(S, state, 4*32);
+
+ /* 3. Mix. */
+ RNDr(S, W, 0);
+ RNDr(S, W, 1);
+ RNDr(S, W, 2);
+ RNDr(S, W, 3);
+ RNDr(S, W, 4);
+ RNDr(S, W, 5);
+ RNDr(S, W, 6);
+ RNDr(S, W, 7);
+ RNDr(S, W, 8);
+ RNDr(S, W, 9);
+ RNDr(S, W, 10);
+ RNDr(S, W, 11);
+ RNDr(S, W, 12);
+ RNDr(S, W, 13);
+ RNDr(S, W, 14);
+ RNDr(S, W, 15);
+ RNDr(S, W, 16);
+ RNDr(S, W, 17);
+ RNDr(S, W, 18);
+ RNDr(S, W, 19);
+ RNDr(S, W, 20);
+ RNDr(S, W, 21);
+ RNDr(S, W, 22);
+ RNDr(S, W, 23);
+ RNDr(S, W, 24);
+ RNDr(S, W, 25);
+ RNDr(S, W, 26);
+ RNDr(S, W, 27);
+ RNDr(S, W, 28);
+ RNDr(S, W, 29);
+ RNDr(S, W, 30);
+ RNDr(S, W, 31);
+ RNDr(S, W, 32);
+ RNDr(S, W, 33);
+ RNDr(S, W, 34);
+ RNDr(S, W, 35);
+ RNDr(S, W, 36);
+ RNDr(S, W, 37);
+ RNDr(S, W, 38);
+ RNDr(S, W, 39);
+ RNDr(S, W, 40);
+ RNDr(S, W, 41);
+ RNDr(S, W, 42);
+ RNDr(S, W, 43);
+ RNDr(S, W, 44);
+ RNDr(S, W, 45);
+ RNDr(S, W, 46);
+ RNDr(S, W, 47);
+ RNDr(S, W, 48);
+ RNDr(S, W, 49);
+ RNDr(S, W, 50);
+ RNDr(S, W, 51);
+ RNDr(S, W, 52);
+ RNDr(S, W, 53);
+ RNDr(S, W, 54);
+ RNDr(S, W, 55);
+ RNDr(S, W, 56);
+ RNDr(S, W, 57);
+ RNDr(S, W, 58);
+ RNDr(S, W, 59);
+ RNDr(S, W, 60);
+ RNDr(S, W, 61);
+ RNDr(S, W, 62);
+ RNDr(S, W, 63);
+
+ /* 4. Mix local working variables into global state */
+ for (i = 0; i < 8; i++)
+ state[i] += S[i];
+}
+
+static const uint32_t sha256d_hash1[16] = {
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x80000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static void sha256dx4(uint32x4_t *hash, uint32x4_t *data)
+{
+ uint32x4_t S[16];
+
+ sha256_initx4(S);
+ sha256_transformx4(S, data, 0);
+ sha256_transformx4(S, data + 16, 0);
+ for (int i=8; i<16; ++i)
+ S[i] = sha256d_hash1[i];
+ sha256_initx4(hash);
+ sha256_transformx4(hash, S, 0);
+}
+
+static inline void sha256d_preextendx4(uint32x4_t *W)
+{
+ W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
+ W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
+ W[18] = s1(W[16]) + W[11] + W[ 2];
+ W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
+ W[20] = W[13] + s0(W[ 5]) + W[ 4];
+ W[21] = W[14] + s0(W[ 6]) + W[ 5];
+ W[22] = W[15] + s0(W[ 7]) + W[ 6];
+ W[23] = W[16] + s0(W[ 8]) + W[ 7];
+ W[24] = W[17] + s0(W[ 9]) + W[ 8];
+ W[25] = s0(W[10]) + W[ 9];
+ W[26] = s0(W[11]) + W[10];
+ W[27] = s0(W[12]) + W[11];
+ W[28] = s0(W[13]) + W[12];
+ W[29] = s0(W[14]) + W[13];
+ W[30] = s0(W[15]) + W[14];
+ W[31] = s0(W[16]) + W[15];
+}
+
+static inline void sha256d_prehashx4(uint32x4_t *S, const uint32x4_t *W)
+{
+ uint32x4_t t0, t1;
+ RNDr(S, W, 0);
+ RNDr(S, W, 1);
+ RNDr(S, W, 2);
+}
+
+static inline void sha256d_msx4(uint32x4_t *hash, uint32x4_t *W,
+ const uint32_t *midstate, const uint32_t *prehash)
+{
+ uint32x4_t S[64];
+ uint32x4_t t0, t1;
+ int i;
+
+ S[18] = W[18];
+ S[19] = W[19];
+ S[20] = W[20];
+ S[22] = W[22];
+ S[23] = W[23];
+ S[24] = W[24];
+ S[30] = W[30];
+ S[31] = W[31];
+
+ W[18] += s0(W[3]);
+ W[19] += W[3];
+ W[20] += s1(W[18]);
+ W[21] = s1(W[19]);
+ W[22] += s1(W[20]);
+ W[23] += s1(W[21]);
+ W[24] += s1(W[22]);
+ W[25] = s1(W[23]) + W[18];
+ W[26] = s1(W[24]) + W[19];
+ W[27] = s1(W[25]) + W[20];
+ W[28] = s1(W[26]) + W[21];
+ W[29] = s1(W[27]) + W[22];
+ W[30] += s1(W[28]) + W[23];
+ W[31] += s1(W[29]) + W[24];
+ for (i = 32; i < 64; i += 2) {
+ W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+ W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+ }
+
+ for (i=0; i<8; ++i)
+ S[i] = prehash[i];
+
+ RNDr(S, W, 3);
+ RNDr(S, W, 4);
+ RNDr(S, W, 5);
+ RNDr(S, W, 6);
+ RNDr(S, W, 7);
+ RNDr(S, W, 8);
+ RNDr(S, W, 9);
+ RNDr(S, W, 10);
+ RNDr(S, W, 11);
+ RNDr(S, W, 12);
+ RNDr(S, W, 13);
+ RNDr(S, W, 14);
+ RNDr(S, W, 15);
+ RNDr(S, W, 16);
+ RNDr(S, W, 17);
+ RNDr(S, W, 18);
+ RNDr(S, W, 19);
+ RNDr(S, W, 20);
+ RNDr(S, W, 21);
+ RNDr(S, W, 22);
+ RNDr(S, W, 23);
+ RNDr(S, W, 24);
+ RNDr(S, W, 25);
+ RNDr(S, W, 26);
+ RNDr(S, W, 27);
+ RNDr(S, W, 28);
+ RNDr(S, W, 29);
+ RNDr(S, W, 30);
+ RNDr(S, W, 31);
+ RNDr(S, W, 32);
+ RNDr(S, W, 33);
+ RNDr(S, W, 34);
+ RNDr(S, W, 35);
+ RNDr(S, W, 36);
+ RNDr(S, W, 37);
+ RNDr(S, W, 38);
+ RNDr(S, W, 39);
+ RNDr(S, W, 40);
+ RNDr(S, W, 41);
+ RNDr(S, W, 42);
+ RNDr(S, W, 43);
+ RNDr(S, W, 44);
+ RNDr(S, W, 45);
+ RNDr(S, W, 46);
+ RNDr(S, W, 47);
+ RNDr(S, W, 48);
+ RNDr(S, W, 49);
+ RNDr(S, W, 50);
+ RNDr(S, W, 51);
+ RNDr(S, W, 52);
+ RNDr(S, W, 53);
+ RNDr(S, W, 54);
+ RNDr(S, W, 55);
+ RNDr(S, W, 56);
+ RNDr(S, W, 57);
+ RNDr(S, W, 58);
+ RNDr(S, W, 59);
+ RNDr(S, W, 60);
+ RNDr(S, W, 61);
+ RNDr(S, W, 62);
+ RNDr(S, W, 63);
+
+ for (i = 0; i < 8; i++)
+ S[i] += midstate[i];
+
+ W[18] = S[18];
+ W[19] = S[19];
+ W[20] = S[20];
+ W[22] = S[22];
+ W[23] = S[23];
+ W[24] = S[24];
+ W[30] = S[30];
+ W[31] = S[31];
+
+ for (i=8; i<16; ++i)
+ S[i] = sha256d_hash1[i];
+ S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
+ S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
+ S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
+ S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
+ S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
+ S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
+ S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
+ S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
+ S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
+ S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
+ S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
+ S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
+ S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
+ S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
+ S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
+ S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
+ for (i = 32; i < 60; i += 2) {
+ S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+ S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+ }
+ S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
+
+ sha256_initx4(hash);
+
+ RNDr(hash, S, 0);
+ RNDr(hash, S, 1);
+ RNDr(hash, S, 2);
+ RNDr(hash, S, 3);
+ RNDr(hash, S, 4);
+ RNDr(hash, S, 5);
+ RNDr(hash, S, 6);
+ RNDr(hash, S, 7);
+ RNDr(hash, S, 8);
+ RNDr(hash, S, 9);
+ RNDr(hash, S, 10);
+ RNDr(hash, S, 11);
+ RNDr(hash, S, 12);
+ RNDr(hash, S, 13);
+ RNDr(hash, S, 14);
+ RNDr(hash, S, 15);
+ RNDr(hash, S, 16);
+ RNDr(hash, S, 17);
+ RNDr(hash, S, 18);
+ RNDr(hash, S, 19);
+ RNDr(hash, S, 20);
+ RNDr(hash, S, 21);
+ RNDr(hash, S, 22);
+ RNDr(hash, S, 23);
+ RNDr(hash, S, 24);
+ RNDr(hash, S, 25);
+ RNDr(hash, S, 26);
+ RNDr(hash, S, 27);
+ RNDr(hash, S, 28);
+ RNDr(hash, S, 29);
+ RNDr(hash, S, 30);
+ RNDr(hash, S, 31);
+ RNDr(hash, S, 32);
+ RNDr(hash, S, 33);
+ RNDr(hash, S, 34);
+ RNDr(hash, S, 35);
+ RNDr(hash, S, 36);
+ RNDr(hash, S, 37);
+ RNDr(hash, S, 38);
+ RNDr(hash, S, 39);
+ RNDr(hash, S, 40);
+ RNDr(hash, S, 41);
+ RNDr(hash, S, 42);
+ RNDr(hash, S, 43);
+ RNDr(hash, S, 44);
+ RNDr(hash, S, 45);
+ RNDr(hash, S, 46);
+ RNDr(hash, S, 47);
+ RNDr(hash, S, 48);
+ RNDr(hash, S, 49);
+ RNDr(hash, S, 50);
+ RNDr(hash, S, 51);
+ RNDr(hash, S, 52);
+ RNDr(hash, S, 53);
+ RNDr(hash, S, 54);
+ RNDr(hash, S, 55);
+ RNDr(hash, S, 56);
+
+ hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+ + S[57] + sha256_k[57];
+ hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+ + S[58] + sha256_k[58];
+ hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+ + S[59] + sha256_k[59];
+ hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+ + S[60] + sha256_k[60]
+ + sha256_h[7];
+}
+
+//
+// Code taken from original scrypt.cpp and vectorized with minimal changes.
+//
+
+static const uint32x4_t keypadx4[12] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32x4_t innerpadx4[11] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32x4_t outerpadx4[8] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32x4_t finalblkx4[16] = {
+ 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_initx4(const uint32x4_t *key,
+ uint32x4_t *tstate, uint32x4_t *ostate)
+{
+ uint32x4_t ihash[8];
+ uint32x4_t pad[16];
+ int i;
+
+ /* tstate is assumed to contain the midstate of key */
+ memcpy(pad, key + 16, 4*16);
+ memcpy(pad + 4, keypadx4, 4*48);
+ sha256_transformx4(tstate, pad, 0);
+ memcpy(ihash, tstate, 4*32);
+
+ sha256_initx4(ostate);
+ for (i = 0; i < 8; i++)
+ pad[i] = ihash[i] ^ 0x5c5c5c5c;
+ for (; i < 16; i++)
+ pad[i] = 0x5c5c5c5c;
+ sha256_transformx4(ostate, pad, 0);
+
+ sha256_initx4(tstate);
+ for (i = 0; i < 8; i++)
+ pad[i] = ihash[i] ^ 0x36363636;
+ for (; i < 16; i++)
+ pad[i] = 0x36363636;
+ sha256_transformx4(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128x4(const uint32x4_t *tstate,
+ const uint32x4_t *ostate, const uint32x4_t *salt, uint32x4_t *output)
+{
+ uint32x4_t istate[8], ostate2[8];
+ uint32x4_t ibuf[16], obuf[16];
+ int i, j;
+
+ memcpy(istate, tstate, 4*32);
+ sha256_transformx4(istate, salt, 0);
+
+ memcpy(ibuf, salt + 16, 4*16);
+ memcpy(ibuf + 5, innerpadx4, 4*44);
+ memcpy(obuf + 8, outerpadx4, 4*32);
+
+ for (i = 0; i < 4; i++) {
+ memcpy(obuf, istate, 4*32);
+ ibuf[4] = i + 1;
+ sha256_transformx4(obuf, ibuf, 0);
+
+ memcpy(ostate2, ostate, 4*32);
+ sha256_transformx4(ostate2, obuf, 0);
+ for (j = 0; j < 8; j++)
+ output[8 * i + j] = swab32x4(ostate2[j]);
+ }
+}
+
+static inline void PBKDF2_SHA256_128_32x4(uint32x4_t *tstate, uint32x4_t *ostate,
+ const uint32x4_t *salt, uint32x4_t *output)
+{
+ uint32x4_t buf[16];
+ int i;
+
+ sha256_transformx4(tstate, salt, 1);
+ sha256_transformx4(tstate, salt + 16, 1);
+ sha256_transformx4(tstate, finalblkx4, 0);
+ memcpy(buf, tstate, 4*32);
+ memcpy(buf + 8, outerpadx4, 4*32);
+
+ sha256_transformx4(ostate, buf, 0);
+ for (i = 0; i < 8; i++)
+ output[i] = swab32x4(ostate[i]);
+}
+
+
+//
+// Original scrypt.cpp HMAC SHA256 functions
+//
+
+static const uint32_t keypad[12] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32_t innerpad[11] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32_t outerpad[8] = {
+ 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32_t finalblk[16] = {
+ 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init(const uint32_t *key,
+ uint32_t *tstate, uint32_t *ostate)
+{
+ uint32_t ihash[8];
+ uint32_t pad[16];
+ int i;
+
+ /* tstate is assumed to contain the midstate of key */
+ memcpy(pad, key + 16, 16);
+ memcpy(pad + 4, keypad, 48);
+ sha256_transform(tstate, pad, 0);
+ memcpy(ihash, tstate, 32);
+
+ sha256_init(ostate);
+ for (i = 0; i < 8; i++)
+ pad[i] = ihash[i] ^ 0x5c5c5c5c;
+ for (; i < 16; i++)
+ pad[i] = 0x5c5c5c5c;
+ sha256_transform(ostate, pad, 0);
+
+ sha256_init(tstate);
+ for (i = 0; i < 8; i++)
+ pad[i] = ihash[i] ^ 0x36363636;
+ for (; i < 16; i++)
+ pad[i] = 0x36363636;
+ sha256_transform(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
+ const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+ uint32_t istate[8], ostate2[8];
+ uint32_t ibuf[16], obuf[16];
+ int i, j;
+
+ memcpy(istate, tstate, 32);
+ sha256_transform(istate, salt, 0);
+
+ memcpy(ibuf, salt + 16, 16);
+ memcpy(ibuf + 5, innerpad, 44);
+ memcpy(obuf + 8, outerpad, 32);
+
+ for (i = 0; i < 4; i++) {
+ memcpy(obuf, istate, 32);
+ ibuf[4] = i + 1;
+ sha256_transform(obuf, ibuf, 0);
+
+ memcpy(ostate2, ostate, 32);
+ sha256_transform(ostate2, obuf, 0);
+ for (j = 0; j < 8; j++)
+ output[8 * i + j] = swab32(ostate2[j]);
+ }
+}
+
+static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
+ const uint32_t *salt, uint32_t *output)
+{
+ uint32_t buf[16];
+
+ sha256_transform(tstate, salt, 1);
+ sha256_transform(tstate, salt + 16, 1);
+ sha256_transform(tstate, finalblk, 0);
+ memcpy(buf, tstate, 32);
+ memcpy(buf + 8, outerpad, 32);
+
+ sha256_transform(ostate, buf, 0);
+ for (int i = 0; i < 8; i++)
+ output[i] = swab32(ostate[i]);
+}
+
+static int lastFactor = 0;
+//
+// Scrypt proof of work algorithm
+// using SSE2 vectorized HMAC SHA256 on CPU and
+// a salsa core implementation on GPU with CUDA
+//
+
+int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
+ uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
+{
+ int result = 0;
+ int throughput = cuda_throughput(thr_id);
+
+ if(throughput == 0)
+ return -1;
+
+ gettimeofday(tv_start, NULL);
+
+ uint32_t n = pdata[19];
+ const uint32_t Htarg = ptarget[7];
+
+ // no default set with --cputest
+ if (opt_nfactor == 0) opt_nfactor = 9;
+ uint32_t N = (1UL<<(opt_nfactor+1));
+ uint32_t *scratch = new uint32_t[N*32]; // scratchbuffer for CPU based validation
+
+ uint32_t nonce[2];
+ uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+ uint32_t* X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
+
+ bool sha_on_cpu = (parallel < 2);
+ bool sha_multithreaded = (parallel == 1);
+ uint32x4_t* datax4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL };
+ uint32x4_t* hashx4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL };
+ uint32x4_t* tstatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL };
+ uint32x4_t* ostatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL };
+ uint32x4_t* Xx4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL };
+
+ // log n-factor
+ if (!opt_quiet && lastFactor != opt_nfactor) {
+ applog(LOG_WARNING, "scrypt factor set to %d (%u)", opt_nfactor, N);
+ lastFactor = opt_nfactor;
+ }
+
+ uint32_t _ALIGN(64) midstate[8];
+ sha256_init(midstate);
+ sha256_transform(midstate, pdata, 0);
+
+ if (sha_on_cpu) {
+ for (int i = 0; i < throughput/4; ++i) {
+ for (int j = 0; j < 20; j++) {
+ datax4[0][20*i+j] = uint32x4_t(pdata[j]);
+ datax4[1][20*i+j] = uint32x4_t(pdata[j]);
+ }
+ }
+ }
+ else prepare_sha256(thr_id, pdata, midstate);
+
+ int cur = 1, nxt = 0;
+ int iteration = 0;
+ int num_shares = (4*opt_n_threads) || 1; // opt_n_threads can be 0 with --cputest
+ int share_workload = ((((throughput + num_shares-1) / num_shares) + 3) / 4) * 4;
+
+ do {
+ nonce[nxt] = n;
+
+ if (sha_on_cpu)
+ {
+ for (int i = 0; i < throughput/4; i++) {
+ datax4[nxt][i * 20 + 19] = uint32x4_t(n+0, n+1, n+2, n+3);
+ n += 4;
+ }
+ if (sha_multithreaded)
+ {
+#ifdef WIN32
+ parallel_for (0, num_shares, [&](int share) {
+ for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+ for (int l = 0; l < 8; l++)
+ tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+ HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+ PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+ }
+ } );
+#else
+ #pragma omp parallel for
+ for (int share = 0; share < num_shares; share++) {
+ for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+ for (int l = 0; l < 8; l++)
+ tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+ HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+ PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+ }
+ }
+#endif
+ }
+ else /* sha_multithreaded */
+ {
+ for (int k = 0; k < throughput/4; k++) {
+ for (int l = 0; l < 8; l++)
+ tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+ HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+ PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+ }
+ }
+
+ for (int i = 0; i < throughput/4; i++) {
+ for (int j = 0; j < 32; j++) {
+ uint32x4_t &t = Xx4[nxt][i * 32 + j];
+ X[nxt][(4*i+0)*32+j] = t[0]; X[nxt][(4*i+1)*32+j] = t[1];
+ X[nxt][(4*i+2)*32+j] = t[2]; X[nxt][(4*i+3)*32+j] = t[3];
+ }
+ }
+
+ cuda_scrypt_serialize(thr_id, nxt);
+ cuda_scrypt_HtoD(thr_id, X[nxt], nxt);
+
+ cuda_scrypt_core(thr_id, nxt, N);
+ cuda_scrypt_done(thr_id, nxt);
+
+ cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false);
+ cuda_scrypt_flush(thr_id, nxt);
+
+ if(!cuda_scrypt_sync(thr_id, cur))
+ {
+ result = -1;
+ break;
+ }
+
+ for (int i = 0; i < throughput/4; i++) {
+ for (int j = 0; j < 32; j++) {
+ Xx4[cur][i * 32 + j] = uint32x4_t(
+ X[cur][(4*i+0)*32+j], X[cur][(4*i+1)*32+j],
+ X[cur][(4*i+2)*32+j], X[cur][(4*i+3)*32+j]
+ );
+ }
+ }
+
+ if (sha_multithreaded)
+ {
+#ifdef WIN32
+ parallel_for (0, num_shares, [&](int share) {
+ for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+ PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+ }
+ } );
+#else
+ #pragma omp parallel for
+ for (int share = 0; share < num_shares; share++) {
+ for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+ PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+ }
+ }
+#endif
+ } else {
+
+ for (int k = 0; k < throughput/4; k++) {
+ PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+ }
+ }
+
+ for (int i = 0; i < throughput/4; i++) {
+ for (int j = 0; j < 8; j++) {
+ uint32x4_t &t = hashx4[cur][i * 8 + j];
+ hash[cur][(4*i+0)*8+j] = t[0]; hash[cur][(4*i+1)*8+j] = t[1];
+ hash[cur][(4*i+2)*8+j] = t[2]; hash[cur][(4*i+3)*8+j] = t[3];
+ }
+ }
+ }
+ else /* sha_on_cpu */
+ {
+ n += throughput;
+
+ cuda_scrypt_serialize(thr_id, nxt);
+ pre_sha256(thr_id, nxt, nonce[nxt], throughput);
+
+ cuda_scrypt_core(thr_id, nxt, N);
+ cuda_scrypt_flush(thr_id, nxt); // required here ?
+
+ post_sha256(thr_id, nxt, throughput);
+ cuda_scrypt_done(thr_id, nxt);
+
+ cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+ cuda_scrypt_flush(thr_id, nxt); // required here ?
+
+ if (!cuda_scrypt_sync(thr_id, cur)) {
+ printf("error\n");
+ result = -1;
+ break;
+ }
+ }
+
+ if (iteration > 0 || opt_n_threads == 0)
+ {
+ for (int i = 0; i < throughput; i++)
+ {
+ if (hash[cur][i * 8 + 7] <= Htarg && fulltest(hash[cur] + i * 8, ptarget))
+ {
+ // CPU based validation to rule out GPU errors (scalar CPU code)
+ uint32_t _ALIGN(64) inp[32], ref[32], tstate[8], ostate[8], refhash[8], ldata[20];
+
+ memcpy(ldata, pdata, 80); ldata[19] = nonce[cur] + i;
+ memcpy(tstate, midstate, 32);
+ HMAC_SHA256_80_init(ldata, tstate, ostate);
+ PBKDF2_SHA256_80_128(tstate, ostate, ldata, inp);
+ computeGold(inp, ref, (uchar*)scratch);
+ bool good = true;
+
+ if (sha_on_cpu) {
+ if (memcmp(&X[cur][i * 32], ref, 32*sizeof(uint32_t)) != 0) good = false;
+ } else {
+ PBKDF2_SHA256_128_32(tstate, ostate, ref, refhash);
+ if (memcmp(&hash[cur][i * 8], refhash, 32) != 0) good = false;
+ }
+
+ if (!good)
+ applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur);
+ else {
+ *hashes_done = n - pdata[19];
+ pdata[19] = nonce[cur] + i;
+ result = 1;
+ goto byebye;
+ }
+ }
+ }
+ }
+
+ cur = (cur+1)&1;
+ nxt = (nxt+1)&1;
+ ++iteration;
+
+ //printf("n=%d, thr=%d, max=%d, rest=%d\n", n, throughput, max_nonce, work_restart[thr_id].restart);
+ } while (n <= max_nonce && !work_restart[thr_id].restart);
+
+ *hashes_done = n - pdata[19];
+ pdata[19] = n;
+byebye:
+ delete[] datax4[0]; delete[] datax4[1]; delete[] hashx4[0]; delete[] hashx4[1];
+ delete[] tstatex4[0]; delete[] tstatex4[1]; delete[] ostatex4[0]; delete[] ostatex4[1];
+ delete[] Xx4[0]; delete[] Xx4[1];
+ delete [] scratch;
+ gettimeofday(tv_end, NULL);
+ return result;
+}
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+ uint32_t x0 = (B[ 0] ^= C[ 0]), x1 = (B[ 1] ^= C[ 1]), x2 = (B[ 2] ^= C[ 2]), x3 = (B[ 3] ^= C[ 3]);
+ uint32_t x4 = (B[ 4] ^= C[ 4]), x5 = (B[ 5] ^= C[ 5]), x6 = (B[ 6] ^= C[ 6]), x7 = (B[ 7] ^= C[ 7]);
+ uint32_t x8 = (B[ 8] ^= C[ 8]), x9 = (B[ 9] ^= C[ 9]), xa = (B[10] ^= C[10]), xb = (B[11] ^= C[11]);
+ uint32_t xc = (B[12] ^= C[12]), xd = (B[13] ^= C[13]), xe = (B[14] ^= C[14]), xf = (B[15] ^= C[15]);
+
+ /* Operate on columns. */
+ x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7);
+ x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9);
+ xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13);
+ x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18);
+
+ /* Operate on rows. */
+ x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7);
+ x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9);
+ x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13);
+ x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18);
+
+ /* Operate on columns. */
+ x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7);
+ x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9);
+ xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13);
+ x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18);
+
+ /* Operate on rows. */
+ x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7);
+ x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9);
+ x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13);
+ x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18);
+
+ /* Operate on columns. */
+ x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7);
+ x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9);
+ xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13);
+ x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18);
+
+ /* Operate on rows. */
+ x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7);
+ x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9);
+ x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13);
+ x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18);
+
+ /* Operate on columns. */
+ x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7);
+ x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9);
+ xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13);
+ x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18);
+
+ /* Operate on rows. */
+ x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7);
+ x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9);
+ x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13);
+ x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18);
+
+ B[ 0] += x0; B[ 1] += x1; B[ 2] += x2; B[ 3] += x3; B[ 4] += x4; B[ 5] += x5; B[ 6] += x6; B[ 7] += x7;
+ B[ 8] += x8; B[ 9] += x9; B[10] += xa; B[11] += xb; B[12] += xc; B[13] += xd; B[14] += xe; B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor
+ */
+static void scrypt_core(uint32_t *X, uint32_t *V, int N)
+{
+ for (int i = 0; i < N; i++) {
+ memcpy(&V[i * 32], X, 128);
+ xor_salsa8(&X[0], &X[16]);
+ xor_salsa8(&X[16], &X[0]);
+ }
+ for (int i = 0; i < N; i++) {
+ uint32_t j = 32 * (X[16] & (N - 1));
+ for (uint8_t k = 0; k < 32; k++)
+ X[k] ^= V[j + k];
+ xor_salsa8(&X[0], &X[16]);
+ xor_salsa8(&X[16], &X[0]);
+ }
+}
+
+/**
+ * Compute reference data set on the CPU
+ * @param input input data as provided to device
+ * @param reference reference data, computed but preallocated
+ * @param scratchpad scrypt scratchpad
+ **/
+void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
+{
+ uint32_t X[32] = { 0 };
+ uint32_t *V = (uint32_t*) scratchpad;
+ int N = (1<<(opt_nfactor+1)); // default 9 = 1024
+
+ for (int k = 0; k < 32; k++)
+ X[k] = input[k];
+
+ scrypt_core(X, V, N);
+
+ for (int k = 0; k < 32; k++)
+ reference[k] = X[k];
+}
+
+static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
+ uint32_t *midstate, unsigned char *scratchpad, int N)
+{
+ uint32_t tstate[8], ostate[8];
+ uint32_t X[32] = { 0 };
+ uint32_t *V = (uint32_t *) scratchpad;
+
+ memcpy(tstate, midstate, 32);
+ HMAC_SHA256_80_init(input, tstate, ostate);
+ PBKDF2_SHA256_80_128(tstate, ostate, input, X);
+
+ scrypt_core(X, V, N);
+
+ PBKDF2_SHA256_128_32(tstate, ostate, X, output);
+}
+
+/* cputest */
+void scrypthash(void* output, const void* input)
+{
+ uint32_t _ALIGN(64) X[32], ref[32] = { 0 }, tstate[8], ostate[8], midstate[8];
+ uint32_t _ALIGN(64) data[20];
+ uchar *scratchbuf = (uchar *) calloc(4 * 128 + 63, 1024);
+
+ // no default set with --cputest
+ if (opt_nfactor == 0) opt_nfactor = 9;
+
+ memcpy(data, input, 80);
+
+ sha256_init(midstate);
+ sha256_transform(midstate, data, 0); /* ok */
+
+ memcpy(tstate, midstate, 32);
+ HMAC_SHA256_80_init(data, tstate, ostate);
+ PBKDF2_SHA256_80_128(tstate, ostate, data, X); /* ok */
+
+ if (scratchbuf) {
+ computeGold(X, ref, scratchbuf);
+ PBKDF2_SHA256_128_32(tstate, ostate, ref, (uint32_t*) output);
+ } else {
+ memset(output, 0, 32);
+ }
+
+ free(scratchbuf);
+}
+
+#define SCRYPT_MAX_WAYS 1
+/* cputest */
+void scrypthash2(void* output, const void* input)
+{
+ uint32_t midstate[8] = { 0 };
+ uint32_t data[SCRYPT_MAX_WAYS * 20] = { 0 };
+ uint32_t hash[SCRYPT_MAX_WAYS * 8] = { 0 };
+ uint32_t N = 1U << ((opt_nfactor ? opt_nfactor : 9) + 1); // default 1024
+
+ uchar* scratch = (uchar*) calloc(4 * 128 + 63, N); // scrypt_buffer_alloc(N);
+
+ memcpy(data, input, 80);
+
+ sha256_init(midstate);
+ sha256_transform(midstate, data, 0);
+
+ scrypt_1024_1_1_256(data, hash, midstate, scratch, N);
+
+ memcpy(output, hash, 32);
+
+ free(scratch);
+}
diff --git a/scrypt/blake.cu b/scrypt/blake.cu
new file mode 100644
index 0000000000..bcaa965806
--- /dev/null
+++ b/scrypt/blake.cu
@@ -0,0 +1,454 @@
+//
+// =============== BLAKE part on nVidia GPU ======================
+//
+// This is the generic "default" implementation when no architecture
+// specific implementation is available in the kernel.
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+// TODO: CUDA porting work remains to be done.
+//
+
+#include