diff --git a/Makefile.am b/Makefile.am
index e8a35ca218..d6edb3479c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,7 @@ bin_PROGRAMS = ccminer
 ccminer_SOURCES	= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
-			  crc32.c hefty1.c scrypt.c \
+			  crc32.c hefty1.c \
 			  ccminer.cpp util.cpp \
 			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
 			  heavy/heavy.cu \
@@ -57,6 +57,13 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
 			  x11/s3.cu
 
+# scrypt
+ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
+    scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \
+    scrypt/salsa_kernel.cu scrypt/test_kernel.cu \
+    scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \
+    scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu
+
 if HAVE_NVML
 nvml_defs = -DUSE_WRAPNVML
 nvml_libs = -ldl
@@ -118,6 +125,10 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 
+# This kernel need also an older SM to be able to autotune kernels
+scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
+	$(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $<
+
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
diff --git a/ccminer.cpp b/ccminer.cpp
index def142087b..d3203253ff 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -102,6 +102,8 @@ enum sha_algos {
 	ALGO_PLUCK,
 	ALGO_QUARK,
 	ALGO_QUBIT,
+	ALGO_SCRYPT,
+	ALGO_SCRYPT_JANE,
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_S3,
@@ -137,6 +139,8 @@ static const char *algo_names[] = {
 	"pluck",
 	"quark",
 	"qubit",
+	"scrypt",
+	"scrypt-jane",
 	"skein",
 	"skein2",
 	"s3",
@@ -184,6 +188,20 @@ char * device_name[MAX_GPUS];
 short device_map[MAX_GPUS] = { 0 };
 long  device_sm[MAX_GPUS] = { 0 };
 uint32_t gpus_intensity[MAX_GPUS] = { 0 };
+
+int device_interactive[MAX_GPUS] = { 0 };
+int device_batchsize[MAX_GPUS] = { 0 };
+int device_backoff[MAX_GPUS] = { 0 };
+int device_lookup_gap[MAX_GPUS] = { 0 };
+int device_texturecache[MAX_GPUS] = { 0 };
+int device_singlememory[MAX_GPUS] = { 0 };
+char *device_config[MAX_GPUS] = { 0 };
+int opt_nfactor = 0;
+int parallel = 2;
+bool autotune = true;
+bool abort_flag = false;
+char *jane_params = NULL;
+
 char *rpc_user = NULL;
 static char *rpc_pass;
 static char *rpc_userpass = NULL;
@@ -255,6 +273,8 @@ Options:\n\
 			pluck       SupCoin\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
+			scrypt      Scrypt\n\
+			scrypt-jane Scrypt-jane Chacha\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			s3          S3 (1Coin)\n\
@@ -439,6 +459,7 @@ void get_currentalgo(char* buf, int sz)
  */
 void proper_exit(int reason)
 {
+	abort_flag = true;
 	cuda_devicereset();
 
 	if (check_dups)
@@ -1173,6 +1194,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	switch (opt_algo) {
 		case ALGO_JACKPOT:
 		case ALGO_PLUCK:
+		case ALGO_SCRYPT:
+		case ALGO_SCRYPT_JANE:
 			diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
 		case ALGO_DMD_GR:
@@ -1386,6 +1409,8 @@ static void *miner_thread(void *userdata)
 				minmax = 0x400000;
 				break;
 			case ALGO_LYRA2:
+			case ALGO_SCRYPT:
+			case ALGO_SCRYPT_JANE:
 				minmax = 0x100000;
 				break;
 			case ALGO_PLUCK:
@@ -1526,6 +1551,16 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		case ALGO_SCRYPT:
+			rc = scanhash_scrypt(thr_id, work.data, work.target, NULL,
+			                      max_nonce, &hashes_done, &tv_start, &tv_end);
+			break;
+
+		case ALGO_SCRYPT_JANE:
+			rc = scanhash_scrypt_jane(thr_id, work.data, work.target, NULL,
+			                      max_nonce, &hashes_done, &tv_start, &tv_end);
+			break;
+
 		case ALGO_SKEIN:
 			rc = scanhash_skeincoin(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@@ -1942,15 +1977,29 @@ void parse_arg(int key, char *arg)
 
 	switch(key) {
 	case 'a':
+		p = strstr(arg, ":"); // optional factor
+		if (p) *p = '\0';
 		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
-			if (algo_names[i] &&
-			    !strcmp(arg, algo_names[i])) {
+			if (algo_names[i] && !strcasecmp(arg, algo_names[i])) {
 				opt_algo = (enum sha_algos)i;
 				break;
 			}
 		}
 		if (i == ARRAY_SIZE(algo_names))
 			show_usage_and_exit(1);
+		if (p) {
+			opt_nfactor = atoi(p + 1);
+			if (opt_algo == ALGO_SCRYPT_JANE) {
+				free(jane_params);
+				jane_params = strdup(p+1);
+			}
+		}
+		if (!opt_nfactor) {
+			switch (opt_algo) {
+			case ALGO_SCRYPT:      opt_nfactor = 9;  break;
+			case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
+			}
+		}
 		break;
 	case 'b':
 		p = strstr(arg, ":");
@@ -2404,6 +2453,8 @@ int main(int argc, char *argv[])
 	rpc_pass = strdup("");
 	rpc_url = strdup("");
 
+	jane_params = strdup("");
+
 	pthread_mutex_init(&applog_lock, NULL);
 
 	// number of cpus for thread affinity
@@ -2423,9 +2474,17 @@ int main(int argc, char *argv[])
 	if (num_cpus < 1)
 		num_cpus = 1;
 
-	// default thread to device map
 	for (i = 0; i < MAX_GPUS; i++) {
 		device_map[i] = i;
+		device_name[i] = NULL;
+		// for future use, maybe
+		device_interactive[i] = -1;
+		device_batchsize[i] = 1024;
+		device_backoff[i] = is_windows() ? 12 : 2;
+		device_lookup_gap[i] = 1;
+		device_texturecache[i] = -1;
+		device_singlememory[i] = -1;
+		device_config[i] = NULL;
 	}
 
 	// number of gpus
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 528d5c07c6..f8b3a6c319 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -250,6 +250,8 @@
       <TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
       <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
     </ClCompile>
+    <ClCompile Include="scrypt-jane.cpp" />
+    <ClCompile Include="scrypt.cpp" />
     <ClCompile Include="util.cpp" />
     <ClCompile Include="fuguecoin.cpp" />
     <ClCompile Include="groestlcoin.cpp" />
@@ -261,10 +263,6 @@
     <ClCompile Include="crc32.c" />
     <ClCompile Include="hefty1.c" />
     <ClCompile Include="myriadgroestl.cpp" />
-    <ClCompile Include="scrypt.c">
-      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
-      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClCompile Include="sph\aes_helper.c" />
@@ -322,6 +320,7 @@
     <ClInclude Include="miner.h" />
     <ClInclude Include="nvml.h" />
     <ClInclude Include="res\resource.h" />
+    <ClInclude Include="scrypt\salsa_kernel.h" />
     <ClInclude Include="sph\sph_blake.h" />
     <ClInclude Include="sph\sph_bmw.h" />
     <ClInclude Include="sph\sph_cubehash.h" />
@@ -352,6 +351,22 @@
     <CudaCompile Include="cuda_myriadgroestl.cu" />
     <CudaCompile Include="cuda_nist5.cu">
     </CudaCompile>
+    <CudaCompile Include="scrypt\blake.cu" />
+    <CudaCompile Include="scrypt\fermi_kernel.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_20,sm_21;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\keccak.cu" />
+    <CudaCompile Include="scrypt\kepler_kernel.cu" />
+    <CudaCompile Include="scrypt\nv_kernel.cu" />
+    <CudaCompile Include="scrypt\nv_kernel2.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\salsa_kernel.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_20,sm_21</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\sha256.cu" />
+    <CudaCompile Include="scrypt\test_kernel.cu" />
+    <CudaCompile Include="scrypt\titan_kernel.cu" />
     <CudaCompile Include="zr5.cu" />
     <CudaCompile Include="heavy\cuda_blake512.cu">
     </CudaCompile>
@@ -510,4 +525,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 3d1fd916fd..9f62c54e14 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -73,6 +73,9 @@
     <Filter Include="Ressources">
       <UniqueIdentifier>{f5117ccb-a70d-411a-b7ea-d6faed230bc7}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\CUDA\scrypt">
+      <UniqueIdentifier>{c26f5b02-37b5-4420-a4e8-ee1ad517dc95}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="compat\jansson\dump.c">
@@ -111,9 +114,6 @@
     <ClCompile Include="hefty1.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="scrypt.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="fuguecoin.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -225,6 +225,12 @@
     <ClCompile Include="lyra2\Sponge.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
+    <ClCompile Include="scrypt.cpp">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClCompile>
+    <ClCompile Include="scrypt-jane.cpp">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="compat.h">
@@ -377,6 +383,9 @@
     <ClInclude Include="res\resource.h">
       <Filter>Ressources</Filter>
     </ClInclude>
+    <ClInclude Include="scrypt\salsa_kernel.h">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -580,6 +589,36 @@
     <CudaCompile Include="skein2.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="scrypt\blake.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\fermi_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\keccak.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\kepler_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel2.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\salsa_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\sha256.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\test_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\titan_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
@@ -596,4 +635,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/miner.h b/miner.h
index 71bf153f62..2b8a7b4fe6 100644
--- a/miner.h
+++ b/miner.h
@@ -272,8 +272,6 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
 extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
 
-extern unsigned char *scrypt_buffer_alloc();
-
 extern int scanhash_deep(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@@ -343,8 +341,12 @@ extern int scanhash_qubit(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done);
 
 extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done);
+	const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
+	unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
+
+extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
+	unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
 
 extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
@@ -683,6 +685,7 @@ void pentablakehash(void *output, const void *input);
 void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
+void scrypthash(void* output, const void* input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
new file mode 100644
index 0000000000..ce21ea2de5
--- /dev/null
+++ b/scrypt-jane.cpp
@@ -0,0 +1,626 @@
+/*
+	scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
+
+	Public Domain or MIT License, whichever is easier
+*/
+
+#include "miner.h"
+
+#include "scrypt/scrypt-jane.h"
+#include "scrypt/code/scrypt-jane-portable.h"
+#include "scrypt/code/scrypt-jane-romix.h"
+#include "scrypt/keccak.h"
+
+#include "scrypt/salsa_kernel.h"
+
+#define scrypt_maxN 30  /* (1 << (30 + 1)) = ~2 billion */
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25  /* (1 << 25) = ~33 million */
+
+// ---------------------------- BEGIN keccak functions ------------------------------------
+
+#define SCRYPT_HASH "Keccak-512"
+#define SCRYPT_HASH_DIGEST_SIZE 64
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
+#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
+#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t state[SCRYPT_KECCAK_F / 64];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+static void
+keccak_block(scrypt_hash_state *S, const uint8_t *in) {
+	size_t i;
+	uint64_t *s = S->state, t[5], u[5], v, w;
+
+	/* absorb input */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
+		s[i] ^= U8TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	memset(S, 0, sizeof(*S));
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		keccak_block(S, S->buffer);
+	}
+
+	/* handle the current data */
+	while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
+		keccak_block(S, in);
+		in += SCRYPT_HASH_BLOCK_SIZE;
+		inlen -= SCRYPT_HASH_BLOCK_SIZE;
+	}
+
+	/* handle leftover data */
+	S->leftover = (uint32_t)inlen;
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	size_t i;
+
+	S->buffer[S->leftover] = 0x01;
+	memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
+	S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
+	keccak_block(S, S->buffer);
+
+	for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
+		U64TO8_LE(&hash[i], S->state[i / 8]);
+	}
+}
+
+// ---------------------------- END keccak functions ------------------------------------
+
+// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
+
+typedef struct scrypt_hmac_state_t {
+	scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void
+scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
+	scrypt_hash_state st;
+	scrypt_hash_init(&st);
+	scrypt_hash_update(&st, m, mlen);
+	scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void
+scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	scrypt_hash_init(&st->inner);
+	scrypt_hash_init(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		scrypt_hash(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void
+scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
+	/* h(inner || m...) */
+	scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void
+scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
+	/* h(inner || m) */
+	scrypt_hash_digest innerhash;
+	scrypt_hash_finish(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+	scrypt_hash_finish(&st->outer, mac);
+}
+
+/*
+ * Special version where N = 1
+ *  - mikaelh
+ */
+static void
+scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
+	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+	scrypt_hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, /*j,*/ blocks;
+//	uint64_t c;
+
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	scrypt_hmac_init(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for (i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		scrypt_hmac_update(&work, be, 4);
+		scrypt_hmac_finish(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		out += SCRYPT_HASH_DIGEST_SIZE;
+		bytes -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+}
+
+// ---------------------------- END PBKDF2 functions ------------------------------------
+
+static void
+scrypt_fatal_error_default(const char *msg) {
+	fprintf(stderr, "%s\n", msg);
+	exit(1);
+}
+
+static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
+
+void
+scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
+	scrypt_fatal_error = fn;
+}
+
+typedef struct scrypt_aligned_alloc_t {
+	uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+#if defined(SCRYPT_TEST_SPEED)
+static uint8_t *mem_base = (uint8_t *)0;
+static size_t mem_bump = 0;
+
+/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+	scrypt_aligned_alloc aa;
+	if (!mem_base) {
+		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+		if (!mem_base)
+			scrypt_fatal_error("scrypt: out of memory");
+		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	}
+	aa.mem = mem_base + mem_bump;
+	aa.ptr = aa.mem;
+	mem_bump += (size_t)size;
+	return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+	mem_bump = 0;
+}
+#else
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+	static const size_t max_alloc = (size_t)-1;
+	scrypt_aligned_alloc aa;
+	size += (SCRYPT_BLOCK_BYTES - 1);
+	if (size > max_alloc)
+		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+	aa.mem = (uint8_t *)malloc((size_t)size);
+	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	if (!aa.mem)
+		scrypt_fatal_error("scrypt: out of memory");
+	return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+	free(aa->mem);
+}
+#endif
+
+
+// yacoin: increasing Nfactor gradually
+unsigned char GetNfactor(unsigned int nTimestamp) {
+	int l = 0;
+
+	unsigned int Nfactor = 0;
+
+	// Yacoin defaults
+	unsigned int Ntimestamp = 1367991200;
+	unsigned int minN = 4;
+	unsigned int maxN = 30;
+
+	if (strlen(jane_params) > 0) {
+		if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op
+		//
+		// NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant
+		// in the src/main.cpp file of the official wallet clients as well as the
+		// const unsigned char minNfactor and const unsigned char maxNfactor
+		//
+		else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) {
+			// YBCoin:   1372386273, minN:  4, maxN: 30
+			Ntimestamp = 1372386273; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) {
+			// ZcCoin:   1375817223, minN: 12, maxN: 30
+			Ntimestamp = 1375817223; minN= 12; maxN= 30;
+		} else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) {
+			// FreeCoin: 1375801200, minN:  6, maxN: 32
+			Ntimestamp = 1375801200; minN=  6; maxN= 32;
+		} else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) {
+			// OneCoin:  1371119462, minN:  6, maxN: 30
+			Ntimestamp = 1371119462; minN=  6; maxN= 30;
+		} else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) {
+			// QQCoin:   1387769316, minN:  4, maxN: 30
+			Ntimestamp = 1387769316; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) {
+			// GoldPressedLatinum:1377557832, minN:  4, maxN: 30
+			Ntimestamp = 1377557832; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) {
+			// MicroCoin:1389028879, minN:  4, maxN: 30
+			Ntimestamp = 1389028879; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) {
+			// AppleCoin:1384720832, minN:  4, maxN: 30
+			Ntimestamp = 1384720832; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) {
+			// Copperbars:1376184687, minN: 4, maxN: 30
+			Ntimestamp = 1376184687; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) {
+			// CacheCoin:1388949883, minN: 4, maxN: 30
+			Ntimestamp = 1388949883; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) {
+			// MicroCoin:1388361600, minN: 4, maxN: 30
+			Ntimestamp = 1388361600; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) {
+			// VelocityCoin:1387769316, minN: 4, maxN: 30
+			Ntimestamp = 1387769316; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) {
+			// InternetCoin:1388385602, minN: 4, maxN: 30
+			Ntimestamp = 1388385602; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) {
+			// InternetCoin:1389196388, minN: 4, maxN: 30
+			Ntimestamp = 1389196388; minN= 4; maxN= 30;
+		} else {
+			if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3)
+			if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
+			else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params);
+		}
+	}
+	// determination based on the constants determined above
+	if (nTimestamp <= Ntimestamp)
+		return minN;
+
+	unsigned long int s = nTimestamp - Ntimestamp;
+	while ((s >> 1) > 3) {
+		l += 1;
+		s >>= 1;
+	}
+
+	s &= 3;
+
+	int n = (l * 170 + s * 25 - 2320) / 100;
+
+	if (n < 0) n = 0;
+
+	if (n > 255)
+		printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+
+	Nfactor = n;
+	if (Nfactor<minN) return minN;
+	if (Nfactor>maxN) return maxN;
+	return Nfactor;
+}
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+					 | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static int s_Nfactor = 0;
+
+int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
+	uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
+{
+	const uint32_t Htarg = ptarget[7];
+
+	if (s_Nfactor == 0 && strlen(jane_params) > 0)
+		applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
+
+	int Nfactor = GetNfactor(bswap_32x4(pdata[17]));
+	if (Nfactor > scrypt_maxN) {
+		scrypt_fatal_error("scrypt: N out of range");
+	}
+
+	if (Nfactor != s_Nfactor)
+	{
+		// all of this isn't very thread-safe...
+		opt_nfactor = (1 << (Nfactor + 1));
+
+		applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor);
+
+		if (s_Nfactor != 0) {
+			// handle N-factor increase at runtime
+			// by adjusting the lookup_gap by factor 2
+			if (s_Nfactor == Nfactor-1)
+				for (int i=0; i < 8; ++i)
+					device_lookup_gap[i] *= 2;
+		}
+		s_Nfactor = Nfactor;
+	}
+
+	int throughput = cuda_throughput(thr_id);
+
+	if(throughput == 0)
+		return -1;
+
+	gettimeofday(tv_start, NULL);
+
+	uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] };
+	uint32_t* hash[2]   = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+
+	uint32_t n = pdata[19];
+
+	/* byte swap pdata into data[0]/[1] arrays */
+	for (int k=0; k<2; ++k) {
+		for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]);
+		for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t));
+	}
+	if (parallel == 2) prepare_keccak512(thr_id, pdata);
+
+	scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) };
+	scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)opt_nfactor * 128);
+	scrypt_aligned_alloc Ybuf = scrypt_alloc(128);
+
+	uint32_t nonce[2];
+	uint32_t* cuda_X[2]      = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+	int cur = 0, nxt = 1;
+	int iteration = 0;
+
+	do {
+		nonce[nxt] = n;
+
+		if (parallel < 2)
+		{
+			for(int i=0;i<throughput;++i) {
+				uint32_t tmp_nonce = n++;
+				data[nxt][20*i + 19] = bswap_32x4(tmp_nonce);
+			}
+
+			for(int i=0;i<throughput;++i)
+				scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128);
+
+			memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput);
+			cuda_scrypt_serialize(thr_id, nxt);
+			cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt);
+			cuda_scrypt_core(thr_id, nxt, opt_nfactor);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
+
+			cuda_scrypt_flush(thr_id, nxt);
+
+			if(!cuda_scrypt_sync(thr_id, cur))
+			{
+				return -1;
+			}
+
+			memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
+			for(int i=0;i<throughput;++i)
+				scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32);
+
+#define VERIFY_ALL 0
+#if VERIFY_ALL
+			{
+				/* 2: X = ROMix(X) */
+				for(int i=0;i<throughput;++i)
+					scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N);
+
+				unsigned int err = 0;
+				for(int i=0;i<throughput;++i) {
+					unsigned char *ref = (Xbuf[cur].ptr + 128 * i);
+					unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i);
+					if (memcmp(ref, dat, 128) != 0)
+					{
+						err++;
+#if 0
+						uint32_t *ref32 = (uint32_t*) ref;
+						uint32_t *dat32 = (uint32_t*) dat;
+						for (int j=0; j<32; ++j) {
+							if (ref32[j] != dat32[j])
+							fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]);
+						}
+#endif
+					}
+				}
+				if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput);
+			}
+#endif
+		} else {
+			n += throughput;
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
+			cuda_scrypt_core(thr_id, nxt, opt_nfactor);
+
+			cuda_scrypt_flush(thr_id, nxt);
+
+			post_keccak512(thr_id, nxt, nonce[nxt], throughput);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+
+			if(!cuda_scrypt_sync(thr_id, cur))
+			{
+				return -1;
+			}
+		}
+
+		if(iteration > 0)
+		{
+			for(int i=0;i<throughput;++i) {
+				volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]);
+
+				if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget))
+				{
+					uint32_t _ALIGN(64) thash[8], tdata[20];
+					uint32_t tmp_nonce = nonce[cur] + i;
+
+					for(int z=0;z<20;z++)
+						tdata[z] = bswap_32x4(pdata[z]);
+					tdata[19] = bswap_32x4(tmp_nonce);
+
+					scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
+					scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), opt_nfactor);
+					scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
+
+					if (memcmp(thash, &hash[cur][8*i], 32) == 0)
+					{
+						//applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]);
+
+						*hashes_done = n - pdata[19];
+						pdata[19] = tmp_nonce;
+						scrypt_free(&Vbuf);
+						scrypt_free(&Ybuf);
+						scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
+						delete[] data[0]; delete[] data[1];
+						gettimeofday(tv_end, NULL);
+						return 1;
+					} else {
+						applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur);
+					}
+				}
+			}
+		}
+
+		cur = (cur+1)&1;
+		nxt = (nxt+1)&1;
+		++iteration;
+	} while (n <= max_nonce && !work_restart[thr_id].restart);
+
+	scrypt_free(&Vbuf);
+	scrypt_free(&Ybuf);
+	scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
+	delete[] data[0]; delete[] data[1];
+
+	*hashes_done = n - pdata[19];
+	pdata[19] = n;
+	gettimeofday(tv_end, NULL);
+	return 0;
+}
diff --git a/scrypt.c b/scrypt.c
deleted file mode 100644
index b716125c1a..0000000000
--- a/scrypt.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-
-static const uint32_t keypad[12] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
-};
-static const uint32_t innerpad[11] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
-};
-static const uint32_t outerpad[8] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
-};
-static const uint32_t finalblk[16] = {
-	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8];
-	uint32_t pad[16];
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 16, 16);
-	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 0);
-	memcpy(ihash, tstate, 32);
-
-	sha256_init(ostate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform(ostate, pad, 0);
-
-	sha256_init(tstate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8], ostate2[8];
-	uint32_t ibuf[16], obuf[16];
-	int i, j;
-
-	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 16, 16);
-	memcpy(ibuf + 5, innerpad, 44);
-	memcpy(obuf + 8, outerpad, 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 32);
-		ibuf[4] = i + 1;
-		sha256_transform(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 32);
-		sha256_transform(ostate2, obuf, 0);
-		for (j = 0; j < 8; j++)
-			output[8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
-	const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[16];
-	int i;
-	
-	sha256_transform(tstate, salt, 1);
-	sha256_transform(tstate, salt + 16, 1);
-	sha256_transform(tstate, finalblk, 0);
-	memcpy(buf, tstate, 32);
-	memcpy(buf + 8, outerpad, 32);
-
-	sha256_transform(ostate, buf, 0);
-	for (i = 0; i < 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-
-#if HAVE_SHA256_4WAY
-
-static const uint32_t keypad_4way[4 * 12] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000280, 0x00000280, 0x00000280, 0x00000280
-};
-static const uint32_t innerpad_4way[4 * 11] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
-};
-static const uint32_t outerpad_4way[4 * 8] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000300, 0x00000300, 0x00000300, 0x00000300
-};
-static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
-	uint32_t pad[4 * 16] __attribute__((aligned(16)));
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 4 * 16, 4 * 16);
-	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 0);
-	memcpy(ihash, tstate, 4 * 32);
-
-	sha256_init_4way(ostate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_4way(ostate, pad, 0);
-
-	sha256_init_4way(tstate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_4way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[4 * 8] __attribute__((aligned(16)));
-	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
-	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
-	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
-	int i, j;
-
-	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 4 * 16, 4 * 16);
-	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
-	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 4 * 32);
-		ibuf[4 * 4 + 0] = i + 1;
-		ibuf[4 * 4 + 1] = i + 1;
-		ibuf[4 * 4 + 2] = i + 1;
-		ibuf[4 * 4 + 3] = i + 1;
-		sha256_transform_4way(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 4 * 32);
-		sha256_transform_4way(ostate2, obuf, 0);
-		for (j = 0; j < 4 * 8; j++)
-			output[4 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[4 * 16] __attribute__((aligned(16)));
-	int i;
-	
-	sha256_transform_4way(tstate, salt, 1);
-	sha256_transform_4way(tstate, salt + 4 * 16, 1);
-	sha256_transform_4way(tstate, finalblk_4way, 0);
-	memcpy(buf, tstate, 4 * 32);
-	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
-
-	sha256_transform_4way(ostate, buf, 0);
-	for (i = 0; i < 4 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_4WAY */
-
-
-#if HAVE_SHA256_8WAY
-
-static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
-	uint32_t pad[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		pad[8 * 4 + i] = 0x80000000;
-	memset(pad + 8 * 5, 0x00, 8 * 40);
-	for (i = 0; i < 8; i++)
-		pad[8 * 15 + i] = 0x00000280;
-	sha256_transform_8way(tstate, pad, 0);
-	memcpy(ihash, tstate, 8 * 32);
-	
-	sha256_init_8way(ostate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_8way(ostate, pad, 0);
-	
-	sha256_init_8way(tstate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_8way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8 * 8] __attribute__((aligned(32)));
-	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
-	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
-	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
-	int i, j;
-	
-	memcpy(istate, tstate, 8 * 32);
-	sha256_transform_8way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 5 + i] = 0x80000000;
-	memset(ibuf + 8 * 6, 0x00, 8 * 36);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 15 + i] = 0x000004a0;
-	
-	for (i = 0; i < 8; i++)
-		obuf[8 * 8 + i] = 0x80000000;
-	memset(obuf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		obuf[8 * 15 + i] = 0x00000300;
-	
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 8 * 32);
-		ibuf[8 * 4 + 0] = i + 1;
-		ibuf[8 * 4 + 1] = i + 1;
-		ibuf[8 * 4 + 2] = i + 1;
-		ibuf[8 * 4 + 3] = i + 1;
-		ibuf[8 * 4 + 4] = i + 1;
-		ibuf[8 * 4 + 5] = i + 1;
-		ibuf[8 * 4 + 6] = i + 1;
-		ibuf[8 * 4 + 7] = i + 1;
-		sha256_transform_8way(obuf, ibuf, 0);
-		
-		memcpy(ostate2, ostate, 8 * 32);
-		sha256_transform_8way(ostate2, obuf, 0);
-		for (j = 0; j < 8 * 8; j++)
-			output[8 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	sha256_transform_8way(tstate, salt, 1);
-	sha256_transform_8way(tstate, salt + 8 * 16, 1);
-	sha256_transform_8way(tstate, finalblk_8way, 0);
-	
-	memcpy(buf, tstate, 8 * 32);
-	for (i = 0; i < 8; i++)
-		buf[8 * 8 + i] = 0x80000000;
-	memset(buf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		buf[8 * 15 + i] = 0x00000300;
-	sha256_transform_8way(ostate, buf, 0);
-	
-	for (i = 0; i < 8 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_8WAY */
-
-
-#if defined(__x86_64__)
-
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#if defined(USE_AVX2)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 21
-#define HAVE_SCRYPT_6WAY 0
-void scrypt_core_6way(uint32_t *X, uint32_t *V);
-#endif
-
-#elif defined(__i386__)
-
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-
-#elif defined(__arm__) && defined(__APCS_32__)
-
-static void scrypt_core(uint32_t *X, uint32_t *V);
-#if defined(__ARM_NEON__)
-#undef HAVE_SHA256_4WAY
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#endif
-
-#endif
-
-static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
-{
-	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
-	int i;
-
-	x00 = (B[ 0] ^= Bx[ 0]);
-	x01 = (B[ 1] ^= Bx[ 1]);
-	x02 = (B[ 2] ^= Bx[ 2]);
-	x03 = (B[ 3] ^= Bx[ 3]);
-	x04 = (B[ 4] ^= Bx[ 4]);
-	x05 = (B[ 5] ^= Bx[ 5]);
-	x06 = (B[ 6] ^= Bx[ 6]);
-	x07 = (B[ 7] ^= Bx[ 7]);
-	x08 = (B[ 8] ^= Bx[ 8]);
-	x09 = (B[ 9] ^= Bx[ 9]);
-	x10 = (B[10] ^= Bx[10]);
-	x11 = (B[11] ^= Bx[11]);
-	x12 = (B[12] ^= Bx[12]);
-	x13 = (B[13] ^= Bx[13]);
-	x14 = (B[14] ^= Bx[14]);
-	x15 = (B[15] ^= Bx[15]);
-	for (i = 0; i < 8; i += 2) {
-#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns. */
-		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
-		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
-		
-		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
-		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
-		
-		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
-		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
-		
-		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
-		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
-		
-		/* Operate on rows. */
-		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
-		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
-		
-		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
-		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
-		
-		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
-		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
-		
-		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
-		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
-#undef R
-	}
-	B[ 0] += x00;
-	B[ 1] += x01;
-	B[ 2] += x02;
-	B[ 3] += x03;
-	B[ 4] += x04;
-	B[ 5] += x05;
-	B[ 6] += x06;
-	B[ 7] += x07;
-	B[ 8] += x08;
-	B[ 9] += x09;
-	B[10] += x10;
-	B[11] += x11;
-	B[12] += x12;
-	B[13] += x13;
-	B[14] += x14;
-	B[15] += x15;
-}
-
-static inline void scrypt_core(uint32_t *X, uint32_t *V)
-{
-	uint32_t i, j, k;
-	
-	for (i = 0; i < 1024; i++) {
-		memcpy(&V[i * 32], X, 128);
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-	for (i = 0; i < 1024; i++) {
-		j = 32 * (X[16] & 1023);
-		for (k = 0; k < 32; k++)
-			X[k] ^= V[j + k];
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-}
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-#endif
-
-#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
-
-unsigned char *scrypt_buffer_alloc()
-{
-	return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE);
-}
-
-static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[8], ostate[8];
-	uint32_t X[32];
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate, midstate, 32);
-	HMAC_SHA256_80_init(input, tstate, ostate);
-	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-	scrypt_core(X, V);
-
-	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
-	uint32_t W[4 * 32] __attribute__((aligned(128)));
-	uint32_t X[4 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (i = 0; i < 20; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = input[k * 20 + i];
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			tstate[4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W, tstate, ostate);
-	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			X[k * 32 + i] = W[4 * i + k];
-	scrypt_core(X + 0 * 32, V);
-	scrypt_core(X + 1 * 32, V);
-	scrypt_core(X + 2 * 32, V);
-	scrypt_core(X + 3 * 32, V);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = X[k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			output[k * 8 + i] = W[4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#if HAVE_SCRYPT_3WAY
-
-static void scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[3 * 8], ostate[3 * 8];
-	uint32_t X[3 * 32] __attribute__((aligned(64)));
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate +  0, midstate, 32);
-	memcpy(tstate +  8, midstate, 32);
-	memcpy(tstate + 16, midstate, 32);
-	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
-	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
-	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
-	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
-	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
-
-	scrypt_core_3way(X, V);
-
-	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
-	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
-	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
-	uint32_t W[12 * 32] __attribute__((aligned(128)));
-	uint32_t X[12 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				tstate[32 * j + 4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
-	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
-	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
-	scrypt_core_3way(X + 0 * 96, V);
-	scrypt_core_3way(X + 1 * 96, V);
-	scrypt_core_3way(X + 2 * 96, V);
-	scrypt_core_3way(X + 3 * 96, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#endif /* HAVE_SCRYPT_3WAY */
-
-#if HAVE_SCRYPT_6WAY
-static void scrypt_1024_1_1_256_24way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
-	uint32_t W[24 * 32] __attribute__((aligned(128)));
-	uint32_t X[24 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-	
-	for (j = 0; j < 3; j++) 
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
-	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
-	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
-	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-	scrypt_core_6way(X + 0 * 32, V);
-	scrypt_core_6way(X + 6 * 32, V);
-	scrypt_core_6way(X + 12 * 32, V);
-	scrypt_core_6way(X + 18 * 32, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
-}
-#endif /* HAVE_SCRYPT_6WAY */
-
-int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
-	const uint32_t Htarg = ptarget[7];
-	uint32_t throughput =  scrypt_best_throughput();
-	uint32_t i;
-	
-#if HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		throughput *= 4;
-#endif
-	
-	for (i = 0; i < throughput; i++)
-		memcpy(data + i * 20, pdata, 80);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-	
-	do {
-		for (i = 0; i < throughput; i++)
-			data[i * 20 + 19] = ++n;
-		
-#if HAVE_SHA256_4WAY
-		if (throughput == 4)
-			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY
-		if (throughput == 12)
-			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_6WAY
-		if (throughput == 24)
-			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_3WAY
-		if (throughput == 3)
-			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
-		else
-#endif
-		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
-		
-		for (i = 0; i < throughput; i++) {
-			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
-				*hashes_done = n - pdata[19] + 1;
-				pdata[19] = data[i * 20 + 19];
-				return 1;
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - pdata[19] + 1;
-	pdata[19] = n;
-	return 0;
-}
diff --git a/scrypt.cpp b/scrypt.cpp
new file mode 100644
index 0000000000..f7a3422c32
--- /dev/null
+++ b/scrypt.cpp
@@ -0,0 +1,1097 @@
+/*
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifdef WIN32
+#include <ppl.h>
+using namespace Concurrency;
+#else
+#include <omp.h>
+#endif
+
+#include "miner.h"
+#include "scrypt/salsa_kernel.h"
+#include "scrypt/sha256.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <emmintrin.h>
+#include <malloc.h>
+#include <new>
+
+// A thin wrapper around the builtin __m128i type
+class uint32x4_t
+{
+public:
+#if WIN32
+	void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+	void operator delete(void *p) { _aligned_free(p); }
+	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+	void operator delete[](void *p) { _aligned_free(p); }
+#else
+	void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void operator delete(void *p) { free(p); }
+	void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void operator delete[](void *p) { free(p); }
+#endif
+	uint32x4_t() { };
+	uint32x4_t(const __m128i init) { val = init; }
+	uint32x4_t(const uint32_t init) { val = _mm_set1_epi32((int)init); }
+	uint32x4_t(const uint32_t a, const uint32_t b, const uint32_t c, const uint32_t d) { val = _mm_setr_epi32((int)a,(int)b,(int)c,(int)d); }
+	inline operator const __m128i() const { return val; }
+	inline const uint32x4_t operator+(const uint32x4_t &other) const { return _mm_add_epi32(val, other); }
+	inline const uint32x4_t operator+(const uint32_t other) const { return _mm_add_epi32(val, _mm_set1_epi32((int)other)); }
+	inline uint32x4_t& operator+=(const uint32x4_t other) { val = _mm_add_epi32(val, other); return *this; }
+	inline uint32x4_t& operator+=(const uint32_t other) { val = _mm_add_epi32(val, _mm_set1_epi32((int)other)); return *this; }
+	inline const uint32x4_t operator&(const uint32_t other) const { return _mm_and_si128(val, _mm_set1_epi32((int)other)); }
+	inline const uint32x4_t operator&(const uint32x4_t &other) const { return _mm_and_si128(val, other); }
+	inline const uint32x4_t operator|(const uint32x4_t &other) const { return _mm_or_si128(val, other); }
+	inline const uint32x4_t operator^(const uint32x4_t &other) const { return _mm_xor_si128(val, other); }
+	inline const uint32x4_t operator<<(const int num) const { return _mm_slli_epi32(val, num); }
+	inline const uint32x4_t operator>>(const int num) const { return _mm_srli_epi32(val, num); }
+	inline const uint32_t operator[](const int num) const { return ((uint32_t*)&val)[num]; }
+ protected:
+	__m128i val;
+};
+
+// non-member overload
+inline const uint32x4_t operator+(const uint32_t left, const uint32x4_t &right) { return _mm_add_epi32(_mm_set1_epi32((int)left), right); }
+
+
+//
+// Code taken from sha2.cpp and vectorized, with minimal changes where required
+// Not all subroutines are actually used.
+//
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+					 | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static __inline uint32x4_t swab32x4(const uint32x4_t &v)
+{
+	return bswap_32x4(v);
+}
+
+static const uint32_t sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+void sha256_initx4(uint32x4_t *statex4)
+{
+	for (int i=0; i<8; ++i)
+		statex4[i] = sha256_h[i];
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+		S[(66 - i) % 8], S[(67 - i) % 8], \
+		S[(68 - i) % 8], S[(69 - i) % 8], \
+		S[(70 - i) % 8], S[(71 - i) % 8], \
+		W[i] + sha256_k[i])
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+void sha256_transformx4(uint32x4_t *state, const uint32x4_t *block, int swap)
+{
+	uint32x4_t W[64];
+	uint32x4_t S[8];
+	uint32x4_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if (swap) {
+		for (i = 0; i < 16; i++)
+			W[i] = swab32x4(block[i]);
+	} else
+		memcpy(W, block, 4*64);
+	for (i = 16; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 4*32);
+
+	/* 3. Mix. */
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static const uint32_t sha256d_hash1[16] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static void sha256dx4(uint32x4_t *hash, uint32x4_t *data)
+{
+	uint32x4_t S[16];
+
+	sha256_initx4(S);
+	sha256_transformx4(S, data, 0);
+	sha256_transformx4(S, data + 16, 0);
+	for (int i=8; i<16; ++i)
+		S[i] = sha256d_hash1[i];
+	sha256_initx4(hash);
+	sha256_transformx4(hash, S, 0);
+}
+
+static inline void sha256d_preextendx4(uint32x4_t *W)
+{
+	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
+	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
+	W[18] = s1(W[16]) + W[11]             + W[ 2];
+	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
+	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
+	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
+	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
+	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
+	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
+	W[25] =                     s0(W[10]) + W[ 9];
+	W[26] =                     s0(W[11]) + W[10];
+	W[27] =                     s0(W[12]) + W[11];
+	W[28] =                     s0(W[13]) + W[12];
+	W[29] =                     s0(W[14]) + W[13];
+	W[30] =                     s0(W[15]) + W[14];
+	W[31] =                     s0(W[16]) + W[15];
+}
+
+static inline void sha256d_prehashx4(uint32x4_t *S, const uint32x4_t *W)
+{
+	uint32x4_t t0, t1;
+	RNDr(S, W, 0);
+	RNDr(S, W, 1);
+	RNDr(S, W, 2);
+}
+
+static inline void sha256d_msx4(uint32x4_t *hash, uint32x4_t *W,
+	const uint32_t *midstate, const uint32_t *prehash)
+{
+	uint32x4_t S[64];
+	uint32x4_t t0, t1;
+	int i;
+
+	S[18] = W[18];
+	S[19] = W[19];
+	S[20] = W[20];
+	S[22] = W[22];
+	S[23] = W[23];
+	S[24] = W[24];
+	S[30] = W[30];
+	S[31] = W[31];
+
+	W[18] += s0(W[3]);
+	W[19] += W[3];
+	W[20] += s1(W[18]);
+	W[21]  = s1(W[19]);
+	W[22] += s1(W[20]);
+	W[23] += s1(W[21]);
+	W[24] += s1(W[22]);
+	W[25]  = s1(W[23]) + W[18];
+	W[26]  = s1(W[24]) + W[19];
+	W[27]  = s1(W[25]) + W[20];
+	W[28]  = s1(W[26]) + W[21];
+	W[29]  = s1(W[27]) + W[22];
+	W[30] += s1(W[28]) + W[23];
+	W[31] += s1(W[29]) + W[24];
+	for (i = 32; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	for (i=0; i<8; ++i)
+		S[i] = prehash[i];
+
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	for (i = 0; i < 8; i++)
+		S[i] += midstate[i];
+
+	W[18] = S[18];
+	W[19] = S[19];
+	W[20] = S[20];
+	W[22] = S[22];
+	W[23] = S[23];
+	W[24] = S[24];
+	W[30] = S[30];
+	W[31] = S[31];
+
+	for (i=8; i<16; ++i)
+		S[i] = sha256d_hash1[i];
+	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
+	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
+	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
+	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
+	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
+	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
+	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
+	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
+	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
+	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
+	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
+	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
+	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
+	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
+	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
+	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
+	for (i = 32; i < 60; i += 2) {
+		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+	}
+	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
+
+	sha256_initx4(hash);
+
+	RNDr(hash, S,  0);
+	RNDr(hash, S,  1);
+	RNDr(hash, S,  2);
+	RNDr(hash, S,  3);
+	RNDr(hash, S,  4);
+	RNDr(hash, S,  5);
+	RNDr(hash, S,  6);
+	RNDr(hash, S,  7);
+	RNDr(hash, S,  8);
+	RNDr(hash, S,  9);
+	RNDr(hash, S, 10);
+	RNDr(hash, S, 11);
+	RNDr(hash, S, 12);
+	RNDr(hash, S, 13);
+	RNDr(hash, S, 14);
+	RNDr(hash, S, 15);
+	RNDr(hash, S, 16);
+	RNDr(hash, S, 17);
+	RNDr(hash, S, 18);
+	RNDr(hash, S, 19);
+	RNDr(hash, S, 20);
+	RNDr(hash, S, 21);
+	RNDr(hash, S, 22);
+	RNDr(hash, S, 23);
+	RNDr(hash, S, 24);
+	RNDr(hash, S, 25);
+	RNDr(hash, S, 26);
+	RNDr(hash, S, 27);
+	RNDr(hash, S, 28);
+	RNDr(hash, S, 29);
+	RNDr(hash, S, 30);
+	RNDr(hash, S, 31);
+	RNDr(hash, S, 32);
+	RNDr(hash, S, 33);
+	RNDr(hash, S, 34);
+	RNDr(hash, S, 35);
+	RNDr(hash, S, 36);
+	RNDr(hash, S, 37);
+	RNDr(hash, S, 38);
+	RNDr(hash, S, 39);
+	RNDr(hash, S, 40);
+	RNDr(hash, S, 41);
+	RNDr(hash, S, 42);
+	RNDr(hash, S, 43);
+	RNDr(hash, S, 44);
+	RNDr(hash, S, 45);
+	RNDr(hash, S, 46);
+	RNDr(hash, S, 47);
+	RNDr(hash, S, 48);
+	RNDr(hash, S, 49);
+	RNDr(hash, S, 50);
+	RNDr(hash, S, 51);
+	RNDr(hash, S, 52);
+	RNDr(hash, S, 53);
+	RNDr(hash, S, 54);
+	RNDr(hash, S, 55);
+	RNDr(hash, S, 56);
+
+	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+			 + S[57] + sha256_k[57];
+	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+			 + S[58] + sha256_k[58];
+	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+			 + S[59] + sha256_k[59];
+	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+			 + S[60] + sha256_k[60]
+			 + sha256_h[7];
+}
+
+//
+// Code taken from original scrypt.cpp and vectorized with minimal changes.
+//
+
+static const uint32x4_t keypadx4[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32x4_t innerpadx4[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32x4_t outerpadx4[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32x4_t finalblkx4[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_initx4(const uint32x4_t *key,
+	uint32x4_t *tstate, uint32x4_t *ostate)
+{
+	uint32x4_t ihash[8];
+	uint32x4_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 16, 4*16);
+	memcpy(pad + 4, keypadx4, 4*48);
+	sha256_transformx4(tstate, pad, 0);
+	memcpy(ihash, tstate, 4*32);
+
+	sha256_initx4(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transformx4(ostate, pad, 0);
+
+	sha256_initx4(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transformx4(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128x4(const uint32x4_t *tstate,
+	const uint32x4_t *ostate, const uint32x4_t *salt, uint32x4_t *output)
+{
+	uint32x4_t istate[8], ostate2[8];
+	uint32x4_t ibuf[16], obuf[16];
+	int i, j;
+
+	memcpy(istate, tstate, 4*32);
+	sha256_transformx4(istate, salt, 0);
+
+	memcpy(ibuf, salt + 16, 4*16);
+	memcpy(ibuf + 5, innerpadx4, 4*44);
+	memcpy(obuf + 8, outerpadx4, 4*32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 4*32);
+		ibuf[4] = i + 1;
+		sha256_transformx4(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 4*32);
+		sha256_transformx4(ostate2, obuf, 0);
+		for (j = 0; j < 8; j++)
+			output[8 * i + j] = swab32x4(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32x4(uint32x4_t *tstate, uint32x4_t *ostate,
+	const uint32x4_t *salt, uint32x4_t *output)
+{
+	uint32x4_t buf[16];
+	int i;
+
+	sha256_transformx4(tstate, salt, 1);
+	sha256_transformx4(tstate, salt + 16, 1);
+	sha256_transformx4(tstate, finalblkx4, 0);
+	memcpy(buf, tstate, 4*32);
+	memcpy(buf + 8, outerpadx4, 4*32);
+
+	sha256_transformx4(ostate, buf, 0);
+	for (i = 0; i < 8; i++)
+		output[i] = swab32x4(ostate[i]);
+}
+
+
+//
+// Original scrypt.cpp HMAC SHA256 functions
+//
+
+static const uint32_t keypad[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32_t innerpad[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32_t outerpad[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32_t finalblk[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init(const uint32_t *key,
+	uint32_t *tstate, uint32_t *ostate)
+{
+	uint32_t ihash[8];
+	uint32_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 16, 16);
+	memcpy(pad + 4, keypad, 48);
+	sha256_transform(tstate, pad, 0);
+	memcpy(ihash, tstate, 32);
+
+	sha256_init(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transform(ostate, pad, 0);
+
+	sha256_init(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transform(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
+	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t istate[8], ostate2[8];
+	uint32_t ibuf[16], obuf[16];
+	int i, j;
+
+	memcpy(istate, tstate, 32);
+	sha256_transform(istate, salt, 0);
+
+	memcpy(ibuf, salt + 16, 16);
+	memcpy(ibuf + 5, innerpad, 44);
+	memcpy(obuf + 8, outerpad, 32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 32);
+		ibuf[4] = i + 1;
+		sha256_transform(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 32);
+		sha256_transform(ostate2, obuf, 0);
+		for (j = 0; j < 8; j++)
+			output[8 * i + j] = swab32(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
+	const uint32_t *salt, uint32_t *output)
+{
+	uint32_t buf[16];
+
+	sha256_transform(tstate, salt, 1);
+	sha256_transform(tstate, salt + 16, 1);
+	sha256_transform(tstate, finalblk, 0);
+	memcpy(buf, tstate, 32);
+	memcpy(buf + 8, outerpad, 32);
+
+	sha256_transform(ostate, buf, 0);
+	for (int i = 0; i < 8; i++)
+		output[i] = swab32(ostate[i]);
+}
+
+static int lastFactor = 0;
+//
+// Scrypt proof of work algorithm
+// using SSE2 vectorized HMAC SHA256 on CPU and
+// a salsa core implementation on GPU with CUDA
+//
+
+int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
+	uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
+{
+	int result = 0;
+	int throughput = cuda_throughput(thr_id);
+
+	if(throughput == 0)
+		return -1;
+
+	gettimeofday(tv_start, NULL);
+
+	uint32_t n = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+	// no default set with --cputest
+	if (opt_nfactor == 0) opt_nfactor = 9;
+	uint32_t N = (1UL<<(opt_nfactor+1));
+	uint32_t *scratch = new uint32_t[N*32]; // scratchbuffer for CPU based validation
+
+	uint32_t nonce[2];
+	uint32_t* hash[2]   = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+	uint32_t* X[2]      = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
+
+	bool sha_on_cpu = (parallel < 2);
+	bool sha_multithreaded = (parallel == 1);
+	uint32x4_t* datax4[2]   = { sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL };
+	uint32x4_t* hashx4[2]   = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* tstatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* ostatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* Xx4[2]      = { sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL };
+
+	// log n-factor
+	if (!opt_quiet && lastFactor != opt_nfactor) {
+		applog(LOG_WARNING, "scrypt factor set to %d (%u)", opt_nfactor, N);
+		lastFactor = opt_nfactor;
+	}
+
+	uint32_t _ALIGN(64) midstate[8];
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+
+	if (sha_on_cpu) {
+		for (int i = 0; i < throughput/4; ++i) {
+			for (int j = 0; j < 20; j++) {
+				datax4[0][20*i+j] = uint32x4_t(pdata[j]);
+				datax4[1][20*i+j] = uint32x4_t(pdata[j]);
+			}
+		}
+	}
+	else prepare_sha256(thr_id, pdata, midstate);
+
+	int cur = 1, nxt = 0;
+	int iteration = 0;
+	int num_shares = (4*opt_n_threads) || 1; // opt_n_threads can be 0 with --cputest
+	int share_workload = ((((throughput + num_shares-1) / num_shares) + 3) / 4) * 4;
+
+	do {
+		nonce[nxt] = n;
+
+		if (sha_on_cpu)
+		{
+			for (int i = 0; i < throughput/4; i++) {
+				datax4[nxt][i * 20 + 19] = uint32x4_t(n+0, n+1, n+2, n+3);
+				n += 4;
+			}
+			if (sha_multithreaded)
+			{
+#ifdef WIN32
+				parallel_for (0, num_shares, [&](int share) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						for (int l = 0; l < 8; l++)
+							tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+							HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+							PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+					}
+				} );
+#else
+			#pragma omp parallel for
+				for (int share = 0; share < num_shares; share++) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						for (int l = 0; l < 8; l++)
+							tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+							HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+							PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+					}
+				}
+#endif
+			}
+			else /* sha_multithreaded */
+			{
+				for (int k = 0; k < throughput/4; k++) {
+					for (int l = 0; l < 8; l++)
+						tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+						HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+						PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+				}
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 32; j++) {
+					uint32x4_t &t = Xx4[nxt][i * 32 + j];
+					X[nxt][(4*i+0)*32+j] = t[0]; X[nxt][(4*i+1)*32+j] = t[1];
+					X[nxt][(4*i+2)*32+j] = t[2]; X[nxt][(4*i+3)*32+j] = t[3];
+				}
+			}
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			cuda_scrypt_HtoD(thr_id, X[nxt], nxt);
+
+			cuda_scrypt_core(thr_id, nxt, N);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false);
+			cuda_scrypt_flush(thr_id, nxt);
+
+			if(!cuda_scrypt_sync(thr_id, cur))
+			{
+				result = -1;
+				break;
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 32; j++) {
+					Xx4[cur][i * 32 + j] = uint32x4_t(
+						X[cur][(4*i+0)*32+j], X[cur][(4*i+1)*32+j],
+						X[cur][(4*i+2)*32+j], X[cur][(4*i+3)*32+j]
+					);
+				}
+			}
+
+			if (sha_multithreaded)
+			{
+#ifdef WIN32
+				parallel_for (0, num_shares, [&](int share) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+					}
+				} );
+#else
+				#pragma omp parallel for
+				for (int share = 0; share < num_shares; share++) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+					}
+				}
+#endif
+			} else {
+
+				for (int k = 0; k < throughput/4; k++) {
+					PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+				}
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 8; j++) {
+					uint32x4_t &t = hashx4[cur][i * 8 + j];
+					hash[cur][(4*i+0)*8+j] = t[0]; hash[cur][(4*i+1)*8+j] = t[1];
+					hash[cur][(4*i+2)*8+j] = t[2]; hash[cur][(4*i+3)*8+j] = t[3];
+				}
+			}
+		}
+		else /* sha_on_cpu */
+		{
+			n += throughput;
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			pre_sha256(thr_id, nxt, nonce[nxt], throughput);
+
+			cuda_scrypt_core(thr_id, nxt, N);
+			cuda_scrypt_flush(thr_id, nxt); // required here ?
+
+			post_sha256(thr_id, nxt, throughput);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+			cuda_scrypt_flush(thr_id, nxt); // required here ?
+
+			if (!cuda_scrypt_sync(thr_id, cur)) {
+				printf("error\n");
+				result = -1;
+				break;
+			}
+		}
+
+		if (iteration > 0 || opt_n_threads == 0)
+		{
+			for (int i = 0; i < throughput; i++)
+			{
+				if (hash[cur][i * 8 + 7] <= Htarg && fulltest(hash[cur] + i * 8, ptarget))
+				{
+					// CPU based validation to rule out GPU errors (scalar CPU code)
+					uint32_t _ALIGN(64) inp[32], ref[32], tstate[8], ostate[8], refhash[8], ldata[20];
+
+					memcpy(ldata, pdata, 80); ldata[19] = nonce[cur] + i;
+					memcpy(tstate, midstate, 32);
+					HMAC_SHA256_80_init(ldata, tstate, ostate);
+					PBKDF2_SHA256_80_128(tstate, ostate, ldata, inp);
+					computeGold(inp, ref, (uchar*)scratch);
+					bool good = true;
+
+					if (sha_on_cpu) {
+						if (memcmp(&X[cur][i * 32], ref, 32*sizeof(uint32_t)) != 0) good = false;
+					} else {
+						PBKDF2_SHA256_128_32(tstate, ostate, ref, refhash);
+						if (memcmp(&hash[cur][i * 8], refhash, 32) != 0) good = false;
+					}
+
+					if (!good)
+						applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur);
+					else {
+						*hashes_done = n - pdata[19];
+						pdata[19] = nonce[cur] + i;
+						result = 1;
+						goto byebye;
+					}
+				}
+			}
+		}
+
+		cur = (cur+1)&1;
+		nxt = (nxt+1)&1;
+		++iteration;
+
+		//printf("n=%d, thr=%d, max=%d, rest=%d\n", n, throughput, max_nonce, work_restart[thr_id].restart);
+	} while (n <= max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - pdata[19];
+	pdata[19] = n;
+byebye:
+	delete[] datax4[0]; delete[] datax4[1]; delete[] hashx4[0]; delete[] hashx4[1];
+	delete[] tstatex4[0]; delete[] tstatex4[1]; delete[] ostatex4[0]; delete[] ostatex4[1];
+	delete[] Xx4[0]; delete[] Xx4[1];
+	delete [] scratch;
+	gettimeofday(tv_end, NULL);
+	return result;
+}
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+	uint32_t x0 = (B[ 0] ^= C[ 0]), x1 = (B[ 1] ^= C[ 1]), x2 = (B[ 2] ^= C[ 2]), x3 = (B[ 3] ^= C[ 3]);
+	uint32_t x4 = (B[ 4] ^= C[ 4]), x5 = (B[ 5] ^= C[ 5]), x6 = (B[ 6] ^= C[ 6]), x7 = (B[ 7] ^= C[ 7]);
+	uint32_t x8 = (B[ 8] ^= C[ 8]), x9 = (B[ 9] ^= C[ 9]), xa = (B[10] ^= C[10]), xb = (B[11] ^= C[11]);
+	uint32_t xc = (B[12] ^= C[12]), xd = (B[13] ^= C[13]), xe = (B[14] ^= C[14]), xf = (B[15] ^= C[15]);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	B[ 0] += x0; B[ 1] += x1; B[ 2] += x2; B[ 3] += x3; B[ 4] += x4; B[ 5] += x5; B[ 6] += x6; B[ 7] += x7;
+	B[ 8] += x8; B[ 9] += x9; B[10] += xa; B[11] += xb; B[12] += xc; B[13] += xd; B[14] += xe; B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor
+ */
+static void scrypt_core(uint32_t *X, uint32_t *V, int N)
+{
+	for (int i = 0; i < N; i++) {
+		memcpy(&V[i * 32], X, 128);
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+	for (int i = 0; i < N; i++) {
+		uint32_t j = 32 * (X[16] & (N - 1));
+		for (uint8_t k = 0; k < 32; k++)
+			X[k] ^= V[j + k];
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+}
+
+/**
+ * Compute reference data set on the CPU
+ * @param input      input data as provided to device
+ * @param reference  reference data, computed but preallocated
+ * @param scratchpad scrypt scratchpad
+ **/
+void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
+{
+	uint32_t X[32] = { 0 };
+	uint32_t *V = (uint32_t*) scratchpad;
+	int N = (1<<(opt_nfactor+1)); // default 9 = 1024
+
+	for (int k = 0; k < 32; k++)
+		X[k] = input[k];
+
+	scrypt_core(X, V, N);
+
+	for (int k = 0; k < 32; k++)
+		reference[k] = X[k];
+}
+
+static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
+	uint32_t *midstate, unsigned char *scratchpad, int N)
+{
+	uint32_t tstate[8], ostate[8];
+	uint32_t X[32] = { 0 };
+	uint32_t *V = (uint32_t *) scratchpad;
+
+	memcpy(tstate, midstate, 32);
+	HMAC_SHA256_80_init(input, tstate, ostate);
+	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
+
+	scrypt_core(X, V, N);
+
+	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
+}
+
+/* cputest */
+void scrypthash(void* output, const void* input)
+{
+	uint32_t _ALIGN(64) X[32], ref[32] = { 0 }, tstate[8], ostate[8], midstate[8];
+	uint32_t _ALIGN(64) data[20];
+	uchar *scratchbuf = (uchar *) calloc(4 * 128 + 63, 1024);
+
+	// no default set with --cputest
+	if (opt_nfactor == 0) opt_nfactor = 9;
+
+	memcpy(data, input, 80);
+
+	sha256_init(midstate);
+	sha256_transform(midstate, data, 0); /* ok */
+
+	memcpy(tstate, midstate, 32);
+	HMAC_SHA256_80_init(data, tstate, ostate);
+	PBKDF2_SHA256_80_128(tstate, ostate, data, X); /* ok */
+
+	if (scratchbuf) {
+		computeGold(X, ref, scratchbuf);
+		PBKDF2_SHA256_128_32(tstate, ostate, ref, (uint32_t*) output);
+	} else {
+		memset(output, 0, 32);
+	}
+
+	free(scratchbuf);
+}
+
+#define SCRYPT_MAX_WAYS 1
+/* cputest */
+void scrypthash2(void* output, const void* input)
+{
+	uint32_t midstate[8] = { 0 };
+	uint32_t data[SCRYPT_MAX_WAYS * 20] = { 0 };
+	uint32_t hash[SCRYPT_MAX_WAYS * 8] = { 0 };
+	uint32_t N = 1U << ((opt_nfactor ? opt_nfactor : 9) + 1); // default 1024
+
+	uchar* scratch = (uchar*) calloc(4 * 128 + 63, N); // scrypt_buffer_alloc(N);
+
+	memcpy(data, input, 80);
+
+	sha256_init(midstate);
+	sha256_transform(midstate, data, 0);
+
+	scrypt_1024_1_1_256(data, hash, midstate, scratch, N);
+
+	memcpy(output, hash, 32);
+
+	free(scratch);
+}
diff --git a/scrypt/blake.cu b/scrypt/blake.cu
new file mode 100644
index 0000000000..bcaa965806
--- /dev/null
+++ b/scrypt/blake.cu
@@ -0,0 +1,454 @@
+//
+//  =============== BLAKE part on nVidia GPU ======================
+//
+// This is the generic "default" implementation when no architecture
+// specific implementation is available in the kernel.
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+// TODO: CUDA porting work remains to be done.
+//
+
+#include <map>
+#include <stdint.h>
+
+#include "cuda_runtime.h"
+#include "salsa_kernel.h"
+#include "miner.h"
+
+typedef uint32_t sph_u32;
+#define SPH_C32(x) ((sph_u32)(x))
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+__constant__ uint64_t ptarget64[4];
+__constant__ uint32_t pdata[20];
+
+// define some error checking macros
+#undef checkCudaErrors
+
+#if WIN32
+#define DELIMITER '/'
+#else
+#define DELIMITER '/'
+#endif
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static __device__ void
+cuda_sph_enc32be(void *dst, sph_u32 val)
+{
+	*(sph_u32 *)dst = cuda_sph_bswap32(val);
+}
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = input[0]; \
+		M1 = input[1]; \
+		M2 = input[2]; \
+		M3 = input[3]; \
+		M4 = input[4]; \
+		M5 = input[5]; \
+		M6 = input[6]; \
+		M7 = input[7]; \
+		M8 = input[8]; \
+		M9 = input[9]; \
+		MA = input[10]; \
+		MB = input[11]; \
+		MC = input[12]; \
+		MD = input[13]; \
+		ME = input[14]; \
+		MF = input[15]; \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+__global__ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
+{
+	uint32_t input[16];
+	uint64_t output[4];
+
+#pragma unroll 16
+	for (int i=0; i < 16; ++i) input[i] = pdata[i];
+
+	sph_u32 H0 = 0x6A09E667;
+	sph_u32 H1 = 0xBB67AE85;
+	sph_u32 H2 = 0x3C6EF372;
+	sph_u32 H3 = 0xA54FF53A;
+	sph_u32 H4 = 0x510E527F;
+	sph_u32 H5 = 0x9B05688C;
+	sph_u32 H6 = 0x1F83D9AB;
+	sph_u32 H7 = 0x5BE0CD19;
+	sph_u32 S0 = 0;
+	sph_u32 S1 = 0;
+	sph_u32 S2 = 0;
+	sph_u32 S3 = 0;
+	sph_u32 T0 = 0;
+	sph_u32 T1 = 0;
+	T0 = SPH_T32(T0 + 512);
+	COMPRESS32;
+
+#pragma unroll 3
+	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
+	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+	input[4] = 0x80000000;
+#pragma unroll 8
+	for (int i=5; i < 13; ++i) input[i] = 0;
+	input[13] = 0x00000001;
+	input[14] = T1;
+	input[15] = T0 + 128;
+
+	T0 = SPH_T32(T0 + 128);
+	COMPRESS32;
+
+	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
+	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
+	if (validate || output[3] <=  ptarget64[3])
+	{
+		// this data is only needed when we actually need to save the hashes
+		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
+		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
+		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
+		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
+		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
+		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
+	}
+
+	if (validate)
+	{
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+#pragma unroll 4
+		for (int i=0; i < 4; ++i) g_out[i] = output[i];
+	}
+
+	if (output[3] <=  ptarget64[3]) {
+		uint64_t *g_good64 = (uint64_t*)g_good;
+		if (output[3] < g_good64[3]) {
+			g_good64[3] = output[3];
+			g_good64[2] = output[2];
+			g_good64[1] = output[1];
+			g_good64[0] = output[0];
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static std::map<int, uint32_t *> context_good[2];
+
+bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	if (!init[thr_id])
+	{
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 80, 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 32, 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	cuda_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
diff --git a/scrypt/code/scrypt-conf.h b/scrypt/code/scrypt-conf.h
new file mode 100644
index 0000000000..46685a5186
--- /dev/null
+++ b/scrypt/code/scrypt-conf.h
@@ -0,0 +1,28 @@
+/*
+	pick the best algo at runtime or compile time?
+	----------------------------------------------
+	SCRYPT_CHOOSE_COMPILETIME (gcc only!)
+	SCRYPT_CHOOSE_RUNTIME
+*/
+#define SCRYPT_CHOOSE_RUNTIME
+
+
+/*
+	hash function to use
+	-------------------------------
+	SCRYPT_BLAKE256
+	SCRYPT_BLAKE512
+	SCRYPT_SHA256
+	SCRYPT_SHA512
+	SCRYPT_SKEIN512
+*/
+//#define SCRYPT_SHA256
+
+
+/*
+	block mixer to use
+	-----------------------------
+	SCRYPT_CHACHA
+	SCRYPT_SALSA
+*/
+//#define SCRYPT_SALSA
diff --git a/scrypt/code/scrypt-jane-chacha.h b/scrypt/code/scrypt-jane-chacha.h
new file mode 100644
index 0000000000..7536004d32
--- /dev/null
+++ b/scrypt/code/scrypt-jane-chacha.h
@@ -0,0 +1,58 @@
+#define SCRYPT_MIX_BASE "ChaCha20/8"
+
+typedef uint32_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U32TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+
+#define SCRYPT_BLOCK_BYTES 64
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_chacha.h"
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN chacha_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+	size_t cpuflags = detect_cpu();
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+	size_t cpuflags = detect_cpu();
+	size_t flags = 0;
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix() {
+	static const uint8_t expected[16] = {
+		0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_CHACHA_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
diff --git a/scrypt/code/scrypt-jane-mix_chacha.h b/scrypt/code/scrypt-jane-mix_chacha.h
new file mode 100644
index 0000000000..85ee9c1ce4
--- /dev/null
+++ b/scrypt/code/scrypt-jane-mix_chacha.h
@@ -0,0 +1,69 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "ChaCha20/8 Ref"
+
+#undef SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_BASIC
+
+static void
+chacha_core_basic(uint32_t state[16]) {
+	size_t rounds = 8;
+	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
+
+	x0 = state[0];
+	x1 = state[1];
+	x2 = state[2];
+	x3 = state[3];
+	x4 = state[4];
+	x5 = state[5];
+	x6 = state[6];
+	x7 = state[7];
+	x8 = state[8];
+	x9 = state[9];
+	x10 = state[10];
+	x11 = state[11];
+	x12 = state[12];
+	x13 = state[13];
+	x14 = state[14];
+	x15 = state[15];
+
+	#define quarter(a,b,c,d) \
+		a += b; t = d^a; d = ROTL32(t,16); \
+		c += d; t = b^c; b = ROTL32(t,12); \
+		a += b; t = d^a; d = ROTL32(t, 8); \
+		c += d; t = b^c; b = ROTL32(t, 7);
+
+	for (; rounds; rounds -= 2) {
+		quarter( x0, x4, x8,x12)
+		quarter( x1, x5, x9,x13)
+		quarter( x2, x6,x10,x14)
+		quarter( x3, x7,x11,x15)
+		quarter( x0, x5,x10,x15)
+		quarter( x1, x6,x11,x12)
+		quarter( x2, x7, x8,x13)
+		quarter( x3, x4, x9,x14)
+	}
+
+	state[0] += x0;
+	state[1] += x1;
+	state[2] += x2;
+	state[3] += x3;
+	state[4] += x4;
+	state[5] += x5;
+	state[6] += x6;
+	state[7] += x7;
+	state[8] += x8;
+	state[9] += x9;
+	state[10] += x10;
+	state[11] += x11;
+	state[12] += x12;
+	state[13] += x13;
+	state[14] += x14;
+	state[15] += x15;
+
+	#undef quarter
+}
+
+#endif
\ No newline at end of file
diff --git a/scrypt/code/scrypt-jane-portable-x86.h b/scrypt/code/scrypt-jane-portable-x86.h
new file mode 100644
index 0000000000..44f97f9940
--- /dev/null
+++ b/scrypt/code/scrypt-jane-portable-x86.h
@@ -0,0 +1,32 @@
+
+typedef enum cpu_flags_x86_t { }cpu_flags_x86;
+
+typedef enum cpu_vendors_x86_t {
+	cpu_nobody,
+	cpu_intel,
+	cpu_amd
+} cpu_vendors_x86;
+
+typedef struct x86_regs_t {
+	uint32_t eax, ebx, ecx, edx;
+} x86_regs;
+
+
+#if defined(SCRYPT_TEST_SPEED)
+size_t cpu_detect_mask = (size_t)-1;
+#endif
+
+static size_t
+detect_cpu(void) {
+	size_t cpu_flags = 0;
+	return cpu_flags;
+}
+
+#if defined(SCRYPT_TEST_SPEED)
+static const char *
+get_top_cpuflag_desc(size_t flag) {
+	return "Basic";
+}
+#endif
+
+#define asm_calling_convention
diff --git a/scrypt/code/scrypt-jane-portable.h b/scrypt/code/scrypt-jane-portable.h
new file mode 100644
index 0000000000..ef5b93d48b
--- /dev/null
+++ b/scrypt/code/scrypt-jane-portable.h
@@ -0,0 +1,284 @@
+/* determine os */
+#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
+	#include <windows.h>
+	#include <wincrypt.h>
+	#define OS_WINDOWS
+#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <fcntl.h>
+
+	#define OS_SOLARIS
+#else
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <sys/param.h> /* need this to define BSD */
+	#include <unistd.h>
+	#include <fcntl.h>
+
+	#define OS_NIX
+	#if defined(__linux__)
+		#include <endian.h>
+		#define OS_LINUX
+	#elif defined(BSD)
+		#define OS_BSD
+
+		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
+			#define OS_OSX
+		#elif defined(macintosh) || defined(Macintosh)
+			#define OS_MAC
+		#elif defined(__OpenBSD__)
+			#define OS_OPENBSD
+		#endif
+	#endif
+#endif
+
+
+/* determine compiler */
+#if defined(_MSC_VER)
+	#define COMPILER_MSVC _MSC_VER
+	#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
+		#define COMPILER_MSVC6PP_AND_LATER
+	#endif
+	#if (COMPILER_MSVC >= 1500)
+		#define COMPILER_HAS_TMMINTRIN
+	#endif
+	
+	#pragma warning(disable : 4127) /* conditional expression is constant */
+	#pragma warning(disable : 4100) /* unreferenced formal parameter */
+	
+	#ifndef _CRT_SECURE_NO_WARNINGS
+	#define _CRT_SECURE_NO_WARNINGS
+	#endif
+
+	#include <float.h>
+	#include <stdlib.h> /* _rotl */
+	#include <intrin.h>
+
+	typedef unsigned char uint8_t;
+	typedef unsigned short uint16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;	
+	typedef unsigned __int64 uint64_t;
+	typedef signed __int64 int64_t;
+
+	#define ROTL32(a,b) _rotl(a,b)
+	#define ROTR32(a,b) _rotr(a,b)
+	#define ROTL64(a,b) _rotl64(a,b)
+	#define ROTR64(a,b) _rotr64(a,b)
+	#undef NOINLINE
+	#define NOINLINE __declspec(noinline)
+	#undef INLINE
+	#define INLINE __forceinline
+	#undef FASTCALL
+	#define FASTCALL __fastcall
+	#undef CDECL
+	#define CDECL __cdecl
+	#undef STDCALL
+	#define STDCALL __stdcall
+	#undef NAKED
+	#define NAKED __declspec(naked)
+	#define MM16 __declspec(align(16))
+#endif
+#if defined(__ICC)
+	#define COMPILER_INTEL
+#endif
+#if defined(__GNUC__)
+	#if (__GNUC__ >= 3)
+		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
+	#else
+		#define COMPILER_GCC_PATCHLEVEL 0
+	#endif
+	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
+	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
+	#undef NOINLINE
+	#if (COMPILER_GCC >= 30000)
+		#define NOINLINE __attribute__((noinline))
+	#else
+		#define NOINLINE
+	#endif
+	#undef INLINE
+	#if (COMPILER_GCC >= 30000)
+		#define INLINE __attribute__((always_inline))
+	#else
+		#define INLINE inline
+	#endif
+	#undef FASTCALL
+	#if (COMPILER_GCC >= 30400)
+		#define FASTCALL __attribute__((fastcall))
+	#else
+		#define FASTCALL
+	#endif
+	#undef CDECL
+	#define CDECL __attribute__((cdecl))
+	#undef STDCALL
+	#define STDCALL __attribute__((stdcall))
+	#define MM16 __attribute__((aligned(16)))
+	#include <stdint.h>
+#endif
+#if defined(__MINGW32__) || defined(__MINGW64__)
+	#define COMPILER_MINGW
+#endif
+#if defined(__PATHCC__)
+	#define COMPILER_PATHCC
+#endif
+
+#define OPTIONAL_INLINE
+#if defined(OPTIONAL_INLINE)
+	#undef OPTIONAL_INLINE
+	#define OPTIONAL_INLINE INLINE
+#else
+	#define OPTIONAL_INLINE
+#endif
+
+#define CRYPTO_FN NOINLINE STDCALL
+
+/* determine cpu */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
+	#define CPU_X86_64
+#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
+	#define CPU_X86 500
+#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
+	#define CPU_X86 400
+#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
+	#define CPU_X86 300
+#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
+	#define CPU_IA64
+#endif
+
+#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
+	#define CPU_SPARC
+	#if defined(__sparcv9)
+		#define CPU_SPARC64
+	#endif
+#endif
+
+#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
+	#define CPU_64BITS
+	#undef FASTCALL
+	#define FASTCALL
+	#undef CDECL
+	#define CDECL
+	#undef STDCALL
+	#define STDCALL
+#endif
+
+#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
+	#define CPU_PPC
+	#if defined(_ARCH_PWR7)
+		#define CPU_POWER7
+	#elif defined(__64BIT__)
+		#define CPU_PPC64
+	#else
+		#define CPU_PPC32
+	#endif
+#endif
+
+#if defined(__hppa__) || defined(__hppa)
+	#define CPU_HPPA
+#endif
+
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+	#define CPU_ALPHA
+#endif
+
+/* endian */
+
+#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
+	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
+#define CPU_LE
+#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
+	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
+	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
+#define CPU_BE
+#else
+	/* unknown endian! */
+#endif
+
+
+#define U8TO32_BE(p)                                            \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U8TO32_LE(p)                                            \
+	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
+	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#define U32TO8_BE(p, v)                                           \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U32TO8_LE(p, v)                                           \
+	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
+	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
+
+#define U8TO64_BE(p)                                                  \
+	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
+
+#define U8TO64_LE(p)                                                  \
+	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
+
+#define U64TO8_BE(p, v)                        \
+	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#define U64TO8_LE(p, v)                        \
+	U32TO8_LE((p),     (uint32_t)((v)      )); \
+	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
+
+#define U32_SWAP(v) {                                             \
+	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
+    (v) = ((v) << 16) | ((v) >> 16);                              \
+}
+
+#define U64_SWAP(v) {                                                                       \
+	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
+	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
+    (v) = ((v) << 32) | ((v) >> 32);                                                        \
+}
+
+static int
+scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
+	uint32_t differentbits = 0;
+	while (len--)
+		differentbits |= (*x++ ^ *y++);
+	return (1 & ((differentbits - 1) >> 8));
+}
+
+void
+scrypt_ensure_zero(void *p, size_t len) {
+#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
+		__stosb((unsigned char *)p, 0, len);
+#elif (defined(CPU_X86) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushl %%edi;\n"
+		"pushl %%ecx;\n"
+		"rep stosb;\n"
+		"popl %%ecx;\n"
+		"popl %%edi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushq %%rdi;\n"
+		"pushq %%rcx;\n"
+		"rep stosb;\n"
+		"popq %%rcx;\n"
+		"popq %%rdi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#else
+	volatile uint8_t *b = (volatile uint8_t *)p;
+	size_t i;
+	for (i = 0; i < len; i++)
+		b[i] = 0;
+#endif
+}
+
+#include "scrypt-jane-portable-x86.h"
+
diff --git a/scrypt/code/scrypt-jane-romix-basic.h b/scrypt/code/scrypt-jane-romix-basic.h
new file mode 100644
index 0000000000..1cdb3fb06f
--- /dev/null
+++ b/scrypt/code/scrypt-jane-romix-basic.h
@@ -0,0 +1,67 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+/* function type returned by scrypt_getROMix, used with cpu detection */
+typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
+#endif
+
+/* romix pre/post nop function */
+static void asm_calling_convention
+scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+}
+
+/* romix pre/post endian conversion function */
+static void asm_calling_convention
+scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
+#if !defined(CPU_LE)
+	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
+	size_t i;
+	if (endian_test.w == 0x100) {
+		nblocks *= SCRYPT_BLOCK_WORDS;
+		for (i = 0; i < nblocks; i++) {
+			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
+		}
+	}
+#endif
+}
+
+/* chunkmix test function */
+typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+
+static int
+scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
+	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
+	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
+	scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+	uint8_t final[16];
+	size_t i;
+
+	for (i = 0; i < words; i++) {
+		v = (scrypt_mix_word_t)i;
+		v = (v << 8) | v;
+		v = (v << 16) | v;
+		chunk[0][i] = v;
+	}
+
+	prefn(chunk[0], blocks);
+	mixfn(chunk[1], chunk[0], NULL, r);
+	postfn(chunk[1], blocks);
+
+	/* grab the last 16 bytes of the final block */
+	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
+		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
+	}
+
+	return scrypt_verify(expected, final, 16);
+}
+
+/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
+static scrypt_mix_word_t *
+scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
+	return base + (i * len);
+}
+
+/* returns a pointer to block i */
+static scrypt_mix_word_t *
+scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
+	return base + (i * SCRYPT_BLOCK_WORDS);
+}
diff --git a/scrypt/code/scrypt-jane-romix-template.h b/scrypt/code/scrypt-jane-romix-template.h
new file mode 100644
index 0000000000..7879c58f84
--- /dev/null
+++ b/scrypt/code/scrypt-jane-romix-template.h
@@ -0,0 +1,179 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
+
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_ROMIX_FN
+#define SCRYPT_ROMIX_FN scrypt_ROMix
+#endif
+
+#undef SCRYPT_HAVE_ROMIX
+#define SCRYPT_HAVE_ROMIX
+
+#if !defined(SCRYPT_CHUNKMIX_FN)
+
+#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
+
+/*
+	Bout = ChunkMix(Bin)
+
+	2*r: number of blocks in the chunk
+*/
+static void asm_calling_convention
+SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
+	scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
+	uint32_t i, j, blocksPerChunk = r * 2, half = 0;
+
+	/* 1: X = B_{2r - 1} */
+	block = scrypt_block(Bin, blocksPerChunk - 1);
+	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+		X[i] = block[i];
+
+	if (Bxor) {
+		block = scrypt_block(Bxor, blocksPerChunk - 1);
+		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+			X[i] ^= block[i];
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		block = scrypt_block(Bin, i);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			X[j] ^= block[j];
+
+		if (Bxor) {
+			block = scrypt_block(Bxor, i);
+			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+				X[j] ^= block[j];
+		}
+		SCRYPT_MIX_FN(X);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		block = scrypt_block(Bout, (i / 2) + half);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			block[j] = X[j];
+	}
+}
+#endif
+
+/*
+	X = ROMix(X)
+
+	X: chunk to mix
+	Y: scratch chunk
+	N: number of rounds
+	V[N]: array of chunks to randomly index in to
+	2*r: number of blocks in a chunk
+*/
+
+static void NOINLINE FASTCALL
+SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
+	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < N - 1; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+	}
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
+/*
+ * Special version with hard-coded r = 1
+ *  - mikaelh
+ */
+static void NOINLINE FASTCALL
+scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
+	const uint32_t r = 1;
+	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < N - 1; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+#ifdef SCRYPT_CHUNKMIX_1_FN
+		SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
+#else
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+#endif
+	}
+#ifdef SCRYPT_CHUNKMIX_1_FN
+	SCRYPT_CHUNKMIX_1_FN(X, block);
+#else
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+#endif
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+		SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
+#else
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+#endif
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+		SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
+#else
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+#endif
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
+#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
+
+
+#undef SCRYPT_CHUNKMIX_FN
+#undef SCRYPT_ROMIX_FN
+#undef SCRYPT_MIX_FN
+#undef SCRYPT_ROMIX_TANGLE_FN
+#undef SCRYPT_ROMIX_UNTANGLE_FN
+
diff --git a/scrypt/code/scrypt-jane-romix.h b/scrypt/code/scrypt-jane-romix.h
new file mode 100644
index 0000000000..478e9cb6ac
--- /dev/null
+++ b/scrypt/code/scrypt-jane-romix.h
@@ -0,0 +1 @@
+#include "scrypt-jane-chacha.h"
diff --git a/scrypt/fermi_kernel.cu b/scrypt/fermi_kernel.cu
new file mode 100644
index 0000000000..c7f9026d65
--- /dev/null
+++ b/scrypt/fermi_kernel.cu
@@ -0,0 +1,907 @@
+//
+// Kernel that runs best on Fermi devices
+//
+// - shared memory use reduced by nearly factor 2 over legacy kernel
+//   by transferring only half work units (16 x uint32_t) at once.
+// - uses ulong2/uint4 based memory transfers (each thread moves 16 bytes),
+//   allowing for shorter unrolled loops. This relies on Fermi's better
+//   memory controllers to get high memory troughput.
+//
+// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=63
+//
+// TODO: batch-size support for this kernel
+//
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "fermi_kernel.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#define TEXWIDTH 32768
+
+// forward references
+template <int ALGO> __global__ void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N);
+template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
+template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+FermiKernel::FermiKernel() : KernelInterface()
+{
+}
+
+bool FermiKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool FermiKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+//    fprintf(stderr, "total size: %u, %u bytes\n", pitch * height, width * sizeof(uint32_t) * 4 * height);
+//    fprintf(stderr, "binding width width=%d, height=%d, pitch=%d\n", width, height,pitch);
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool FermiKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool FermiKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void FermiKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool FermiKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	int shared = WARPS_PER_BLOCK * WU_PER_WARP * (16+4) * sizeof(uint32_t);
+
+	// First phase: Sequential writes to scratchpad.
+
+	if (LOOKUP_GAP == 1) {
+		  if (IS_SCRYPT())      fermi_scrypt_core_kernelA<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N);
+		  if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N);
+	} else {
+		  if (IS_SCRYPT())      fermi_scrypt_core_kernelA_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
+		  if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
+	}
+
+	// Second phase: Random read access from scratchpad.
+
+	if (LOOKUP_GAP == 1) {
+		if (texture_cache) {
+			if (texture_cache == 1) {
+				if (IS_SCRYPT())      fermi_scrypt_core_kernelB_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N);
+				if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())      fermi_scrypt_core_kernelB_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N);
+				if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N);
+			}
+			else success = false;
+		} else {
+			if (IS_SCRYPT())      fermi_scrypt_core_kernelB<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N);
+			if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N);
+		}
+	} else {
+		if (texture_cache) {
+			if (texture_cache == 1) {
+				if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			}
+			else success = false;
+		} else {
+			if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+		}
+	}
+
+	return success;
+}
+
+#if 0
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<7) | ((a00)>>25) );\
+a1^=(((a10)<<7) | ((a10)>>25) );\
+a2^=(((a20)<<7) | ((a20)>>25) );\
+a3^=(((a30)<<7) | ((a30)>>25) );\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<9) | ((a00)>>23) );\
+a1^=(((a10)<<9) | ((a10)>>23) );\
+a2^=(((a20)<<9) | ((a20)>>23) );\
+a3^=(((a30)<<9) | ((a30)>>23) );\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<13) | ((a00)>>19) );\
+a1^=(((a10)<<13) | ((a10)>>19) );\
+a2^=(((a20)<<13) | ((a20)>>19) );\
+a3^=(((a30)<<13) | ((a30)>>19) );\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<18) | ((a00)>>14) );\
+a1^=(((a10)<<18) | ((a10)>>14) );\
+a2^=(((a20)<<18) | ((a20)>>14) );\
+a3^=(((a30)<<18) | ((a30)>>14) );\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+static __device__ __forceinline__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Scrypt core kernel for Fermi class devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__
+void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP]  + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	for (int i = 1; i < N; i++) {
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)(&V[SCRATCH*wu + i*32])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)(&V[SCRATCH*wu + i*32 + 16])) = *((ulonglong2*)XB[wu]);
+	}
+}
+
+template <int ALGO> __global__
+void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32 + 16]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	switch(ALGO) {
+	case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+	}
+
+	for (int i = 0; i < N; i++) {
+
+		XX[16] = 32 * (C[0].x & (N-1));
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+
+}
+
+template <int ALGO, int TEX_DIM> __global__ void
+fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + 16+Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	switch(ALGO) {
+	case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+	}
+
+	for (int i = 0; i < N; i++) {
+
+		XX[16] = 32 * (C[0].x & (N-1));
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+}
+
+//
+// Lookup-Gap variations of the above functions
+//
+
+template <int ALGO> __global__ void
+fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP]  + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	for (int i = 1; i < N; i++) {
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+		if (i % LOOKUP_GAP == 0) {
+#pragma unroll 4
+			for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+			for (int wu=0; wu < 32; wu+=8)
+				*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+			for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+			for (int wu=0; wu < 32; wu+=8)
+				*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32 + 16])) = *((ulonglong2*)XB[wu]);
+		}
+	}
+}
+
+template <int ALGO> __global__ void
+fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+	uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32 + 16]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	while (loop--)
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+	for (int i = 0; i < N; i++) {
+
+		uint32_t j = C[0].x & (N-1);
+		uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
+		XX[16] = 32 * pos;
+
+		uint4 b[4], c[4];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
+
+		while (loop--)
+			switch(ALGO) {
+			case A_SCRYPT:      xor_salsa8(b, c);  xor_salsa8(c, b); break;
+			case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
+			}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+
+}
+
+template <int ALGO, int TEX_DIM> __global__ void
+fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+	uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + 16+Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	while (loop--)
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+	for (int i = 0; i < N; i++) {
+
+		uint32_t j = C[0].x & (N-1);
+		uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
+		XX[16] = 32 * pos;
+
+		uint4 b[4], c[4];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
+
+		while (loop--)
+			switch(ALGO) {
+			case A_SCRYPT:      xor_salsa8(b, c);  xor_salsa8(c, b); break;
+			case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
+			}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+}
diff --git a/scrypt/fermi_kernel.h b/scrypt/fermi_kernel.h
new file mode 100644
index 0000000000..54f822d2f2
--- /dev/null
+++ b/scrypt/fermi_kernel.h
@@ -0,0 +1,28 @@
+#ifndef FERMI_KERNEL_H
+#define FERMI_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class FermiKernel : public KernelInterface
+{
+public:
+	FermiKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'F'; };
+	virtual int get_major_version() { return 1; }
+	virtual int get_minor_version() { return 0; }
+	virtual int max_warps_per_block() { return 16; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferShared; }
+};
+
+#endif // #ifndef FERMI_KERNEL_H
diff --git a/scrypt/keccak.cu b/scrypt/keccak.cu
new file mode 100644
index 0000000000..aa23e506ea
--- /dev/null
+++ b/scrypt/keccak.cu
@@ -0,0 +1,837 @@
+//
+//  =============== KECCAK part on nVidia GPU ======================
+//
+// The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins
+// in place of the SHA2 based PBKDF2 used in scrypt coins.
+//
+// The keccak256 is used exclusively in Maxcoin and clones. This module
+// holds the generic "default" implementation when no architecture
+// specific implementation is available in the kernel.
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+
+#include <map>
+#include <stdint.h>
+
+#include "salsa_kernel.h"
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "keccak.h"
+
+// define some error checking macros
+#undef checkCudaErrors
+
+#if WIN32
+#define DELIMITER '/'
+#else
+#define DELIMITER '/'
+#endif
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+
+// CB
+#define U32TO64_LE(p) \
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+
+#define U64TO32_LE(p, v) \
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+
+static __device__ void mycpy64(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy56(uint32_t *d, const uint32_t *s) {
+#pragma unroll 14
+	for (int k=0; k < 14; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy8(uint32_t *d, const uint32_t *s) {
+#pragma unroll 2
+	for (int k=0; k < 2; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy4(uint32_t *d, const uint32_t *s) {
+	*d = *s;
+}
+
+// ---------------------------- BEGIN keccak functions ------------------------------------
+
+#define KECCAK_HASH "Keccak-512"
+
+typedef struct keccak_hash_state_t {
+	uint64_t state[25];                        // 25*2
+	uint32_t buffer[72/4];                     // 72
+} keccak_hash_state;
+
+__device__ void statecopy0(keccak_hash_state *d, keccak_hash_state *s)
+{
+#pragma unroll 25
+	for (int i=0; i < 25; ++i)
+		d->state[i] = s->state[i];
+}
+
+__device__ void statecopy8(keccak_hash_state *d, keccak_hash_state *s)
+{
+#pragma unroll 25
+	for (int i=0; i < 25; ++i)
+		d->state[i] = s->state[i];
+#pragma unroll 2
+	for (int i=0; i < 2; ++i)
+		d->buffer[i] = s->buffer[i];
+}
+
+static const uint64_t host_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint64_t c_keccak_round_constants[24];
+__constant__ uint32_t pdata[20];
+
+__device__
+void keccak_block(keccak_hash_state *S, const uint32_t *in) {
+	size_t i;
+	uint64_t *s = S->state, t[5], u[5], v, w;
+
+	/* absorb input */
+	#pragma unroll 9
+	for (i = 0; i < 72 / 8; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= c_keccak_round_constants[i];
+	}
+}
+
+__device__
+void keccak_hash_init(keccak_hash_state *S) {
+#pragma unroll 25
+	for (int i=0; i<25; ++i)
+		S->state[i] = 0ULL;
+}
+
+// assuming there is no leftover data and exactly 72 bytes are incoming
+// we can directly call into the block hashing function
+__device__ void keccak_hash_update72(keccak_hash_state *S, const uint32_t *in) {
+	keccak_block(S, in);
+}
+
+__device__ void keccak_hash_update8(keccak_hash_state *S, const uint32_t *in) {
+	mycpy8(S->buffer, in);
+}
+
+__device__ void keccak_hash_update4_8(keccak_hash_state *S, const uint32_t *in) {
+	mycpy4(S->buffer+8/4, in);
+}
+
+__device__ void keccak_hash_update4_56(keccak_hash_state *S, const uint32_t *in) {
+	mycpy4(S->buffer+56/4, in);
+}
+
+__device__ void keccak_hash_update56(keccak_hash_state *S, const uint32_t *in) {
+	mycpy56(S->buffer, in);
+}
+
+__device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) {
+	mycpy64(S->buffer, in);
+}
+
+__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) {
+	S->buffer[8/4] = 0x01;
+#pragma unroll 15
+	for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000;
+	keccak_block(S, (const uint32_t*)S->buffer);
+#pragma unroll 8
+	for (size_t i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) {
+	S->buffer[12/4] = 0x01;
+#pragma unroll 14
+	for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000;
+	keccak_block(S, (const uint32_t*)S->buffer);
+#pragma unroll 8
+	for (size_t i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) {
+	S->buffer[60/4] = 0x01;
+#pragma unroll 2
+	for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000;
+	keccak_block(S, (const uint32_t*)S->buffer);
+#pragma unroll 8
+	for (size_t i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) {
+	S->buffer[64/4] = 0x01;
+#pragma unroll 1
+	for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000;
+	keccak_block(S, (const uint32_t*)S->buffer);
+#pragma unroll 8
+	for (size_t i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+// ---------------------------- END keccak functions ------------------------------------
+
+// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
+
+typedef struct pbkdf2_hmac_state_t {
+	keccak_hash_state inner, outer;
+} pbkdf2_hmac_state;
+
+
+__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) {
+	keccak_hash_state st;
+	keccak_hash_init(&st);
+	keccak_hash_update72(&st, m);
+	keccak_hash_update8(&st, m+72/4);
+	keccak_hash_finish8(&st, hash);
+}
+
+/* hmac */
+__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) {
+	uint32_t pad[72/4];
+	size_t i;
+
+	keccak_hash_init(&st->inner);
+	keccak_hash_init(&st->outer);
+
+#pragma unroll 18
+	for (i = 0; i < 72/4; i++)
+		pad[i] = 0;
+
+	/* key > blocksize bytes, hash it */
+	pbkdf2_hash(pad, key);
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+#pragma unroll 18
+	for (i = 0; i < 72/4; i++)
+		pad[i] ^= 0x36363636;
+	keccak_hash_update72(&st->inner, pad);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+#pragma unroll 18
+	for (i = 0; i < 72/4; i++)
+		pad[i] ^= 0x6a6a6a6a;
+	keccak_hash_update72(&st->outer, pad);
+}
+
+// assuming there is no leftover data and exactly 72 bytes are incoming
+// we can directly call into the block hashing function
+__device__ void pbkdf2_hmac_update72(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update72(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update8(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update8(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update4_8(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update4_8(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update4_56(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update4_56(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update56(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update56(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_finish12(pbkdf2_hmac_state *st, uint32_t *mac) {
+	/* h(inner || m) */
+	uint32_t innerhash[16];
+	keccak_hash_finish12(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	keccak_hash_update64(&st->outer, innerhash);
+	keccak_hash_finish64(&st->outer, mac);
+}
+
+__device__ void pbkdf2_hmac_finish60(pbkdf2_hmac_state *st, uint32_t *mac) {
+	/* h(inner || m) */
+	uint32_t innerhash[16];
+	keccak_hash_finish60(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	keccak_hash_update64(&st->outer, innerhash);
+	keccak_hash_finish64(&st->outer, mac);
+}
+
+__device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {
+	statecopy8(&d->inner, &s->inner);
+	statecopy0(&d->outer, &s->outer);
+}
+
+// ---------------------------- END PBKDF2 functions ------------------------------------
+
+static __device__ uint32_t cuda_swab32(uint32_t x) {
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+__global__ __launch_bounds__(128)
+void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
+{
+	nonce        +=       (blockIdx.x * blockDim.x) + threadIdx.x;
+	g_idata      += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t data[20];
+
+	#pragma unroll
+	for (int i=0; i <19; ++i)
+		data[i] = cuda_swab32(pdata[i]);
+	data[19] = cuda_swab32(nonce);
+
+//    scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128);
+
+	pbkdf2_hmac_state hmac_pw, work;
+	uint32_t ti[16];
+	uint32_t be;
+
+	/* hmac(password, ...) */
+	pbkdf2_hmac_init80(&hmac_pw, data);
+
+	/* hmac(password, salt...) */
+	pbkdf2_hmac_update72(&hmac_pw, data);
+	pbkdf2_hmac_update8(&hmac_pw, data+72/4);
+
+	/* U1 = hmac(password, salt || be(i)) */
+	be = cuda_swab32(1);
+	pbkdf2_statecopy8(&work, &hmac_pw);
+	pbkdf2_hmac_update4_8(&work, &be);
+	pbkdf2_hmac_finish12(&work, ti);
+	mycpy64(g_idata, ti);
+
+	be = cuda_swab32(2);
+	pbkdf2_statecopy8(&work, &hmac_pw);
+	pbkdf2_hmac_update4_8(&work, &be);
+	pbkdf2_hmac_finish12(&work, ti);
+	mycpy64(g_idata+16, ti);
+}
+
+
+__global__ __launch_bounds__(128)
+void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
+{
+	nonce        +=       (blockIdx.x * blockDim.x) + threadIdx.x;
+	g_odata      += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_hash       +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t data[20];
+
+#pragma unroll 19
+	for (int i=0; i <19; ++i)
+		data[i] = cuda_swab32(pdata[i]);
+	data[19] = cuda_swab32(nonce);
+
+//	scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32);
+
+	pbkdf2_hmac_state hmac_pw;
+	uint32_t ti[16];
+	uint32_t be;
+
+	/* hmac(password, ...) */
+	pbkdf2_hmac_init80(&hmac_pw, data);
+
+	/* hmac(password, salt...) */
+	pbkdf2_hmac_update72(&hmac_pw, g_odata);
+	pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4);
+
+	/* U1 = hmac(password, salt || be(i)) */
+	be = cuda_swab32(1);
+	pbkdf2_hmac_update4_56(&hmac_pw, &be);
+	pbkdf2_hmac_finish60(&hmac_pw, ti);
+	mycpy32(g_hash, ti);
+}
+
+//
+// callable host code to initialize constants and to call kernels
+//
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20])
+{
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+
+extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_pre_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], nonce);
+}
+
+extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce);
+}
+
+
+//
+// Maxcoin related Keccak implementation (Keccak256)
+//
+
+#include <stdint.h>
+
+#include <map>
+extern std::map<int, int> context_blocks;
+extern std::map<int, int> context_wpb;
+extern std::map<int, KernelInterface *> context_kernel;
+
+__constant__ uint64_t ptarget64[4];
+
+#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
+#define ROL_mult8(a, offset) ROL(a, offset)
+
+__constant__ uint64_t KeccakF_RoundConstants[24];
+
+static uint64_t host_KeccakF_RoundConstants[24] = {
+	(uint64_t)0x0000000000000001ULL,
+	(uint64_t)0x0000000000008082ULL,
+	(uint64_t)0x800000000000808aULL,
+	(uint64_t)0x8000000080008000ULL,
+	(uint64_t)0x000000000000808bULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008009ULL,
+	(uint64_t)0x000000000000008aULL,
+	(uint64_t)0x0000000000000088ULL,
+	(uint64_t)0x0000000080008009ULL,
+	(uint64_t)0x000000008000000aULL,
+	(uint64_t)0x000000008000808bULL,
+	(uint64_t)0x800000000000008bULL,
+	(uint64_t)0x8000000000008089ULL,
+	(uint64_t)0x8000000000008003ULL,
+	(uint64_t)0x8000000000008002ULL,
+	(uint64_t)0x8000000000000080ULL,
+	(uint64_t)0x000000000000800aULL,
+	(uint64_t)0x800000008000000aULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008080ULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008008ULL
+};
+
+__constant__ uint64_t pdata64[10];
+
+__global__
+void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
+{
+	uint64_t Aba, Abe, Abi, Abo, Abu;
+	uint64_t Aga, Age, Agi, Ago, Agu;
+	uint64_t Aka, Ake, Aki, Ako, Aku;
+	uint64_t Ama, Ame, Ami, Amo, Amu;
+	uint64_t Asa, Ase, Asi, Aso, Asu;
+	uint64_t BCa, BCe, BCi, BCo, BCu;
+	uint64_t Da, De, Di, Do, Du;
+	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
+	uint64_t Ega, Ege, Egi, Ego, Egu;
+	uint64_t Eka, Eke, Eki, Eko, Eku;
+	uint64_t Ema, Eme, Emi, Emo, Emu;
+	uint64_t Esa, Ese, Esi, Eso, Esu;
+
+	//copyFromState(A, state)
+	Aba = pdata64[0];
+	Abe = pdata64[1];
+	Abi = pdata64[2];
+	Abo = pdata64[3];
+	Abu = pdata64[4];
+	Aga = pdata64[5];
+	Age = pdata64[6];
+	Agi = pdata64[7];
+	Ago = pdata64[8];
+	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
+	Aka = 0x0000000000000001ULL;
+	Ake = 0;
+	Aki = 0;
+	Ako = 0;
+	Aku = 0;
+	Ama = 0;
+	Ame = 0x8000000000000000ULL;
+	Ami = 0;
+	Amo = 0;
+	Amu = 0;
+	Asa = 0;
+	Ase = 0;
+	Asi = 0;
+	Aso = 0;
+	Asu = 0;
+
+#pragma unroll 12
+	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
+	{
+		//    prepareTheta
+		BCa = Aba^Aga^Aka^Ama^Asa;
+		BCe = Abe^Age^Ake^Ame^Ase;
+		BCi = Abi^Agi^Aki^Ami^Asi;
+		BCo = Abo^Ago^Ako^Amo^Aso;
+		BCu = Abu^Agu^Aku^Amu^Asu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Aba ^= Da;
+		BCa = Aba;
+		Age ^= De;
+		BCe = ROL(Age, 44);
+		Aki ^= Di;
+		BCi = ROL(Aki, 43);
+		Amo ^= Do;
+		BCo = ROL(Amo, 21);
+		Asu ^= Du;
+		BCu = ROL(Asu, 14);
+		Eba =   BCa ^((~BCe)&  BCi );
+		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
+		Ebe =   BCe ^((~BCi)&  BCo );
+		Ebi =   BCi ^((~BCo)&  BCu );
+		Ebo =   BCo ^((~BCu)&  BCa );
+		Ebu =   BCu ^((~BCa)&  BCe );
+
+		Abo ^= Do;
+		BCa = ROL(Abo, 28);
+		Agu ^= Du;
+		BCe = ROL(Agu, 20);
+		Aka ^= Da;
+		BCi = ROL(Aka,  3);
+		Ame ^= De;
+		BCo = ROL(Ame, 45);
+		Asi ^= Di;
+		BCu = ROL(Asi, 61);
+		Ega =   BCa ^((~BCe)&  BCi );
+		Ege =   BCe ^((~BCi)&  BCo );
+		Egi =   BCi ^((~BCo)&  BCu );
+		Ego =   BCo ^((~BCu)&  BCa );
+		Egu =   BCu ^((~BCa)&  BCe );
+
+		Abe ^= De;
+		BCa = ROL(Abe,  1);
+		Agi ^= Di;
+		BCe = ROL(Agi,  6);
+		Ako ^= Do;
+		BCi = ROL(Ako, 25);
+		Amu ^= Du;
+		BCo = ROL_mult8(Amu,  8);
+		Asa ^= Da;
+		BCu = ROL(Asa, 18);
+		Eka =   BCa ^((~BCe)&  BCi );
+		Eke =   BCe ^((~BCi)&  BCo );
+		Eki =   BCi ^((~BCo)&  BCu );
+		Eko =   BCo ^((~BCu)&  BCa );
+		Eku =   BCu ^((~BCa)&  BCe );
+
+		Abu ^= Du;
+		BCa = ROL(Abu, 27);
+		Aga ^= Da;
+		BCe = ROL(Aga, 36);
+		Ake ^= De;
+		BCi = ROL(Ake, 10);
+		Ami ^= Di;
+		BCo = ROL(Ami, 15);
+		Aso ^= Do;
+		BCu = ROL_mult8(Aso, 56);
+		Ema =   BCa ^((~BCe)&  BCi );
+		Eme =   BCe ^((~BCi)&  BCo );
+		Emi =   BCi ^((~BCo)&  BCu );
+		Emo =   BCo ^((~BCu)&  BCa );
+		Emu =   BCu ^((~BCa)&  BCe );
+
+		Abi ^= Di;
+		BCa = ROL(Abi, 62);
+		Ago ^= Do;
+		BCe = ROL(Ago, 55);
+		Aku ^= Du;
+		BCi = ROL(Aku, 39);
+		Ama ^= Da;
+		BCo = ROL(Ama, 41);
+		Ase ^= De;
+		BCu = ROL(Ase,  2);
+		Esa =   BCa ^((~BCe)&  BCi );
+		Ese =   BCe ^((~BCi)&  BCo );
+		Esi =   BCi ^((~BCo)&  BCu );
+		Eso =   BCo ^((~BCu)&  BCa );
+		Esu =   BCu ^((~BCa)&  BCe );
+
+		//    prepareTheta
+		BCa = Eba^Ega^Eka^Ema^Esa;
+		BCe = Ebe^Ege^Eke^Eme^Ese;
+		BCi = Ebi^Egi^Eki^Emi^Esi;
+		BCo = Ebo^Ego^Eko^Emo^Eso;
+		BCu = Ebu^Egu^Eku^Emu^Esu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Eba ^= Da;
+		BCa = Eba;
+		Ege ^= De;
+		BCe = ROL(Ege, 44);
+		Eki ^= Di;
+		BCi = ROL(Eki, 43);
+		Emo ^= Do;
+		BCo = ROL(Emo, 21);
+		Esu ^= Du;
+		BCu = ROL(Esu, 14);
+		Aba =   BCa ^((~BCe)&  BCi );
+		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
+		Abe =   BCe ^((~BCi)&  BCo );
+		Abi =   BCi ^((~BCo)&  BCu );
+		Abo =   BCo ^((~BCu)&  BCa );
+		Abu =   BCu ^((~BCa)&  BCe );
+
+		Ebo ^= Do;
+		BCa = ROL(Ebo, 28);
+		Egu ^= Du;
+		BCe = ROL(Egu, 20);
+		Eka ^= Da;
+		BCi = ROL(Eka, 3);
+		Eme ^= De;
+		BCo = ROL(Eme, 45);
+		Esi ^= Di;
+		BCu = ROL(Esi, 61);
+		Aga =   BCa ^((~BCe)&  BCi );
+		Age =   BCe ^((~BCi)&  BCo );
+		Agi =   BCi ^((~BCo)&  BCu );
+		Ago =   BCo ^((~BCu)&  BCa );
+		Agu =   BCu ^((~BCa)&  BCe );
+
+		Ebe ^= De;
+		BCa = ROL(Ebe, 1);
+		Egi ^= Di;
+		BCe = ROL(Egi, 6);
+		Eko ^= Do;
+		BCi = ROL(Eko, 25);
+		Emu ^= Du;
+		BCo = ROL_mult8(Emu, 8);
+		Esa ^= Da;
+		BCu = ROL(Esa, 18);
+		Aka =   BCa ^((~BCe)&  BCi );
+		Ake =   BCe ^((~BCi)&  BCo );
+		Aki =   BCi ^((~BCo)&  BCu );
+		Ako =   BCo ^((~BCu)&  BCa );
+		Aku =   BCu ^((~BCa)&  BCe );
+
+		Ebu ^= Du;
+		BCa = ROL(Ebu, 27);
+		Ega ^= Da;
+		BCe = ROL(Ega, 36);
+		Eke ^= De;
+		BCi = ROL(Eke, 10);
+		Emi ^= Di;
+		BCo = ROL(Emi, 15);
+		Eso ^= Do;
+		BCu = ROL_mult8(Eso, 56);
+		Ama =   BCa ^((~BCe)&  BCi );
+		Ame =   BCe ^((~BCi)&  BCo );
+		Ami =   BCi ^((~BCo)&  BCu );
+		Amo =   BCo ^((~BCu)&  BCa );
+		Amu =   BCu ^((~BCa)&  BCe );
+
+		Ebi ^= Di;
+		BCa = ROL(Ebi, 62);
+		Ego ^= Do;
+		BCe = ROL(Ego, 55);
+		Eku ^= Du;
+		BCi = ROL(Eku, 39);
+		Ema ^= Da;
+		BCo = ROL(Ema, 41);
+		Ese ^= De;
+		BCu = ROL(Ese, 2);
+		Asa =   BCa ^((~BCe)&  BCi );
+		Ase =   BCe ^((~BCi)&  BCo );
+		Asi =   BCi ^((~BCo)&  BCu );
+		Aso =   BCo ^((~BCu)&  BCa );
+		Asu =   BCu ^((~BCa)&  BCe );
+	}
+
+	if (validate) {
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+		g_out[3] = Abo;
+		g_out[2] = Abi;
+		g_out[1] = Abe;
+		g_out[0] = Aba;
+	}
+
+	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
+	// with atomic writes, locks or similar...
+	uint64_t *g_good64 = (uint64_t*)g_good;
+	if (Abo <=  ptarget64[3]) {
+		if (Abo < g_good64[3]) {
+			g_good64[3] = Abo;
+			g_good64[2] = Abi;
+			g_good64[1] = Abe;
+			g_good64[0] = Aba;
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+static std::map<int, uint32_t *> context_good[2];
+
+// ... keccak???
+bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	static bool init[MAX_DEVICES] = {false};
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
+
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
diff --git a/scrypt/keccak.h b/scrypt/keccak.h
new file mode 100644
index 0000000000..62ac1cd7fc
--- /dev/null
+++ b/scrypt/keccak.h
@@ -0,0 +1,8 @@
+#ifndef KECCAK_H
+#define KEKKAC_H
+
+extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]);
+extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
+extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
+
+#endif // #ifndef KEKKAC_H
diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu
new file mode 100644
index 0000000000..45b94ee424
--- /dev/null
+++ b/scrypt/kepler_kernel.cu
@@ -0,0 +1,781 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ */
+
+// TODO: attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "kepler_kernel.h"
+
+#define TEXWIDTH 32768
+#define THREADS_PER_WU 4  // four threads per hash
+
+typedef enum
+{
+		ANDERSEN,
+		SIMPLE
+} MemoryAccess;
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
+	return make_uint4(
+		__shfl((int)bx.x, target_thread),
+		__shfl((int)bx.y, target_thread),
+		__shfl((int)bx.z, target_thread),
+		__shfl((int)bx.w, target_thread)
+	);
+}
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+
+	if (SCHEME == ANDERSEN) {
+		int target_thread = (threadIdx.x + 4)%32;
+		uint4 t=b, t2=__shfl(bx, target_thread);
+		int t2_start = __shfl((int)start, target_thread) + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else if (SCHEME == SIMPLE) {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME, int TEX_DIM> __device__  __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch;
+
+	if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
+		if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
+		bool c = (threadIdx.x & 0x4);
+		if (TEX_DIM == 0) {
+				b  = *((uint4 *)(&scratch[c ? t2_start : start]));
+				bx = *((uint4 *)(&scratch[c ? start : t2_start]));
+		} else if (TEX_DIM == 1) {
+				b  = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
+				bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
+		} else if (TEX_DIM == 2) {
+				b  = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
+				bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
+		}
+		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
+		bx = __shfl(bx, (threadIdx.x + 28)%32);
+	} else {
+				 if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
+		else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
+		else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
+				 if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
+		else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
+		else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
+	}
+}
+
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx)
+{
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	b.w = __shfl((int)b.w, x1);
+	b.z = __shfl((int)b.z, x2);
+	b.y = __shfl((int)b.y, x3);
+	uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
+
+	bx.w = __shfl((int)bx.w, x1);
+	bx.z = __shfl((int)bx.z, x2);
+	bx.y = __shfl((int)bx.y, x3);
+	tmp = bx.y; bx.y = bx.w; bx.w = tmp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
+	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
+	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
+	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
+	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
+	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
+	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
+	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+
+	primary_order_shuffle(b, bx);
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*0 + thread_in_block%4];
+	b.y = B[key_offset + 4*1 + thread_in_block%4];
+	b.z = B[key_offset + 4*2 + thread_in_block%4];
+	b.w = B[key_offset + 4*3 + thread_in_block%4];
+	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
+	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
+	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
+	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	B[key_offset + 4*0 + thread_in_block%4] = b.x;
+	B[key_offset + 4*1 + thread_in_block%4] = b.y;
+	B[key_offset + 4*2 + thread_in_block%4] = b.z;
+	B[key_offset + 4*3 + thread_in_block%4] = b.w;
+	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
+	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
+	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
+	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+		switch(ALGO) {
+		case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+		case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+		}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+		switch(ALGO) {
+		case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+		case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+		}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		x.y = __shfl((int)x.y, x3);
+		x.w = __shfl((int)x.w, x1);
+		x.z = __shfl((int)x.z, x2);
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		x.y = __shfl((int)x.y, x3);
+		x.w = __shfl((int)x.w, x1);
+		x.z = __shfl((int)x.z, x2);
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x3);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x1);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x3);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x1);
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3);  break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void kepler_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void kepler_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME, TEX_DIM>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	if (SCHEME == SIMPLE)
+	{
+		// better divergent thread handling submitted by nVidia engineers, but
+		// supposedly this does not run with the ANDERSEN memory access scheme
+		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int pos = j/LOOKUP_GAP;
+		int loop = -1;
+		uint4 t, tx;
+
+		int i = begin;
+		while(i < end) {
+			if (loop==-1) {
+				j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+				pos = j/LOOKUP_GAP;
+				loop = j-pos*LOOKUP_GAP;
+				read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+			}
+			if (loop==0) {
+				b ^= t; bx ^= tx;
+				t=b;tx=bx;
+			}
+			block_mixer<ALGO>(t, tx, x1, x2, x3);
+			if (loop==0) {
+				b=t;bx=tx;
+				i++;
+			}
+			loop--;
+		}
+	}
+	else
+	{
+		// this is my original implementation, now used with the ANDERSEN
+		// memory access scheme only.
+		for (int i = begin; i < end; i++) {
+			int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+			int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+			uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+			while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+			b ^= t; bx ^= tx;
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+		}
+	}
+
+//for (int i = begin; i < end; i++) {
+//	int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+//	int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+//	uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+//	while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+//	b ^= t; bx ^= tx;
+//	block_mixer<ALGO>(b, bx, x1, x2, x3);
+//}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+KeplerKernel::KeplerKernel() : KernelInterface()
+{
+}
+
+bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+		cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+		texRef1D_4_V.normalized = 0;
+		texRef1D_4_V.filterMode = cudaFilterModePoint;
+		texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+		checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+		return true;
+}
+
+bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+		cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+		texRef2D_4_V.normalized = 0;
+		texRef2D_4_V.filterMode = cudaFilterModePoint;
+		texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+		texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+		// maintain texture width of TEXWIDTH (max. limit is 65000)
+		while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+		while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+		checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+		return true;
+}
+
+bool KeplerKernel::unbindtexture_1D()
+{
+		checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+		return true;
+}
+
+bool KeplerKernel::unbindtexture_2D()
+{
+		checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+		return true;
+}
+
+void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+		checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
+	uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static int prev_N[MAX_DEVICES] = {0};
+	if (N != prev_N[thr_id]) {
+			uint32_t h_N = N;
+			uint32_t h_N_1 = N-1;
+			uint32_t h_SCRATCH = SCRATCH;
+			uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+			uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+			cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+			cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+			cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+			cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+			cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+			prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+	//int num_sleeps = 2* ((N + (batch-1)) / batch);
+	//int sleeptime = 100;
+
+	unsigned int pos = 0;
+	do
+	{
+			if (LOOKUP_GAP == 1) {
+					if (IS_SCRYPT())      kepler_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+					if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			} else {
+					if (IS_SCRYPT())      kepler_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+					if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,   0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			} else if (texture_cache == 1) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,  1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,  2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+
+		} else {
+
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			} else if (texture_cache == 1) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
diff --git a/scrypt/kepler_kernel.h b/scrypt/kepler_kernel.h
new file mode 100644
index 0000000000..afe78dabee
--- /dev/null
+++ b/scrypt/kepler_kernel.h
@@ -0,0 +1,29 @@
+#ifndef KEPLER_KERNEL_H
+#define KEPLER_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class KeplerKernel : public KernelInterface
+{
+public:
+	KeplerKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'k'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef KEPLER_KERNEL_H
diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu
new file mode 100644
index 0000000000..28a2708003
--- /dev/null
+++ b/scrypt/nv_kernel.cu
@@ -0,0 +1,1488 @@
+//
+// Experimental Kernel for Kepler (Compute 3.5) devices
+// code submitted by nVidia performance engineer Alexey Panteleev
+// with modifications by Christian Buchner
+//
+// for Compute 3.5
+// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80
+// for Compute 3.0
+// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63
+//
+
+#include <map>
+
+#include "cuda_runtime.h"
+
+#include "miner.h"
+#include "salsa_kernel.h"
+#include "nv_kernel.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#define TEXWIDTH 32768
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+// grab lane ID
+static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; }
+
+// forward references
+template <int ALGO> __global__ void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end);
+template <int ALGO, int TEX_DIM> __global__ void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end);
+template <int ALGO> __global__ void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP);
+template <int ALGO, int TEX_DIM> __global__ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each work unit)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1; // N - 1
+__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP
+
+NVKernel::NVKernel() : KernelInterface()
+{
+}
+
+bool NVKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool NVKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool NVKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool NVKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void NVKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool NVKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static int prev_N[MAX_DEVICES] = {0};
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+	const int batch = device_batchsize[thr_id];
+	unsigned int pos = 0;
+
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelA<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			}
+		else {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelA_LG<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+		} else {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
+
+static __device__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width)
+{
+	return make_uint4(
+		(unsigned int)__shfl((int)val.x, lane, width),
+		(unsigned int)__shfl((int)val.y, lane, width),
+		(unsigned int)__shfl((int)val.z, lane, width),
+		(unsigned int)__shfl((int)val.w, lane, width));
+}
+
+__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	uint4 T1[8], T2[8];
+
+	/* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+	   *A1  B1  C1  D1  E1  F1  G1  H1
+	   *A2  B2  C2  D2  E2  F2  G2  H2
+	   *A3  B3  C3  D3  E3  F3  G3  H3
+	   *A4  B4  C4  D4  E4  F4  G4  H4
+	   *A5  B5  C5  D5  E5  F5  G5  H5
+	   *A6  B6  C6  D6  E6  F6  G6  H6
+	   *A7  B7  C7  D7  E7  F7  G7  H7
+	*/
+
+	// rotate rows
+	T1[0] = B[0];
+	T1[1] = __shfl(B[1], lane8 + 7, 8);
+	T1[2] = __shfl(B[2], lane8 + 6, 8);
+	T1[3] = __shfl(B[3], lane8 + 5, 8);
+	T1[4] = __shfl(C[0], lane8 + 4, 8);
+	T1[5] = __shfl(C[1], lane8 + 3, 8);
+	T1[6] = __shfl(C[2], lane8 + 2, 8);
+	T1[7] = __shfl(C[3], lane8 + 1, 8);
+
+	/* Matrix after row rotates:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+		H1 *A1  B1  C1  D1  E1  F1  G1
+		G2  H2 *A2  B2  C2  D2  E2  F2
+		F3  G3  H3 *A3  B3  C3  D3  E3
+		E4  F4  G4  H4 *A4  B4  C4  D4
+		D5  E5  F5  G5  H5 *A5  B5  C5
+		C6  D6  E6  F6  G6  H6 *A6  B6
+		B7  C7  D7  E7  F7  G7  H7 *A7
+	*/
+
+	// rotate columns up using a barrel shifter simulation
+	// column X is rotated up by (X+1) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	/* Matrix after column rotates:
+
+		H1  H2  H3  H4  H5  H6  H7  H0
+		G2  G3  G4  G5  G6  G7  G0  G1
+		F3  F4  F5  F6  F7  F0  F1  F2
+		E4  E5  E6  E7  E0  E1  E2  E3
+		D5  D6  D7  D0  D1  D2  D3  D4
+		C6  C7  C0  C1  C2  C3  C4  C5
+		B7  B0  B1  B2  B3  B4  B5  B6
+	   *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7
+	*/
+
+	// rotate rows again using address math and write to D, in reverse row order
+	D[spacing*2*(32*tile   )+ lane8     ] = T2[7];
+	D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6];
+	D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5];
+	D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4];
+	D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3];
+	D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2];
+	D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1];
+	D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0];
+}
+
+template <int TEX_DIM> __device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	// Perform the same transposition as in __transposed_write_BC, but in reverse order.
+	// See the illustrations in comments for __transposed_write_BC.
+
+	// read and rotate rows, in reverse row order
+	uint4 T1[8], T2[8];
+	const uint4 *loc;
+	loc = &S[(spacing*2*(32*tile   ) +  lane8      + 8*__shfl(row, 0, 8))];
+	T1[7] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))];
+	T1[6] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))];
+	T1[5] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))];
+	T1[4] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))];
+	T1[3] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))];
+	T1[2] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))];
+	T1[1] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))];
+	T1[0] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+
+	// rotate columns down using a barrel shifter simulation
+	// column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	// rotate rows
+	B[0] = T2[0];
+	B[1] = __shfl(T2[1], lane8 + 1, 8);
+	B[2] = __shfl(T2[2], lane8 + 2, 8);
+	B[3] = __shfl(T2[3], lane8 + 3, 8);
+	C[0] = __shfl(T2[4], lane8 + 4, 8);
+	C[1] = __shfl(T2[5], lane8 + 5, 8);
+	C[2] = __shfl(T2[6], lane8 + 6, 8);
+	C[3] = __shfl(T2[7], lane8 + 7, 8);
+
+}
+
+template <int TEX_DIM> __device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	uint4 BT[4], CT[4];
+	__transposed_read_BC<TEX_DIM>(S, BT, CT, spacing, row);
+
+#pragma unroll 4
+	for(int n = 0; n < 4; n++)
+	{
+		B[n] ^= BT[n];
+		C[n] ^= CT[n];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+#endif
+
+
+
+#if 0
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+
+template <int ALGO> static __device__ void block_mixer(uint4 *B, uint4 *C)
+{
+	switch (ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); break;
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Experimental Scrypt core kernel for Kepler devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__
+void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_N);
+		++i;
+	} else
+		__transposed_read_BC<0>((uint4*)(V + (i-1)*32), B, C, c_N, 0);
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		__transposed_write_BC(B, C, (uint4*)(V + i*32), c_N);
+		++i;
+	}
+}
+
+template <int ALGO> __global__
+void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_spacing);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		__transposed_read_BC<0>((uint4*)(V + pos*32), B, C, c_spacing, 0);
+		while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	}
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		if (i % LOOKUP_GAP == 0)
+		  __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing);
+		++i;
+	}
+}
+
+template <int ALGO, int TEX_DIM>__global__
+void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+		__transposed_read_BC<TEX_DIM>((uint4*)V, B, C, c_N, c_N_1);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	} else
+		__transposed_read_BC<0>((uint4*)g_odata, B, C, 1, 0);
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		__transposed_xor_BC<TEX_DIM>((uint4*)(V), B, C, c_N, slot);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+template <int ALGO, int TEX_DIM> __global__
+void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+	  int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+	  __transposed_read_BC<TEX_DIM>((uint4*)V, B, C, c_spacing, pos);
+	  while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	} else {
+		__transposed_read_BC<TEX_DIM>((uint4*)g_odata, B, C, 1, 0);
+	}
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP;
+		uint4 b[4], c[4]; __transposed_read_BC<TEX_DIM>((uint4*)(V), b, c, c_spacing, pos);
+		while(loop--) { block_mixer<ALGO>(b, c); block_mixer<ALGO>(c, b); }
+#pragma unroll 4
+		for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; }
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+
+
+//
+// Maxcoin related Keccak implementation (Keccak256)
+//
+
+// from salsa_kernel.cu
+extern std::map<int, int> context_blocks;
+extern std::map<int, int> context_wpb;
+extern std::map<int, KernelInterface *> context_kernel;
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+__constant__ uint64_t ptarget64[4];
+
+#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
+#define ROL_mult8(a, offset) ROL(a, offset)
+
+__constant__ uint64_t KeccakF_RoundConstants[24];
+static uint64_t host_KeccakF_RoundConstants[24] = {
+	(uint64_t)0x0000000000000001ULL,
+	(uint64_t)0x0000000000008082ULL,
+	(uint64_t)0x800000000000808aULL,
+	(uint64_t)0x8000000080008000ULL,
+	(uint64_t)0x000000000000808bULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008009ULL,
+	(uint64_t)0x000000000000008aULL,
+	(uint64_t)0x0000000000000088ULL,
+	(uint64_t)0x0000000080008009ULL,
+	(uint64_t)0x000000008000000aULL,
+	(uint64_t)0x000000008000808bULL,
+	(uint64_t)0x800000000000008bULL,
+	(uint64_t)0x8000000000008089ULL,
+	(uint64_t)0x8000000000008003ULL,
+	(uint64_t)0x8000000000008002ULL,
+	(uint64_t)0x8000000000000080ULL,
+	(uint64_t)0x000000000000800aULL,
+	(uint64_t)0x800000008000000aULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008080ULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008008ULL
+};
+
+__constant__ uint64_t pdata64[10];
+
+static __device__ uint32_t cuda_swab32(uint32_t x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+__global__
+void kepler_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
+{
+	uint64_t Aba, Abe, Abi, Abo, Abu;
+	uint64_t Aga, Age, Agi, Ago, Agu;
+	uint64_t Aka, Ake, Aki, Ako, Aku;
+	uint64_t Ama, Ame, Ami, Amo, Amu;
+	uint64_t Asa, Ase, Asi, Aso, Asu;
+	uint64_t BCa, BCe, BCi, BCo, BCu;
+	uint64_t Da, De, Di, Do, Du;
+	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
+	uint64_t Ega, Ege, Egi, Ego, Egu;
+	uint64_t Eka, Eke, Eki, Eko, Eku;
+	uint64_t Ema, Eme, Emi, Emo, Emu;
+	uint64_t Esa, Ese, Esi, Eso, Esu;
+
+	//copyFromState(A, state)
+	Aba = pdata64[0];
+	Abe = pdata64[1];
+	Abi = pdata64[2];
+	Abo = pdata64[3];
+	Abu = pdata64[4];
+	Aga = pdata64[5];
+	Age = pdata64[6];
+	Agi = pdata64[7];
+	Ago = pdata64[8];
+	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
+	Aka = 0x0000000000000001ULL;
+	Ake = 0;
+	Aki = 0;
+	Ako = 0;
+	Aku = 0;
+	Ama = 0;
+	Ame = 0x8000000000000000ULL;
+	Ami = 0;
+	Amo = 0;
+	Amu = 0;
+	Asa = 0;
+	Ase = 0;
+	Asi = 0;
+	Aso = 0;
+	Asu = 0;
+
+#pragma unroll 12
+	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
+	{
+		//    prepareTheta
+		BCa = Aba^Aga^Aka^Ama^Asa;
+		BCe = Abe^Age^Ake^Ame^Ase;
+		BCi = Abi^Agi^Aki^Ami^Asi;
+		BCo = Abo^Ago^Ako^Amo^Aso;
+		BCu = Abu^Agu^Aku^Amu^Asu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Aba ^= Da;
+		BCa = Aba;
+		Age ^= De;
+		BCe = ROL(Age, 44);
+		Aki ^= Di;
+		BCi = ROL(Aki, 43);
+		Amo ^= Do;
+		BCo = ROL(Amo, 21);
+		Asu ^= Du;
+		BCu = ROL(Asu, 14);
+		Eba =   BCa ^((~BCe)&  BCi );
+		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
+		Ebe =   BCe ^((~BCi)&  BCo );
+		Ebi =   BCi ^((~BCo)&  BCu );
+		Ebo =   BCo ^((~BCu)&  BCa );
+		Ebu =   BCu ^((~BCa)&  BCe );
+
+		Abo ^= Do;
+		BCa = ROL(Abo, 28);
+		Agu ^= Du;
+		BCe = ROL(Agu, 20);
+		Aka ^= Da;
+		BCi = ROL(Aka,  3);
+		Ame ^= De;
+		BCo = ROL(Ame, 45);
+		Asi ^= Di;
+		BCu = ROL(Asi, 61);
+		Ega =   BCa ^((~BCe)&  BCi );
+		Ege =   BCe ^((~BCi)&  BCo );
+		Egi =   BCi ^((~BCo)&  BCu );
+		Ego =   BCo ^((~BCu)&  BCa );
+		Egu =   BCu ^((~BCa)&  BCe );
+
+		Abe ^= De;
+		BCa = ROL(Abe,  1);
+		Agi ^= Di;
+		BCe = ROL(Agi,  6);
+		Ako ^= Do;
+		BCi = ROL(Ako, 25);
+		Amu ^= Du;
+		BCo = ROL_mult8(Amu,  8);
+		Asa ^= Da;
+		BCu = ROL(Asa, 18);
+		Eka =   BCa ^((~BCe)&  BCi );
+		Eke =   BCe ^((~BCi)&  BCo );
+		Eki =   BCi ^((~BCo)&  BCu );
+		Eko =   BCo ^((~BCu)&  BCa );
+		Eku =   BCu ^((~BCa)&  BCe );
+
+		Abu ^= Du;
+		BCa = ROL(Abu, 27);
+		Aga ^= Da;
+		BCe = ROL(Aga, 36);
+		Ake ^= De;
+		BCi = ROL(Ake, 10);
+		Ami ^= Di;
+		BCo = ROL(Ami, 15);
+		Aso ^= Do;
+		BCu = ROL_mult8(Aso, 56);
+		Ema =   BCa ^((~BCe)&  BCi );
+		Eme =   BCe ^((~BCi)&  BCo );
+		Emi =   BCi ^((~BCo)&  BCu );
+		Emo =   BCo ^((~BCu)&  BCa );
+		Emu =   BCu ^((~BCa)&  BCe );
+
+		Abi ^= Di;
+		BCa = ROL(Abi, 62);
+		Ago ^= Do;
+		BCe = ROL(Ago, 55);
+		Aku ^= Du;
+		BCi = ROL(Aku, 39);
+		Ama ^= Da;
+		BCo = ROL(Ama, 41);
+		Ase ^= De;
+		BCu = ROL(Ase,  2);
+		Esa =   BCa ^((~BCe)&  BCi );
+		Ese =   BCe ^((~BCi)&  BCo );
+		Esi =   BCi ^((~BCo)&  BCu );
+		Eso =   BCo ^((~BCu)&  BCa );
+		Esu =   BCu ^((~BCa)&  BCe );
+
+		//    prepareTheta
+		BCa = Eba^Ega^Eka^Ema^Esa;
+		BCe = Ebe^Ege^Eke^Eme^Ese;
+		BCi = Ebi^Egi^Eki^Emi^Esi;
+		BCo = Ebo^Ego^Eko^Emo^Eso;
+		BCu = Ebu^Egu^Eku^Emu^Esu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Eba ^= Da;
+		BCa = Eba;
+		Ege ^= De;
+		BCe = ROL(Ege, 44);
+		Eki ^= Di;
+		BCi = ROL(Eki, 43);
+		Emo ^= Do;
+		BCo = ROL(Emo, 21);
+		Esu ^= Du;
+		BCu = ROL(Esu, 14);
+		Aba =   BCa ^((~BCe)&  BCi );
+		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
+		Abe =   BCe ^((~BCi)&  BCo );
+		Abi =   BCi ^((~BCo)&  BCu );
+		Abo =   BCo ^((~BCu)&  BCa );
+		Abu =   BCu ^((~BCa)&  BCe );
+
+		Ebo ^= Do;
+		BCa = ROL(Ebo, 28);
+		Egu ^= Du;
+		BCe = ROL(Egu, 20);
+		Eka ^= Da;
+		BCi = ROL(Eka, 3);
+		Eme ^= De;
+		BCo = ROL(Eme, 45);
+		Esi ^= Di;
+		BCu = ROL(Esi, 61);
+		Aga =   BCa ^((~BCe)&  BCi );
+		Age =   BCe ^((~BCi)&  BCo );
+		Agi =   BCi ^((~BCo)&  BCu );
+		Ago =   BCo ^((~BCu)&  BCa );
+		Agu =   BCu ^((~BCa)&  BCe );
+
+		Ebe ^= De;
+		BCa = ROL(Ebe, 1);
+		Egi ^= Di;
+		BCe = ROL(Egi, 6);
+		Eko ^= Do;
+		BCi = ROL(Eko, 25);
+		Emu ^= Du;
+		BCo = ROL_mult8(Emu, 8);
+		Esa ^= Da;
+		BCu = ROL(Esa, 18);
+		Aka =   BCa ^((~BCe)&  BCi );
+		Ake =   BCe ^((~BCi)&  BCo );
+		Aki =   BCi ^((~BCo)&  BCu );
+		Ako =   BCo ^((~BCu)&  BCa );
+		Aku =   BCu ^((~BCa)&  BCe );
+
+		Ebu ^= Du;
+		BCa = ROL(Ebu, 27);
+		Ega ^= Da;
+		BCe = ROL(Ega, 36);
+		Eke ^= De;
+		BCi = ROL(Eke, 10);
+		Emi ^= Di;
+		BCo = ROL(Emi, 15);
+		Eso ^= Do;
+		BCu = ROL_mult8(Eso, 56);
+		Ama =   BCa ^((~BCe)&  BCi );
+		Ame =   BCe ^((~BCi)&  BCo );
+		Ami =   BCi ^((~BCo)&  BCu );
+		Amo =   BCo ^((~BCu)&  BCa );
+		Amu =   BCu ^((~BCa)&  BCe );
+
+		Ebi ^= Di;
+		BCa = ROL(Ebi, 62);
+		Ego ^= Do;
+		BCe = ROL(Ego, 55);
+		Eku ^= Du;
+		BCi = ROL(Eku, 39);
+		Ema ^= Da;
+		BCo = ROL(Ema, 41);
+		Ese ^= De;
+		BCu = ROL(Ese, 2);
+		Asa =   BCa ^((~BCe)&  BCi );
+		Ase =   BCe ^((~BCi)&  BCo );
+		Asi =   BCi ^((~BCo)&  BCu );
+		Aso =   BCo ^((~BCu)&  BCa );
+		Asu =   BCu ^((~BCa)&  BCe );
+	}
+
+	if (validate) {
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+		g_out[3] = Abo;
+		g_out[2] = Abi;
+		g_out[1] = Abe;
+		g_out[0] = Aba;
+	}
+
+	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
+	// with atomic writes, locks or similar...
+	uint64_t *g_good64 = (uint64_t*)g_good;
+	if (Abo <=  ptarget64[3]) {
+		if (Abo < g_good64[3]) {
+			g_good64[3] = Abo;
+			g_good64[2] = Abi;
+			g_good64[1] = Abe;
+			g_good64[0] = Aba;
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+static std::map<int, uint32_t *> context_good[2];
+
+bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	static bool init[MAX_DEVICES] = {false};
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
+
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void NVKernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	kepler_crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
+
+
+//
+// Blakecoin related Keccak implementation (Keccak256)
+//
+
+typedef uint32_t sph_u32;
+#define SPH_C32(x) ((sph_u32)(x))
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define SPH_ROTL32(a, b) __funnelshift_l( a, a, b );
+#endif
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+__constant__ uint32_t pdata[20];
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static __device__ void
+cuda_sph_enc32be(void *dst, sph_u32 val)
+{
+	*(sph_u32 *)dst = cuda_sph_bswap32(val);
+}
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = input[0]; \
+		M1 = input[1]; \
+		M2 = input[2]; \
+		M3 = input[3]; \
+		M4 = input[4]; \
+		M5 = input[5]; \
+		M6 = input[6]; \
+		M7 = input[7]; \
+		M8 = input[8]; \
+		M9 = input[9]; \
+		MA = input[10]; \
+		MB = input[11]; \
+		MC = input[12]; \
+		MD = input[13]; \
+		ME = input[14]; \
+		MF = input[15]; \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+
+__global__
+void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
+{
+	uint32_t input[16];
+	uint64_t output[4];
+
+	#pragma unroll
+	for (int i=0; i < 16; ++i) input[i] = pdata[i];
+
+	sph_u32 H0 = 0x6A09E667;
+	sph_u32 H1 = 0xBB67AE85;
+	sph_u32 H2 = 0x3C6EF372;
+	sph_u32 H3 = 0xA54FF53A;
+	sph_u32 H4 = 0x510E527F;
+	sph_u32 H5 = 0x9B05688C;
+	sph_u32 H6 = 0x1F83D9AB;
+	sph_u32 H7 = 0x5BE0CD19;
+	sph_u32 S0 = 0;
+	sph_u32 S1 = 0;
+	sph_u32 S2 = 0;
+	sph_u32 S3 = 0;
+	sph_u32 T0 = 0;
+	sph_u32 T1 = 0;
+	T0 = SPH_T32(T0 + 512);
+	COMPRESS32;
+
+	#pragma unroll
+	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
+
+	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+	input[4] = 0x80000000;
+
+	#pragma unroll 8
+	for (int i=5; i < 13; ++i) input[i] = 0;
+
+	input[13] = 0x00000001;
+	input[14] = T1;
+	input[15] = T0 + 128;
+
+	T0 = SPH_T32(T0 + 128);
+	COMPRESS32;
+
+	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
+	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
+	if (validate || output[3] <=  ptarget64[3])
+	{
+		// this data is only needed when we actually need to save the hashes
+		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
+		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
+		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
+		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
+		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
+		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
+	}
+
+	if (validate)
+	{
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+		#pragma unroll
+		for (int i=0; i < 4; ++i) g_out[i] = output[i];
+	}
+
+	if (output[3] <=  ptarget64[3]) {
+		uint64_t *g_good64 = (uint64_t*)g_good;
+		if (output[3] < g_good64[3]) {
+			g_good64[3] = output[3];
+			g_good64[2] = output[2];
+			g_good64[1] = output[1];
+			g_good64[0] = output[0];
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	static bool init[MAX_DEVICES] = {false};
+	if (!init[thr_id])
+	{
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void NVKernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	kepler_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
diff --git a/scrypt/nv_kernel.h b/scrypt/nv_kernel.h
new file mode 100644
index 0000000000..e45ed9b95f
--- /dev/null
+++ b/scrypt/nv_kernel.h
@@ -0,0 +1,36 @@
+#ifndef NV_KERNEL_H
+#define NV_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class NVKernel : public KernelInterface
+{
+public:
+	NVKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'K'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+
+	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
+
+	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
+	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
+};
+
+#endif // #ifndef NV_KERNEL_H
diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu
new file mode 100644
index 0000000000..cc01843afd
--- /dev/null
+++ b/scrypt/nv_kernel2.cu
@@ -0,0 +1,1723 @@
+//
+// Experimental Kernel for Kepler (Compute 3.5) devices
+// code submitted by nVidia performance engineer Alexey Panteleev
+// with modifications by Christian Buchner
+//
+// for Compute 3.5
+// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80
+// for Compute 3.0
+// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63
+//
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "nv_kernel2.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+// grab lane ID
+static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; }
+
+// forward references
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each work unit)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1; // N - 1
+__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP
+
+
+NV2Kernel::NV2Kernel() : KernelInterface()
+{
+}
+
+void NV2Kernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static int prev_N[MAX_DEVICES] = {0};
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+	const int batch = device_batchsize[thr_id];
+	unsigned int pos = 0;
+
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      nv2_scrypt_core_kernelA<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      nv2_scrypt_core_kernelA_LG<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      nv2_scrypt_core_kernelB<A_SCRYPT     > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      nv2_scrypt_core_kernelB_LG<A_SCRYPT     > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB_LG<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
+
+static __device__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width)
+{
+	return make_uint4(
+		(unsigned int)__shfl((int)val.x, lane, width),
+		(unsigned int)__shfl((int)val.y, lane, width),
+		(unsigned int)__shfl((int)val.z, lane, width),
+		(unsigned int)__shfl((int)val.w, lane, width));
+}
+
+__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	uint4 T1[8], T2[8];
+
+	/* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+	   *A1  B1  C1  D1  E1  F1  G1  H1
+	   *A2  B2  C2  D2  E2  F2  G2  H2
+	   *A3  B3  C3  D3  E3  F3  G3  H3
+	   *A4  B4  C4  D4  E4  F4  G4  H4
+	   *A5  B5  C5  D5  E5  F5  G5  H5
+	   *A6  B6  C6  D6  E6  F6  G6  H6
+	   *A7  B7  C7  D7  E7  F7  G7  H7
+	*/
+
+	// rotate rows
+	T1[0] = B[0];
+	T1[1] = __shfl(B[1], lane8 + 7, 8);
+	T1[2] = __shfl(B[2], lane8 + 6, 8);
+	T1[3] = __shfl(B[3], lane8 + 5, 8);
+	T1[4] = __shfl(C[0], lane8 + 4, 8);
+	T1[5] = __shfl(C[1], lane8 + 3, 8);
+	T1[6] = __shfl(C[2], lane8 + 2, 8);
+	T1[7] = __shfl(C[3], lane8 + 1, 8);
+
+	/* Matrix after row rotates:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+		H1 *A1  B1  C1  D1  E1  F1  G1
+		G2  H2 *A2  B2  C2  D2  E2  F2
+		F3  G3  H3 *A3  B3  C3  D3  E3
+		E4  F4  G4  H4 *A4  B4  C4  D4
+		D5  E5  F5  G5  H5 *A5  B5  C5
+		C6  D6  E6  F6  G6  H6 *A6  B6
+		B7  C7  D7  E7  F7  G7  H7 *A7
+	*/
+
+	// rotate columns up using a barrel shifter simulation
+	// column X is rotated up by (X+1) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	/* Matrix after column rotates:
+
+		H1  H2  H3  H4  H5  H6  H7  H0
+		G2  G3  G4  G5  G6  G7  G0  G1
+		F3  F4  F5  F6  F7  F0  F1  F2
+		E4  E5  E6  E7  E0  E1  E2  E3
+		D5  D6  D7  D0  D1  D2  D3  D4
+		C6  C7  C0  C1  C2  C3  C4  C5
+		B7  B0  B1  B2  B3  B4  B5  B6
+	   *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7
+	*/
+
+	// rotate rows again using address math and write to D, in reverse row order
+	D[spacing*2*(32*tile   )+ lane8     ] = T2[7];
+	D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6];
+	D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5];
+	D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4];
+	D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3];
+	D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2];
+	D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1];
+	D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0];
+}
+
+__device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	// Perform the same transposition as in __transposed_write_BC, but in reverse order.
+	// See the illustrations in comments for __transposed_write_BC.
+
+	// read and rotate rows, in reverse row order
+	uint4 T1[8], T2[8];
+	T1[7] = __ldg(&S[(spacing*2*(32*tile   ) +  lane8      + 8*__shfl(row, 0, 8))]);
+	T1[6] = __ldg(&S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))]);
+	T1[5] = __ldg(&S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))]);
+	T1[4] = __ldg(&S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))]);
+	T1[3] = __ldg(&S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))]);
+	T1[2] = __ldg(&S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))]);
+	T1[1] = __ldg(&S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))]);
+	T1[0] = __ldg(&S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))]);
+
+	// rotate columns down using a barrel shifter simulation
+	// column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	// rotate rows
+	B[0] = T2[0];
+	B[1] = __shfl(T2[1], lane8 + 1, 8);
+	B[2] = __shfl(T2[2], lane8 + 2, 8);
+	B[3] = __shfl(T2[3], lane8 + 3, 8);
+	C[0] = __shfl(T2[4], lane8 + 4, 8);
+	C[1] = __shfl(T2[5], lane8 + 5, 8);
+	C[2] = __shfl(T2[6], lane8 + 6, 8);
+	C[3] = __shfl(T2[7], lane8 + 7, 8);
+
+}
+
+__device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	uint4 BT[4], CT[4];
+	__transposed_read_BC(S, BT, CT, spacing, row);
+
+#pragma unroll 4
+	for(int n = 0; n < 4; n++)
+	{
+		B[n] ^= BT[n];
+		C[n] ^= CT[n];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+#endif
+
+
+
+#if 0
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+
+template <int ALGO> static __device__ void block_mixer(uint4 *B, uint4 *C)
+{
+  switch (ALGO)
+  {
+	case A_SCRYPT:      xor_salsa8(B, C); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); break;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Experimental Scrypt core kernel for Titan devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_N);
+		++i;
+	} else
+		__transposed_read_BC((uint4*)(V + (i-1)*32), B, C, c_N, 0);
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		__transposed_write_BC(B, C, (uint4*)(V + i*32), c_N);
+		++i;
+	}
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_spacing);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		__transposed_read_BC((uint4*)(V + pos*32), B, C, c_spacing, 0);
+		while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	}
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		if (i % LOOKUP_GAP == 0)
+		  __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing);
+		++i;
+	}
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+		__transposed_read_BC((uint4*)V, B, C, c_N, c_N_1);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	} else
+		__transposed_read_BC((uint4*)g_odata, B, C, 1, 0);
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		__transposed_xor_BC((uint4*)(V), B, C, c_N, slot);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+	  int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+	  __transposed_read_BC((uint4*)V, B, C, c_spacing, pos);
+	  while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	} else {
+		__transposed_read_BC((uint4*)g_odata, B, C, 1, 0);
+	}
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP;
+		uint4 b[4], c[4]; __transposed_read_BC((uint4*)(V), b, c, c_spacing, pos);
+		while(loop--) { block_mixer<ALGO>(b, c); block_mixer<ALGO>(c, b); }
+#pragma unroll 4
+		for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; }
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+
+//
+// Maxcoin related Keccak implementation (Keccak256)
+//
+
+// from salsa_kernel.cu
+extern std::map<int, int> context_blocks;
+extern std::map<int, int> context_wpb;
+extern std::map<int, KernelInterface *> context_kernel;
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+__constant__ uint64_t ptarget64[4];
+
+// ROL macro replaced with the inline assembly code below to work around a performance issue
+//#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
+__inline__ __device__ uint2 ROL(const uint2 a, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#define ROL_mult8(a, offset) ROL(a, offset)
+
+__inline__ __device__ uint64_t devectorize(uint2 v) { return __double_as_longlong(__hiloint2double(v.y, v.x)); }
+__inline__ __device__ uint2 vectorize(uint64_t v) { return make_uint2(__double2loint(__longlong_as_double(v)), __double2hiint(__longlong_as_double(v))); }
+__inline__ __device__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+__inline__ __device__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+__inline__ __device__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+__inline__ __device__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+__inline__ __device__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+
+__constant__ uint64_t KeccakF_RoundConstants[24];
+
+static uint64_t host_KeccakF_RoundConstants[24] =
+{
+	(uint64_t)0x0000000000000001ULL,
+	(uint64_t)0x0000000000008082ULL,
+	(uint64_t)0x800000000000808aULL,
+	(uint64_t)0x8000000080008000ULL,
+	(uint64_t)0x000000000000808bULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008009ULL,
+	(uint64_t)0x000000000000008aULL,
+	(uint64_t)0x0000000000000088ULL,
+	(uint64_t)0x0000000080008009ULL,
+	(uint64_t)0x000000008000000aULL,
+	(uint64_t)0x000000008000808bULL,
+	(uint64_t)0x800000000000008bULL,
+	(uint64_t)0x8000000000008089ULL,
+	(uint64_t)0x8000000000008003ULL,
+	(uint64_t)0x8000000000008002ULL,
+	(uint64_t)0x8000000000000080ULL,
+	(uint64_t)0x000000000000800aULL,
+	(uint64_t)0x800000008000000aULL,
+	(uint64_t)0x8000000080008081ULL,
+	(uint64_t)0x8000000000008080ULL,
+	(uint64_t)0x0000000080000001ULL,
+	(uint64_t)0x8000000080008008ULL
+};
+
+__constant__ uint64_t pdata64[10];
+
+static __device__ uint32_t cuda_swab32(uint32_t x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+// in this implementation the first and last iteration of the for() loop were explicitly
+// unrolled and redundant operations were removed (e.g. operations on zero inputs, and
+// computation of unnecessary outputs)
+__global__ void titan_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
+{
+	uint2 Aba, Abe, Abi, Abo, Abu;
+	uint2 Aga, Age, Agi, Ago, Agu;
+	uint2 Aka, Ake, Aki, Ako, Aku;
+	uint2 Ama, Ame, Ami, Amo, Amu;
+	uint2 Asa, Ase, Asi, Aso, Asu;
+	uint2 BCa, BCe, BCi, BCo, BCu;
+	uint2 Da, De, Di, Do, Du;
+	uint2 Eba, Ebe, Ebi, Ebo, Ebu;
+	uint2 Ega, Ege, Egi, Ego, Egu;
+	uint2 Eka, Eke, Eki, Eko, Eku;
+	uint2 Ema, Eme, Emi, Emo, Emu;
+	uint2 Esa, Ese, Esi, Eso, Esu;
+
+	// embed unique nonce into source data stream in pdata[]
+	Agu = vectorize((pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32));
+
+	//    prepareTheta
+	BCa = vectorize(pdata64[0]^pdata64[5]^0x0000000000000001ULL);
+	BCe = vectorize(pdata64[1]^pdata64[6]^0x8000000000000000ULL);
+	BCi = vectorize(pdata64[2]^pdata64[7]);
+	BCo = vectorize(pdata64[3]^pdata64[8]);
+	BCu = vectorize(pdata64[4])^Agu;
+
+	//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+	Da = BCu^ROL(BCe, 1);
+	De = BCa^ROL(BCi, 1);
+	Di = BCe^ROL(BCo, 1);
+	Do = BCi^ROL(BCu, 1);
+	Du = BCo^ROL(BCa, 1);
+
+	Aba = vectorize(pdata64[0]) ^ Da;
+	BCa = Aba;
+	Age = vectorize(pdata64[6]) ^ De;
+	BCe = ROL(Age, 44);
+	Aki = Di;
+	BCi = ROL(Aki, 43);
+	Amo = Do;
+	BCo = ROL(Amo, 21);
+	Asu = Du;
+	BCu = ROL(Asu, 14);
+	Eba =   BCa ^((~BCe)&  BCi );
+	Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[0]);
+	Ebe =   BCe ^((~BCi)&  BCo );
+	Ebi =   BCi ^((~BCo)&  BCu );
+	Ebo =   BCo ^((~BCu)&  BCa );
+	Ebu =   BCu ^((~BCa)&  BCe );
+
+	Abo = vectorize(pdata64[3]) ^ Do;
+	BCa = ROL(Abo, 28);
+	Agu ^= Du;
+	BCe = ROL(Agu, 20);
+	Aka = vectorize(0x0000000000000001ULL) ^ Da;
+	BCi = ROL(Aka,  3);
+	Ame = vectorize(0x8000000000000000ULL) ^ De;
+	BCo = ROL(Ame, 45);
+	Asi = Di;
+	BCu = ROL(Asi, 61);
+	Ega =   BCa ^((~BCe)&  BCi );
+	Ege =   BCe ^((~BCi)&  BCo );
+	Egi =   BCi ^((~BCo)&  BCu );
+	Ego =   BCo ^((~BCu)&  BCa );
+	Egu =   BCu ^((~BCa)&  BCe );
+
+	Abe = vectorize(pdata64[1]) ^ De;
+	BCa = ROL(Abe,  1);
+	Agi = vectorize(pdata64[7]) ^ Di;
+	BCe = ROL(Agi,  6);
+	Ako = Do;
+	BCi = ROL(Ako, 25);
+	Amu = Du;
+	BCo = ROL(Amu,  8);
+	Asa = Da;
+	BCu = ROL(Asa, 18);
+	Eka =   BCa ^((~BCe)&  BCi );
+	Eke =   BCe ^((~BCi)&  BCo );
+	Eki =   BCi ^((~BCo)&  BCu );
+	Eko =   BCo ^((~BCu)&  BCa );
+	Eku =   BCu ^((~BCa)&  BCe );
+
+	Abu = vectorize(pdata64[4]) ^ Du;
+	BCa = ROL(Abu, 27);
+	Aga = vectorize(pdata64[5]) ^ Da;
+	BCe = ROL(Aga, 36);
+	Ake = De;
+	BCi = ROL(Ake, 10);
+	Ami = Di;
+	BCo = ROL(Ami, 15);
+	Aso = Do;
+	BCu = ROL(Aso, 56);
+	Ema =   BCa ^((~BCe)&  BCi );
+	Eme =   BCe ^((~BCi)&  BCo );
+	Emi =   BCi ^((~BCo)&  BCu );
+	Emo =   BCo ^((~BCu)&  BCa );
+	Emu =   BCu ^((~BCa)&  BCe );
+
+	Abi = vectorize(pdata64[2]) ^ Di;
+	BCa = ROL(Abi, 62);
+	Ago = vectorize(pdata64[8]) ^ Do;
+	BCe = ROL(Ago, 55);
+	Aku = Du;
+	BCi = ROL(Aku, 39);
+	Ama = Da;
+	BCo = ROL(Ama, 41);
+	Ase = De;
+	BCu = ROL(Ase,  2);
+	Esa =   BCa ^((~BCe)&  BCi );
+	Ese =   BCe ^((~BCi)&  BCo );
+	Esi =   BCi ^((~BCo)&  BCu );
+	Eso =   BCo ^((~BCu)&  BCa );
+	Esu =   BCu ^((~BCa)&  BCe );
+
+	//    prepareTheta
+	BCa = Eba^Ega^Eka^Ema^Esa;
+	BCe = Ebe^Ege^Eke^Eme^Ese;
+	BCi = Ebi^Egi^Eki^Emi^Esi;
+	BCo = Ebo^Ego^Eko^Emo^Eso;
+	BCu = Ebu^Egu^Eku^Emu^Esu;
+
+	//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+	Da = BCu^ROL(BCe, 1);
+	De = BCa^ROL(BCi, 1);
+	Di = BCe^ROL(BCo, 1);
+	Do = BCi^ROL(BCu, 1);
+	Du = BCo^ROL(BCa, 1);
+
+	Eba ^= Da;
+	BCa = Eba;
+	Ege ^= De;
+	BCe = ROL(Ege, 44);
+	Eki ^= Di;
+	BCi = ROL(Eki, 43);
+	Emo ^= Do;
+	BCo = ROL(Emo, 21);
+	Esu ^= Du;
+	BCu = ROL(Esu, 14);
+	Aba =   BCa ^((~BCe)&  BCi );
+	Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[1]);
+	Abe =   BCe ^((~BCi)&  BCo );
+	Abi =   BCi ^((~BCo)&  BCu );
+	Abo =   BCo ^((~BCu)&  BCa );
+	Abu =   BCu ^((~BCa)&  BCe );
+
+	Ebo ^= Do;
+	BCa = ROL(Ebo, 28);
+	Egu ^= Du;
+	BCe = ROL(Egu, 20);
+	Eka ^= Da;
+	BCi = ROL(Eka, 3);
+	Eme ^= De;
+	BCo = ROL(Eme, 45);
+	Esi ^= Di;
+	BCu = ROL(Esi, 61);
+	Aga =   BCa ^((~BCe)&  BCi );
+	Age =   BCe ^((~BCi)&  BCo );
+	Agi =   BCi ^((~BCo)&  BCu );
+	Ago =   BCo ^((~BCu)&  BCa );
+	Agu =   BCu ^((~BCa)&  BCe );
+
+	Ebe ^= De;
+	BCa = ROL(Ebe, 1);
+	Egi ^= Di;
+	BCe = ROL(Egi, 6);
+	Eko ^= Do;
+	BCi = ROL(Eko, 25);
+	Emu ^= Du;
+	BCo = ROL(Emu, 8);
+	Esa ^= Da;
+	BCu = ROL(Esa, 18);
+	Aka =   BCa ^((~BCe)&  BCi );
+	Ake =   BCe ^((~BCi)&  BCo );
+	Aki =   BCi ^((~BCo)&  BCu );
+	Ako =   BCo ^((~BCu)&  BCa );
+	Aku =   BCu ^((~BCa)&  BCe );
+
+	Ebu ^= Du;
+	BCa = ROL(Ebu, 27);
+	Ega ^= Da;
+	BCe = ROL(Ega, 36);
+	Eke ^= De;
+	BCi = ROL(Eke, 10);
+	Emi ^= Di;
+	BCo = ROL(Emi, 15);
+	Eso ^= Do;
+	BCu = ROL(Eso, 56);
+	Ama =   BCa ^((~BCe)&  BCi );
+	Ame =   BCe ^((~BCi)&  BCo );
+	Ami =   BCi ^((~BCo)&  BCu );
+	Amo =   BCo ^((~BCu)&  BCa );
+	Amu =   BCu ^((~BCa)&  BCe );
+
+	Ebi ^= Di;
+	BCa = ROL(Ebi, 62);
+	Ego ^= Do;
+	BCe = ROL(Ego, 55);
+	Eku ^= Du;
+	BCi = ROL(Eku, 39);
+	Ema ^= Da;
+	BCo = ROL(Ema, 41);
+	Ese ^= De;
+	BCu = ROL(Ese, 2);
+	Asa =   BCa ^((~BCe)&  BCi );
+	Ase =   BCe ^((~BCi)&  BCo );
+	Asi =   BCi ^((~BCo)&  BCu );
+	Aso =   BCo ^((~BCu)&  BCa );
+	Asu =   BCu ^((~BCa)&  BCe );
+
+//#pragma unroll 10
+	for( int laneCount = 2; laneCount < 22; laneCount += 2 )
+	{
+		//    prepareTheta
+		BCa = Aba^Aga^Aka^Ama^Asa;
+		BCe = Abe^Age^Ake^Ame^Ase;
+		BCi = Abi^Agi^Aki^Ami^Asi;
+		BCo = Abo^Ago^Ako^Amo^Aso;
+		BCu = Abu^Agu^Aku^Amu^Asu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Aba ^= Da;
+		BCa = Aba;
+		Age ^= De;
+		BCe = ROL(Age, 44);
+		Aki ^= Di;
+		BCi = ROL(Aki, 43);
+		Amo ^= Do;
+		BCo = ROL(Amo, 21);
+		Asu ^= Du;
+		BCu = ROL(Asu, 14);
+		Eba =   BCa ^((~BCe)&  BCi );
+		Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount]);
+		Ebe =   BCe ^((~BCi)&  BCo );
+		Ebi =   BCi ^((~BCo)&  BCu );
+		Ebo =   BCo ^((~BCu)&  BCa );
+		Ebu =   BCu ^((~BCa)&  BCe );
+
+		Abo ^= Do;
+		BCa = ROL(Abo, 28);
+		Agu ^= Du;
+		BCe = ROL(Agu, 20);
+		Aka ^= Da;
+		BCi = ROL(Aka,  3);
+		Ame ^= De;
+		BCo = ROL(Ame, 45);
+		Asi ^= Di;
+		BCu = ROL(Asi, 61);
+		Ega =   BCa ^((~BCe)&  BCi );
+		Ege =   BCe ^((~BCi)&  BCo );
+		Egi =   BCi ^((~BCo)&  BCu );
+		Ego =   BCo ^((~BCu)&  BCa );
+		Egu =   BCu ^((~BCa)&  BCe );
+
+		Abe ^= De;
+		BCa = ROL(Abe,  1);
+		Agi ^= Di;
+		BCe = ROL(Agi,  6);
+		Ako ^= Do;
+		BCi = ROL(Ako, 25);
+		Amu ^= Du;
+		BCo = ROL(Amu,  8);
+		Asa ^= Da;
+		BCu = ROL(Asa, 18);
+		Eka =   BCa ^((~BCe)&  BCi );
+		Eke =   BCe ^((~BCi)&  BCo );
+		Eki =   BCi ^((~BCo)&  BCu );
+		Eko =   BCo ^((~BCu)&  BCa );
+		Eku =   BCu ^((~BCa)&  BCe );
+
+		Abu ^= Du;
+		BCa = ROL(Abu, 27);
+		Aga ^= Da;
+		BCe = ROL(Aga, 36);
+		Ake ^= De;
+		BCi = ROL(Ake, 10);
+		Ami ^= Di;
+		BCo = ROL(Ami, 15);
+		Aso ^= Do;
+		BCu = ROL(Aso, 56);
+		Ema =   BCa ^((~BCe)&  BCi );
+		Eme =   BCe ^((~BCi)&  BCo );
+		Emi =   BCi ^((~BCo)&  BCu );
+		Emo =   BCo ^((~BCu)&  BCa );
+		Emu =   BCu ^((~BCa)&  BCe );
+
+		Abi ^= Di;
+		BCa = ROL(Abi, 62);
+		Ago ^= Do;
+		BCe = ROL(Ago, 55);
+		Aku ^= Du;
+		BCi = ROL(Aku, 39);
+		Ama ^= Da;
+		BCo = ROL(Ama, 41);
+		Ase ^= De;
+		BCu = ROL(Ase,  2);
+		Esa =   BCa ^((~BCe)&  BCi );
+		Ese =   BCe ^((~BCi)&  BCo );
+		Esi =   BCi ^((~BCo)&  BCu );
+		Eso =   BCo ^((~BCu)&  BCa );
+		Esu =   BCu ^((~BCa)&  BCe );
+
+		//    prepareTheta
+		BCa = Eba^Ega^Eka^Ema^Esa;
+		BCe = Ebe^Ege^Eke^Eme^Ese;
+		BCi = Ebi^Egi^Eki^Emi^Esi;
+		BCo = Ebo^Ego^Eko^Emo^Eso;
+		BCu = Ebu^Egu^Eku^Emu^Esu;
+
+		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+		Da = BCu^ROL(BCe, 1);
+		De = BCa^ROL(BCi, 1);
+		Di = BCe^ROL(BCo, 1);
+		Do = BCi^ROL(BCu, 1);
+		Du = BCo^ROL(BCa, 1);
+
+		Eba ^= Da;
+		BCa = Eba;
+		Ege ^= De;
+		BCe = ROL(Ege, 44);
+		Eki ^= Di;
+		BCi = ROL(Eki, 43);
+		Emo ^= Do;
+		BCo = ROL(Emo, 21);
+		Esu ^= Du;
+		BCu = ROL(Esu, 14);
+		Aba =   BCa ^((~BCe)&  BCi );
+		Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount+1]);
+		Abe =   BCe ^((~BCi)&  BCo );
+		Abi =   BCi ^((~BCo)&  BCu );
+		Abo =   BCo ^((~BCu)&  BCa );
+		Abu =   BCu ^((~BCa)&  BCe );
+
+		Ebo ^= Do;
+		BCa = ROL(Ebo, 28);
+		Egu ^= Du;
+		BCe = ROL(Egu, 20);
+		Eka ^= Da;
+		BCi = ROL(Eka, 3);
+		Eme ^= De;
+		BCo = ROL(Eme, 45);
+		Esi ^= Di;
+		BCu = ROL(Esi, 61);
+		Aga =   BCa ^((~BCe)&  BCi );
+		Age =   BCe ^((~BCi)&  BCo );
+		Agi =   BCi ^((~BCo)&  BCu );
+		Ago =   BCo ^((~BCu)&  BCa );
+		Agu =   BCu ^((~BCa)&  BCe );
+
+		Ebe ^= De;
+		BCa = ROL(Ebe, 1);
+		Egi ^= Di;
+		BCe = ROL(Egi, 6);
+		Eko ^= Do;
+		BCi = ROL(Eko, 25);
+		Emu ^= Du;
+		BCo = ROL(Emu, 8);
+		Esa ^= Da;
+		BCu = ROL(Esa, 18);
+		Aka =   BCa ^((~BCe)&  BCi );
+		Ake =   BCe ^((~BCi)&  BCo );
+		Aki =   BCi ^((~BCo)&  BCu );
+		Ako =   BCo ^((~BCu)&  BCa );
+		Aku =   BCu ^((~BCa)&  BCe );
+
+		Ebu ^= Du;
+		BCa = ROL(Ebu, 27);
+		Ega ^= Da;
+		BCe = ROL(Ega, 36);
+		Eke ^= De;
+		BCi = ROL(Eke, 10);
+		Emi ^= Di;
+		BCo = ROL(Emi, 15);
+		Eso ^= Do;
+		BCu = ROL(Eso, 56);
+		Ama =   BCa ^((~BCe)&  BCi );
+		Ame =   BCe ^((~BCi)&  BCo );
+		Ami =   BCi ^((~BCo)&  BCu );
+		Amo =   BCo ^((~BCu)&  BCa );
+		Amu =   BCu ^((~BCa)&  BCe );
+
+		Ebi ^= Di;
+		BCa = ROL(Ebi, 62);
+		Ego ^= Do;
+		BCe = ROL(Ego, 55);
+		Eku ^= Du;
+		BCi = ROL(Eku, 39);
+		Ema ^= Da;
+		BCo = ROL(Ema, 41);
+		Ese ^= De;
+		BCu = ROL(Ese, 2);
+		Asa =   BCa ^((~BCe)&  BCi );
+		Ase =   BCe ^((~BCi)&  BCo );
+		Asi =   BCi ^((~BCo)&  BCu );
+		Aso =   BCo ^((~BCu)&  BCa );
+		Asu =   BCu ^((~BCa)&  BCe );
+	}
+
+	//    prepareTheta
+	BCa = Aba^Aga^Aka^Ama^Asa;
+	BCe = Abe^Age^Ake^Ame^Ase;
+	BCi = Abi^Agi^Aki^Ami^Asi;
+	BCo = Abo^Ago^Ako^Amo^Aso;
+	BCu = Abu^Agu^Aku^Amu^Asu;
+
+	//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+	Da = BCu^ROL(BCe, 1);
+	De = BCa^ROL(BCi, 1);
+	Di = BCe^ROL(BCo, 1);
+	Do = BCi^ROL(BCu, 1);
+	Du = BCo^ROL(BCa, 1);
+
+	Aba ^= Da;
+	BCa = Aba;
+	Age ^= De;
+	BCe = ROL(Age, 44);
+	Aki ^= Di;
+	BCi = ROL(Aki, 43);
+	Amo ^= Do;
+	BCo = ROL(Amo, 21);
+	Asu ^= Du;
+	BCu = ROL(Asu, 14);
+	Eba =   BCa ^((~BCe)&  BCi );
+	Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[22]);
+	Ebe =   BCe ^((~BCi)&  BCo );
+	Ebi =   BCi ^((~BCo)&  BCu );
+	Ebo =   BCo ^((~BCu)&  BCa );
+	Ebu =   BCu ^((~BCa)&  BCe );
+
+	Abo ^= Do;
+	BCa = ROL(Abo, 28);
+	Agu ^= Du;
+	BCe = ROL(Agu, 20);
+	Aka ^= Da;
+	BCi = ROL(Aka,  3);
+	Ame ^= De;
+	BCo = ROL(Ame, 45);
+	Asi ^= Di;
+	BCu = ROL(Asi, 61);
+	Ega =   BCa ^((~BCe)&  BCi );
+	Ege =   BCe ^((~BCi)&  BCo );
+	Egi =   BCi ^((~BCo)&  BCu );
+	Ego =   BCo ^((~BCu)&  BCa );
+	Egu =   BCu ^((~BCa)&  BCe );
+
+	Abe ^= De;
+	BCa = ROL(Abe,  1);
+	Agi ^= Di;
+	BCe = ROL(Agi,  6);
+	Ako ^= Do;
+	BCi = ROL(Ako, 25);
+	Amu ^= Du;
+	BCo = ROL(Amu,  8);
+	Asa ^= Da;
+	BCu = ROL(Asa, 18);
+	Eka =   BCa ^((~BCe)&  BCi );
+	Eke =   BCe ^((~BCi)&  BCo );
+	Eki =   BCi ^((~BCo)&  BCu );
+	Eko =   BCo ^((~BCu)&  BCa );
+	Eku =   BCu ^((~BCa)&  BCe );
+
+	Abu ^= Du;
+	BCa = ROL(Abu, 27);
+	Aga ^= Da;
+	BCe = ROL(Aga, 36);
+	Ake ^= De;
+	BCi = ROL(Ake, 10);
+	Ami ^= Di;
+	BCo = ROL(Ami, 15);
+	Aso ^= Do;
+	BCu = ROL(Aso, 56);
+	Ema =   BCa ^((~BCe)&  BCi );
+	Eme =   BCe ^((~BCi)&  BCo );
+	Emi =   BCi ^((~BCo)&  BCu );
+	Emo =   BCo ^((~BCu)&  BCa );
+	Emu =   BCu ^((~BCa)&  BCe );
+
+	Abi ^= Di;
+	BCa = ROL(Abi, 62);
+	Ago ^= Do;
+	BCe = ROL(Ago, 55);
+	Aku ^= Du;
+	BCi = ROL(Aku, 39);
+	Ama ^= Da;
+	BCo = ROL(Ama, 41);
+	Ase ^= De;
+	BCu = ROL(Ase,  2);
+	Esa =   BCa ^((~BCe)&  BCi );
+	Ese =   BCe ^((~BCi)&  BCo );
+	Esi =   BCi ^((~BCo)&  BCu );
+	Eso =   BCo ^((~BCu)&  BCa );
+	Esu =   BCu ^((~BCa)&  BCe );
+
+	//    prepareTheta
+	BCa = Eba^Ega^Eka^Ema^Esa;
+	BCe = Ebe^Ege^Eke^Eme^Ese;
+	BCi = Ebi^Egi^Eki^Emi^Esi;
+	BCo = Ebo^Ego^Eko^Emo^Eso;
+	BCu = Ebu^Egu^Eku^Emu^Esu;
+
+	//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+	Da = BCu^ROL(BCe, 1);
+	De = BCa^ROL(BCi, 1);
+	Di = BCe^ROL(BCo, 1);
+	Do = BCi^ROL(BCu, 1);
+	Du = BCo^ROL(BCa, 1);
+
+	Eba ^= Da;
+	BCa = Eba;
+	Ege ^= De;
+	BCe = ROL(Ege, 44);
+	Eki ^= Di;
+	BCi = ROL(Eki, 43);
+	Emo ^= Do;
+	BCo = ROL(Emo, 21);
+	Esu ^= Du;
+	BCu = ROL(Esu, 14);
+	Aba =   BCa ^((~BCe)&  BCi );
+	Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[23]);
+	Abe =   BCe ^((~BCi)&  BCo );
+	Abi =   BCi ^((~BCo)&  BCu );
+	Abo =   BCo ^((~BCu)&  BCa );
+
+	if (validate) {
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+		g_out[3] = devectorize(Abo);
+		g_out[2] = devectorize(Abi);
+		g_out[1] = devectorize(Abe);
+		g_out[0] = devectorize(Aba);
+	}
+
+	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
+	// with atomic writes, locks or similar...
+	uint64_t *g_good64 = (uint64_t*)g_good;
+	if (devectorize(Abo) <=  ptarget64[3]) {
+		if (devectorize(Abo) < g_good64[3]) {
+			g_good64[3] = devectorize(Abo);
+			g_good64[2] = devectorize(Abi);
+			g_good64[1] = devectorize(Abe);
+			g_good64[0] = devectorize(Aba);
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+static std::map<int, uint32_t *> context_good[2];
+
+bool NV2Kernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	static bool init[MAX_DEVICES] = {false};
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
+
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void NV2Kernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	titan_crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
+
+
+//
+// Blakecoin related Keccak implementation (Keccak256)
+//
+
+typedef uint32_t sph_u32;
+#define SPH_C32(x) ((sph_u32)(x))
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define SPH_ROTL32(a, b) __funnelshift_l( a, a, b );
+#endif
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+__constant__ uint32_t pdata[20];
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static __device__ void
+cuda_sph_enc32be(void *dst, sph_u32 val)
+{
+	*(sph_u32 *)dst = cuda_sph_bswap32(val);
+}
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = input[0]; \
+		M1 = input[1]; \
+		M2 = input[2]; \
+		M3 = input[3]; \
+		M4 = input[4]; \
+		M5 = input[5]; \
+		M6 = input[6]; \
+		M7 = input[7]; \
+		M8 = input[8]; \
+		M9 = input[9]; \
+		MA = input[10]; \
+		MB = input[11]; \
+		MC = input[12]; \
+		MD = input[13]; \
+		ME = input[14]; \
+		MF = input[15]; \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+
+__global__ void titan_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
+{
+	uint32_t input[16];
+	uint64_t output[4];
+
+#pragma unroll 16
+	for (int i=0; i < 16; ++i) input[i] = pdata[i];
+
+	sph_u32 H0 = 0x6A09E667;
+	sph_u32 H1 = 0xBB67AE85;
+	sph_u32 H2 = 0x3C6EF372;
+	sph_u32 H3 = 0xA54FF53A;
+	sph_u32 H4 = 0x510E527F;
+	sph_u32 H5 = 0x9B05688C;
+	sph_u32 H6 = 0x1F83D9AB;
+	sph_u32 H7 = 0x5BE0CD19;
+	sph_u32 S0 = 0;
+	sph_u32 S1 = 0;
+	sph_u32 S2 = 0;
+	sph_u32 S3 = 0;
+	sph_u32 T0 = 0;
+	sph_u32 T1 = 0;
+	T0 = SPH_T32(T0 + 512);
+	COMPRESS32;
+
+#pragma unroll 3
+	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
+	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+	input[4] = 0x80000000;
+#pragma unroll 8
+	for (int i=5; i < 13; ++i) input[i] = 0;
+	input[13] = 0x00000001;
+	input[14] = T1;
+	input[15] = T0 + 128;
+
+	T0 = SPH_T32(T0 + 128);
+	COMPRESS32;
+
+	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
+	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
+	if (validate || output[3] <=  ptarget64[3])
+	{
+		// this data is only needed when we actually need to save the hashes
+		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
+		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
+		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
+		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
+		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
+		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
+	}
+
+	if (validate)
+	{
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+#pragma unroll 4
+		for (int i=0; i < 4; ++i) g_out[i] = output[i];
+	}
+
+	if (output[3] <=  ptarget64[3]) {
+		uint64_t *g_good64 = (uint64_t*)g_good;
+		if (output[3] < g_good64[3]) {
+			g_good64[3] = output[3];
+			g_good64[2] = output[2];
+			g_good64[1] = output[1];
+			g_good64[0] = output[0];
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+bool NV2Kernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	static bool init[MAX_DEVICES] = {false};
+	if (!init[thr_id])
+	{
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void NV2Kernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	titan_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
diff --git a/scrypt/nv_kernel2.h b/scrypt/nv_kernel2.h
new file mode 100644
index 0000000000..a67c65fdfb
--- /dev/null
+++ b/scrypt/nv_kernel2.h
@@ -0,0 +1,36 @@
+#ifndef NV2_KERNEL_H
+#define NV2_KERNEL_H
+
+#include "miner.h"
+#include <cuda_runtime.h>
+
+#include "salsa_kernel.h"
+
+class NV2Kernel : public KernelInterface
+{
+public:
+	NV2Kernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual char get_identifier() { return 'T'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 5; };
+
+	virtual int max_warps_per_block() { return 24; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool no_textures() { return true; }
+	virtual bool support_lookup_gap() { return true; }
+
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+
+	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
+
+	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
+	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
+};
+
+#endif // #ifndef NV2_KERNEL_H
diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu
new file mode 100644
index 0000000000..e82de02441
--- /dev/null
+++ b/scrypt/salsa_kernel.cu
@@ -0,0 +1,939 @@
+
+//
+// Contains the autotuning logic and some utility functions.
+// Note that all CUDA kernels have been moved to other .cu files
+//
+// NOTE: compile this .cu module for compute_20,sm_21 with --maxrregcount=64
+//
+
+#include <stdio.h>
+#include <map>
+#include <algorithm>
+#include <unistd.h> // usleep
+#include <ctype.h> // tolower
+#include "cuda_helper.h"
+
+#include "salsa_kernel.h"
+
+#include "titan_kernel.h"
+#include "fermi_kernel.h"
+#include "test_kernel.h"
+#include "nv_kernel.h"
+#include "nv_kernel2.h"
+#include "kepler_kernel.h"
+
+#include "miner.h"
+
+#if WIN32
+#ifdef _WIN64
+#define _64BIT 1
+#endif
+#else
+#if __x86_64__
+#define _64BIT 1
+#endif
+#endif
+
+#if _64BIT
+#define MAXMEM 0x300000000ULL  // 12 GB (the largest Kepler)
+#else
+#define MAXMEM  0xFFFFFFFFULL  // nearly 4 GB (32 bit limitations)
+#endif
+
+// require CUDA 5.5 driver API
+#define DMAJ 5
+#define DMIN 5
+
+// define some error checking macros
+#undef checkCudaErrors
+
+#if WIN32
+#define DELIMITER '/'
+#else
+#define DELIMITER '/'
+#endif
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess) \
+		applog(LOG_ERR, "GPU #%d: Err %d: %s (%s:%d)", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
+}
+
+// some globals containing pointers to device memory (for chunked allocation)
+// [MAX_DEVICES] indexes up to MAX_DEVICES threads (0...MAX_DEVICES-1)
+int       MAXWARPS[MAX_GPUS];
+uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64];          // NOTE: the *64 prevents buffer overflow for --keccak
+uint32_t  h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64];    //       with really large kernel launch configurations
+
+KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props)
+{
+	KernelInterface *kernel = NULL;
+	uint32_t N = (1UL << opt_nfactor+1); // not sure
+
+	if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192))
+	{
+		// high register count kernels (scrypt, low N-factor scrypt-jane)
+		if (props->major > 3 || (props->major == 3 && props->minor >= 5))
+			kernel = new NV2Kernel(); // we don't want this for Keccak though
+		else if (props->major == 3 && props->minor == 0)
+			kernel = new NVKernel();
+		else if (props->major == 2 || props->major == 1)
+			kernel = new FermiKernel();
+	}
+	else
+	{
+	   // low register count kernels (high N-factor scrypt-jane)
+	   if (props->major > 3 || (props->major == 3 && props->minor >= 5))
+			kernel = new TitanKernel();
+		else if (props->major == 3 && props->minor == 0)
+			kernel = new KeplerKernel();
+		else if (props->major == 2 || props->major == 1)
+			kernel = new TestKernel();
+	}
+	return kernel;
+}
+
+
+bool validate_config(char *config, int &b, int &w, KernelInterface **kernel = NULL, cudaDeviceProp *props = NULL)
+{
+	bool success = false;
+	char kernelid = ' ';
+	if (config != NULL)
+	{
+		if (config[0] == 'T' || config[0] == 'K' || config[0] == 'F' || config[0] == 'L' ||
+			config[0] == 't' || config[0] == 'k' || config[0] == 'f' ||
+			config[0] == 'Z' || config[0] == 'Y' || config[0] == 'X') {
+			kernelid = config[0];
+			config++;
+		}
+
+		if (config[0] >= '0' && config[0] <= '9')
+			if (sscanf(config, "%dx%d", &b, &w) == 2)
+				success = true;
+
+		if (success && kernel != NULL)
+		{
+			switch (kernelid)
+			{
+				case 'T': case 'Z': *kernel = new NV2Kernel(); break;
+				case 't':           *kernel = new TitanKernel(); break;
+				case 'K': case 'Y': *kernel = new NVKernel(); break;
+				case 'k':           *kernel = new KeplerKernel(); break;
+				case 'F': case 'L': *kernel = new FermiKernel(); break;
+				case 'f': case 'X': *kernel = new TestKernel(); break;
+				case ' ': // choose based on device architecture
+					*kernel = Best_Kernel_Heuristics(props);
+				break;
+			}
+		}
+	}
+	return success;
+}
+
+std::map<int, int> context_blocks;
+std::map<int, int> context_wpb;
+std::map<int, bool> context_concurrent;
+std::map<int, KernelInterface *> context_kernel;
+std::map<int, uint32_t *> context_idata[2];
+std::map<int, uint32_t *> context_odata[2];
+std::map<int, cudaStream_t> context_streams[2];
+std::map<int, uint32_t *> context_X[2];
+std::map<int, uint32_t *> context_H[2];
+std::map<int, cudaEvent_t> context_serialize[2];
+
+// for SHA256 hashing on GPU
+std::map<int, uint32_t *> context_tstate[2];
+std::map<int, uint32_t *> context_ostate[2];
+std::map<int, uint32_t *> context_hash[2];
+
+int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb);
+
+void cuda_shutdown(int thr_id)
+{
+	cudaDeviceSynchronize();
+	cudaDeviceReset();
+	cudaThreadExit();
+}
+
+int cuda_throughput(int thr_id)
+{
+	int GRID_BLOCKS, WARPS_PER_BLOCK;
+	if (context_blocks.find(thr_id) == context_blocks.end())
+	{
+#if 0
+		CUcontext ctx;
+		cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] );
+		cuCtxSetCurrent(ctx);
+#else
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
+		checkCudaErrors(cudaSetDevice(device_map[thr_id]));
+		checkCudaErrors(cudaFree(0));
+#endif
+
+		KernelInterface *kernel;
+		bool concurrent;
+		GRID_BLOCKS = find_optimal_blockcount(thr_id, kernel, concurrent, WARPS_PER_BLOCK);
+
+		if(GRID_BLOCKS == 0)
+			return 0;
+
+		unsigned int THREADS_PER_WU = kernel->threads_per_wu();
+		unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
+		unsigned int state_size = WU_PER_LAUNCH * sizeof(uint32_t) * 8;
+
+		// allocate device memory for scrypt_core inputs and outputs
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[1][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[1][thr_id] = tmp;
+
+		// allocate pinned host memory for scrypt hashes
+		checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[0][thr_id] = tmp;
+		checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[1][thr_id] = tmp;
+
+		if (IS_SCRYPT())
+		{
+			if (parallel < 2)
+			{
+				// allocate pinned host memory for scrypt_core input/output
+				checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
+				checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
+			}
+			else
+			{
+				// allocate tstate, ostate, scrypt hash device memory
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[1][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[1][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
+			}
+		}
+		else if (IS_SCRYPT_JANE())
+		{
+			// allocate pinned host memory for scrypt_core input/output
+			checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
+			checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
+
+			checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
+			checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
+		}
+
+		// create two CUDA streams
+		cudaStream_t tmp2;
+		checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[0][thr_id] = tmp2;
+		checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[1][thr_id] = tmp2;
+
+		// events used to serialize the kernel launches (we don't want any overlapping of kernels)
+		cudaEvent_t tmp4;
+		checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[0][thr_id] = tmp4;
+		checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[1][thr_id] = tmp4;
+		checkCudaErrors(cudaEventRecord(context_serialize[1][thr_id]));
+
+		context_kernel[thr_id] = kernel;
+		context_concurrent[thr_id] = concurrent;
+		context_blocks[thr_id] = GRID_BLOCKS;
+		context_wpb[thr_id] = WARPS_PER_BLOCK;
+	}
+
+	GRID_BLOCKS = context_blocks[thr_id];
+	WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	return WU_PER_LAUNCH;
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = {
+		{ 0x10, 8   }, // Tesla Generation (SM 1.0) G80 class
+		{ 0x11, 8   }, // Tesla Generation (SM 1.1) G8x class
+		{ 0x12, 8   }, // Tesla Generation (SM 1.2) G9x class
+		{ 0x13, 8   }, // Tesla Generation (SM 1.3) GT200 class
+		{ 0x20, 32  }, // Fermi Generation (SM 2.0) GF100 class
+		{ 0x21, 48  }, // Fermi Generation (SM 2.1) GF10x class
+		{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
+		{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
+		{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti
+		{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
+		{ -1, -1 },
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1)
+	{
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+
+	// If we don't find the values, we default use the previous one to run properly
+    applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
+	return 128;
+}
+
+#ifdef WIN32
+#include <windows.h>
+static int console_width() {
+	CONSOLE_SCREEN_BUFFER_INFO csbi;
+	GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+	return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+}
+#else
+static inline int console_width() {
+	return 999;
+}
+#endif
+
+int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &WARPS_PER_BLOCK)
+{
+	int cw = console_width();
+	int optimal_blocks = 0;
+
+	cudaDeviceProp props;
+	checkCudaErrors(cudaGetDeviceProperties(&props, device_map[thr_id]));
+	concurrent = (props.concurrentKernels > 0);
+
+	device_name[thr_id] = strdup(props.name);
+	applog(LOG_INFO, "GPU #%d: %s with SM %d.%d", device_map[thr_id], props.name, props.major, props.minor);
+
+	WARPS_PER_BLOCK = -1;
+
+	// if not specified, use interactive mode for devices that have the watchdog timer enabled
+	if (device_interactive[thr_id] == -1)
+		device_interactive[thr_id] = props.kernelExecTimeoutEnabled;
+
+	// turn off texture cache if not otherwise specified
+	if (device_texturecache[thr_id] == -1)
+		device_texturecache[thr_id] = 0;
+
+	// if not otherwise specified or required, turn single memory allocations off as they reduce
+	// the amount of memory that we can allocate on Windows Vista, 7 and 8 (WDDM driver model issue)
+	if (device_singlememory[thr_id] == -1) device_singlememory[thr_id] = 0;
+
+	// figure out which kernel implementation to use
+	if (!validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK, &kernel, &props)) {
+		kernel = NULL;
+		if (device_config[thr_id] != NULL) {
+				 if (device_config[thr_id][0] == 'T' || device_config[thr_id][0] == 'Z')
+				kernel = new NV2Kernel();
+			else if (device_config[thr_id][0] == 't')
+				kernel = new TitanKernel();
+			else if (device_config[thr_id][0] == 'K' || device_config[thr_id][0] == 'Y')
+				kernel = new NVKernel();
+			else if (device_config[thr_id][0] == 'k')
+				kernel = new KeplerKernel();
+			else if (device_config[thr_id][0] == 'F' || device_config[thr_id][0] == 'L')
+				kernel = new FermiKernel();
+			else if (device_config[thr_id][0] == 'f' || device_config[thr_id][0] == 'X')
+				kernel = new TestKernel();
+		}
+		if (kernel == NULL) kernel = Best_Kernel_Heuristics(&props);
+	}
+
+	if (kernel->get_major_version() > props.major || kernel->get_major_version() == props.major && kernel->get_minor_version() > props.minor)
+	{
+		applog(LOG_ERR, "GPU #%d: FATAL: the '%c' kernel requires %d.%d capability!", device_map[thr_id], kernel->get_identifier(), kernel->get_major_version(), kernel->get_minor_version());
+		return 0;
+	}
+
+	// set whatever cache configuration and shared memory bank mode the kernel prefers
+	checkCudaErrors(cudaDeviceSetCacheConfig(kernel->cache_config()));
+	checkCudaErrors(cudaDeviceSetSharedMemConfig(kernel->shared_mem_config()));
+
+	// some kernels (e.g. Titan) do not support the texture cache
+	if (kernel->no_textures() && device_texturecache[thr_id]) {
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel ignores the texture cache argument", device_map[thr_id], kernel->get_identifier());
+		device_texturecache[thr_id] = 0;
+	}
+
+	// Texture caching only works with single memory allocation
+	if (device_texturecache[thr_id]) device_singlememory[thr_id] = 1;
+
+	if (kernel->single_memory() && !device_singlememory[thr_id]) {
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel requires single memory allocation", device_map[thr_id], kernel->get_identifier());
+		device_singlememory[thr_id] = 1;
+	}
+
+	if (device_lookup_gap[thr_id] == 0) device_lookup_gap[thr_id] = 1;
+	if (!kernel->support_lookup_gap() && device_lookup_gap[thr_id] > 1)
+	{
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel does not support a lookup gap", device_map[thr_id], kernel->get_identifier());
+		device_lookup_gap[thr_id] = 1;
+	}
+
+	applog(LOG_INFO, "GPU #%d: interactive: %d, tex-cache: %d%s, single-alloc: %d", device_map[thr_id],
+		   (device_interactive[thr_id]  != 0) ? 1 : 0,
+		   (device_texturecache[thr_id] != 0) ? device_texturecache[thr_id] : 0, (device_texturecache[thr_id] != 0) ? "D" : "",
+		   (device_singlememory[thr_id] != 0) ? 1 : 0 );
+
+	// number of threads collaborating on one work unit (hash)
+	unsigned int THREADS_PER_WU = kernel->threads_per_wu();
+	unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
+	unsigned int BACKOFF = device_backoff[thr_id];
+	unsigned int N = (1 << (opt_nfactor+1));
+	double szPerWarp = (double)(SCRATCH * WU_PER_WARP * sizeof(uint32_t));
+	//applog(LOG_INFO, "WU_PER_WARP=%u, THREADS_PER_WU=%u, LOOKUP_GAP=%u, BACKOFF=%u, SCRATCH=%u", WU_PER_WARP, THREADS_PER_WU, LOOKUP_GAP, BACKOFF, SCRATCH);
+	applog(LOG_INFO, "GPU #%d: %d hashes / %.1f MB per warp.", device_map[thr_id], WU_PER_WARP, szPerWarp / (1024.0 * 1024.0));
+
+	// compute highest MAXWARPS numbers for kernels allowing cudaBindTexture to succeed
+	int MW_1D_4 = 134217728 / (SCRATCH * WU_PER_WARP / 4); // for uint4_t textures
+	int MW_1D_2 = 134217728 / (SCRATCH * WU_PER_WARP / 2); // for uint2_t textures
+	int MW_1D = kernel->get_texel_width() == 2 ? MW_1D_2 : MW_1D_4;
+
+	uint32_t *d_V = NULL;
+	if (device_singlememory[thr_id])
+	{
+		// if no launch config was specified, we simply
+		// allocate the single largest memory chunk on the device that we can get
+		if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) {
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+		}
+		else {
+			// compute no. of warps to allocate the largest number producing a single memory block
+			// PROBLEM: one some devices, ALL allocations will fail if the first one failed. This sucks.
+			size_t MEM_LIMIT = (size_t)min((unsigned long long)MAXMEM, (unsigned long long)props.totalGlobalMem);
+			int warpmax = (int)min((unsigned long long)TOTAL_WARP_LIMIT, (unsigned long long)(MEM_LIMIT / szPerWarp));
+
+			// run a bisection algorithm for memory allocation (way more reliable than the previous approach)
+			int best = 0;
+			int warp = (warpmax+1)/2;
+			int interval = (warpmax+1)/2;
+			while (interval > 0)
+			{
+				cudaGetLastError(); // clear the error state
+				cudaMalloc((void **)&d_V, (size_t)(szPerWarp * warp));
+				if (cudaGetLastError() == cudaSuccess) {
+					checkCudaErrors(cudaFree(d_V)); d_V = NULL;
+					if (warp > best) best = warp;
+					if (warp == warpmax) break;
+					interval = (interval+1)/2;
+					warp += interval;
+					if (warp > warpmax) warp = warpmax;
+				}
+				else
+				{
+					interval = interval/2;
+					warp -= interval;
+					if (warp < 1) warp = 1;
+				}
+			}
+			// back off a bit from the largest possible allocation size
+			MAXWARPS[thr_id] = ((100-BACKOFF)*best+50)/100;
+		}
+
+		// now allocate a buffer for determined MAXWARPS setting
+		cudaGetLastError(); // clear the error state
+		cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+		if (cudaGetLastError() == cudaSuccess) {
+			for (int i=0; i < MAXWARPS[thr_id]; ++i)
+				h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
+
+			if (device_texturecache[thr_id] == 1)
+			{
+				if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+				{
+					if ( optimal_blocks * WARPS_PER_BLOCK > MW_1D ) {
+						applog(LOG_ERR, "GPU #%d: '%s' exceeds limits for 1D cache. Using 2D cache instead.", device_map[thr_id], device_config[thr_id]);
+						device_texturecache[thr_id] = 2;
+					}
+				}
+				// bind linear memory to a 1D texture reference
+				if (kernel->get_texel_width() == 2)
+					kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_2) * sizeof(uint32_t));
+				else
+					kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_4) * sizeof(uint32_t));
+			}
+			else if (device_texturecache[thr_id] == 2)
+			{
+				// bind pitch linear memory to a 2D texture reference
+				if (kernel->get_texel_width() == 2)
+					kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+				else
+					kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+			}
+		}
+		else
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Launch config '%s' requires too much memory!", device_map[thr_id], device_config[thr_id]);
+			return 0;
+		}
+	}
+	else
+	{
+		if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+		else
+			MAXWARPS[thr_id] = TOTAL_WARP_LIMIT;
+
+		// chunked memory allocation up to device limits
+		int warp;
+		for (warp = 0; warp < MAXWARPS[thr_id]; ++warp) {
+			// work around partition camping problems by adding a random start address offset to each allocation
+			h_V_extra[thr_id][warp] = (props.major == 1) ? (16 * (rand()%(16384/16))) : 0;
+			cudaGetLastError(); // clear the error state
+			cudaMalloc((void **) &h_V[thr_id][warp], (SCRATCH * WU_PER_WARP + h_V_extra[thr_id][warp])*sizeof(uint32_t));
+			if (cudaGetLastError() == cudaSuccess) h_V[thr_id][warp] += h_V_extra[thr_id][warp];
+			else {
+				h_V_extra[thr_id][warp] = 0;
+
+				// back off by several warp allocations to have some breathing room
+				int remove = (BACKOFF*warp+50)/100;
+				for (int i=0; warp > 0 && i < remove; ++i) {
+					warp--;
+					checkCudaErrors(cudaFree(h_V[thr_id][warp]-h_V_extra[thr_id][warp]));
+					h_V[thr_id][warp] = NULL; h_V_extra[thr_id][warp] = 0;
+				}
+
+				break;
+			}
+		}
+		MAXWARPS[thr_id] = warp;
+	}
+	if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
+		kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
+	}
+
+	if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+	{
+		if (optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' requires too much memory.", device_map[thr_id], device_config[thr_id]);
+			return 0;
+		}
+
+		if (WARPS_PER_BLOCK > kernel->max_warps_per_block())
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' exceeds warp limit for '%c' kernel.", device_map[thr_id], device_config[thr_id], kernel->get_identifier());
+			return 0;
+		}
+	}
+	else
+	{
+		if (device_config[thr_id] != NULL && strcasecmp("auto", device_config[thr_id]))
+			applog(LOG_WARNING, "GPU #%d: Given launch config '%s' does not validate.", device_map[thr_id], device_config[thr_id]);
+
+		if (autotune)
+		{
+			applog(LOG_INFO, "GPU #%d: Performing auto-tuning, please wait 2 minutes...", device_map[thr_id]);
+
+			// allocate device memory
+			uint32_t *d_idata = NULL, *d_odata = NULL;
+			if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
+				unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32;
+				checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
+				checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
+
+				// pre-initialize some device memory
+				uint32_t *h_idata = (uint32_t*)malloc(mem_size);
+				for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method
+				checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
+				free(h_idata);
+			}
+#if 0
+			else if (opt_algo == ALGO_KECCAK) {
+				uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+				uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
+				kernel->prepare_keccak256(thr_id, pdata, ptarget);
+			} else if (opt_algo == ALGO_BLAKE) {
+				uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+				uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
+				kernel->prepare_blake256(thr_id, pdata, ptarget);
+			}
+#endif
+			double best_hash_sec = 0.0;
+			int best_wpb = 0;
+
+			// auto-tuning loop
+			{
+				// we want to have enough total warps for half the multiprocessors at least
+				// compute highest MAXWARPS number that we can support based on texture cache mode
+				int MINTW = props.multiProcessorCount / 2;
+				int MAXTW = (device_texturecache[thr_id] == 1) ? min(MAXWARPS[thr_id],MW_1D) : MAXWARPS[thr_id];
+
+				// we want to have blocks for half the multiprocessors at least
+				int MINB = props.multiProcessorCount / 2;
+				int MAXB = MAXTW;
+
+				double tmin = 0.05;
+
+				applog(LOG_INFO, "GPU #%d: maximum total warps (BxW): %d", (int) device_map[thr_id], MAXTW);
+
+				for (int GRID_BLOCKS = MINB; !abort_flag && GRID_BLOCKS <= MAXB; ++GRID_BLOCKS)
+				{
+					double Hash[32+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+					for (WARPS_PER_BLOCK = 1; !abort_flag && WARPS_PER_BLOCK <= kernel->max_warps_per_block(); ++WARPS_PER_BLOCK)
+					{
+						double hash_sec = 0;
+						if (GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW &&
+							GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW)
+						{
+							// setup execution parameters
+							dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+							dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+							struct timeval tv_start, tv_end;
+							double tdelta = 0;
+
+							checkCudaErrors(cudaDeviceSynchronize());
+							gettimeofday(&tv_start, NULL);
+							int repeat = 0;
+							do  // average several measurements for better exactness
+							{
+								if (IS_SCRYPT() || IS_SCRYPT_JANE())
+									kernel->run_kernel(
+										grid, threads, WARPS_PER_BLOCK, thr_id, NULL,
+										d_idata, d_odata, N, LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id]
+									);
+								if(cudaDeviceSynchronize() != cudaSuccess)
+									break;
+								++repeat;
+								gettimeofday(&tv_end, NULL);
+								// for a better result averaging, measure for at least 50ms (10ms for Keccak)
+							} while ((tdelta=(1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec))) < tmin);
+							if (cudaGetLastError() != cudaSuccess) continue;
+
+							tdelta /= repeat; // BUGFIX: this averaging over multiple measurements was missing
+
+							// for scrypt: in interactive mode only find launch configs where kernel launch times are short enough
+							// TODO: instead we could reduce the batchsize parameter to meet the launch time requirement.
+							if (IS_SCRYPT() && device_interactive[thr_id] && GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30)
+								if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2;
+
+							hash_sec = (double)WU_PER_LAUNCH / tdelta;
+							Hash[WARPS_PER_BLOCK] = hash_sec;
+							if (hash_sec > best_hash_sec) {
+								optimal_blocks = GRID_BLOCKS;
+								best_hash_sec = hash_sec;
+								best_wpb = WARPS_PER_BLOCK;
+							}
+						}
+					}
+skip2:              ;
+					if (opt_debug) {
+						if (GRID_BLOCKS == MINB) {
+							char line[512] = "    ";
+							for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
+								char tmp[16]; sprintf(tmp, i < 10 ? "   x%-2d" : "  x%-2d ", i);
+								strcat(line, tmp);
+								if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
+									strcat(line, "\n                          ");
+							}
+							applog(LOG_DEBUG, line);
+						}
+
+						char kMGT = ' '; bool flag;
+						for (int j=0; j < 4; ++j) {
+							flag=false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1000, i++);
+							if (flag)   for (int i=1; i<=kernel->max_warps_per_block(); Hash[i] /= 1000, i++);
+							else break;
+								 if (kMGT == ' ') kMGT = 'k';
+							else if (kMGT == 'k') kMGT = 'M';
+							else if (kMGT == 'M') kMGT = 'G';
+							else if (kMGT == 'G') kMGT = 'T';
+						}
+						const char *format = "%5.4f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1, i++); if (flag) format = "%5.3f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 10, i++); if (flag) format = "%5.2f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 100, i++); if (flag) format = "%5.1f%c";
+
+						char line[512]; sprintf(line, "%3d:", GRID_BLOCKS);
+						for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
+							char tmp[16];
+							if (Hash[i]>0)
+								sprintf(tmp, format, Hash[i], (i<kernel->max_warps_per_block())?'|':' ');
+							else
+								sprintf(tmp, "     %c", (i<kernel->max_warps_per_block())?'|':' ');
+							strcat(line, tmp);
+							if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
+								strcat(line, "\n                          ");
+						}
+						int n = strlen(line)-1; line[n++] = '|'; line[n++] = ' '; line[n++] = kMGT; line[n++] = '\0';
+						strcat(line, "H/s");
+						applog(LOG_DEBUG, line);
+					}
+				}
+skip:           ;
+			}
+
+			if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
+				checkCudaErrors(cudaFree(d_odata));
+				checkCudaErrors(cudaFree(d_idata));
+			}
+
+			WARPS_PER_BLOCK = best_wpb;
+			applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
+		}
+		else
+		{
+			// Heuristics to find a good kernel launch configuration
+
+			// base the initial block estimate on the number of multiprocessors
+			int device_cores = props.multiProcessorCount * _ConvertSMVer2Cores(props.major, props.minor);
+
+			// defaults, in case nothing else is chosen below
+			optimal_blocks = 4 * device_cores / WU_PER_WARP;
+			WARPS_PER_BLOCK = 2;
+
+			// Based on compute capability, pick a known good block x warp configuration.
+			if (props.major >= 3)
+			{
+				if (props.major == 3 && props.minor == 5) // GK110 (Tesla K20X, K20, GeForce GTX TITAN)
+				{
+					// TODO: what to do with Titan and Tesla K20(X)?
+					// for now, do the same as for GTX 660Ti (2GB)
+					optimal_blocks = (int)(optimal_blocks * 0.8809524);
+					WARPS_PER_BLOCK = 2;
+				}
+				else // GK104, GK106, GK107 ...
+				{
+					if (MAXWARPS[thr_id] > (int)(optimal_blocks * 1.7261905) * 2)
+					{
+						// this results in 290x2 configuration on GTX 660Ti (3GB)
+						// but it requires 3GB memory on the card!
+						optimal_blocks = (int)(optimal_blocks * 1.7261905);
+						WARPS_PER_BLOCK = 2;
+					}
+					else
+					{
+						// this results in 148x2 configuration on GTX 660Ti (2GB)
+						optimal_blocks = (int)(optimal_blocks * 0.8809524);
+						WARPS_PER_BLOCK = 2;
+					}
+				}
+			}
+			// 1st generation Fermi (compute 2.0) GF100, GF110
+			else if (props.major == 2 && props.minor == 0)
+			{
+				// this results in a 60x4 configuration on GTX 570
+				optimal_blocks = 4 * device_cores / WU_PER_WARP;
+				WARPS_PER_BLOCK = 4;
+			}
+			// 2nd generation Fermi (compute 2.1) GF104,106,108,114,116
+			else if (props.major == 2 && props.minor == 1)
+			{
+				// this results in a 56x2 configuration on GTX 460
+				optimal_blocks = props.multiProcessorCount * 8;
+				WARPS_PER_BLOCK = 2;
+			}
+
+			// in case we run out of memory with the automatically chosen configuration,
+			// first back off with WARPS_PER_BLOCK, then reduce optimal_blocks.
+			if (WARPS_PER_BLOCK==3 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+				WARPS_PER_BLOCK = 2;
+			while (optimal_blocks > 0 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+				optimal_blocks--;
+		}
+	}
+
+	applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", device_map[thr_id], kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
+
+	if (device_singlememory[thr_id])
+	{
+		if (MAXWARPS[thr_id] != optimal_blocks * WARPS_PER_BLOCK)
+		{
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+			if (device_texturecache[thr_id] == 1)
+				kernel->unbindtexture_1D();
+			else if (device_texturecache[thr_id] == 2)
+				kernel->unbindtexture_2D();
+			checkCudaErrors(cudaFree(d_V)); d_V = NULL;
+
+			cudaGetLastError(); // clear the error state
+			cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+			if (cudaGetLastError() == cudaSuccess) {
+				for (int i=0; i < MAXWARPS[thr_id]; ++i)
+					h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
+
+				if (device_texturecache[thr_id] == 1)
+				{
+					// bind linear memory to a 1D texture reference
+					if (kernel->get_texel_width() == 2)
+						kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+					else
+						kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+				}
+				else if (device_texturecache[thr_id] == 2)
+				{
+					// bind pitch linear memory to a 2D texture reference
+					if (kernel->get_texel_width() == 2)
+						kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+					else
+						kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+				}
+
+				// update pointers to scratch buffer in constant memory after reallocation
+				if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
+					kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
+				}
+			}
+			else
+			{
+				applog(LOG_ERR, "GPU #%d: Unable to allocate enough memory for launch config '%s'.", device_map[thr_id], device_config[thr_id]);
+			}
+		}
+	}
+	else
+	{
+		// back off unnecessary memory allocations to have some breathing room
+		while (MAXWARPS[thr_id] > 0 && MAXWARPS[thr_id] > optimal_blocks * WARPS_PER_BLOCK) {
+			(MAXWARPS[thr_id])--;
+			checkCudaErrors(cudaFree(h_V[thr_id][MAXWARPS[thr_id]]-h_V_extra[thr_id][MAXWARPS[thr_id]]));
+			h_V[thr_id][MAXWARPS[thr_id]] = NULL; h_V_extra[thr_id][MAXWARPS[thr_id]] = 0;
+		}
+	}
+
+	return optimal_blocks;
+}
+
+void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
+
+	// copy host memory to device
+	cudaMemcpyAsync(context_idata[stream][thr_id], X, mem_size, cudaMemcpyHostToDevice, context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_serialize(int thr_id, int stream)
+{
+	// if the device can concurrently execute multiple kernels, then we must
+	// wait for the serialization event recorded by the other stream
+	//if (context_concurrent[thr_id] || device_interactive[thr_id])
+		cudaStreamWaitEvent(context_streams[stream][thr_id], context_serialize[(stream+1)&1][thr_id], 0);
+}
+
+void cuda_scrypt_done(int thr_id, int stream)
+{
+	// record the serialization event in the current stream
+	cudaEventRecord(context_serialize[stream][thr_id], context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_flush(int thr_id, int stream)
+{
+	// flush the work queue (required for WDDM drivers)
+	cudaStreamSynchronize(context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
+
+	// setup execution parameters
+	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+	context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id, context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id], N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]);
+}
+
+bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
+{
+	return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget);
+}
+
+void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+
+	// setup execution parameters
+	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+	context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
+}
+
+bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
+{
+	return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget);
+}
+
+void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+
+	// setup execution parameters
+	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+	context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
+}
+
+void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32);
+
+	// copy result from device to host (asynchronously)
+	checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+}
+
+bool cuda_scrypt_sync(int thr_id, int stream)
+{
+	cudaError_t err;
+
+	if(device_interactive[thr_id] && !opt_benchmark)
+	{
+		// For devices that also do desktop rendering or compositing, we want to free up some time slots.
+		// That requires making a pause in work submission when there is no active task on the GPU,
+		// and Device Synchronize ensures that.
+
+		// this call was replaced by the loop below to workaround the high CPU usage issue
+		//err = cudaDeviceSynchronize();
+
+		while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady ||
+			  (err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady))
+			usleep(1000);
+
+		usleep(1000);
+	}
+	else
+	{
+		// this call was replaced by the loop below to workaround the high CPU usage issue
+		//err = cudaStreamSynchronize(context_streams[stream][thr_id]);
+
+		while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady)
+			usleep(1000);
+	}
+
+	if(err != cudaSuccess)
+	{
+		applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err));
+		return false;
+	}
+
+	return true;
+}
+
+uint32_t* cuda_transferbuffer(int thr_id, int stream)
+{
+	return context_X[stream][thr_id];
+}
+
+uint32_t* cuda_hashbuffer(int thr_id, int stream)
+{
+	return context_H[stream][thr_id];
+}
diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h
new file mode 100644
index 0000000000..f25fb01b9a
--- /dev/null
+++ b/scrypt/salsa_kernel.h
@@ -0,0 +1,135 @@
+#ifndef SALSA_KERNEL_H
+#define SALSA_KERNEL_H
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <malloc.h>
+#include <string.h>
+#include <cuda_runtime.h>
+
+#include "miner.h"
+
+#define MAX_DEVICES MAX_GPUS
+
+#define A_SCRYPT 0
+#define A_SCRYPT_JANE 1
+
+// from ccminer.cpp
+extern short device_map[MAX_GPUS];
+extern int device_interactive[MAX_GPUS];
+extern int device_batchsize[MAX_GPUS];
+extern int device_backoff[MAX_GPUS];
+extern int device_lookup_gap[MAX_GPUS];
+extern int device_texturecache[MAX_GPUS];
+extern int device_singlememory[MAX_GPUS];
+extern char *device_config[MAX_GPUS];
+extern char *device_name[MAX_GPUS];
+extern bool autotune;
+
+extern int opt_nfactor;
+extern char *jane_params;
+extern bool abort_flag;
+extern bool autotune;
+extern int parallel;
+
+extern void get_currentalgo(char* buf, int sz);
+
+typedef unsigned int uint32_t; // define this as 32 bit type derived from int
+
+static char algo[64] = { 0 };
+static __inline bool IS_SCRYPT() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt"); }
+static __inline bool IS_SCRYPT_JANE() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt-jane"); }
+
+// CUDA externals
+extern int cuda_num_devices();
+extern void cuda_shutdown(int thr_id);
+extern int cuda_throughput(int thr_id);
+
+extern uint32_t *cuda_transferbuffer(int thr_id, int stream);
+extern uint32_t *cuda_hashbuffer(int thr_id, int stream);
+
+extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream);
+extern void cuda_scrypt_serialize(int thr_id, int stream);
+extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N);
+extern void cuda_scrypt_done(int thr_id, int stream);
+extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
+extern bool cuda_scrypt_sync(int thr_id, int stream);
+extern void cuda_scrypt_flush(int thr_id, int stream);
+
+extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
+
+extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
+
+extern void computeGold(uint32_t *idata, uint32_t *reference, uchar *scratchpad);
+
+extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
+
+#ifdef __NVCC__
+extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
+extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
+#endif
+
+// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp
+
+#ifdef __NVCC__
+
+/**
+ * An pure virtual interface for a CUDA kernel implementation.
+ * TODO: encapsulate the kernel launch parameters in some kind of wrapper.
+ */
+class KernelInterface
+{
+public:
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0;
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0;
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; }
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; }
+	virtual bool unbindtexture_1D() { return true; }
+	virtual bool unbindtexture_2D() { return true; }
+
+	virtual char get_identifier() = 0;
+	virtual int get_major_version() { return 1; }
+	virtual int get_minor_version() { return 0; }
+	virtual int max_warps_per_block() = 0;
+	virtual int get_texel_width() = 0;
+	virtual bool no_textures() { return false; };
+	virtual bool single_memory() { return false; };
+	virtual int threads_per_wu() { return 1; }
+	virtual bool support_lookup_gap() { return false; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }
+
+	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
+		return default_prepare_keccak256(thr_id, host_pdata, ptarget);
+	}
+	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
+		default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
+	}
+
+	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
+		return default_prepare_blake256(thr_id, host_pdata, ptarget);
+	}
+	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
+		default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
+	}
+};
+
+// Not performing error checking is actually bad, but...
+#define checkCudaErrors(x) x
+#define getLastCudaError(x)
+
+#endif // #ifdef __NVCC__
+
+// Define work unit size
+#define TOTAL_WARP_LIMIT 4096
+#define WU_PER_WARP (32 / THREADS_PER_WU)
+#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)
+#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)
+
+// make scratchpad size dependent on N and LOOKUP_GAP
+#define SCRATCH   (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32)
+
+#endif // #ifndef SALSA_KERNEL_H
diff --git a/scrypt/scrypt-jane.h b/scrypt/scrypt-jane.h
new file mode 100644
index 0000000000..e8c270bf32
--- /dev/null
+++ b/scrypt/scrypt-jane.h
@@ -0,0 +1,29 @@
+#ifndef SCRYPT_JANE_H
+#define SCRYPT_JANE_H
+
+/*
+	Nfactor: Increases CPU & Memory Hardness
+	N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
+
+	rfactor: Increases Memory Hardness
+	r = (1 << rfactor): How large a chunk is
+
+	pfactor: Increases CPU Hardness
+	p = (1 << pfactor): Number of times to mix the main chunk
+
+	A block is the basic mixing unit (salsa/chacha block = 64 bytes)
+	A chunk is (2 * r) blocks
+
+	~Memory used = (N + 2) * ((2 * r) * block size)
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+typedef void (*scrypt_fatal_errorfn)(const char *msg);
+void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
+
+void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V);
+
+#endif /* SCRYPT_JANE_H */
diff --git a/scrypt/sha2.c b/scrypt/sha2.c
new file mode 100644
index 0000000000..0fb89796a7
--- /dev/null
+++ b/scrypt/sha2.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright 2011 ArtForz
+ * Copyright 2011-2013 pooler
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef WIN32
+#define __attribute__(x)
+#endif
+
+#if defined(__arm__) && defined(__APCS_32__)
+#define EXTERN_SHA256
+#endif
+
+static const uint32_t sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+void sha256_init(uint32_t *state)
+{
+	memcpy(state, sha256_h, 32);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+		S[(66 - i) % 8], S[(67 - i) % 8], \
+		S[(68 - i) % 8], S[(69 - i) % 8], \
+		S[(70 - i) % 8], S[(71 - i) % 8], \
+		W[i] + sha256_k[i])
+
+#ifndef EXTERN_SHA256
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if (swap) {
+		for (i = 0; i < 16; i++)
+			W[i] = swab32(block[i]);
+	} else
+		memcpy(W, block, 64);
+	for (i = 16; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+#endif /* EXTERN_SHA256 */
+
+
+static const uint32_t sha256d_hash1[16] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
+{
+	uint32_t S[16];
+	int i;
+
+	sha256_init(S);
+	sha256_transform(S, data, 0);
+	sha256_transform(S, data + 16, 0);
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	sha256_init(hash);
+	sha256_transform(hash, S, 0);
+	for (i = 0; i < 8; i++)
+		hash[i] = swab32(hash[i]);
+}
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
+{
+	uint32_t S[16], T[16];
+	int i, r;
+
+	sha256_init(S);
+	for (r = len; r > -9; r -= 64) {
+		if (r < 64)
+			memset(T, 0, 64);
+		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+		if (r >= 0 && r < 64)
+			((unsigned char *)T)[r] = 0x80;
+		for (i = 0; i < 16; i++)
+			T[i] = be32dec(T + i);
+		if (r < 56)
+			T[15] = 8 * len;
+		sha256_transform(S, T, 0);
+	}
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	sha256_init(T);
+	sha256_transform(T, S, 0);
+	for (i = 0; i < 8; i++)
+		be32enc((uint32_t *)hash + i, T[i]);
+}
+
+static inline void sha256d_preextend(uint32_t *W)
+{
+	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
+	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
+	W[18] = s1(W[16]) + W[11]             + W[ 2];
+	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
+	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
+	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
+	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
+	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
+	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
+	W[25] =                     s0(W[10]) + W[ 9];
+	W[26] =                     s0(W[11]) + W[10];
+	W[27] =                     s0(W[12]) + W[11];
+	W[28] =                     s0(W[13]) + W[12];
+	W[29] =                     s0(W[14]) + W[13];
+	W[30] =                     s0(W[15]) + W[14];
+	W[31] =                     s0(W[16]) + W[15];
+}
+
+static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
+{
+	uint32_t t0, t1;
+	RNDr(S, W, 0);
+	RNDr(S, W, 1);
+	RNDr(S, W, 2);
+}
+
+#ifdef EXTERN_SHA256
+
+void sha256d_ms(uint32_t *hash, uint32_t *W,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+#else
+
+static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
+	const uint32_t *midstate, const uint32_t *prehash)
+{
+	uint32_t S[64];
+	uint32_t t0, t1;
+	int i;
+
+	S[18] = W[18];
+	S[19] = W[19];
+	S[20] = W[20];
+	S[22] = W[22];
+	S[23] = W[23];
+	S[24] = W[24];
+	S[30] = W[30];
+	S[31] = W[31];
+
+	W[18] += s0(W[3]);
+	W[19] += W[3];
+	W[20] += s1(W[18]);
+	W[21]  = s1(W[19]);
+	W[22] += s1(W[20]);
+	W[23] += s1(W[21]);
+	W[24] += s1(W[22]);
+	W[25]  = s1(W[23]) + W[18];
+	W[26]  = s1(W[24]) + W[19];
+	W[27]  = s1(W[25]) + W[20];
+	W[28]  = s1(W[26]) + W[21];
+	W[29]  = s1(W[27]) + W[22];
+	W[30] += s1(W[28]) + W[23];
+	W[31] += s1(W[29]) + W[24];
+	for (i = 32; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	memcpy(S, prehash, 32);
+
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	for (i = 0; i < 8; i++)
+		S[i] += midstate[i];
+
+	W[18] = S[18];
+	W[19] = S[19];
+	W[20] = S[20];
+	W[22] = S[22];
+	W[23] = S[23];
+	W[24] = S[24];
+	W[30] = S[30];
+	W[31] = S[31];
+
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
+	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
+	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
+	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
+	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
+	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
+	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
+	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
+	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
+	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
+	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
+	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
+	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
+	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
+	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
+	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
+	for (i = 32; i < 60; i += 2) {
+		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+	}
+	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
+
+	sha256_init(hash);
+
+	RNDr(hash, S,  0);
+	RNDr(hash, S,  1);
+	RNDr(hash, S,  2);
+	RNDr(hash, S,  3);
+	RNDr(hash, S,  4);
+	RNDr(hash, S,  5);
+	RNDr(hash, S,  6);
+	RNDr(hash, S,  7);
+	RNDr(hash, S,  8);
+	RNDr(hash, S,  9);
+	RNDr(hash, S, 10);
+	RNDr(hash, S, 11);
+	RNDr(hash, S, 12);
+	RNDr(hash, S, 13);
+	RNDr(hash, S, 14);
+	RNDr(hash, S, 15);
+	RNDr(hash, S, 16);
+	RNDr(hash, S, 17);
+	RNDr(hash, S, 18);
+	RNDr(hash, S, 19);
+	RNDr(hash, S, 20);
+	RNDr(hash, S, 21);
+	RNDr(hash, S, 22);
+	RNDr(hash, S, 23);
+	RNDr(hash, S, 24);
+	RNDr(hash, S, 25);
+	RNDr(hash, S, 26);
+	RNDr(hash, S, 27);
+	RNDr(hash, S, 28);
+	RNDr(hash, S, 29);
+	RNDr(hash, S, 30);
+	RNDr(hash, S, 31);
+	RNDr(hash, S, 32);
+	RNDr(hash, S, 33);
+	RNDr(hash, S, 34);
+	RNDr(hash, S, 35);
+	RNDr(hash, S, 36);
+	RNDr(hash, S, 37);
+	RNDr(hash, S, 38);
+	RNDr(hash, S, 39);
+	RNDr(hash, S, 40);
+	RNDr(hash, S, 41);
+	RNDr(hash, S, 42);
+	RNDr(hash, S, 43);
+	RNDr(hash, S, 44);
+	RNDr(hash, S, 45);
+	RNDr(hash, S, 46);
+	RNDr(hash, S, 47);
+	RNDr(hash, S, 48);
+	RNDr(hash, S, 49);
+	RNDr(hash, S, 50);
+	RNDr(hash, S, 51);
+	RNDr(hash, S, 52);
+	RNDr(hash, S, 53);
+	RNDr(hash, S, 54);
+	RNDr(hash, S, 55);
+	RNDr(hash, S, 56);
+
+	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+			 + S[57] + sha256_k[57];
+	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+			 + S[58] + sha256_k[58];
+	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+			 + S[59] + sha256_k[59];
+	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+			 + S[60] + sha256_k[60]
+			 + sha256_h[7];
+}
+
+#endif /* EXTERN_SHA256 */
+
+#ifdef HAVE_SHA256_4WAY
+
+void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
+{
+		gettimeofday(tv_start, NULL);
+
+	uint32_t data[4 * 64] __attribute__((aligned(128)));
+	uint32_t hash[4 * 8] __attribute__((aligned(32)));
+	uint32_t midstate[4 * 8] __attribute__((aligned(32)));
+	uint32_t prehash[4 * 8] __attribute__((aligned(32)));
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	int i, j;
+
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+	for (i = 31; i >= 0; i--)
+		for (j = 0; j < 4; j++)
+			data[i * 4 + j] = data[i];
+
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+	for (i = 7; i >= 0; i--) {
+		for (j = 0; j < 4; j++) {
+			midstate[i * 4 + j] = midstate[i];
+			prehash[i * 4 + j] = prehash[i];
+		}
+	}
+
+	do {
+		for (i = 0; i < 4; i++)
+			data[4 * 3 + i] = ++n;
+
+		sha256d_ms_4way(hash, data, midstate, prehash);
+
+		for (i = 0; i < 4; i++) {
+			if (swab32(hash[4 * 7 + i]) <= Htarg) {
+				pdata[19] = data[4 * 3 + i];
+				sha256d_80_swap(hash, pdata);
+				if (fulltest(hash, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					gettimeofday(&tv_end, NULL);
+					return 1;
+				}
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	gettimeofday(&tv_end, NULL);
+	return 0;
+}
+
+#endif /* HAVE_SHA256_4WAY */
+
+#ifdef HAVE_SHA256_8WAY
+
+void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[8 * 64] __attribute__((aligned(128)));
+	uint32_t hash[8 * 8] __attribute__((aligned(32)));
+	uint32_t midstate[8 * 8] __attribute__((aligned(32)));
+	uint32_t prehash[8 * 8] __attribute__((aligned(32)));
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	int i, j;
+
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+	for (i = 31; i >= 0; i--)
+		for (j = 0; j < 8; j++)
+			data[i * 8 + j] = data[i];
+
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+	for (i = 7; i >= 0; i--) {
+		for (j = 0; j < 8; j++) {
+			midstate[i * 8 + j] = midstate[i];
+			prehash[i * 8 + j] = prehash[i];
+		}
+	}
+
+	do {
+		for (i = 0; i < 8; i++)
+			data[8 * 3 + i] = ++n;
+
+		sha256d_ms_8way(hash, data, midstate, prehash);
+
+		for (i = 0; i < 8; i++) {
+			if (swab32(hash[8 * 7 + i]) <= Htarg) {
+				pdata[19] = data[8 * 3 + i];
+				sha256d_80_swap(hash, pdata);
+				if (fulltest(hash, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+#endif /* HAVE_SHA256_8WAY */
+
+int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
+{
+	uint32_t data[64] __attribute__((aligned(128)));
+	uint32_t hash[8] __attribute__((aligned(32)));
+	uint32_t midstate[8] __attribute__((aligned(32)));
+	uint32_t prehash[8] __attribute__((aligned(32)));
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+#ifdef HAVE_SHA256_8WAY
+	if (sha256_use_8way())
+		return scanhash_sha256d_8way(thr_id, pdata, ptarget,
+			max_nonce, hashes_done);
+#endif
+#ifdef HAVE_SHA256_4WAY
+	if (sha256_use_4way())
+		return scanhash_sha256d_4way(thr_id, pdata, ptarget,
+			max_nonce, hashes_done);
+#endif
+
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+
+	do {
+		data[3] = ++n;
+		sha256d_ms(hash, data, midstate, prehash);
+		if (swab32(hash[7]) <= Htarg) {
+			pdata[19] = data[3];
+			sha256d_80_swap(hash, pdata);
+			if (fulltest(hash, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+				return 1;
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
diff --git a/scrypt/sha256.cu b/scrypt/sha256.cu
new file mode 100644
index 0000000000..5b4c808b76
--- /dev/null
+++ b/scrypt/sha256.cu
@@ -0,0 +1,441 @@
+//
+//  =============== SHA256 part on nVidia GPU ======================
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+
+#include "sha256.h"
+
+// define some error checking macros
+#undef checkCudaErrors
+
+#if WIN32
+#define DELIMITER '/'
+#else
+#define DELIMITER '/'
+#endif
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#define checkCudaErrors(x) { \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", (int) device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_tstate[2];
+extern std::map<int, uint32_t *> context_ostate[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+static const uint32_t host_sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t host_sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+		S[(66 - i) % 8], S[(67 - i) % 8], \
+		S[(68 - i) % 8], S[(69 - i) % 8], \
+		S[(70 - i) % 8], S[(71 - i) % 8], \
+		W[i] + sha256_k[i])
+
+static const uint32_t host_keypad[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+
+static const uint32_t host_innerpad[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+
+static const uint32_t host_outerpad[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+
+static const uint32_t host_finalblk[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+//
+// CUDA code
+//
+
+__constant__ uint32_t sha256_h[8];
+__constant__ uint32_t sha256_k[64];
+__constant__ uint32_t keypad[12];
+__constant__ uint32_t innerpad[11];
+__constant__ uint32_t outerpad[8];
+__constant__ uint32_t finalblk[16];
+__constant__ uint32_t pdata[20];
+__constant__ uint32_t midstate[8];
+
+__device__ void mycpy12(uint32_t *d, const uint32_t *s) {
+#pragma unroll 3
+	for (int k=0; k < 3; k++) d[k] = s[k];
+}
+
+__device__ void mycpy16(uint32_t *d, const uint32_t *s) {
+#pragma unroll 4
+	for (int k=0; k < 4; k++) d[k] = s[k];
+}
+
+__device__ void mycpy32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; k++) d[k] = s[k];
+}
+
+__device__ void mycpy44(uint32_t *d, const uint32_t *s) {
+#pragma unroll 11
+	for (int k=0; k < 11; k++) d[k] = s[k];
+}
+
+__device__ void mycpy48(uint32_t *d, const uint32_t *s) {
+#pragma unroll 12
+	for (int k=0; k < 12; k++) d[k] = s[k];
+}
+
+__device__ void mycpy64(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; k++) d[k] = s[k];
+}
+
+__device__ uint32_t cuda_swab32(uint32_t x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+__device__ void mycpy32_swab32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; k++) d[k] = cuda_swab32(s[k]);
+}
+
+__device__ void mycpy64_swab32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; k++) d[k] = cuda_swab32(s[k]);
+}
+
+__device__ void cuda_sha256_init(uint32_t *state)
+{
+	mycpy32(state, sha256_h);
+}
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state. Modified for lower register use.
+ */
+__device__ void cuda_sha256_transform(uint32_t *state, const uint32_t *block)
+{
+	uint32_t W[64]; // only 4 of these are accessed during each partial Mix
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Initialize working variables. */
+	mycpy32(S, state);
+
+	/* 2. Prepare message schedule W and Mix. */
+	mycpy16(W, block);
+	RNDr(S, W,  0); RNDr(S, W,  1); RNDr(S, W,  2); RNDr(S, W,  3);
+
+	mycpy16(W+4, block+4);
+	RNDr(S, W,  4); RNDr(S, W,  5); RNDr(S, W,  6); RNDr(S, W,  7);
+
+	mycpy16(W+8, block+8);
+	RNDr(S, W,  8); RNDr(S, W,  9); RNDr(S, W, 10); RNDr(S, W, 11);
+
+	mycpy16(W+12, block+12);
+	RNDr(S, W, 12); RNDr(S, W, 13); RNDr(S, W, 14); RNDr(S, W, 15);
+
+#pragma unroll 2
+	for (i = 16; i < 20; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 16); RNDr(S, W, 17); RNDr(S, W, 18); RNDr(S, W, 19);
+
+#pragma unroll 2
+	for (i = 20; i < 24; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 20); RNDr(S, W, 21); RNDr(S, W, 22); RNDr(S, W, 23);
+
+#pragma unroll 2
+	for (i = 24; i < 28; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 24); RNDr(S, W, 25); RNDr(S, W, 26); RNDr(S, W, 27);
+
+#pragma unroll 2
+	for (i = 28; i < 32; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 28); RNDr(S, W, 29); RNDr(S, W, 30); RNDr(S, W, 31);
+
+#pragma unroll 2
+	for (i = 32; i < 36; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 32); RNDr(S, W, 33); RNDr(S, W, 34); RNDr(S, W, 35);
+
+#pragma unroll 2
+	for (i = 36; i < 40; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 36); RNDr(S, W, 37); RNDr(S, W, 38); RNDr(S, W, 39);
+
+#pragma unroll 2
+	for (i = 40; i < 44; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 40); RNDr(S, W, 41); RNDr(S, W, 42); RNDr(S, W, 43);
+
+#pragma unroll 2
+	for (i = 44; i < 48; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 44); RNDr(S, W, 45); RNDr(S, W, 46); RNDr(S, W, 47);
+
+#pragma unroll 2
+	for (i = 48; i < 52; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 48); RNDr(S, W, 49); RNDr(S, W, 50); RNDr(S, W, 51);
+
+#pragma unroll 2
+	for (i = 52; i < 56; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 52); RNDr(S, W, 53); RNDr(S, W, 54); RNDr(S, W, 55);
+
+#pragma unroll 2
+	for (i = 56; i < 60; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 56); RNDr(S, W, 57); RNDr(S, W, 58); RNDr(S, W, 59);
+
+#pragma unroll 2
+	for (i = 60; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 60); RNDr(S, W, 61); RNDr(S, W, 62); RNDr(S, W, 63);
+
+	/* 3. Mix local working variables into global state */
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+//
+// HMAC SHA256 functions, modified to work with pdata and nonce directly
+//
+
+__device__ void cuda_HMAC_SHA256_80_init(uint32_t *tstate, uint32_t *ostate, uint32_t nonce)
+{
+	uint32_t ihash[8];
+	uint32_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	mycpy12(pad, pdata + 16);
+	pad[3] = nonce;
+	mycpy48(pad + 4, keypad);
+	cuda_sha256_transform(tstate, pad);
+	mycpy32(ihash, tstate);
+
+	cuda_sha256_init(ostate);
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+#pragma unroll 8
+	for (i=8; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	cuda_sha256_transform(ostate, pad);
+
+	cuda_sha256_init(tstate);
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+#pragma unroll 8
+	for (i=8; i < 16; i++)
+		pad[i] = 0x36363636;
+	cuda_sha256_transform(tstate, pad);
+}
+
+__device__ void cuda_PBKDF2_SHA256_80_128(const uint32_t *tstate,
+	const uint32_t *ostate, uint32_t *output, uint32_t nonce)
+{
+	uint32_t istate[8], ostate2[8];
+	uint32_t ibuf[16], obuf[16];
+
+	mycpy32(istate, tstate);
+	cuda_sha256_transform(istate, pdata);
+
+	mycpy12(ibuf, pdata + 16);
+	ibuf[3] = nonce;
+	ibuf[4] = 1;
+	mycpy44(ibuf + 5, innerpad);
+
+	mycpy32(obuf, istate);
+	mycpy32(obuf + 8, outerpad);
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output, ostate2);       // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 2;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+8, ostate2);     // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 3;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+16, ostate2);    // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 4;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+24, ostate2);    // TODO: coalescing would be desired
+}
+
+__global__ void cuda_pre_sha256(uint32_t g_inp[32], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t nonce)
+{
+	nonce        +=       (blockIdx.x * blockDim.x) + threadIdx.x;
+	g_inp        += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_tstate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_ostate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t tstate[8], ostate[8];
+	mycpy32(tstate, midstate);
+
+	cuda_HMAC_SHA256_80_init(tstate, ostate, nonce);
+
+	mycpy32(g_tstate_ext, tstate);            // TODO: coalescing would be desired
+	mycpy32(g_ostate_ext, ostate);            // TODO: coalescing would be desired
+
+	cuda_PBKDF2_SHA256_80_128(tstate, ostate, g_inp, nonce);
+}
+
+__global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t g_salt_ext[32])
+{
+	g_output     +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_tstate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_ostate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_salt_ext   += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t tstate[16];
+	mycpy32(tstate, g_tstate_ext);            // TODO: coalescing would be desired
+
+	uint32_t halfsalt[16];
+	mycpy64_swab32(halfsalt, g_salt_ext);     // TODO: coalescing would be desired
+	cuda_sha256_transform(tstate, halfsalt);
+	mycpy64_swab32(halfsalt, g_salt_ext+16);  // TODO: coalescing would be desired
+	cuda_sha256_transform(tstate, halfsalt);
+	cuda_sha256_transform(tstate, finalblk);
+
+	uint32_t buf[16];
+	mycpy32(buf, tstate);
+	mycpy32(buf + 8, outerpad);
+
+	uint32_t ostate[16];
+	mycpy32(ostate, g_ostate_ext);
+
+	cuda_sha256_transform(ostate, buf);
+	mycpy32_swab32(g_output, ostate);        // TODO: coalescing would be desired
+}
+
+//
+// callable host code to initialize constants and to call kernels
+//
+
+void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8])
+{
+	static bool init[8] = {false, false, false, false, false, false, false, false};
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(sha256_k, host_sha256_k, sizeof(host_sha256_k), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(keypad, host_keypad, sizeof(host_keypad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(innerpad, host_innerpad, sizeof(host_innerpad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(outerpad, host_outerpad, sizeof(host_outerpad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(finalblk, host_finalblk, sizeof(host_finalblk), 0, cudaMemcpyHostToDevice));
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(midstate, host_midstate, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+
+void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_pre_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], nonce);
+}
+
+void post_sha256(int thr_id, int stream, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_post_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_hash[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], context_odata[stream][thr_id]);
+}
diff --git a/scrypt/sha256.h b/scrypt/sha256.h
new file mode 100644
index 0000000000..cacb08f180
--- /dev/null
+++ b/scrypt/sha256.h
@@ -0,0 +1,10 @@
+#ifndef SHA256_H
+#define SHA256_H
+
+#include <stdint.h>
+
+extern "C" void prepare_sha256(int thr_id, uint32_t cpu_pdata[20], uint32_t cpu_midstate[8]);
+extern "C" void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput);
+extern "C" void post_sha256(int thr_id, int stream, int throughput);
+
+#endif // #ifndef SHA256_H
diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu
new file mode 100644
index 0000000000..f7552d34db
--- /dev/null
+++ b/scrypt/test_kernel.cu
@@ -0,0 +1,781 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ *
+ * The array notation for b[] and bx[] arrays was converted to uint4,
+ * in preparation for some experimental changes to memory access patterns.
+ * Also this kernel is going to be a testbed for adaptation to Fermi devices.
+ */
+
+// TODO: experiment with different memory access patterns in write/read_keys_direct functions
+// TODO: attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "test_kernel.h"
+
+#define TEXWIDTH 32768
+#define THREADS_PER_WU 4  // four threads per hash
+
+typedef enum
+{
+		ANDERSEN,
+		SIMPLE
+} MemoryAccess;
+
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+
+	if (SCHEME == ANDERSEN) {
+		uint4 t=b, t2;
+		extern __shared__ unsigned char shared[];
+		uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+		uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
+		uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 4)%32];
+		*s = bx.x; t2.x = *st;
+		*s = bx.y; t2.y = *st;
+		*s = bx.z; t2.z = *st;
+		*s = bx.w; t2.w = *st;
+		*s = start; int t2_start = *st + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME, int TEX_DIM> __device__  __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch;
+
+	if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		extern __shared__ unsigned char shared[];
+		uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+		uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
+		*s = start; int t2_start = tmp[threadIdx.x/32][(threadIdx.x + 4)%32] + 4;
+		if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
+		bool c = (threadIdx.x & 0x4);
+		if (TEX_DIM == 0) {
+				b  = *((uint4 *)(&scratch[c ? t2_start : start]));
+				bx = *((uint4 *)(&scratch[c ? start : t2_start]));
+		} else if (TEX_DIM == 1) {
+				b  = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
+				bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
+		} else if (TEX_DIM == 2) {
+				b  = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
+				bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
+		}
+		uint4 temp = b; b = (c ? bx : b); bx = (c ? temp : bx);
+		uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 28)%32];
+		*s = bx.x; bx.x = *st;
+		*s = bx.y; bx.y = *st;
+		*s = bx.z; bx.z = *st;
+		*s = bx.w; bx.w = *st;
+	} else {
+				 if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
+		else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
+		else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
+				 if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
+		else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
+		else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
+	}
+}
+
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx)
+{
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	*s = b.w; b.w = *s1;
+	*s = b.z; b.z = *s2;
+	*s = b.y; b.y = *s3;
+	uint32_t temp = b.y; b.y = b.w; b.w = temp;
+
+	*s = bx.w; bx.w = *s1;
+	*s = bx.z; bx.z = *s2;
+	*s = bx.y; bx.y = *s3;
+	temp = bx.y; bx.y = bx.w; bx.w = temp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
+	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
+	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
+	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
+	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
+	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
+	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
+	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+
+	primary_order_shuffle(b, bx);
+
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__ 
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*0 + thread_in_block%4];
+	b.y = B[key_offset + 4*1 + thread_in_block%4];
+	b.z = B[key_offset + 4*2 + thread_in_block%4];
+	b.w = B[key_offset + 4*3 + thread_in_block%4];
+	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
+	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
+	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
+	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	B[key_offset + 4*0 + thread_in_block%4] = b.x;
+	B[key_offset + 4*1 + thread_in_block%4] = b.y;
+	B[key_offset + 4*2 + thread_in_block%4] = b.z;
+	B[key_offset + 4*3 + thread_in_block%4] = b.w;
+	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
+	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
+	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
+	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+	}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+	}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		*s = x.y; x.y = *s3;
+		*s = x.w; x.w = *s1;
+		*s = x.z; x.z = *s2;
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		*s = x.w; x.w = *s3;
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		*s = x.y; x.y = *s3;
+		*s = x.w; x.w = *s1;
+		*s = x.z; x.z = *s2;
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		*s = x.w; x.w = *s3;
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s3;
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s3;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s1;
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s3;
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s3;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s1;
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3); break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void test_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void test_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void test_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
+		int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void test_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
+		int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
+		int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+		uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+
+TestKernel::TestKernel() : KernelInterface()
+{
+}
+
+bool TestKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool TestKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool TestKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool TestKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void TestKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// compute required shared memory per block for __shfl() emulation
+	size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t);
+
+	// make some constants available to kernel, update only initially and when changing
+	static int prev_N[MAX_DEVICES] = {0};
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_SCRATCH = SCRATCH;
+		uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+		uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+
+	unsigned int pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      test_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      test_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE())	test_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+		} else {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
diff --git a/scrypt/test_kernel.h b/scrypt/test_kernel.h
new file mode 100644
index 0000000000..e084f72a0d
--- /dev/null
+++ b/scrypt/test_kernel.h
@@ -0,0 +1,30 @@
+#ifndef TEST_KERNEL_H
+#define TEST_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class TestKernel : public KernelInterface
+{
+public:
+	TestKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'f'; };
+	virtual int get_major_version() { return 1; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef TEST_KERNEL_H
diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu
new file mode 100644
index 0000000000..fe12ea69be
--- /dev/null
+++ b/scrypt/titan_kernel.cu
@@ -0,0 +1,731 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ */
+
+//       attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include "cuda_runtime.h"
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "titan_kernel.h"
+
+#define THREADS_PER_WU 4  // four threads per hash
+
+typedef enum
+{
+	ANDERSEN,
+	SIMPLE
+} MemoryAccess;
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
+	return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread));
+}
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int target_thread = (threadIdx.x + 4)%32;
+		uint4 t=b, t2=__shfl(bx, target_thread);
+		int t2_start = __shfl((int)start, target_thread) + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
+		bool c = (threadIdx.x & 0x4);
+		b  = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
+		bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
+		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
+		bx = __shfl(bx, (threadIdx.x + 28)%32);
+	} else {
+		b = *((uint4 *)(&scratch[start]));
+		bx = *((uint4 *)(&scratch[start+16]));
+	}
+}
+
+__device__  __forceinline__
+void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	b[3] = __shfl((int)b[3], x1);
+	b[2] = __shfl((int)b[2], x2);
+	b[1] = __shfl((int)b[1], x3);
+	uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
+
+	bx[3] = __shfl((int)bx[3], x1);
+	bx[2] = __shfl((int)bx[2], x2);
+	bx[1] = __shfl((int)bx[1], x3);
+	tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
+}
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx) {
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	b.w = __shfl((int)b.w, x1);
+	b.z = __shfl((int)b.z, x2);
+	b.y = __shfl((int)b.y, x3);
+	uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
+
+	bx.w = __shfl((int)bx.w, x1);
+	bx.z = __shfl((int)bx.z, x2);
+	bx.y = __shfl((int)bx.y, x3);
+	tmp = bx.y; bx.y = bx.w; bx.w = tmp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
+	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
+	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
+	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
+	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
+	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
+	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
+	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+
+	primary_order_shuffle(b, bx);
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*0 + thread_in_block%4];
+	b.y = B[key_offset + 4*1 + thread_in_block%4];
+	b.z = B[key_offset + 4*2 + thread_in_block%4];
+	b.w = B[key_offset + 4*3 + thread_in_block%4];
+	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
+	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
+	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
+	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	B[key_offset + 4*0 + thread_in_block%4] = b.x;
+	B[key_offset + 4*1 + thread_in_block%4] = b.y;
+	B[key_offset + 4*2 + thread_in_block%4] = b.z;
+	B[key_offset + 4*3 + thread_in_block%4] = b.w;
+	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
+	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
+	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
+	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+	}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+	}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+	#define XOR_ROTATE_ADD(dst, s1, s2, amt) dst ^= ROTL(s1+s2, amt);
+#endif
+
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++)
+	{
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		x.y = __shfl((int)x.y, x3);
+		x.w = __shfl((int)x.w, x1);
+		x.z = __shfl((int)x.z, x2);
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll 4
+	for (int j = 0; j < 4; j++)
+	{
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		x.y = __shfl((int)x.y, x3);
+		x.w = __shfl((int)x.w, x1);
+		x.z = __shfl((int)x.z, x2);
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#if __CUDA_ARCH__ < 320
+	// Kepler (Compute 3.0)
+	#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+	#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { pt += ps; rt = ROTL(rt ^ pt,amt); }
+#endif
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+#pragma unroll 4
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x3);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x1);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) 
+	{
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x1);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl((int)x.y, x3);
+		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl((int)x.w, x1);
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3); break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME>(b, bx, start+32*pos);
+		while(loop--)
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+	else
+			load_key<ALGO>(d_odata, b, bx);
+
+	if (SCHEME == SIMPLE)
+	{
+		// better divergent thread handling submitted by nVidia engineers, but
+		// supposedly this does not run with the ANDERSEN memory access scheme
+		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int pos = j/LOOKUP_GAP;
+		int loop = -1;
+		uint4 t, tx;
+
+		int i = begin;
+		while(i < end)
+		{
+			if (loop == -1) {
+				j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+				pos = j/LOOKUP_GAP;
+				loop = j-pos*LOOKUP_GAP;
+				read_keys_direct<SCHEME>(t, tx, start+32*pos);
+			}
+			if (loop == 0) {
+				b ^= t; bx ^= tx;
+				t=b;tx=bx;
+			}
+
+			block_mixer<ALGO>(t, tx, x1, x2, x3);
+			if (loop == 0) {
+				b=t;bx=tx;
+				i++;
+			}
+			loop--;
+		}
+	}
+	else
+	{
+		// this is my original implementation, now used with the ANDERSEN
+		// memory access scheme only.
+		for (int i = begin; i < end; i++) {
+			int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+			int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+			uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos);
+			while (loop--)
+				block_mixer<ALGO>(t, tx, x1, x2, x3);
+			b ^= t; bx ^= tx;
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+		}
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+
+TitanKernel::TitanKernel() : KernelInterface()
+{
+}
+
+void TitanKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
+	uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static int prev_N[MAX_DEVICES] = {0};
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_SCRATCH = SCRATCH;
+		uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+		uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+
+	unsigned int pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      titan_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      titan_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+
+	pos = 0;
+	do {
+		if (LOOKUP_GAP == 1)  {
+			if (IS_SCRYPT())      titan_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      titan_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+
+	} while (pos < N);
+
+	return success;
+}
diff --git a/scrypt/titan_kernel.h b/scrypt/titan_kernel.h
new file mode 100644
index 0000000000..720b9a3a80
--- /dev/null
+++ b/scrypt/titan_kernel.h
@@ -0,0 +1,26 @@
+#ifndef TITAN_KERNEL_H
+#define TITAN_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class TitanKernel : public KernelInterface
+{
+public:
+	TitanKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual char get_identifier() { return 't'; }
+	virtual int get_major_version() { return 3; }
+	virtual int get_minor_version() { return 5; }
+
+	virtual int max_warps_per_block() { return 32; }
+	virtual int get_texel_width() { return 4; }
+	virtual bool no_textures() { return true; }
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef TITAN_KERNEL_H
diff --git a/util.cpp b/util.cpp
index 366cc3e5ef..d349774901 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1788,6 +1788,9 @@ void print_hash_tests(void)
 	qubithash(&hash[0], &buf[0]);
 	printpfx("qubit", hash);
 
+	scrypthash(&hash[0], &buf[0]);
+	printpfx("scrypt", hash);
+
 	skeincoinhash(&hash[0], &buf[0]);
 	printpfx("skein", hash);