Skip to content

Commit

Permalink
cubehash: improve a bit and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
tpruvot committed Jul 27, 2017
1 parent 08a3f34 commit 9c05304
Showing 1 changed file with 64 additions and 110 deletions.
174 changes: 64 additions & 110 deletions x11/cuda_x11_cubehash512.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "cuda_helper.h"

typedef unsigned char BitSequence;
#include <cuda_helper.h>
#include <cuda_vectors.h>

#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
Expand All @@ -18,17 +17,14 @@ typedef unsigned char BitSequence;

__device__ __constant__
static const uint32_t c_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
0x4167D83E, 0x3FEE2313, 0xC701CF8C,
0xCC39968E, 0x50AC5695, 0x4D42C787,
0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
0xA23911AE, 0xFCD398D9, 0x148FE485,
0x1B017BEF, 0xB6444532, 0x6A536159,
0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
0xBC796576, 0x1921C8F7, 0xE7989AF1,
0x7795D246, 0xD43E3B44
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
};

__device__ __forceinline__
Expand Down Expand Up @@ -149,107 +145,68 @@ static void rrounds(uint32_t x[2][2][2][2][2])
}

__device__ __forceinline__
static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
{
int k;
int l;
int m;
uint32_t *in = block;

#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][0][k][l][m] ^= *in++;
// read 32 bytes input from global mem with uint2 chunks
AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
}

__device__ __forceinline__
static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
{
int j;
int k;
int l;
int m;
uint32_t *out = hash;

#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
*out++ = x[0][j][k][l][m];
// used to write final hash to global mem
AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
}

__device__
void Init(uint32_t x[2][2][2][2][2])
{
int i,j,k,l,m;
#if 0
/* "the first three state words x_00000, x_00001, x_00010" */
/* "are set to the integers h/8, b, r respectively." */
/* "the remaining state words are set to 0." */
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = 0;
x[0][0][0][0][0] = 512/8;
x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
x[0][0][0][1][0] = CUBEHASH_ROUNDS;

/* "the state is then transformed invertibly through 10r identical rounds */
for (i = 0;i < 10;++i) rrounds(x);
#else
const uint32_t *iv = c_IV_512;

#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = *iv++;
#endif
}
#define Init(x) \
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);

__device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
block_tox((uint32_t*)data, x);
rrounds(x);
/* "xor the block into the first b bytes of the state" */
block_tox(data, x);
/* "and then transform the state invertibly through r identical rounds" */
rrounds(x);
}

__device__ __forceinline__
static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
{
int i;
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;

/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (int i = 0; i < 10; i++) rrounds(x);

/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (i = 0;i < 10;++i) rrounds(x);

/* "output the first h/8 bytes of the state" */
hash_fromx((uint32_t*)hashval, x);
/* "output the first h/8 bytes of the state" */
hash_fromx(hashval, x);
}


Expand All @@ -269,20 +226,17 @@ void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_
uint32_t x[2][2][2][2][2];
Init(x);

// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);

// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
Update32(x, &Hash[0]);
Update32(x, &Hash[8]);

// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
Update32(x, last);

Final(x, (BitSequence*)Hash);
Final(x, Hash);
}
}

Expand Down Expand Up @@ -332,12 +286,12 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
Update32(x, (const BitSequence*)message);
Update32(x, message);

// second 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
Update32(x, (const BitSequence*)message);
Update32(x, message);

// last 16 bytes + Padding
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
Expand All @@ -346,9 +300,9 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
message[5] = 0;
message[6] = 0;
message[7] = 0;
Update32(x, (const BitSequence*)message);
Update32(x, message);

BitSequence* output = (BitSequence*) (&g_outhash[(size_t)8 * thread]);
uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
Final(x, output);
}
}
Expand Down

0 comments on commit 9c05304

Please sign in to comment.