Skip to content

Commit

Permalink
Some work on data alignment
Browse files Browse the repository at this point in the history
linux: add -march=native (we build it ourself) and some other flags

+ remove unused vars (seen with -Wall)
  • Loading branch information
tpruvot committed Nov 3, 2014
1 parent 93bb428 commit 5bc969f
Show file tree
Hide file tree
Showing 10 changed files with 65 additions and 29 deletions.
5 changes: 4 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
x11/s3.cu

if HAVE_WINDOWS
ccminer_SOURCES += compat/winansi.c
endif

ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME

nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
Expand Down
6 changes: 3 additions & 3 deletions blake32.cu
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds =
__host__
void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
{
uint32_t data[11];
uint32_t _ALIGN(64) data[11];
memcpy(data, midstate, 32);
data[8] = penddata[0];
data[9] = penddata[1];
Expand All @@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
const uint32_t first_nonce = pdata[19];
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint64_t targetHigh = ((uint64_t*)ptarget)[3]; // 0x00000000.0fffffff
uint32_t endiandata[20];
uint32_t _ALIGN(64) endiandata[20];
#if PRECALC64
uint32_t midstate[8];
uint32_t _ALIGN(64) midstate[8];
#else
uint32_t crcsum;
#endif
Expand Down
4 changes: 3 additions & 1 deletion configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@

#--ptxas-options=\"-v -dlcm=cg\""

CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"

CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda

20 changes: 9 additions & 11 deletions cpu-miner.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ struct work {
uint32_t scanned_to;
};

static struct work g_work;
static struct work _ALIGN(64) g_work;
static time_t g_work_time;
static pthread_mutex_t g_work_lock;

Expand Down Expand Up @@ -484,11 +484,10 @@ static int share_result(int result, const char *reason)
{
char s[345];
double hashrate;
int i, ret = 0;

hashrate = 0.;
pthread_mutex_lock(&stats_lock);
for (i = 0; i < opt_n_threads; i++)
for (int i = 0; i < opt_n_threads; i++)
hashrate += thr_hashrates[i];
result ? accepted_count++ : rejected_count++;
pthread_mutex_unlock(&stats_lock);
Expand Down Expand Up @@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work)
if (opt_protocol && rc) {
timeval_subtract(&diff, &tv_end, &tv_start);
/* show time because curl can be slower against versions/config */
applog(LOG_DEBUG, "got new work in %u µs",
diff.tv_sec * 1000000 + diff.tv_usec);
applog(LOG_DEBUG, "got new work in %.2f ms",
(1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
}

json_decref(val);
Expand All @@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc)

switch (wc->cmd) {
case WC_SUBMIT_WORK:
free(wc->u.work);
aligned_free(wc->u.work);
break;
default: /* do nothing */
break;
Expand All @@ -682,15 +681,15 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
struct work *ret_work;
int failures = 0;

ret_work = (struct work*)calloc(1, sizeof(*ret_work));
ret_work = (struct work*)aligned_calloc(sizeof(*ret_work));
if (!ret_work)
return false;

/* obtain new work from bitcoin via JSON-RPC */
while (!get_upstream_work(curl, ret_work)) {
if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
free(ret_work);
aligned_free(ret_work);
return false;
}

Expand All @@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)

/* send work to requesting thread */
if (!tq_push(wc->thr->q, ret_work))
free(ret_work);
aligned_free(ret_work);

return true;
}
Expand Down Expand Up @@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
if (!wc)
return false;

wc->u.work = (struct work *)malloc(sizeof(*work_in));
wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
if (!wc->u.work)
goto err_out;

Expand Down Expand Up @@ -946,7 +945,6 @@ static void *miner_thread(void *userdata)
struct work work;
uint32_t max_nonce;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
unsigned char *scratchbuf = NULL;
bool work_done = false;
bool extrajob = false;
char s[16];
Expand Down
2 changes: 1 addition & 1 deletion cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>

#if defined(_MSC_VER)
#if defined(__INTELLISENSE__)
/* reduce warnings */
#include <device_functions.h>
#include <device_launch_parameters.h>
Expand Down
9 changes: 2 additions & 7 deletions groestlcoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len)

extern "C" void groestlhash(void *state, const void *input)
{
// Tryout GPU-groestl

sph_groestl512_context ctx_groestl[2];
static unsigned char pblank[1];
uint32_t mask = 8;
uint32_t zero = 0;

// CPU-groestl
sph_groestl512_context ctx_groestl[2];

//these uint512 in the c++ source of the client are backed by an array of uint32
uint32_t hashA[16], hashB[16];
Expand Down
5 changes: 2 additions & 3 deletions hashlog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void)
extern "C" void hashlog_dump_job(char* jobid)
{
if (opt_debug) {
int deleted = 0;
uint64_t njobid = hextouint(jobid);
uint64_t keypfx = (njobid << 32);
uint32_t sz = tlastshares.size();
// uint32_t sz = tlastshares.size();
std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
while (i != tlastshares.end()) {
if ((keypfx & i->first) == keypfx) {
Expand All @@ -235,4 +234,4 @@ extern "C" void hashlog_dump_job(char* jobid)
i++;
}
}
}
}
12 changes: 12 additions & 0 deletions miner.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ void *alloca (size_t);
# endif
#endif

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
# define _ALIGN(x) __align__(x)
#elif _MSC_VER
# define _ALIGN(x) __declspec(align(x))
#else
# define _ALIGN(x) __attribute__ ((aligned(x)))
#endif

#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#define LOG_BLUE 0x10 /* unique value */
Expand Down Expand Up @@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x)
}
#endif

/* used for struct work */
void *aligned_calloc(int size);
void aligned_free(void *ptr);

#if JANSSON_MAJOR_VERSION >= 2
#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
#else
Expand Down
2 changes: 1 addition & 1 deletion sph/haval_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc,
unsigned ub, unsigned n, void *dst)
{
unsigned current,j;
unsigned current;
DSTATE;

#if SPH_64
Expand Down
29 changes: 28 additions & 1 deletion util.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
curl_easy_setopt(curl, CURLOPT_POST, 1);

if (opt_protocol)
applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);

upload_data.buf = rpc_req;
upload_data.len = strlen(rpc_req);
Expand Down Expand Up @@ -481,6 +481,33 @@ json_t *json_rpc_call(CURL *curl, const char *url,
return NULL;
}

/**
* Unlike malloc, calloc set the memory to zero
*/
void *aligned_calloc(int size)
{
const int ALIGN = 64; // cache line
#ifdef _MSC_VER
void* res = _aligned_malloc(size, ALIGN);
memset(res, 0, size);
return res;
#else
void *mem = calloc(1, size+ALIGN+sizeof(void*));
void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1));
ptr[-1] = mem;
return ptr;
#endif
}

void aligned_free(void *ptr)
{
#ifdef _MSC_VER
return _aligned_free(ptr);
#else
free(((void**)ptr)[-1]);
#endif
}

void cbin2hex(char *out, const char *in, size_t len)
{
if (out) {
Expand Down

0 comments on commit 5bc969f

Please sign in to comment.