-
Notifications
You must be signed in to change notification settings - Fork 2
/
CRC_polynomial_cuda_wrapper.h
85 lines (68 loc) · 2.27 KB
/
CRC_polynomial_cuda_wrapper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <vector>
#include <iostream>
#include <thread>
#include <cassert>
#include <stdint.h>
#include "cuda.cu"
using std::min;
using std::vector;
using std::thread;
using std::cout;
using std::endl;
using std::ref;
#ifndef CRC_polynomial_cuda_wrapper
#define CRC_polynomial_cuda_wrapper
class params {
public:
vector<uint16_t> solutions;
uint64_t start, end;
int block_dim, id;
};
template<int da, int dc>
void CRC_polynomial_cuda_t2_wrapper_thread(params& p, const size_t n) {
uint64_t grid_dim = (p.end-p.start)/p.block_dim + 1;
grid_dim = min(grid_dim, (1ul<<31)/n/(da+dc));
uint64_t total_threads = p.block_dim * grid_dim;
cout << "Started thread: " << p.id << endl;
if (p.id+1==n)
cout << "(blockDim, gridDim): (" << p.block_dim << ", " << grid_dim << ")" << endl;
cudaStream_t stream;
cudaStreamCreate(&stream);
bool *r, *rh=new bool[total_threads]();
cudaMalloc(&r, sizeof(bool)*total_threads);
for (uint64_t k=p.start; k<p.end; k+=total_threads) {
CRC_polynomial_cuda_t2<da,dc><<<grid_dim,p.block_dim,0,stream>>>(k,p.end,r);
// cudaDeviceSynchronize();
cudaMemcpyAsync(rh, r, total_threads, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
for (uint64_t j{0}; j<total_threads; ++j) {
if (rh[j]) p.solutions.push_back(k+j);
}
}
delete[] rh;
cudaFree(r);
cudaStreamDestroy(stream);
cout << "terminated thread: " << p.id << endl;
}
template<int da, int dc>
vector<uint64_t> CRC_polynomial_cuda_t2_wrapper() {
assert(dc < 64);
constexpr int blockDim{1<<7}; // threads per block
const size_t n = std::thread::hardware_concurrency();
cout << "Allocating resources for: " << n << " threads." << endl;
vector<thread> threads(n);
vector<params> p(n);
for (size_t i{0}, d{(1ul<<(dc+1))/n}; i<n; ++i) {
p[i].id = i;
p[i].start = i*d;
p[i].end = (i+1 == n) ? (1ul<<(dc+1)) : (p[i].start + d);
p[i].block_dim = blockDim;
threads[i] = thread(CRC_polynomial_cuda_t2_wrapper_thread<da,dc>, ref(p[i]), n);
}
for (thread& t : threads) t.join();
vector<uint64_t> Solutions;
for (params& param : p)
Solutions.insert(Solutions.end(), param.solutions.begin(), param.solutions.end());
return std::move(Solutions);
}
#endif