forked from QINZHAOYU/CudaSteps
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
266 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
#include "../common/error.cuh" | ||
#include "../common/floats.hpp" | ||
#include "../common/clock.cuh" | ||
#include <fstream> | ||
#include <regex> | ||
#include <string> | ||
#include <vector> | ||
|
||
|
||
void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y); | ||
void write_data(const std::string &fstr, const int *NL, const int N, const int M); | ||
void find_neighbor(int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real minDis); | ||
__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real mindDis); | ||
__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real minDis); | ||
|
||
|
||
int main() | ||
{ | ||
cout << FLOAT_PREC << endl; | ||
|
||
std::string fstr = "xy.txt"; | ||
std::string fout = "result.txt"; | ||
std::vector<real> x, y; | ||
read_data(fstr, x, y); | ||
|
||
int N = x.size(), M = 10; | ||
real minDis = 1.9*1.9; | ||
|
||
int *NN = new int[N]; | ||
int *NL = new int[N*M]; | ||
for (int i = 0; i < N; ++i) | ||
{ | ||
NN[i] = 0; | ||
for (int j = 0; j < M; ++j) | ||
{ | ||
NL[i*M + j] = -1; | ||
} | ||
} | ||
|
||
int *d_NN, *d_NL; | ||
CHECK(cudaMalloc(&d_NN, N*sizeof(int))); | ||
CHECK(cudaMalloc(&d_NL, N*M*sizeof(int))); | ||
real *d_x, *d_y; | ||
CHECK(cudaMalloc(&d_x, N*sizeof(real))); | ||
CHECK(cudaMalloc(&d_y, N*sizeof(real))); | ||
|
||
cppClockStart | ||
|
||
find_neighbor(NN, NL, x.data(), y.data(), N, M, minDis); | ||
// write_data(fout, NL, N, M); | ||
cppClockCurr | ||
|
||
cudaClockStart | ||
|
||
CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault)); | ||
CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault)); | ||
|
||
int block_size = 128; | ||
int grid_size = (N + block_size - 1)/block_size; | ||
find_neighbor_atomic<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis); | ||
|
||
CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault)); | ||
CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault)); | ||
// write_data(fout, NL, N, M); | ||
|
||
cudaClockCurr | ||
|
||
CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault)); | ||
CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault)); | ||
find_neighbor_gpu<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis); | ||
CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault)); | ||
CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault)); | ||
|
||
cudaClockCurr | ||
|
||
write_data(fout, NL, N, M); | ||
|
||
delete[] NN; | ||
delete[] NL; | ||
CHECK(cudaFree(d_NN)); | ||
CHECK(cudaFree(d_NL)); | ||
CHECK(cudaFree(d_x)); | ||
CHECK(cudaFree(d_y)); | ||
|
||
return 0; | ||
} | ||
|
||
|
||
void find_neighbor(int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real minDis) | ||
{ | ||
for (int i = 0; i < N; ++i) | ||
{ | ||
NN[i] = 0; | ||
} | ||
|
||
for (int i = 0; i < N; ++i) | ||
{ | ||
for (int j = i + 1; j < N; ++j) | ||
{ | ||
real dx = x[j] - x[i]; | ||
real dy = y[j] - y[i]; | ||
real dis = dx * dx + dy * dy; | ||
if (dis < minDis) // 比较平方,减少计算量。 | ||
{ | ||
NL[i*M + NN[i]] = j; // 一维数组存放二维数据。 | ||
NN[i] ++; | ||
NL[j*M + NN[j]] = i; // 省去一般的判断。 | ||
NN[j]++; | ||
} | ||
} | ||
} | ||
} | ||
|
||
__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real minDis) | ||
{ | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
|
||
if (i < N) | ||
{ | ||
int count = 0; // 寄存器变量,减少对全局变量NN的访问。 | ||
for (int j = 0; j < N; ++j) // 访问次数 N*N,性能降低。 | ||
{ | ||
real dx = x[j] - x[i]; | ||
real dy = y[j] - y[i]; | ||
real dis = dx * dx + dy * dy; | ||
|
||
if (dis < minDis && i != j) // 距离判断优先,提高“假”的命中率。 | ||
{ | ||
// 修改了全局内存NL的数据排列方式,实现合并访问(i 与 threadIdx.x的变化步调一致)。 | ||
// ??? | ||
NL[(count++) * N + i] = j; | ||
} | ||
} | ||
|
||
NN[i] = count; | ||
} | ||
} | ||
|
||
__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y, | ||
const int N, const int M, | ||
const real minDis) | ||
{ | ||
// 将 cpu 版本的第一层循环展开,一个线程对应一个原子操作。 | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
|
||
if (i < N) | ||
{ | ||
NN[i] = 0; | ||
|
||
for (int j = i + 1; j < N; ++j) | ||
{ | ||
real dx = x[j] - x[i]; | ||
real dy = y[j] - y[i]; | ||
real dis = dx * dx + dy*dy; | ||
if (dis < minDis) | ||
{ | ||
// 原子函数提高的性能,但是在NL中产生了一定的随机性,不便于后期调试。 | ||
int old_i_num = atomicAdd(&NN[i], 1); // 返回值为旧值,当前线程对应点的邻居数 | ||
NL[i*M + old_i_num] = j; // 当前线程对应点的新邻居 | ||
int old_j_num = atomicAdd(&NN[j], 1); // 返回值为旧值,当前邻居点的邻居数 | ||
NL[j*M + old_j_num] = i; // 当前邻居点的新邻居 | ||
} | ||
} | ||
} | ||
} | ||
|
||
void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y) | ||
{ | ||
x.clear(); | ||
y.clear(); | ||
|
||
std::fstream reader(fstr, std::ios::in); | ||
if (!reader.is_open()) | ||
{ | ||
std::cerr << "data file open failed.\n"; | ||
return; | ||
} | ||
|
||
std::regex re{"[\\s,]+"}; | ||
std::string line; | ||
while(std::getline(reader, line)) | ||
{ | ||
std::vector<std::string> arr{std::sregex_token_iterator(line.begin(), line.end(), re, -1), | ||
std::sregex_token_iterator()}; | ||
|
||
if (arr.size() < 2 || arr[0].find("#") != std::string::npos) | ||
{ | ||
continue; | ||
} | ||
|
||
x.push_back(stod(arr[0])); | ||
y.push_back(stod(arr[1])); | ||
} | ||
} | ||
|
||
void write_data(const std::string &fstr, const int *NL, const int N, const int M) | ||
{ | ||
std::fstream writer(fstr, std::ios::out); | ||
if (!writer.is_open()) | ||
{ | ||
std::cerr << "result file open failed.\n"; | ||
return; | ||
} | ||
|
||
for (int i = 0; i < N; ++i) | ||
{ | ||
writer << i << "\t"; | ||
for (int j = 0; j < M; ++j) | ||
{ | ||
int ind = NL[i*M + j]; | ||
if (ind >= 0) | ||
{ | ||
writer << ind << "\t"; | ||
} | ||
} | ||
|
||
writer << endl; | ||
} | ||
} | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|