add capter9; has unknown code;

kounowa · Dec 1, 2021 · 1557fe9 · 1557fe9
1 parent 2f2190f
commit 1557fe9
Show file tree

Hide file tree

Showing 3 changed files with 266 additions and 0 deletions.
diff --git a/ReadMe.md b/ReadMe.md
@@ -12,6 +12,7 @@ CUDA gpu 编程学习，基于 《CUDA 编程——基础与实践》（樊哲
 6. [CUDA 内存组织](./capter6/ReadMe.md)
 7. [全局内存的合理使用](./capter7/ReadMe.md)
 8. [共享内存的合理使用](./capter8/ReadMe.md)
+9. [原子函数的合理使用](./capter9/ReadMe.md)
 
 
 CUDA 官方文档：  

diff --git a/capter9/ReadMe.md b/capter9/ReadMe.md
@@ -11,9 +11,37 @@ cuda 中，一个线程的原子操作可以在不受其他线程的任何操作
 1. 用另一个核函数将较短的数组进一步归约；
 2. 在核函数末尾利用原子函数进行归约。
 
+在代码实现中：
+1. 原子函数 `atomicAdd(·)`执行数组的一次完整的读-写操作；
+2. 传给 `cudaMemcpy(·)` 的主机内存可以是栈内存，也可以是堆内存；
+3. 主机函数可以和设备函数同名，但要遵循重载原则（参数列表不一致）。
 
+------
+
+## 原子函数
+
+原子函数对其第一个参数指向的数据进行一次“读-写-改”的原子操作，是不可分割的操作。  
+第一个参数可以指向全局内存，也可以指向共享内存。  
+
+对所有参与的线程来说，原子操作是一个线程一个线程轮流进行的，没有明确的次序。  
+原子函数没有同步功能。
+
+原子函数的返回值为所指地址的旧值。
+
++ 加法： `T atomicAdd(T *address, T val)`；
++ 减法： `T atomicSub(T *address, T val)`；
++ 交换： `T atomicExch(T *address, T val)`;
++ 最小值： `T atomicMin(T *address, T val)`；
++ 最大值： `T atomicMax(T *address, T val)`；
++ 自增： `T atomicInc(T *address, T val)`；
++ 自减： `T atomicDec(T *address, T val)`；
++ 比较交换： `T atomicCAS(T *address, T compare, T val)`；  
 
 ------
 
+## 邻居列表
 
+两个粒子互为邻居的判断：他们的距离不大于一个给定的截断距离 rc。  
+基本算法： 对每一个给定的粒子，通过比较它与所有其他粒子的距离来判断相应粒子对是否互为邻居。
 
+------
diff --git a/capter9/neighbor.cu b/capter9/neighbor.cu
@@ -0,0 +1,237 @@
+#include "../common/error.cuh"
+#include "../common/floats.hpp"
+#include "../common/clock.cuh"
+#include <fstream>
+#include <regex>
+#include <string>
+#include <vector>
+
+
+void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y);
+void write_data(const std::string &fstr, const int *NL, const int N, const int M);
+void find_neighbor(int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real minDis);
+__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real mindDis);
+__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real minDis);
+
+
+int main()
+{
+    cout << FLOAT_PREC << endl;
+
+    std::string fstr = "xy.txt";
+    std::string fout = "result.txt";
+    std::vector<real> x, y;
+    read_data(fstr, x, y);
+
+    int N = x.size(), M = 10;
+    real minDis = 1.9*1.9;
+
+    int *NN = new int[N];
+    int *NL = new int[N*M];
+    for (int i = 0; i < N; ++i)
+    {
+        NN[i] = 0;
+        for (int j = 0; j < M; ++j)
+        {
+            NL[i*M + j] = -1;
+        }
+    }
+
+    int *d_NN, *d_NL;
+    CHECK(cudaMalloc(&d_NN, N*sizeof(int)));
+    CHECK(cudaMalloc(&d_NL, N*M*sizeof(int)));
+    real *d_x, *d_y;
+    CHECK(cudaMalloc(&d_x, N*sizeof(real)));
+    CHECK(cudaMalloc(&d_y, N*sizeof(real)));
+
+    cppClockStart
+
+    find_neighbor(NN, NL, x.data(), y.data(), N, M, minDis);
+    // write_data(fout, NL, N, M);
+    cppClockCurr
+
+    cudaClockStart
+
+    CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault));
+
+    int block_size = 128;
+    int grid_size = (N + block_size - 1)/block_size;
+    find_neighbor_atomic<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis);
+
+    CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault));
+    // write_data(fout, NL, N, M);
+
+    cudaClockCurr
+
+    CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault));
+    find_neighbor_gpu<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis);
+    CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault));
+    CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault));
+
+    cudaClockCurr    
+
+    write_data(fout, NL, N, M);
+
+    delete[] NN;
+    delete[] NL;
+    CHECK(cudaFree(d_NN));
+    CHECK(cudaFree(d_NL));
+    CHECK(cudaFree(d_x));
+    CHECK(cudaFree(d_y));
+
+    return 0;
+}
+
+
+void find_neighbor(int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real minDis)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        NN[i] = 0;
+    }
+
+    for (int i = 0; i < N; ++i)
+    {
+        for (int j = i + 1; j < N; ++j)
+        {
+            real dx = x[j] - x[i];
+            real dy = y[j] - y[i];
+            real dis = dx * dx + dy * dy;
+            if (dis < minDis) // 比较平方，减少计算量。
+            {
+                NL[i*M + NN[i]] = j;  // 一维数组存放二维数据。
+                NN[i] ++;
+                NL[j*M + NN[j]] = i;  // 省去一般的判断。
+                NN[j]++;
+            }
+        }
+    }
+}
+
+__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real minDis)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < N)
+    {
+        int count = 0;  // 寄存器变量，减少对全局变量NN的访问。
+        for (int j = 0; j < N; ++j)  // 访问次数 N*N，性能降低。
+        {
+            real dx = x[j] - x[i];
+            real dy = y[j] - y[i];
+            real dis = dx * dx + dy * dy;
+
+            if (dis < minDis && i != j) // 距离判断优先，提高“假”的命中率。
+            {
+                // 修改了全局内存NL的数据排列方式，实现合并访问（i 与 threadIdx.x的变化步调一致）。
+                // ???
+                NL[(count++) * N + i] = j;
+            }
+        }
+
+        NN[i] = count;
+    }
+}
+
+__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y, 
+    const int N, const int M, 
+    const real minDis)
+{
+    // 将 cpu 版本的第一层循环展开，一个线程对应一个原子操作。
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < N)
+    {
+        NN[i] = 0;
+
+        for (int j = i + 1; j < N; ++j)
+        {
+            real dx = x[j] - x[i];
+            real dy = y[j] - y[i];
+            real dis = dx * dx + dy*dy;
+            if (dis < minDis)
+            {
+                // 原子函数提高的性能，但是在NL中产生了一定的随机性，不便于后期调试。
+                int old_i_num = atomicAdd(&NN[i], 1);  // 返回值为旧值，当前线程对应点的邻居数
+                NL[i*M + old_i_num] = j;  // 当前线程对应点的新邻居
+                int old_j_num = atomicAdd(&NN[j], 1);  // 返回值为旧值，当前邻居点的邻居数
+                NL[j*M + old_j_num] = i;  // 当前邻居点的新邻居
+            }
+        }
+    }
+}
+
+void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y)
+{
+    x.clear();
+    y.clear();
+
+    std::fstream reader(fstr, std::ios::in);
+    if (!reader.is_open())
+    {
+        std::cerr << "data file open failed.\n";
+        return;
+    }
+
+    std::regex re{"[\\s,]+"};
+    std::string line;
+    while(std::getline(reader, line))
+    {
+       std::vector<std::string> arr{std::sregex_token_iterator(line.begin(), line.end(), re, -1), 
+           std::sregex_token_iterator()};
+
+		if (arr.size() < 2 || arr[0].find("#") != std::string::npos)
+		{
+			continue;
+		}
+
+		x.push_back(stod(arr[0]));
+		y.push_back(stod(arr[1]));        
+    }
+}
+
+void write_data(const std::string &fstr, const int *NL, const int N, const int M)
+{
+    std::fstream writer(fstr, std::ios::out);
+    if (!writer.is_open())
+    {
+        std::cerr << "result file open failed.\n";
+        return;
+    }
+
+    for (int i = 0; i < N; ++i)
+    {
+        writer << i << "\t";
+        for (int j = 0; j < M; ++j)
+        {
+            int ind = NL[i*M + j];
+            if (ind >= 0) 
+            {
+                writer << ind << "\t";
+            }
+        }
+
+        writer << endl;
+    }
+}
+
+
+
+
+
+
+
+