diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index df9440e..2035f93 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -16,4 +16,6 @@ jobs: - name: make clean run: make clean - name: make sequential - run: make sequential \ No newline at end of file + run: make sequential + # - name: make parallel + # run: make parallel \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0eb247a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.vscode +.DS_Store diff --git a/Makefile b/Makefile index 2807b02..336c134 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ CC := gcc +NVCC := /usr/local/cuda-12.2/bin/nvcc BIN_FOLDER := bin SRC_FOLDER := src PARALLEL_FOLDER := parallel @@ -12,14 +13,21 @@ SRC-CUDA := main.cu all: sequential cuda +prova: + @mkdir -p prova + $(NVCC) $(SRC_FOLDER)/$(PARALLEL_FOLDER)/prova.cu + @mv a.out prova/prova + + sequential: @mkdir -p $(BIN_FOLDER) $(CC) $(SRC_FOLDER)/$(SRC-SEQ) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/sequential.c @mv a.out $(BIN_FOLDER)/$(NN-SEQ) parallel: - /usr/local/cuda-10.0/bin/nvcc $(SRC_FOLDER)/$(PARALLEL_FOLDER)/$(SRC-CUDA) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/$(PARALLEL_FOLDER)/parallel.cu + @mkdir -p $(BIN_FOLDER) + $(NVCC) $(SRC_FOLDER)/$(PARALLEL_FOLDER)/$(SRC-CUDA) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/$(PARALLEL_FOLDER)/parallel.cu @mv a.out $(BIN_FOLDER)/$(NN-CUDA) clean: - rm -rf $(BIN_FOLDER) \ No newline at end of file + rm -rf $(BIN_FOLDER) prova diff --git a/include/parser.h b/include/parser.h index 10fe82a..bdc90eb 100644 --- a/include/parser.h +++ b/include/parser.h @@ -5,6 +5,7 @@ #include #include + void read_matrix(int **row_ptr, int **col_ind, float **values, const char *filename, int *num_rows, int *num_cols, int *num_vals); #endif // PARSER_H diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..a46e36f --- /dev/null +++ b/include/utils.h @@ -0,0 +1,122 @@ +#pragma once + +#include +#include +#include +#include + +/////////////////////////////////////////////////////////////// +// CUDA error check +////////////////////////////////////////////////////////////// +static void cuda_check_status(cudaError_t status) +{ + if (status != cudaSuccess) + { + std::cerr << "error : CUDA API call : " + << cudaGetErrorString(status) << std::endl; + exit(1); + } +} + +////////////////////////////////////////////////////////////// +// memory allocation +////////////////////////////////////////////////////////////// +template +T* malloc_device(size_t n) +{ + void* p; + auto status = cudaMalloc(&p, n * sizeof(T)); + cuda_check_status(status); + return (T*)p; +} + +template +T* malloc_managed(size_t n, T value = T()) +{ + T* p; + auto status = cudaMallocManaged(&p, n * sizeof(T)); + cuda_check_status(status); + std::fill(p, p + n, value); + return p; +} + +template +T* malloc_pinned(size_t n, T value = T()) +{ + T* p = nullptr; + cudaHostAlloc((void**)&p, n * sizeof(T), 0); + std::fill(p, p + n, value); + return p; +} + + +/////////////////////////////////////////////////////////////////// +// CUDA memory copy +////////////////////////////////////////////////////////////////// +template +void copy_to_device(T* from, T* to, size_t n) +{ + cuda_check_status(cudaMemcpy(to, from, n * sizeof(T), cudaMemcpyHostToDevice)); +} + +template +void copy_to_host(T* from, T* to, size_t n) +{ + cuda_check_status(cudaMemcpy(to, from, n * sizeof(T), cudaMemcpyDeviceToHost)); +} + +template +void copy_to_device_async(const T* from, T* to, size_t n, cudaStream_t stream = NULL) +{ + auto status = cudaMemcpyAsync(to, from, n * sizeof(T), cudaMemcpyHostToDevice, stream); + cuda_check_status(status); +} + +template +void copy_to_host_async(const T* from, T* to, size_t n, cudaStream_t stream = NULL) +{ + auto status = cudaMemcpyAsync(to, from, n * sizeof(T), cudaMemcpyDeviceToHost, stream); + cuda_check_status(status); +} + +/////////////////////////////////////////////////////////////////// +// others +////////////////////////////////////////////////////////////////// +static size_t read_arg(int argc, char** argv, size_t index, int default_value) +{ + if (argc > index) + { + try{ + auto n = std::stoi(argv[index]); + if (n < 0) + { + return default_value; + } + return n; + }catch(std::exception& e) + { + std::cerr << "error [invalid argument, expected a positive integer] | compiler says : " + << e.what() << std::endl; + exit(1); + } + } + return default_value; +} + +template +T* malloc_host(size_t n, T value = T()) +{ + T* p = (T*)malloc(n * sizeof(T)); + std::fill(p, p + n, value); + return p; +} + +//aliases +using clock_type = std::chrono::high_resolution_clock; +using duration_type = std::chrono::duration; + +static double get_time() +{ + static auto start_time = clock_type::now(); + return duration_type(clock_type::now() - start_time).count(); +} \ No newline at end of file diff --git a/prova/prova b/prova/prova new file mode 100755 index 0000000..e174597 Binary files /dev/null and b/prova/prova differ diff --git a/src/main.c b/src/main.c index 6c83139..2d7fa5c 100644 --- a/src/main.c +++ b/src/main.c @@ -18,7 +18,7 @@ int main(int argc, char **argv) { } int *row_ptr, *col_ind, num_rows, num_cols, num_vals; - float *values, elapsed_time;; + float *values, elapsed_time; int num_repeat = atoi(argv[1]); int print_mode = atoi(argv[2]); diff --git a/src/parallel/main.cu b/src/parallel/main.cu index 9b3291c..a619cc6 100644 --- a/src/parallel/main.cu +++ b/src/parallel/main.cu @@ -3,6 +3,9 @@ #include #include "../../include/parser.h" +#include "../../include/parallel.h" +#include "../../include/utils.h" + int main(int argc, const char * argv[]) { fprintf(stdout, "============================\n"); @@ -26,8 +29,10 @@ int main(int argc, const char * argv[]) { read_matrix(&row_ptr, &col_ind, &values, filename, &num_rows, &num_cols, &num_vals); - float *x = (float *) malloc(num_rows * sizeof(float)); - float *y = (float *) malloc(num_rows * sizeof(float)); + // float *x = (float *) malloc(num_rows * sizeof(float)); + float *x = malloc_host(num_rows); + // float *y = (float *) malloc(num_rows * sizeof(float)); + float *y = malloc_host(num_rows); for (int i = 0; i < num_rows; ++i) { x[i] = 1.0; y[i] = 0.0; @@ -73,10 +78,12 @@ int main(int argc, const char * argv[]) { cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0); // Copy from host to device + auto s = get_time(); cudaMemcpy(d_row_ptr, row_ptr, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_col_ind, col_ind, num_vals * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_values, values, num_vals * sizeof(float), cudaMemcpyHostToDevice); - + auto time_H2D = get_time() - s; + // Time the iterations float elapsed_time; cudaEvent_t start, stop; diff --git a/src/parallel/prova.cu b/src/parallel/prova.cu new file mode 100644 index 0000000..acfb1cd --- /dev/null +++ b/src/parallel/prova.cu @@ -0,0 +1,15 @@ +#include + +__global__ +void kernel() +{ + printf("Hello World from GPU! %d, %d\n", threadIdx.x, blockIdx.x); +} + +int main(int argc, char** argv) +{ + std::cout << "Hello World from CPU!" << std::endl; + kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; +} \ No newline at end of file diff --git a/src/parser.c b/src/parser.c index dad1f21..98bf562 100644 --- a/src/parser.c +++ b/src/parser.c @@ -67,4 +67,4 @@ void read_matrix(int **row_ptr, int **col_ind, float **values, const char *filen *row_ptr = row_ptr_t; *col_ind = col_ind_t; *values = values_t; -} \ No newline at end of file +}