diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index df9440e..2035f93 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -16,4 +16,6 @@ jobs:
     - name: make clean 
       run: make clean
     - name: make sequential
-      run: make sequential
\ No newline at end of file
+      run: make sequential
+    # - name: make parallel
+      # run: make parallel
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0eb247a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.vscode 
+.DS_Store
diff --git a/Makefile b/Makefile
index 2807b02..336c134 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 CC := gcc
+NVCC := /usr/local/cuda-12.2/bin/nvcc
 BIN_FOLDER := bin
 SRC_FOLDER := src
 PARALLEL_FOLDER := parallel
@@ -12,14 +13,21 @@ SRC-CUDA := main.cu
 
 all: sequential cuda
 
+prova:
+	@mkdir -p prova
+	$(NVCC) $(SRC_FOLDER)/$(PARALLEL_FOLDER)/prova.cu
+	@mv a.out prova/prova
+
+
 sequential:
 	@mkdir -p $(BIN_FOLDER)
 	$(CC)  $(SRC_FOLDER)/$(SRC-SEQ) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/sequential.c
 	@mv a.out $(BIN_FOLDER)/$(NN-SEQ)
 
 parallel:
-	/usr/local/cuda-10.0/bin/nvcc $(SRC_FOLDER)/$(PARALLEL_FOLDER)/$(SRC-CUDA) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/$(PARALLEL_FOLDER)/parallel.cu
+	@mkdir -p $(BIN_FOLDER)
+	$(NVCC) $(SRC_FOLDER)/$(PARALLEL_FOLDER)/$(SRC-CUDA) $(SRC_FOLDER)/parser.c $(SRC_FOLDER)/$(PARALLEL_FOLDER)/parallel.cu
 	@mv a.out $(BIN_FOLDER)/$(NN-CUDA)
 	
 clean:
-	rm -rf $(BIN_FOLDER)
\ No newline at end of file
+	rm -rf $(BIN_FOLDER) prova 
diff --git a/include/parser.h b/include/parser.h
index 10fe82a..bdc90eb 100644
--- a/include/parser.h
+++ b/include/parser.h
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <time.h>
 
+
 void read_matrix(int **row_ptr, int **col_ind, float **values, const char *filename, int *num_rows, int *num_cols, int *num_vals);
 
 #endif // PARSER_H
diff --git a/include/utils.h b/include/utils.h
new file mode 100644
index 0000000..a46e36f
--- /dev/null
+++ b/include/utils.h
@@ -0,0 +1,122 @@
+#pragma once 
+
+#include <chrono>
+#include <cmath>
+#include <mutex>
+#include <iostream>
+
+///////////////////////////////////////////////////////////////
+// CUDA error check
+//////////////////////////////////////////////////////////////
+static void cuda_check_status(cudaError_t status)
+{
+    if (status != cudaSuccess)
+    {
+        std::cerr << "error : CUDA API call : " 
+            << cudaGetErrorString(status) << std::endl;
+        exit(1);
+    } 
+}
+
+//////////////////////////////////////////////////////////////
+// memory allocation
+//////////////////////////////////////////////////////////////
+template <typename T>
+T* malloc_device(size_t n)
+{
+    void* p;
+    auto status = cudaMalloc(&p, n * sizeof(T));
+    cuda_check_status(status);
+    return (T*)p;
+}
+
+template <typename T>
+T* malloc_managed(size_t n, T value = T())
+{
+    T* p;
+    auto status = cudaMallocManaged(&p, n * sizeof(T));
+    cuda_check_status(status);
+    std::fill(p, p + n, value);
+    return p;
+}
+
+template <typename T>
+T* malloc_pinned(size_t n, T value = T())
+{
+    T* p = nullptr;
+    cudaHostAlloc((void**)&p, n * sizeof(T), 0);
+    std::fill(p, p + n, value);
+    return p;
+}
+
+
+///////////////////////////////////////////////////////////////////
+// CUDA memory copy
+//////////////////////////////////////////////////////////////////
+template <typename T>
+void copy_to_device(T* from, T* to, size_t n)
+{
+    cuda_check_status(cudaMemcpy(to, from, n * sizeof(T), cudaMemcpyHostToDevice));
+}
+
+template <typename T>
+void copy_to_host(T* from, T* to, size_t n)
+{
+    cuda_check_status(cudaMemcpy(to, from, n * sizeof(T), cudaMemcpyDeviceToHost));
+}
+
+template <typename T>
+void copy_to_device_async(const T* from, T* to, size_t n, cudaStream_t stream = NULL)
+{
+    auto status = cudaMemcpyAsync(to, from, n * sizeof(T), cudaMemcpyHostToDevice, stream);
+    cuda_check_status(status);
+}
+
+template <typename T>
+void copy_to_host_async(const T* from, T* to, size_t n, cudaStream_t stream = NULL)
+{
+    auto status = cudaMemcpyAsync(to, from, n * sizeof(T), cudaMemcpyDeviceToHost, stream);
+    cuda_check_status(status);
+}
+
+///////////////////////////////////////////////////////////////////
+// others
+//////////////////////////////////////////////////////////////////
+static size_t read_arg(int argc, char** argv, size_t index, int default_value)
+{
+    if (argc > index)
+    {
+        try{
+            auto n = std::stoi(argv[index]);
+            if (n < 0)
+            {
+                return default_value;
+            }
+            return n;
+        }catch(std::exception& e)
+        {
+            std::cerr << "error [invalid argument, expected a positive integer] | compiler says :  " 
+                        << e.what() << std::endl;
+            exit(1);
+        }
+    }
+    return default_value;
+}
+
+template <typename T>
+T* malloc_host(size_t n, T value = T())
+{
+    T* p = (T*)malloc(n * sizeof(T));
+    std::fill(p, p + n, value);
+    return p;
+}
+
+//aliases
+using clock_type = std::chrono::high_resolution_clock;
+using duration_type = std::chrono::duration<double>;
+
+static  double get_time()
+{
+    static auto start_time = clock_type::now();
+    return duration_type(clock_type::now() - start_time).count();
+}
\ No newline at end of file
diff --git a/prova/prova b/prova/prova
new file mode 100755
index 0000000..e174597
Binary files /dev/null and b/prova/prova differ
diff --git a/src/main.c b/src/main.c
index 6c83139..2d7fa5c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -18,7 +18,7 @@ int main(int argc, char **argv) {
     }
     
     int *row_ptr, *col_ind, num_rows, num_cols, num_vals;
-    float *values, elapsed_time;;
+    float *values, elapsed_time;
     
     int num_repeat = atoi(argv[1]);
     int print_mode = atoi(argv[2]);
diff --git a/src/parallel/main.cu b/src/parallel/main.cu
index 9b3291c..a619cc6 100644
--- a/src/parallel/main.cu
+++ b/src/parallel/main.cu
@@ -3,6 +3,9 @@
 #include <string.h>
 
 #include "../../include/parser.h"
+#include "../../include/parallel.h"
+#include "../../include/utils.h"
+
 
 int main(int argc, const char * argv[]) {
     fprintf(stdout, "============================\n");
@@ -26,8 +29,10 @@ int main(int argc, const char * argv[]) {
     
     read_matrix(&row_ptr, &col_ind, &values, filename, &num_rows, &num_cols, &num_vals);
     
-    float *x = (float *) malloc(num_rows * sizeof(float));
-    float *y = (float *) malloc(num_rows * sizeof(float));
+    // float *x = (float *) malloc(num_rows * sizeof(float));
+    float *x = malloc_host<float>(num_rows);
+    // float *y = (float *) malloc(num_rows * sizeof(float));
+    float *y = malloc_host<float>(num_rows);
     for (int i = 0; i < num_rows; ++i) {
         x[i] = 1.0;
         y[i] = 0.0;
@@ -73,10 +78,12 @@ int main(int argc, const char * argv[]) {
     cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
     
     // Copy from host to device
+    auto s = get_time();
     cudaMemcpy(d_row_ptr, row_ptr, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(d_col_ind, col_ind, num_vals * sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(d_values, values, num_vals * sizeof(float), cudaMemcpyHostToDevice);
-    
+    auto time_H2D = get_time() - s;
+
     // Time the iterations
     float elapsed_time;
     cudaEvent_t start, stop;
diff --git a/src/parallel/prova.cu b/src/parallel/prova.cu
new file mode 100644
index 0000000..acfb1cd
--- /dev/null
+++ b/src/parallel/prova.cu
@@ -0,0 +1,15 @@
+#include <iostream>
+
+__global__
+void kernel()
+{
+    printf("Hello World from GPU! %d, %d\n", threadIdx.x, blockIdx.x);
+}
+
+int main(int argc, char** argv)
+{
+    std::cout << "Hello World from CPU!" << std::endl;
+    kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+    return 0;
+}
\ No newline at end of file
diff --git a/src/parser.c b/src/parser.c
index dad1f21..98bf562 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -67,4 +67,4 @@ void read_matrix(int **row_ptr, int **col_ind, float **values, const char *filen
     *row_ptr = row_ptr_t;
     *col_ind = col_ind_t;
     *values = values_t;
-}
\ No newline at end of file
+}