update curand sequence for ChASE

ChASE-library · Mar 18, 2023 · 670dad3 · 670dad3
1 parent d75ab35
commit 670dad3
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 32 deletions.
diff --git a/ChASE-MPI/CMakeLists.txt b/ChASE-MPI/CMakeLists.txt
@@ -94,7 +94,6 @@ if(SCALAPACK_FOUND)
 	)
 endif()
 
-
 if(OpenMP_FOUND)
     target_link_libraries( chase_mpi INTERFACE
             OpenMP::OpenMP_CXX

diff --git a/ChASE-MPI/chase_mpi.hpp b/ChASE-MPI/chase_mpi.hpp
@@ -507,14 +507,6 @@ class ChaseMpi : public chase::Chase<T>
 #ifdef USE_NSIGHT
         nvtxRangePop();
 #endif
-        /*
-        std::mt19937 gen(2342.0);
-        std::normal_distribution<> normal_distribution;
-
-        for (std::size_t k = 0; k < N_; ++k){
-          v1[k] = getRandomT<T>([&]() { return normal_distribution(gen); });
-        }
-        */
 #ifdef HAS_OMP
         char* omp_threads;
         omp_threads = getenv("OMP_NUM_THREADS");
@@ -587,7 +579,7 @@ class ChaseMpi : public chase::Chase<T>
 #endif
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif
+#endif	
         delete[] ritzv;
         delete[] isuppz;
         delete[] d;
@@ -644,7 +636,7 @@ class ChaseMpi : public chase::Chase<T>
             num_threads = std::atoi(omp_threads);
         }
         omp_set_num_threads(1);
-#endif
+#endif	
         // ENSURE that v1 has one norm
 #ifdef USE_NSIGHT
         nvtxRangePushA("Lanczos: loop");
@@ -706,7 +698,7 @@ class ChaseMpi : public chase::Chase<T>
         }
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif
+#endif	
         delete[] isuppz;
         delete[] d;
         delete[] e;

diff --git a/ChASE-MPI/impl/chase_mpidla_blaslapack.hpp b/ChASE-MPI/impl/chase_mpidla_blaslapack.hpp
@@ -275,7 +275,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
             num_threads = std::atoi(omp_threads);
         }
         omp_set_num_threads(1);
-#endif        
+#endif   	    
         for (auto i = 0; i < unconverged; i++)
         {
             T alpha = -ritzv[i];
@@ -286,7 +286,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
         }
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif    
+#endif   	
     }
 
     //! - This function performs the local computation for ChaseMpiDLA::heevd()

diff --git a/ChASE-MPI/impl/chase_mpidla_mgpu.hpp b/ChASE-MPI/impl/chase_mpidla_mgpu.hpp
@@ -40,7 +40,7 @@
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
                        int n, cudaStream_t stream_);
 //! generate `n` random double numbers in normal distribution on each GPU
 //! device.
@@ -51,7 +51,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
                        int n, cudaStream_t stream_);
 //! generate `n` random complex float numbers in normal distribution on each GPU
 //! device. The real part and the imaginary part of each individual random
@@ -63,7 +63,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<float>* v, int n, cudaStream_t stream_);
 //! generate `n` random complex double numbers in normal distribution on each
 //! GPU device. The real part and the imaginary part of each individual random
@@ -75,7 +75,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<double>* v, int n, cudaStream_t stream_);
 
 //! shift the diagonal of a `nxn` square matrix `A` in float real data type.
@@ -258,7 +258,7 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
         cuda_exec(
             cudaMalloc((void**)&d_ritz_, sizeof(Base<T>) * (nev_ + nex_)));
         cuda_exec(
-            cudaMalloc((void**)&states_, sizeof(curandState) * (256 * 32)));
+            cudaMalloc((void**)&states_, sizeof(curandStatePhilox4_32_10_t) * (256 * 32)));
 
         cublasCreate(&cublasH_);
         cusolverDnCreate(&cusolverH_);
@@ -786,7 +786,8 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
         stream1_; //!< CUDA stream for asynchronous exectution of kernels
     cudaStream_t
         stream2_; //!< CUDA stream for asynchronous exectution of kernels
-    curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
+    //curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
+    curandStatePhilox4_32_10_t *states_ = NULL;
     T* d_H_;  //!< a pointer to a local buffer of size `m_*n_` on GPU, which is
               //!< mapped to `H_`.
     T* d_C_;  //!< a pointer to a local buffer of size `m_*(nev_+nex_)` on GPU,

diff --git a/ChASE-MPI/kernels/shift.cu b/ChASE-MPI/kernels/shift.cu
@@ -16,11 +16,11 @@
 #define GRIDDIM 32
 
 // generate `n` random float numbers on GPU
-__global__ void s_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void s_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 float* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -33,11 +33,11 @@ __global__ void s_normal_kernel(unsigned long long seed, curandState* states,
 }
 
 // generate `n` random double numbers on GPU
-__global__ void d_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void d_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 double* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -49,11 +49,11 @@ __global__ void d_normal_kernel(unsigned long long seed, curandState* states,
     }
 }
 // generate `n` random complex single numbers on GPU
-__global__ void c_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void c_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 cuComplex* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -68,11 +68,11 @@ __global__ void c_normal_kernel(unsigned long long seed, curandState* states,
 }
 
 // generate `n` random complex double numbers on GPU
-__global__ void z_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void z_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 cuDoubleComplex* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -166,26 +166,26 @@ __global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m,
     }
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
                        int n, cudaStream_t stream_)
 {
     s_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
                        int n, cudaStream_t stream_)
 {
     d_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<float>* v, int n, cudaStream_t stream_)
 {
     c_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(
         seed, states, reinterpret_cast<cuComplex*>(v), n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<double>* v, int n, cudaStream_t stream_)
 {
     z_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(
-Original file line number
+Diff line change
@@ Expand Up / @@ -94,7 +94,6 @@ if(SCALAPACK_FOUND) @@
     	)
     endif()
     if(OpenMP_FOUND)
         target_link_libraries( chase_mpi INTERFACE
                 OpenMP::OpenMP_CXX
@@ Expand Down @@