Skip to content

Commit

Permalink
update curand sequence for ChASE
Browse files Browse the repository at this point in the history
  • Loading branch information
brunowu committed Mar 18, 2023
1 parent d75ab35 commit 670dad3
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 32 deletions.
1 change: 0 additions & 1 deletion ChASE-MPI/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ if(SCALAPACK_FOUND)
)
endif()


if(OpenMP_FOUND)
target_link_libraries( chase_mpi INTERFACE
OpenMP::OpenMP_CXX
Expand Down
14 changes: 3 additions & 11 deletions ChASE-MPI/chase_mpi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,14 +507,6 @@ class ChaseMpi : public chase::Chase<T>
#ifdef USE_NSIGHT
nvtxRangePop();
#endif
/*
std::mt19937 gen(2342.0);
std::normal_distribution<> normal_distribution;
for (std::size_t k = 0; k < N_; ++k){
v1[k] = getRandomT<T>([&]() { return normal_distribution(gen); });
}
*/
#ifdef HAS_OMP
char* omp_threads;
omp_threads = getenv("OMP_NUM_THREADS");
Expand Down Expand Up @@ -587,7 +579,7 @@ class ChaseMpi : public chase::Chase<T>
#endif
#ifdef HAS_OMP
omp_set_num_threads(num_threads);
#endif
#endif
delete[] ritzv;
delete[] isuppz;
delete[] d;
Expand Down Expand Up @@ -644,7 +636,7 @@ class ChaseMpi : public chase::Chase<T>
num_threads = std::atoi(omp_threads);
}
omp_set_num_threads(1);
#endif
#endif
// ENSURE that v1 has one norm
#ifdef USE_NSIGHT
nvtxRangePushA("Lanczos: loop");
Expand Down Expand Up @@ -706,7 +698,7 @@ class ChaseMpi : public chase::Chase<T>
}
#ifdef HAS_OMP
omp_set_num_threads(num_threads);
#endif
#endif
delete[] isuppz;
delete[] d;
delete[] e;
Expand Down
4 changes: 2 additions & 2 deletions ChASE-MPI/impl/chase_mpidla_blaslapack.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
num_threads = std::atoi(omp_threads);
}
omp_set_num_threads(1);
#endif
#endif
for (auto i = 0; i < unconverged; i++)
{
T alpha = -ritzv[i];
Expand All @@ -286,7 +286,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
}
#ifdef HAS_OMP
omp_set_num_threads(num_threads);
#endif
#endif
}

//! - This function performs the local computation for ChaseMpiDLA::heevd()
Expand Down
13 changes: 7 additions & 6 deletions ChASE-MPI/impl/chase_mpidla_mgpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
//! generated numbers
//! @param[in] stream_ an asynchronous CUDA stream which allows to run this
//! function asynchronously
void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
int n, cudaStream_t stream_);
//! generate `n` random double numbers in normal distribution on each GPU
//! device.
Expand All @@ -51,7 +51,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
//! generated numbers
//! @param[in] stream_ an asynchronous CUDA stream which allows to run this
//! function asynchronously
void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
int n, cudaStream_t stream_);
//! generate `n` random complex float numbers in normal distribution on each GPU
//! device. The real part and the imaginary part of each individual random
Expand All @@ -63,7 +63,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
//! generated numbers
//! @param[in] stream_ an asynchronous CUDA stream which allows to run this
//! function asynchronously
void chase_rand_normal(unsigned long long seed, curandState* states,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
std::complex<float>* v, int n, cudaStream_t stream_);
//! generate `n` random complex double numbers in normal distribution on each
//! GPU device. The real part and the imaginary part of each individual random
Expand All @@ -75,7 +75,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states,
//! generated numbers
//! @param[in] stream_ an asynchronous CUDA stream which allows to run this
//! function asynchronously
void chase_rand_normal(unsigned long long seed, curandState* states,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
std::complex<double>* v, int n, cudaStream_t stream_);

//! shift the diagonal of a `nxn` square matrix `A` in float real data type.
Expand Down Expand Up @@ -258,7 +258,7 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
cuda_exec(
cudaMalloc((void**)&d_ritz_, sizeof(Base<T>) * (nev_ + nex_)));
cuda_exec(
cudaMalloc((void**)&states_, sizeof(curandState) * (256 * 32)));
cudaMalloc((void**)&states_, sizeof(curandStatePhilox4_32_10_t) * (256 * 32)));

cublasCreate(&cublasH_);
cusolverDnCreate(&cusolverH_);
Expand Down Expand Up @@ -786,7 +786,8 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
stream1_; //!< CUDA stream for asynchronous exectution of kernels
cudaStream_t
stream2_; //!< CUDA stream for asynchronous exectution of kernels
curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
//curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
curandStatePhilox4_32_10_t *states_ = NULL;
T* d_H_; //!< a pointer to a local buffer of size `m_*n_` on GPU, which is
//!< mapped to `H_`.
T* d_C_; //!< a pointer to a local buffer of size `m_*(nev_+nex_)` on GPU,
Expand Down
24 changes: 12 additions & 12 deletions ChASE-MPI/kernels/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
#define GRIDDIM 32

// generate `n` random float numbers on GPU
__global__ void s_normal_kernel(unsigned long long seed, curandState* states,
__global__ void s_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
float* v, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curandState* state = states + tid;
curandStatePhilox4_32_10_t* state = states + tid;
curand_init(seed, tid, 0, state);

int i;
Expand All @@ -33,11 +33,11 @@ __global__ void s_normal_kernel(unsigned long long seed, curandState* states,
}

// generate `n` random double numbers on GPU
__global__ void d_normal_kernel(unsigned long long seed, curandState* states,
__global__ void d_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
double* v, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curandState* state = states + tid;
curandStatePhilox4_32_10_t* state = states + tid;
curand_init(seed, tid, 0, state);

int i;
Expand All @@ -49,11 +49,11 @@ __global__ void d_normal_kernel(unsigned long long seed, curandState* states,
}
}
// generate `n` random complex single numbers on GPU
__global__ void c_normal_kernel(unsigned long long seed, curandState* states,
__global__ void c_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
cuComplex* v, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curandState* state = states + tid;
curandStatePhilox4_32_10_t* state = states + tid;
curand_init(seed, tid, 0, state);

int i;
Expand All @@ -68,11 +68,11 @@ __global__ void c_normal_kernel(unsigned long long seed, curandState* states,
}

// generate `n` random complex double numbers on GPU
__global__ void z_normal_kernel(unsigned long long seed, curandState* states,
__global__ void z_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
cuDoubleComplex* v, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curandState* state = states + tid;
curandStatePhilox4_32_10_t* state = states + tid;
curand_init(seed, tid, 0, state);

int i;
Expand Down Expand Up @@ -166,26 +166,26 @@ __global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m,
}
}

void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
int n, cudaStream_t stream_)
{
s_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
}

void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
int n, cudaStream_t stream_)
{
d_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
}

void chase_rand_normal(unsigned long long seed, curandState* states,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
std::complex<float>* v, int n, cudaStream_t stream_)
{
c_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(
seed, states, reinterpret_cast<cuComplex*>(v), n);
}

void chase_rand_normal(unsigned long long seed, curandState* states,
void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
std::complex<double>* v, int n, cudaStream_t stream_)
{
z_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(
Expand Down

0 comments on commit 670dad3

Please sign in to comment.