diff --git a/CMakeLists.txt b/CMakeLists.txt index fb1176e8f0..09835f051e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.18) -project(SIRIUS VERSION 7.4.2) +project(SIRIUS VERSION 7.4.3) # user variables set(CREATE_PYTHON_MODULE OFF CACHE BOOL "create sirius Python module") diff --git a/apps/tests/test_wf_fft.cpp b/apps/tests/test_wf_fft.cpp index 192a33003e..8c6b161fdd 100644 --- a/apps/tests/test_wf_fft.cpp +++ b/apps/tests/test_wf_fft.cpp @@ -7,7 +7,7 @@ using namespace sirius; void test_wf_fft() { //sddk::MPI_grid mpi_grid({2, 3}, sddk::Communicator::world()); - mpi::Grid mpi_grid({1, 1}, mpi::Communicator::world()); + mpi::Grid mpi_grid({2, 2}, mpi::Communicator::world()); /* creation of simple G+k vector set */ auto gkvec = fft::gkvec_factory(8.0, mpi_grid.communicator()); @@ -18,7 +18,8 @@ void test_wf_fft() /* get the FFT box boundaries */ auto fft_grid = fft::get_min_grid(8.0, gkvec->lattice_vectors()); - std::vector num_mt_coeffs({10, 20, 30, 10, 20}); + //std::vector num_mt_coeffs({10, 20, 30, 10, 20}); + std::vector num_mt_coeffs({1}); wf::Wave_functions wf(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host); wf::Wave_functions wf_ref(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host); @@ -32,7 +33,7 @@ void test_wf_fft() } } } - auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device); + //auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device); auto pu = sddk::device_t::CPU; @@ -61,10 +62,10 @@ void test_wf_fft() for (int ispn = 0; ispn < 2; ispn++) { wf::Wave_functions_fft wf_fft(gkvec_fft, wf, wf::spin_index(ispn), wf::band_range(0,10), - wf::shuffle_to::fft_layout | wf::shuffle_to::wf_layout); + wf::shuffle_to::wf_layout); for (int i = 0; i < wf_fft.num_wf_local(); i++) { - spfft_transform->backward(wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu); + spfft_transform->backward(wf1[ispn].pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu); spfft_transform->forward(spfft_pu, wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), SPFFT_FULL_SCALING); } } diff --git a/apps/unit_tests/test_rlm_deriv.cpp b/apps/unit_tests/test_rlm_deriv.cpp index 6657fe1239..7cb3ebafb1 100644 --- a/apps/unit_tests/test_rlm_deriv.cpp +++ b/apps/unit_tests/test_rlm_deriv.cpp @@ -482,7 +482,7 @@ int test3() 1 + 10 * utils::random(), 1 + 10 * utils::random()); - auto rtp = SHT::spherical_coordinates(v); + auto rtp = r3::spherical_coordinates(v); double dr = 1e-5 * rtp[0]; @@ -495,8 +495,8 @@ int test3() r3::vector v2 = v; v2[x] -= dr; - auto vs1 = SHT::spherical_coordinates(v1); - auto vs2 = SHT::spherical_coordinates(v2); + auto vs1 = r3::spherical_coordinates(v1); + auto vs2 = r3::spherical_coordinates(v2); std::vector rlm1(lmmax); std::vector rlm2(lmmax); diff --git a/apps/unit_tests/test_rot_ylm.cpp b/apps/unit_tests/test_rot_ylm.cpp index 6d30d40336..308dd28306 100644 --- a/apps/unit_tests/test_rot_ylm.cpp +++ b/apps/unit_tests/test_rot_ylm.cpp @@ -44,10 +44,10 @@ int run_test_impl(cmd_args& args) /* random Cartesian vector */ r3::vector coord(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX, double(rand()) / RAND_MAX); - auto scoord = SHT::spherical_coordinates(coord); + auto scoord = r3::spherical_coordinates(coord); /* rotated coordinates */ auto coord2 = dot(Rc, coord); - auto scoord2 = SHT::spherical_coordinates(coord2); + auto scoord2 = r3::spherical_coordinates(coord2); int lmax{10}; sddk::mdarray ylm(utils::lmmax(lmax)); diff --git a/doc/doxygen.cfg b/doc/doxygen.cfg index cc42f17a59..1a5e5bd4bb 100644 --- a/doc/doxygen.cfg +++ b/doc/doxygen.cfg @@ -39,7 +39,7 @@ PROJECT_NAME = "SIRIUS" # control system is used. -PROJECT_NUMBER = "7.4.2" +PROJECT_NUMBER = "7.4.3" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/src/SDDK/memory.hpp b/src/SDDK/memory.hpp index 5694d69e31..ba7a2ed4c8 100644 --- a/src/SDDK/memory.hpp +++ b/src/SDDK/memory.hpp @@ -678,7 +678,7 @@ sddk::memory_pool& get_memory_pool(sddk::memory_t M__); { \ if (!(condition__)) { \ std::stringstream _s; \ - _s << "Assertion (" << #condition__ << ") failed " \ + _s << "Assertion (" << #condition__ << ") failed " \ << "at line " << __LINE__ << " of file " << __FILE__ << std::endl \ << "array label: " << label_ << std::endl; \ for (int i = 0; i < N; i++) { \ @@ -755,6 +755,20 @@ class mdarray_index_descriptor { return size_; } + + inline bool check_range(index_type i__) const + { +#ifdef NDEBUG + return true; +#else + if (i__ < begin_ || i__ > end_) { + std::cout << "index " << i__ << " out of range [" << begin_ << ", " << end_ << "]" << std::endl; + return false; + } else { + return true; + } +#endif + } }; /// Multidimensional array with the column-major (Fortran) order. @@ -833,7 +847,8 @@ class mdarray std::array i = {args...}; for (int j = 0; j < N; j++) { - mdarray_assert(i[j] >= dims_[j].begin() && i[j] <= dims_[j].end()); + //mdarray_assert(dims_[j].check_range(i[j]) && i[j] >= dims_[j].begin() && i[j] <= dims_[j].end()); + mdarray_assert(dims_[j].check_range(i[j])); } size_t idx = offsets_[0] + i[0]; diff --git a/src/SDDK/wave_functions.hpp b/src/SDDK/wave_functions.hpp index 57b022f6cf..d48d25eee7 100644 --- a/src/SDDK/wave_functions.hpp +++ b/src/SDDK/wave_functions.hpp @@ -924,15 +924,31 @@ class Wave_functions_fft : public Wave_functions_base */ auto& comm_col = gkvec_fft_->comm_ortho_fft(); - auto ncol = sddk::splindex_base::block_size(b__.size(), comm_col.size()); - size_t sz = gkvec_fft_->count() * ncol; - sddk::mdarray, 1> send_recv_buf(sz, sddk::get_memory_pool(sddk::memory_t::host), "send_recv_buf"); + /* in full-potential case leading dimenstion is larger than the number of plane-wave + * coefficients, so we have to copy data into temporary storage with necessary leading + * dimension */ + sddk::mdarray, 2> wf_tmp; + if (wf_->ld() == wf_->num_pw_) { /* pure plane-wave coeffs */ + auto ptr = (wf_->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin()); + wf_tmp = sddk::mdarray, 2>(ptr, wf_->num_pw_, b__.size()); + } else { + wf_tmp = sddk::mdarray, 2>(wf_->num_pw_, b__.size(), sddk::get_memory_pool(sddk::memory_t::host)); + for (int i = 0; i < b__.size(); i++) { + auto in_ptr = wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i); + std::copy(in_ptr, in_ptr + wf_->num_pw_, wf_tmp.at(sddk::memory_t::host, 0, i)); + } + } - auto& row_distr = gkvec_fft_->gvec_slab(); + auto* send_buf = (wf_tmp.ld() == 0) ? nullptr : wf_tmp.at(sddk::memory_t::host); /* local number of columns */ int n_loc = spl_num_wf_.local_size(); + sddk::mdarray, 1> recv_buf(gkvec_fft_->count() * n_loc, + sddk::get_memory_pool(sddk::memory_t::host), "recv_buf"); + + auto& row_distr = gkvec_fft_->gvec_slab(); + /* send and receive dimensions */ mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size()); for (int j = 0; j < comm_col.size(); j++) { @@ -942,12 +958,8 @@ class Wave_functions_fft : public Wave_functions_base sd.calc_offsets(); rd.calc_offsets(); - auto send_buf = (this->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin()); - - { - comm_col.alltoall(send_buf, sd.counts.data(), sd.offsets.data(), send_recv_buf.at(sddk::memory_t::host), - rd.counts.data(), rd.offsets.data()); - } + comm_col.alltoall(send_buf, sd.counts.data(), sd.offsets.data(), recv_buf.at(sddk::memory_t::host), + rd.counts.data(), rd.offsets.data()); /* reorder received blocks */ #pragma omp parallel for @@ -956,7 +968,7 @@ class Wave_functions_fft : public Wave_functions_base int offset = row_distr.offsets[j]; int count = row_distr.counts[j]; if (count) { - auto from = &send_recv_buf[offset * n_loc + count * i]; + auto from = &recv_buf[offset * n_loc + count * i]; std::copy(from, from + count, this->data_[0].at(sddk::memory_t::host, offset, i)); } } @@ -989,15 +1001,15 @@ class Wave_functions_fft : public Wave_functions_base auto& comm_col = gkvec_fft_->comm_ortho_fft(); - auto ncol = sddk::splindex_base::block_size(b__.size(), comm_col.size()); - size_t sz = gkvec_fft_->count() * ncol; - sddk::mdarray, 1> send_recv_buf(sz, sddk::get_memory_pool(sddk::memory_t::host), "send_recv_buf"); - - auto& row_distr = gkvec_fft_->gvec_slab(); - /* local number of columns */ int n_loc = spl_num_wf_.local_size(); + /* send buffer */ + sddk::mdarray, 1> send_buf(gkvec_fft_->count() * n_loc, + sddk::get_memory_pool(sddk::memory_t::host), "send_buf"); + + auto& row_distr = gkvec_fft_->gvec_slab(); + /* reorder sending blocks */ #pragma omp parallel for for (int i = 0; i < n_loc; i++) { @@ -1006,7 +1018,7 @@ class Wave_functions_fft : public Wave_functions_base int count = row_distr.counts[j]; if (count) { auto from = this->data_[0].at(sddk::memory_t::host, offset, i); - std::copy(from, from + count, &send_recv_buf[offset * n_loc + count * i]); + std::copy(from, from + count, &send_buf[offset * n_loc + count * i]); } } } @@ -1018,10 +1030,43 @@ class Wave_functions_fft : public Wave_functions_base } sd.calc_offsets(); rd.calc_offsets(); - auto* recv_buf = (this->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin()); - comm_col.alltoall(send_recv_buf.at(sddk::memory_t::host), sd.counts.data(), sd.offsets.data(), recv_buf, +#if !defined(NDEBUG) + for (int i = 0; i < n_loc; i++) { + for (int j = 0; j < comm_col.size(); j++) { + int offset = row_distr.offsets[j]; + int count = row_distr.counts[j]; + for (int igg = 0; igg < count; igg++) { + if (send_buf[offset * n_loc + count * i + igg] != this->data_[0](offset + igg, i)) { + RTE_THROW("wrong packing of send buffer"); + } + } + } + } +#endif + /* full potential wave-functions have extra muffin-tin part; + * that makes the PW block of data not consecutive and thus we need to copy to a consecutive buffer + * for alltoall */ + sddk::mdarray, 2> wf_tmp; + if (wf_->ld() == wf_->num_pw_) { /* pure plane-wave coeffs */ + auto ptr = (wf_->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin()); + wf_tmp = sddk::mdarray, 2>(ptr, wf_->num_pw_, b__.size()); + } else { + wf_tmp = sddk::mdarray, 2>(wf_->num_pw_, b__.size(), sddk::get_memory_pool(sddk::memory_t::host)); + } + + auto* recv_buf = (wf_tmp.ld() == 0) ? nullptr : wf_tmp.at(sddk::memory_t::host); + + comm_col.alltoall(send_buf.at(sddk::memory_t::host), sd.counts.data(), sd.offsets.data(), recv_buf, rd.counts.data(), rd.offsets.data()); + + if (wf_->ld() != wf_->num_pw_) { + for (int i = 0; i < b__.size(); i++) { + auto out_ptr = wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i); + std::copy(wf_tmp.at(sddk::memory_t::host, 0, i), + wf_tmp.at(sddk::memory_t::host, 0, i) + wf_->num_pw_, out_ptr); + } + } } if (pp && wf_->gkvec().comm().rank() == 0) { auto t = utils::time_interval(t0); diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp index e8aa91f192..29c0867b70 100644 --- a/src/api/sirius_api.cpp +++ b/src/api/sirius_api.cpp @@ -2768,7 +2768,7 @@ sirius_find_eigen_states(void* const* gs_handler__, void* const* ks_handler__, b gs.potential().update_atomic_potential(); } if (precompute_rf__ && *precompute_rf__) { - const_cast(gs.ctx().unit_cell()).generate_radial_functions(); + const_cast(gs.ctx().unit_cell()).generate_radial_functions(gs.ctx().out()); } if (precompute_ri__ && *precompute_ri__) { const_cast(gs.ctx().unit_cell()).generate_radial_integrals(); @@ -4033,7 +4033,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__, gkvec(x, igk) = kp->gkvec().template gkvec(igk)[x]; gkvec_cart(x, igk) = gkc[x]; } - auto rtp = sirius::SHT::spherical_coordinates(gkc); + auto rtp = r3::spherical_coordinates(gkc); gkvec_len[igk] = rtp[0]; gkvec_tp(0, igk) = rtp[1]; gkvec_tp(1, igk) = rtp[2]; diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp index d2830b0117..bf423fa14b 100644 --- a/src/band/diag_full_potential.cpp +++ b/src/band/diag_full_potential.cpp @@ -46,6 +46,8 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) con /* block size of scalapack 2d block-cyclic distribution */ int bs = ctx_.cyclic_block_size(); + auto pcs = env::print_checksum(); + la::dmatrix> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t())); la::dmatrix> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t())); @@ -76,7 +78,7 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) con } } - if (ctx_.print_checksum()) { + if (pcs) { auto z1 = h.checksum(ngklo, ngklo); auto z2 = o.checksum(ngklo, ngklo); utils::print_checksum("h_lapw", z1, ctx_.out()); @@ -107,11 +109,9 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) con } } - if (ctx_.print_checksum()) { + if (pcs) { auto z1 = kp.fv_eigen_vectors().checksum(kp.gklo_basis_size(), ctx_.num_fv_states()); - if (kp.comm().rank() == 0) { - utils::print_checksum("fv_eigen_vectors", z1, kp.out(1)); - } + utils::print_checksum("fv_eigen_vectors", z1, kp.out(1)); } /* remap to slab */ @@ -129,6 +129,12 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) con la::constant>::zero(), kp.comm().native()); } + if (pcs) { + auto z1 = kp.fv_eigen_vectors_slab().checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states())); + auto z2 = kp.fv_eigen_vectors_slab().checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states())); + utils::print_checksum("fv_eigen_vectors_slab", z1 + z2, kp.out(1)); + } + /* renormalize wave-functions */ if (ctx_.valence_relativity() == relativity_t::iora) { diff --git a/src/beta_projectors/beta_projectors.hpp b/src/beta_projectors/beta_projectors.hpp index cce2292ee7..5d9c83f357 100644 --- a/src/beta_projectors/beta_projectors.hpp +++ b/src/beta_projectors/beta_projectors.hpp @@ -57,7 +57,7 @@ class Beta_projectors : public Beta_projectors_base #pragma omp parallel for for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) { /* vs = {r, theta, phi} */ - auto vs = SHT::spherical_coordinates(this->gkvec_.template gkvec_cart(igkloc)); + auto vs = r3::spherical_coordinates(this->gkvec_.template gkvec_cart(igkloc)); /* compute real spherical harmonics for G+k vector */ std::vector gkvec_rlm(utils::lmmax(this->ctx_.unit_cell().lmax())); sf::spherical_harmonics(this->ctx_.unit_cell().lmax(), vs[1], vs[2], &gkvec_rlm[0]); diff --git a/src/beta_projectors/beta_projectors_strain_deriv.hpp b/src/beta_projectors/beta_projectors_strain_deriv.hpp index c4bea65427..f1215becb4 100644 --- a/src/beta_projectors/beta_projectors_strain_deriv.hpp +++ b/src/beta_projectors/beta_projectors_strain_deriv.hpp @@ -55,7 +55,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base #pragma omp parallel for schedule(static) for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) { auto gvc = this->gkvec_.template gkvec_cart(igkloc); - auto rtp = SHT::spherical_coordinates(gvc); + auto rtp = r3::spherical_coordinates(gvc); double theta = rtp[1]; double phi = rtp[2]; @@ -70,7 +70,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) { auto gvc = this->gkvec_.template gkvec_cart(igkloc); /* vs = {r, theta, phi} */ - auto gvs = SHT::spherical_coordinates(gvc); + auto gvs = r3::spherical_coordinates(gvc); /* |G+k|=0 case */ if (gvs[0] < 1e-10) { diff --git a/src/context/config.hpp b/src/context/config.hpp index 677c4e5407..c9ccb9f1dd 100644 --- a/src/context/config.hpp +++ b/src/context/config.hpp @@ -943,6 +943,18 @@ class config_t } dict_["/control/output"_json_pointer] = output__; } + /// Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator. + inline auto gvec_chunk_size() const + { + return dict_.at("/control/gvec_chunk_size"_json_pointer).get(); + } + inline void gvec_chunk_size(int gvec_chunk_size__) + { + if (dict_.contains("locked")) { + throw std::runtime_error(locked_msg); + } + dict_["/control/gvec_chunk_size"_json_pointer] = gvec_chunk_size__; + } private: nlohmann::json& dict_; }; diff --git a/src/context/input_schema.json b/src/context/input_schema.json index 0a62fdcc57..dedd5264d6 100644 --- a/src/context/input_schema.json +++ b/src/context/input_schema.json @@ -362,7 +362,7 @@ }, "spglib_tolerance" : { "type" : "number", - "default" : 1e-4, + "default" : 1e-6, "title" : "Tolerance of the spglib in finding crystal symmetries" }, "verbosity" : { @@ -411,36 +411,41 @@ "type" : "boolean", "default" : false, "title" : "If true then the atomic forces are printed at the end of SCF run." - }, - "print_timers" : { - "type" : "boolean", - "default" : true, - "title" : "If true then the timer statistics is printed at the end of SCF run." - }, - "print_neighbors" : { - "type" : "boolean", - "default" : false, - "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output." - }, - "use_second_variation" : { - "type" : "boolean", - "default" : true, - "title" : "True if second-variational diagonalization is used in LAPW method." - }, - "beta_chunk_size" : { - "type" : "integer", - "default" : 256, - "title" : "Number of atoms in a chunk of beta-projectors." - }, - "ortho_rf" : { - "type" : "boolean", - "default" : false, - "title" : "Orthogonalize LAPW radial functions." - }, - "output" : { - "type" : "string", - "default" : "stdout:", - "title" : "Type of the output stream (stdout:, file:name)" + }, + "print_timers" : { + "type" : "boolean", + "default" : true, + "title" : "If true then the timer statistics is printed at the end of SCF run." + }, + "print_neighbors" : { + "type" : "boolean", + "default" : false, + "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output." + }, + "use_second_variation" : { + "type" : "boolean", + "default" : true, + "title" : "True if second-variational diagonalization is used in LAPW method." + }, + "beta_chunk_size" : { + "type" : "integer", + "default" : 256, + "title" : "Number of atoms in a chunk of beta-projectors." + }, + "ortho_rf" : { + "type" : "boolean", + "default" : false, + "title" : "Orthogonalize LAPW radial functions." + }, + "output" : { + "type" : "string", + "default" : "stdout:", + "title" : "Type of the output stream (stdout:, file:name)" + }, + "gvec_chunk_size" : { + "type" : "integer", + "default" : 500000, + "title" : "Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator." } } }, diff --git a/src/context/simulation_context.cpp b/src/context/simulation_context.cpp index 9270f2dfaf..e1799846d5 100644 --- a/src/context/simulation_context.cpp +++ b/src/context/simulation_context.cpp @@ -156,7 +156,7 @@ Simulation_context::generate_gvec_ylm(int lmax__) sddk::matrix> gvec_ylm(utils::lmmax(lmax__), gvec().count(), sddk::memory_t::host, "gvec_ylm"); #pragma omp parallel for schedule(static) for (int igloc = 0; igloc < gvec().count(); igloc++) { - auto rtp = SHT::spherical_coordinates(gvec().gvec_cart(igloc)); + auto rtp = r3::spherical_coordinates(gvec().gvec_cart(igloc)); sf::spherical_harmonics(lmax__, rtp[1], rtp[2], &gvec_ylm(0, igloc)); } return gvec_ylm; @@ -279,28 +279,6 @@ Simulation_context::ewald_lambda() const return lambda; } -sddk::splindex -Simulation_context::split_gvec_local() const -{ - /* local number of G-vectors for this MPI rank */ - int ngv_loc = gvec().count(); - /* estimate number of G-vectors in a block */ - int ld{-1}; - for (int iat = 0; iat < unit_cell().num_atom_types(); iat++) { - int nat = unit_cell().atom_type(iat).num_atoms(); - int nbf = unit_cell().atom_type(iat).mt_basis_size(); - - ld = std::max(ld, std::max(nbf * (nbf + 1) / 2, nat)); - } - /* limit the size of relevant array to ~1Gb */ - int ngv_b = (1 << 30) / sizeof(std::complex) / ld; - ngv_b = std::max(1, std::min(ngv_loc, ngv_b)); - /* number of blocks of G-vectors */ - int nb = ngv_loc / ngv_b; - /* split local number of G-vectors between blocks */ - return sddk::splindex(ngv_loc, nb, 0); -} - void Simulation_context::initialize() { @@ -361,22 +339,6 @@ Simulation_context::initialize() /* initialize MPI communicators */ init_comm(); - auto print_mpi_layout = env::print_mpi_layout(); - - if (verbosity() >= 3 || print_mpi_layout) { - mpi::pstdout pout(comm()); - if (comm().rank() == 0) { - pout << "MPI rank placement" << std::endl; - pout << "------------------" << std::endl; - } - pout << "rank: " << comm().rank() - << ", comm_band_rank: " << comm_band().rank() - << ", comm_k_rank: " << comm_k().rank() - << ", hostname: " << utils::hostname() - << ", mpi processor name: " << mpi::Communicator::processor_name() << std::endl; - rte::ostream(this->out(), "info") << pout.flush(0); - } - switch (processing_unit()) { case sddk::device_t::CPU: { host_memory_t_ = sddk::memory_t::host; @@ -573,6 +535,7 @@ Simulation_context::initialize() } } + /* environment variable has a highest priority */ auto ev_str = env::get_ev_solver(); if (ev_str.size()) { evsn[0] = ev_str; @@ -646,6 +609,30 @@ Simulation_context::initialize() this->cfg().control().print_checksum(true); } + auto print_mpi_layout = env::print_mpi_layout(); + + if (verbosity() >= 3 || print_mpi_layout) { + mpi::pstdout pout(comm()); + if (comm().rank() == 0) { + pout << "MPI rank placement" << std::endl; + pout << utils::hbar(136, '-') << std::endl; + pout << " | comm tot, band, k | comm fft, ortho | mpi_grid tot, row, col | blacs tot, row, col" << std::endl; + } + pout << std::setw(12) << utils::hostname() << " | " + << std::setw(6) << comm().rank() + << std::setw(6) << comm_band().rank() + << std::setw(6) << comm_k().rank() << " | " + << std::setw(6) << comm_fft_coarse().rank() + << std::setw(6) << comm_band_ortho_fft_coarse().rank() << " | " + << std::setw(6) << mpi_grid_->communicator(3).rank() + << std::setw(6) << mpi_grid_->communicator(1 << 0).rank() + << std::setw(6) << mpi_grid_->communicator(1 << 1).rank() << " | " + << std::setw(6) << blacs_grid().comm().rank() + << std::setw(6) << blacs_grid().comm_row().rank() + << std::setw(6) << blacs_grid().comm_col().rank() << std::endl; + rte::ostream(this->out(), "info") << pout.flush(0); + } + initialized_ = true; cfg().lock(); } @@ -709,7 +696,6 @@ Simulation_context::print_info(std::ostream& out__) const << " number of G-shells : " << gvecs[i]->num_shells() << std::endl << std::endl; } - os << "number of local G-vector blocks: " << split_gvec_local().num_ranks() << std::endl; os << std::endl; } { @@ -962,7 +948,8 @@ Simulation_context::update() the next time only reciprocal lattice of the G-vectors is updated */ if (!gvec_coarse_) { /* create list of coarse G-vectors */ - gvec_coarse_ = std::make_unique(rlv, 2 * gk_cutoff(), comm(), cfg().control().reduce_gvec()); + gvec_coarse_ = std::make_unique(rlv, 2 * gk_cutoff(), comm(), cfg().control().reduce_gvec(), + cfg().control().spglib_tolerance()); /* create FFT friendly partiton */ gvec_coarse_fft_ = std::make_shared(*gvec_coarse_, comm_fft_coarse(), comm_ortho_fft_coarse()); @@ -1141,21 +1128,12 @@ Simulation_context::update() } } - /* precompute some G-vector related arrays */ - gvec_tp_ = sddk::mdarray(gvec().count(), 2, sddk::memory_t::host, "gvec_tp_"); - #pragma omp parallel for schedule(static) - for (int igloc = 0; igloc < gvec().count(); igloc++) { - auto rtp = SHT::spherical_coordinates(gvec().gvec_cart(igloc)); - gvec_tp_(igloc, 0) = rtp[1]; - gvec_tp_(igloc, 1) = rtp[2]; - } - switch (this->processing_unit()) { case sddk::device_t::CPU: { break; } case sddk::device_t::GPU: { - gvec_tp_.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device); + gvec_->gvec_tp().allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device); break; } } @@ -1241,24 +1219,10 @@ Simulation_context::update() new Radial_integrals_atomic_wf(unit_cell(), new_gk_cutoff, 20, idxr_wf, ps_wf, ps_atomic_wf_ri_djl_callback_)); } - /* update augmentation operator */ - sddk::memory_pool* mp{nullptr}; - sddk::memory_pool* mpd{nullptr}; - switch (this->processing_unit()) { - case sddk::device_t::CPU: { - mp = &get_memory_pool(sddk::memory_t::host); - break; - } - case sddk::device_t::GPU: { - mp = &get_memory_pool(sddk::memory_t::host_pinned); - mpd = &get_memory_pool(sddk::memory_t::device); - break; - } - } for (int iat = 0; iat < unit_cell().num_atom_types(); iat++) { if (unit_cell().atom_type(iat).augment() && unit_cell().atom_type(iat).num_atoms() > 0) { - augmentation_op_[iat] = std::make_unique(unit_cell().atom_type(iat), gvec()); - augmentation_op_[iat]->generate_pw_coeffs(aug_ri(), gvec_tp_, *mp, mpd); + augmentation_op_[iat] = std::make_unique(unit_cell().atom_type(iat), gvec(), aug_ri(), aug_ri_djl()); + augmentation_op_[iat]->generate_pw_coeffs(); } else { augmentation_op_[iat] = nullptr; } @@ -1533,8 +1497,6 @@ Simulation_context::init_comm() /* create communicator, orthogonal to comm_fft_coarse */ comm_ortho_fft_coarse_ = comm().split(comm_fft_coarse().rank()); - - /* create communicator, orthogonal to comm_fft_coarse within a band communicator */ - comm_band_ortho_fft_coarse_ = comm_band().split(comm_fft_coarse().rank()); } + } // namespace sirius diff --git a/src/context/simulation_context.hpp b/src/context/simulation_context.hpp index cfc8823a0d..a610492aab 100644 --- a/src/context/simulation_context.hpp +++ b/src/context/simulation_context.hpp @@ -109,11 +109,6 @@ class Simulation_context : public Simulation_parameters /// Auxiliary communicator for the coarse-grid FFT transformation. mpi::Communicator comm_ortho_fft_coarse_; - /// Communicator, which is orthogonal to comm_fft_coarse within a band communicator. - /** This communicator is used in reshuffling the wave-functions for the FFT-friendly distribution. It will be - used to parallelize application of local Hamiltonian over bands. */ - mpi::Communicator comm_band_ortho_fft_coarse_; - /// Unit cell of the simulation. std::unique_ptr unit_cell_; @@ -177,9 +172,6 @@ class Simulation_context : public Simulation_parameters /// Lattice coordinats of G-vectors in a GPU-friendly ordering. sddk::mdarray gvec_coord_; - /// Theta and phi angles of G-vectors in GPU-friendly ordering. - sddk::mdarray gvec_tp_; - /// Volume of the initial unit cell. /** This is needed to estimate the new cutoff for radial integrals. */ double omega0_; @@ -509,18 +501,25 @@ class Simulation_context : public Simulation_parameters if (cfg().control().fft_mode() == "serial") { return mpi::Communicator::self(); } else { - return comm_band(); + return mpi_grid_->communicator(1 << 0); } } - auto const& comm_ortho_fft_coarse() const + /// Communicator, which is orthogonal to comm_fft_coarse within a band communicator. + /** This communicator is used in reshuffling the wave-functions for the FFT-friendly distribution. It will be + used to parallelize application of local Hamiltonian over bands. */ + auto const& comm_band_ortho_fft_coarse() const { - return comm_ortho_fft_coarse_; + if (cfg().control().fft_mode() == "serial") { + return comm_band(); + } else { + return mpi_grid_->communicator(1 << 1); + } } - auto const& comm_band_ortho_fft_coarse() const + auto const& comm_ortho_fft_coarse() const { - return comm_band_ortho_fft_coarse_; + return comm_ortho_fft_coarse_; } void create_storage_file() const; @@ -562,16 +561,11 @@ class Simulation_context : public Simulation_parameters return gvec_phase_factor(gvec().gvec(ig__), ia__); } - inline sddk::mdarray const& gvec_coord() const + inline auto const& gvec_coord() const { return gvec_coord_; } - inline sddk::mdarray const& gvec_tp() const - { - return gvec_tp_; - } - /// Generate phase factors \f$ e^{i {\bf G} {\bf r}_{\alpha}} \f$ for all atoms of a given type. void generate_phase_factors(int iat__, sddk::mdarray, 2>& phase_factors__) const; @@ -764,9 +758,6 @@ class Simulation_context : public Simulation_parameters return (this->processing_unit() == sddk::device_t::CPU) ? sddk::memory_t::host : sddk::memory_t::device; } - /// Split local set of G-vectors into chunks. - sddk::splindex split_gvec_local() const; - /// Set the size of the fine-grained FFT grid. void fft_grid_size(std::array fft_grid_size__) { diff --git a/src/density/augmentation_operator.cpp b/src/density/augmentation_operator.cpp index ab34b15414..9351f97522 100644 --- a/src/density/augmentation_operator.cpp +++ b/src/density/augmentation_operator.cpp @@ -26,25 +26,7 @@ namespace sirius { -#if defined(SIRIUS_GPU) -extern "C" void aug_op_pw_coeffs_gpu(int ngvec__, int const* gvec_shell__, int const* idx__, int idxmax__, - std::complex const* zilm__, int const* l_by_lm__, int lmmax__, - double const* gc__, int ld0__, int ld1__, - double const* gvec_rlm__, int ld2__, - double const* ri_values__, int ld3__, int ld4__, - double* q_pw__, int ld5__, double fourpi_omega__); -extern "C" void aug_op_pw_coeffs_deriv_gpu(int ngvec__, int const* gvec_shell__, double const* gvec_cart__, - int const* idx__, int idxmax__, - double const* gc__, int ld0__, int ld1__, - double const* rlm__, double const* rlm_dg__, int ld2__, - double const* ri_values__, double const* ri_dg_values__, int ld3__, int ld4__, - double* q_pw__, int ld5__, double fourpi__, int nu__, int lmax_q__); - -extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* tp__, double* rlm__, int ld__); -#endif - -void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug const& radial_integrals__, - sddk::mdarray const& tp__, sddk::memory_pool& mp__, sddk::memory_pool* mpd__) +void Augmentation_operator::generate_pw_coeffs() { if (!atom_type_.augment()) { return; @@ -53,101 +35,42 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug const double fourpi_omega = fourpi / gvec_.omega(); + auto const& tp = gvec_.gvec_tp(); + /* maximum l of beta-projectors */ int lmax_beta = atom_type_.indexr().lmax(); int lmmax = utils::lmmax(2 * lmax_beta); - auto l_by_lm = utils::l_by_lm(2 * lmax_beta); - - sddk::mdarray, 1> zilm(lmmax); - for (int l = 0, lm = 0; l <= 2 * lmax_beta; l++) { - for (int m = -l; m <= l; m++, lm++) { - zilm[lm] = std::pow(std::complex(0, 1), l); - } - } - - /* Gaunt coefficients of three real spherical harmonics */ - Gaunt_coefficients gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr); - - /* split G-vectors between ranks */ - int gvec_count = gvec_.count(); - - /* array of real spherical harmonics for each G-vector */ - sddk::mdarray gvec_rlm; - switch (atom_type_.parameters().processing_unit()) { - case sddk::device_t::CPU: { - gvec_rlm = sddk::mdarray(lmmax, gvec_count, mp__); - #pragma omp parallel for schedule(static) - for (int igloc = 0; igloc < gvec_count; igloc++) { - sf::spherical_harmonics(2 * lmax_beta, tp__(igloc, 0), tp__(igloc, 1), &gvec_rlm(0, igloc)); - } - break; - } - case sddk::device_t::GPU: { - gvec_rlm = sddk::mdarray(lmmax, gvec_count, *mpd__); -#if defined(SIRIUS_GPU) - spherical_harmonics_rlm_gpu(2 * lmax_beta, gvec_count, tp__.at(sddk::memory_t::device), - gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld()); -#endif - break; - } - } - - /* number of beta- radial functions */ - int nbrf = atom_type_.mt_radial_basis_size(); - - sddk::mdarray ri_values(nbrf * (nbrf + 1) / 2, 2 * lmax_beta + 1, gvec_.num_gvec_shells_local(), mp__); - #pragma omp parallel for - for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) { - auto ri = radial_integrals__.values(atom_type_.id(), gvec_.gvec_shell_len_local(j)); - for (int l = 0; l <= 2 * lmax_beta; l++) { - for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) { - ri_values(i, l, j) = ri(i, l); - } - } - } - /* number of beta-projectors */ int nbf = atom_type_.mt_basis_size(); - - int idxmax = nbf * (nbf + 1) / 2; - - /* flatten the indices */ - sddk::mdarray idx(3, idxmax, mp__); - for (int xi2 = 0; xi2 < nbf; xi2++) { - int lm2 = atom_type_.indexb(xi2).lm; - int idxrf2 = atom_type_.indexb(xi2).idxrf; - - for (int xi1 = 0; xi1 <= xi2; xi1++) { - int lm1 = atom_type_.indexb(xi1).lm; - int idxrf1 = atom_type_.indexb(xi1).idxrf; - - /* packed orbital index */ - int idx12 = utils::packed_index(xi1, xi2); - /* packed radial-function index */ - int idxrf12 = utils::packed_index(idxrf1, idxrf2); - - idx(0, idx12) = lm1; - idx(1, idx12) = lm2; - idx(2, idx12) = idxrf12; - } - } - - /* array of plane-wave coefficients */ - q_pw_ = sddk::mdarray(nbf * (nbf + 1) / 2, 2 * gvec_count, mp__, "q_pw_"); - + /* only half of Q_{xi,xi'}(G) matrix is stored */ + int nqlm = nbf * (nbf + 1) / 2; + /* local number of G-vectors */ + int gvec_count = gvec_.count(); + + /* Info: + * After some tests, the current GPU implementation of generating aug. operator turns out to be slower than CPU. + * The reason is probaly the memory access pattern of G, lm and, idxrf indices. + * The current decision is to compute aug. operator on CPU once during the initialization and + * then copy the chunks of Q(G) to GPU when computing D-operator and augment charge density. + */ switch (atom_type_.parameters().processing_unit()) { - case sddk::device_t::CPU: { - #pragma omp parallel for schedule(static) + case sddk::device_t::CPU: + case sddk::device_t::GPU: { + /* Gaunt coefficients of three real spherical harmonics */ + Gaunt_coefficients gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr); + #pragma omp parallel for for (int igloc = 0; igloc < gvec_count; igloc++) { + std::vector rlm(lmmax); + sf::spherical_harmonics(2 * lmax_beta, tp(igloc, 0), tp(igloc, 1), rlm.data()); std::vector> v(lmmax); - for (int idx12 = 0; idx12 < nbf * (nbf + 1) / 2; idx12++) { - int lm1 = idx(0, idx12); - int lm2 = idx(1, idx12); - int idxrf12 = idx(2, idx12); + for (int idx12 = 0; idx12 < nqlm; idx12++) { + int lm1 = idx_(0, idx12); + int lm2 = idx_(1, idx12); + int idxrf12 = idx_(2, idx12); for (int lm3 = 0; lm3 < lmmax; lm3++) { - v[lm3] = std::conj(zilm[lm3]) * gvec_rlm(lm3, igloc) * - ri_values(idxrf12, l_by_lm[lm3], gvec_.gvec_shell_idx_local(igloc)); + v[lm3] = std::conj(zilm_[lm3]) * rlm[lm3] * + ri_values_(idxrf12, l_by_lm_[lm3], gvec_.gvec_shell_idx_local(igloc)); } std::complex z = fourpi_omega * gaunt_coefs.sum_L3_gaunt(lm2, lm1, &v[0]); q_pw_(idx12, 2 * igloc) = z.real(); @@ -156,49 +79,28 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug const } break; } - case sddk::device_t::GPU: { - sddk::mdarray gvec_shell(gvec_count, mp__); - for (int igloc = 0; igloc < gvec_count; igloc++) { - gvec_shell(igloc) = gvec_.gvec_shell_idx_local(igloc); - } - gvec_shell.allocate(*mpd__).copy_to(sddk::memory_t::device); - - idx.allocate(*mpd__).copy_to(sddk::memory_t::device); - - zilm.allocate(*mpd__).copy_to(sddk::memory_t::device); - - sddk::mdarray l_by_lm_d(&l_by_lm[0], lmmax); - l_by_lm_d.allocate(*mpd__).copy_to(sddk::memory_t::device); - - auto gc = gaunt_coefs.get_full_set_L3(); - gc.allocate(*mpd__).copy_to(sddk::memory_t::device); - - ri_values.allocate(*mpd__).copy_to(sddk::memory_t::device); - - q_pw_.allocate(*mpd__); - -#if defined(SIRIUS_GPU) - int ld0 = static_cast(gc.size(0)); - int ld1 = static_cast(gc.size(1)); - aug_op_pw_coeffs_gpu(gvec_count, gvec_shell.at(sddk::memory_t::device), idx.at(sddk::memory_t::device), - idxmax, zilm.at(sddk::memory_t::device), l_by_lm_d.at(sddk::memory_t::device), lmmax, - gc.at(sddk::memory_t::device), ld0, ld1, gvec_rlm.at(sddk::memory_t::device), lmmax, - ri_values.at(sddk::memory_t::device), static_cast(ri_values.size(0)), static_cast(ri_values.size(1)), - q_pw_.at(sddk::memory_t::device), static_cast(q_pw_.size(0)), fourpi_omega); -#endif - q_pw_.copy_to(sddk::memory_t::host); - - q_pw_.deallocate(sddk::memory_t::device); - } - } - - sym_weight_ = sddk::mdarray(nbf * (nbf + 1) / 2, mp__, "sym_weight_"); - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 <= xi2; xi1++) { - /* packed orbital index */ - int idx12 = utils::packed_index(xi1, xi2); - sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2; - } +// case sddk::device_t::GPU: { +//#if defined(SIRIUS_GPU) +// auto spl_ngv_loc = utils::split_in_blocks(gvec_count, atom_type_.parameters().cfg().control().gvec_chunk_size()); +// auto& mpd = sddk::get_memory_pool(sddk::memory_t::device); +// /* allocate buffer for Rlm on GPUs */ +// sddk::mdarray gvec_rlm(lmmax, spl_ngv_loc[0], mpd, "gvec_rlm"); +// /* allocate buffer for Q(G) on GPUs */ +// sddk::mdarray qpw(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw"); +// +// int g_begin{0}; +// /* loop over blocks of G-vectors */ +// for (auto ng : spl_ngv_loc) { +// /* generate Rlm spherical harmonics */ +// spherical_harmonics_rlm_gpu(2 * lmax_beta, ng, tp.at(sddk::memory_t::device, g_begin, 0), +// tp.at(sddk::memory_t::device, g_begin, 1), gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld()); +// this->generate_pw_coeffs_chunk_gpu(g_begin, ng, gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld(), qpw); +// acc::copyout(q_pw_.at(sddk::memory_t::host, 0, 2 * g_begin), qpw.at(sddk::memory_t::device), 2 * ng * nqlm); +// g_begin += ng; +// } +//#endif +// break; +// } } q_mtrx_ = sddk::mdarray(nbf, nbf); @@ -227,207 +129,60 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug const } } -Augmentation_operator_gvec_deriv::Augmentation_operator_gvec_deriv(Simulation_parameters const& param__, int lmax__, - fft::Gvec const& gvec__, sddk::mdarray const& tp__) - : gvec_(gvec__) -{ - PROFILE("sirius::Augmentation_operator_gvec_deriv"); - - int lmmax = utils::lmmax(2 * lmax__); - - /* Gaunt coefficients of three real spherical harmonics */ - gaunt_coefs_ = std::unique_ptr>( - new Gaunt_coefficients(lmax__, 2 * lmax__, lmax__, SHT::gaunt_rrr)); - - /* split G-vectors between ranks */ - int gvec_count = gvec__.count(); - - rlm_g_ = sddk::mdarray(lmmax, gvec_count); - rlm_dg_ = sddk::mdarray(lmmax, 3, gvec_count); - - /* array of real spherical harmonics and derivatives for each G-vector */ - #pragma omp parallel for schedule(static) - for (int igloc = 0; igloc < gvec_count; igloc++) { - /* compute Rlm */ - sf::spherical_harmonics(2 * lmax__, tp__(igloc, 0), tp__(igloc, 1), &rlm_g_(0, igloc)); - /* compute dRlm/dG */ - sddk::mdarray tmp(&rlm_dg_(0, 0, igloc), lmmax, 3); - auto gv = gvec__.gvec_cart(igloc); - sf::dRlm_dr(2 * lmax__, gv, tmp, false); - } - switch (param__.processing_unit()) { - case sddk::device_t::CPU: { - break; - } - case sddk::device_t::GPU: { - rlm_g_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device); - rlm_dg_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device); - } - } -} - -void Augmentation_operator_gvec_deriv::prepare(Atom_type const& atom_type__, - Radial_integrals_aug const& ri__, Radial_integrals_aug const& ri_dq__) +void Augmentation_operator::generate_pw_coeffs_gvec_deriv(int nu__) { - PROFILE("sirius::Augmentation_operator_gvec_deriv::prepare"); - - int lmax_beta = atom_type__.lmax_beta(); - - /* number of beta- radial functions */ - int nbrf = atom_type__.mt_radial_basis_size(); - - auto& mp = get_memory_pool(sddk::memory_t::host); - - ri_values_ = sddk::mdarray(2 * lmax_beta + 1, nbrf * (nbrf + 1) / 2, gvec_.num_gvec_shells_local(), mp); - ri_dg_values_ = sddk::mdarray(2 * lmax_beta + 1, nbrf * (nbrf + 1) / 2, gvec_.num_gvec_shells_local(), - mp); - #pragma omp parallel for - for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) { - auto ri = ri__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j)); - auto ri_dg = ri_dq__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j)); - for (int l = 0; l <= 2 * lmax_beta; l++) { - for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) { - ri_values_(l, i, j) = ri(i, l); - ri_dg_values_(l, i, j) = ri_dg(i, l); - } - } - } - - /* number of beta-projectors */ - int nbf = atom_type__.mt_basis_size(); - - int idxmax = nbf * (nbf + 1) / 2; - - /* flatten the indices */ - idx_ = sddk::mdarray(3, idxmax, mp); - for (int xi2 = 0; xi2 < nbf; xi2++) { - int lm2 = atom_type__.indexb(xi2).lm; - int idxrf2 = atom_type__.indexb(xi2).idxrf; - - for (int xi1 = 0; xi1 <= xi2; xi1++) { - int lm1 = atom_type__.indexb(xi1).lm; - int idxrf1 = atom_type__.indexb(xi1).idxrf; - - /* packed orbital index */ - int idx12 = utils::packed_index(xi1, xi2); - /* packed radial-function index */ - int idxrf12 = utils::packed_index(idxrf1, idxrf2); - - idx_(0, idx12) = lm1; - idx_(1, idx12) = lm2; - idx_(2, idx12) = idxrf12; - } - } - - int gvec_count = gvec_.count(); - - gvec_shell_ = sddk::mdarray(gvec_count, get_memory_pool(sddk::memory_t::host)); - gvec_cart_ = sddk::mdarray(3, gvec_count, get_memory_pool(sddk::memory_t::host)); - for (int igloc = 0; igloc < gvec_count; igloc++) { - auto gvc = gvec_.gvec_cart(igloc); - gvec_shell_(igloc) = gvec_.gvec_shell_idx_local(igloc); - for (int x: {0, 1, 2}) { - gvec_cart_(x, igloc) = gvc[x]; - } - } - - sym_weight_ = sddk::mdarray(nbf * (nbf + 1) / 2, mp, "sym_weight_"); - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 <= xi2; xi1++) { - /* packed orbital index */ - int idx12 = xi2 * (xi2 + 1) / 2 + xi1; - sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2; - } - } - - switch (atom_type__.parameters().processing_unit()) { - case sddk::device_t::CPU: { - /* array of plane-wave coefficients */ - q_pw_ = sddk::mdarray(nbf * (nbf + 1) / 2, 2 * gvec_count, mp, "q_pw_dg_"); - break; - } - case sddk::device_t::GPU: { - auto& mpd = get_memory_pool(sddk::memory_t::device); - ri_values_.allocate(mpd).copy_to(sddk::memory_t::device); - ri_dg_values_.allocate(mpd).copy_to(sddk::memory_t::device); - idx_.allocate(mpd).copy_to(sddk::memory_t::device); - gvec_shell_.allocate(mpd).copy_to(sddk::memory_t::device); - gvec_cart_.allocate(mpd).copy_to(sddk::memory_t::device); - /* array of plane-wave coefficients */ - q_pw_ = sddk::mdarray(nbf * (nbf + 1) / 2, 2 * gvec_count, mpd, "q_pw_dg_"); - break; - } + if (!atom_type_.augment()) { + return; } -} + PROFILE("sirius::Augmentation_operator::generate_pw_coeffs_gvec_deriv"); -void Augmentation_operator_gvec_deriv::generate_pw_coeffs(Atom_type const& atom_type__, int nu__) -{ - PROFILE("sirius::Augmentation_operator_gvec_deriv::generate_pw_coeffs"); + auto const& tp = gvec_.gvec_tp(); /* maximum l of beta-projectors */ - int lmax_beta = atom_type__.indexr().lmax(); - - int lmax_q = 2 * lmax_beta; - - int lmmax = utils::lmmax(lmax_q); - - auto l_by_lm = utils::l_by_lm(lmax_q); - - sddk::mdarray, 1> zilm(lmmax); - for (int l = 0, lm = 0; l <= lmax_q; l++) { - for (int m = -l; m <= l; m++, lm++) { - zilm[lm] = std::pow(std::complex(0, 1), l); - } - } + int lmax_beta = atom_type_.indexr().lmax(); + int lmmax = utils::lmmax(2 * lmax_beta); /* number of beta-projectors */ - int nbf = atom_type__.mt_basis_size(); - - /* split G-vectors between ranks */ - int gvec_count = gvec_.count(); + int nbf = atom_type_.mt_basis_size(); + /* local number of G-vectors */ + int gvec_count = gvec_.count(); - switch (atom_type__.parameters().processing_unit()) { + switch (atom_type_.parameters().processing_unit()) { + case sddk::device_t::GPU: case sddk::device_t::CPU: { - #pragma omp parallel for schedule(static) + /* Gaunt coefficients of three real spherical harmonics */ + Gaunt_coefficients gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr); + #pragma omp parallel for for (int igloc = 0; igloc < gvec_count; igloc++) { /* index of the G-vector shell */ int igsh = gvec_.gvec_shell_idx_local(igloc); + + auto gvc = gvec_.gvec_cart(igloc); + double gvc_nu = gvc[nu__]; + + std::vector rlm(lmmax); + sddk::mdarray rlm_dq(lmmax, 3); + sf::spherical_harmonics(2 * lmax_beta, tp(igloc, 0), tp(igloc, 1), rlm.data()); + const bool divide_by_r{false}; + sf::dRlm_dr(2 * lmax_beta, gvc, rlm_dq, divide_by_r); + std::vector> v(lmmax); - double gvc_nu = gvec_cart_(nu__, igloc); for (int idx12 = 0; idx12 < nbf * (nbf + 1) / 2; idx12++) { int lm1 = idx_(0, idx12); int lm2 = idx_(1, idx12); int idxrf12 = idx_(2, idx12); for (int lm3 = 0; lm3 < lmmax; lm3++) { - int l = l_by_lm[lm3]; - v[lm3] = std::conj(zilm[lm3]) * - (rlm_dg_(lm3, nu__, igloc) * ri_values_(l, idxrf12, igsh) + - rlm_g_(lm3, igloc) * ri_dg_values_(l, idxrf12, igsh) * gvc_nu); + int l = l_by_lm_[lm3]; + v[lm3] = std::conj(zilm_[lm3]) * (rlm_dq(lm3, nu__) * ri_values_(idxrf12, l, igsh) + + rlm[lm3] * ri_dq_values_(idxrf12, l, igsh) * gvc_nu); } - std::complex z = fourpi * gaunt_coefs_->sum_L3_gaunt(lm2, lm1, &v[0]); + std::complex z = fourpi * gaunt_coefs.sum_L3_gaunt(lm2, lm1, &v[0]); q_pw_(idx12, 2 * igloc) = z.real(); q_pw_(idx12, 2 * igloc + 1) = z.imag(); } } break; } - case sddk::device_t::GPU: { - auto& mpd = get_memory_pool(sddk::memory_t::device); - - auto gc = gaunt_coefs_->get_full_set_L3(); - gc.allocate(mpd).copy_to(sddk::memory_t::device); - -#if defined(SIRIUS_GPU) - aug_op_pw_coeffs_deriv_gpu(gvec_count, gvec_shell_.at(sddk::memory_t::device), gvec_cart_.at(sddk::memory_t::device), - idx_.at(sddk::memory_t::device), static_cast(idx_.size(1)), - gc.at(sddk::memory_t::device), static_cast(gc.size(0)), static_cast(gc.size(1)), - rlm_g_.at(sddk::memory_t::device), rlm_dg_.at(sddk::memory_t::device), static_cast(rlm_g_.size(0)), - ri_values_.at(sddk::memory_t::device), ri_dg_values_.at(sddk::memory_t::device), static_cast(ri_values_.size(0)), - static_cast(ri_values_.size(1)), q_pw_.at(sddk::memory_t::device), static_cast(q_pw_.size(0)), - fourpi, nu__, lmax_q); -#endif - break; - } } } diff --git a/src/density/augmentation_operator.hpp b/src/density/augmentation_operator.hpp index 8be3150df9..d6662203f6 100644 --- a/src/density/augmentation_operator.hpp +++ b/src/density/augmentation_operator.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2013-2019 Anton Kozhevnikov, Thomas Schulthess +// Copyright (c) 2013-2023 Anton Kozhevnikov, Thomas Schulthess // All rights reserved. // // Redistribution and use in source and binary forms, with or without modification, are permitted provided that @@ -28,8 +28,82 @@ #include "radial/radial_integrals.hpp" #include "fft/gvec.hpp" +#if defined(SIRIUS_GPU) +extern "C" { + +void +aug_op_pw_coeffs_gpu(int ngvec__, int const* gvec_shell__, int const* idx__, int idxmax__, + std::complex const* zilm__, int const* l_by_lm__, int lmmax__, double const* gc__, int ld0__, + int ld1__, double const* gvec_rlm__, int ld2__, double const* ri_values__, int ld3__, int ld4__, + double* q_pw__, int ld5__, double fourpi_omega__); + +void +aug_op_pw_coeffs_deriv_gpu(int ngvec__, int const* gvec_shell__, double const* gvec_cart__, int const* idx__, + int idxmax__, double const* gc__, int ld0__, int ld1__, double const* rlm__, double const* rlm_dg__, int ld2__, + double const* ri_values__, double const* ri_dg_values__, int ld3__, int ld4__, double* q_pw__, int ld5__, + double fourpi__, int nu__, int lmax_q__); + +void +spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* theta__, double const* phi__, double* rlm__, int ld__); + +} +#endif + namespace sirius { +template +inline void iterate_aug_atom_types(Unit_cell const& uc__, F&& f__) +{ + for (int iat = 0; iat < uc__.num_atom_types(); iat++) { + auto& atom_type = uc__.atom_type(iat); + + if (!atom_type.augment() || atom_type.num_atoms() == 0) { + continue; + } + f__(atom_type); + } +} + +inline auto max_l_aug(Unit_cell const& uc__) +{ + int l{0}; + + iterate_aug_atom_types(uc__, + [&l](Atom_type const& type__) + { + l = std::max(l, type__.indexr().lmax()); + }); + + return l; +} + +inline auto max_na_aug(Unit_cell const& uc__) +{ + int na{0}; + + iterate_aug_atom_types(uc__, + [&na](Atom_type const& type__) + { + na = std::max(na, type__.num_atoms()); + }); + + return na; + +} + +inline auto max_nb_aug(Unit_cell const& uc__) +{ + int nb{0}; + + iterate_aug_atom_types(uc__, + [&nb](Atom_type const& type__) + { + nb = std::max(nb, type__.mt_basis_size()); + }); + + return nb; +} + /// Augmentation charge operator Q(r) of the ultrasoft pseudopotential formalism. /** This class generates and stores the plane-wave coefficients of the augmentation charge operator for a given atom type. */ @@ -42,43 +116,142 @@ class Augmentation_operator sddk::mdarray q_mtrx_; - mutable sddk::mdarray q_pw_; + sddk::mdarray q_pw_; + + sddk::mdarray sym_weight_; + + sddk::mdarray, 1> zilm_; + + sddk::mdarray ri_values_; + sddk::mdarray ri_dq_values_; + + sddk::mdarray idx_; - mutable sddk::mdarray sym_weight_; + sddk::mdarray l_by_lm_; public: - Augmentation_operator(Atom_type const& atom_type__, fft::Gvec const& gvec__) + /// Constructor. + /**\param [in] atom_type Atom type instance. + * \param [in] gvec G-vector instance. + * \param [in] radial_integrals Radial integrals of the Q(r) with spherical Bessel functions. + */ + Augmentation_operator(Atom_type const& atom_type__, fft::Gvec const& gvec__, + Radial_integrals_aug const& ri__, Radial_integrals_aug const& ri_dq__) : atom_type_(atom_type__) , gvec_(gvec__) { - } + int lmax_beta = atom_type_.indexr().lmax(); + int lmax = 2 * lmax_beta; + int lmmax = utils::lmmax(lmax); + + /* compute l of lm index */ + auto l_by_lm = utils::l_by_lm(lmax); + l_by_lm_ = sddk::mdarray(lmmax); + std::copy(l_by_lm.begin(), l_by_lm.end(), &l_by_lm_[0]); + + /* compute i^l array */ + zilm_ = sddk::mdarray, 1>(lmmax); + for (int l = 0, lm = 0; l <= lmax; l++) { + for (int m = -l; m <= l; m++, lm++) { + zilm_[lm] = std::pow(std::complex(0, 1), l); + } + } + + /* number of beta-projectors */ + int nbf = atom_type_.mt_basis_size(); + /* number of beta-projector radial functions */ + int nbrf = atom_type__.mt_radial_basis_size(); + /* only half of Q_{xi,xi'}(G) matrix is stored */ + int nqlm = nbf * (nbf + 1) / 2; + + /* flatten the indices */ + idx_ = sddk::mdarray(3, nqlm); + for (int xi2 = 0; xi2 < nbf; xi2++) { + int lm2 = atom_type_.indexb(xi2).lm; + int idxrf2 = atom_type_.indexb(xi2).idxrf; + + for (int xi1 = 0; xi1 <= xi2; xi1++) { + int lm1 = atom_type_.indexb(xi1).lm; + int idxrf1 = atom_type_.indexb(xi1).idxrf; + + /* packed orbital index */ + int idx12 = utils::packed_index(xi1, xi2); + /* packed radial-function index */ + int idxrf12 = utils::packed_index(idxrf1, idxrf2); + + idx_(0, idx12) = lm1; + idx_(1, idx12) = lm2; + idx_(2, idx12) = idxrf12; + } + } - void generate_pw_coeffs(Radial_integrals_aug const& radial_integrals__, sddk::mdarray const& tp__, - sddk::memory_pool& mp__, sddk::memory_pool* mpd__); + ri_values_ = sddk::mdarray(nbrf * (nbrf + 1) / 2, lmax + 1, gvec_.num_gvec_shells_local()); + ri_dq_values_ = sddk::mdarray(nbrf * (nbrf + 1) / 2, lmax + 1, gvec_.num_gvec_shells_local()); + #pragma omp parallel for + for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) { + auto ri = ri__.values(atom_type_.id(), gvec_.gvec_shell_len_local(j)); + auto ri_dq = ri_dq__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j)); + for (int l = 0; l <= lmax; l++) { + for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) { + ri_values_(i, l, j) = ri(i, l); + ri_dq_values_(i, l, j) = ri_dq(i, l); + } + } + } - void prepare(stream_id sid, sddk::memory_pool* mp__) const - { - if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU && atom_type_.augment()) { - if (mp__) { - sym_weight_.allocate(*mp__); - q_pw_.allocate(*mp__); - } else { - sym_weight_.allocate(sddk::memory_t::device); - q_pw_.allocate(sddk::memory_t::device); + sym_weight_ = sddk::mdarray(nqlm); + for (int xi2 = 0; xi2 < nbf; xi2++) { + for (int xi1 = 0; xi1 <= xi2; xi1++) { + /* packed orbital index */ + int idx12 = utils::packed_index(xi1, xi2); + sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2; } - sym_weight_.copy_to(sddk::memory_t::device, sid); - q_pw_.copy_to(sddk::memory_t::device, sid); } - } - void dismiss() const - { - if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU && atom_type_.augment()) { - q_pw_.deallocate(sddk::memory_t::device); - sym_weight_.deallocate(sddk::memory_t::device); + if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU) { + auto& mpd = sddk::get_memory_pool(sddk::memory_t::device); + sym_weight_.allocate(mpd).copy_to(sddk::memory_t::device); } + + /* allocate array of plane-wave coefficients */ + auto mt = (atom_type_.parameters().processing_unit() == sddk::device_t::CPU) ? sddk::memory_t::host : + sddk::memory_t::host_pinned; + q_pw_ = sddk::mdarray(nqlm, 2 * gvec_.count(), sddk::get_memory_pool(mt), "q_pw_"); + } +// TODO: not used at the moment, evaluate the possibility to remove in the future +// /// Generate chunk of plane-wave coefficients on the GPU. +// void generate_pw_coeffs_chunk_gpu(int g_begin__, int ng__, double const* gvec_rlm__, int ld__, +// sddk::mdarray& qpw__) const +// { +//#if defined(SIRIUS_GPU) +// double fourpi_omega = fourpi / gvec_.omega(); +// +// /* maximum l of beta-projectors */ +// int lmax_beta = atom_type_.indexr().lmax(); +// int lmmax = utils::lmmax(2 * lmax_beta); +// /* number of beta-projectors */ +// int nbf = atom_type_.mt_basis_size(); +// /* only half of Q_{xi,xi'}(G) matrix is stored */ +// int nqlm = nbf * (nbf + 1) / 2; +// /* generate Q(G) */ +// aug_op_pw_coeffs_gpu(ng__, gvec_shell_.at(sddk::memory_t::device, g_begin__), idx_.at(sddk::memory_t::device), +// nqlm, zilm_.at(sddk::memory_t::device), l_by_lm_.at(sddk::memory_t::device), lmmax, +// gaunt_coefs_.at(sddk::memory_t::device), static_cast(gaunt_coefs_.size(0)), +// static_cast(gaunt_coefs_.size(1)), gvec_rlm__, ld__, +// ri_values_.at(sddk::memory_t::device), static_cast(ri_values_.size(0)), +// static_cast(ri_values_.size(1)), qpw__.at(sddk::memory_t::device), static_cast(qpw__.size(0)), +// fourpi_omega); +//#endif +// } + + /// Generate Q_{xi,xi'}(G) plane wave coefficients. + void generate_pw_coeffs(); + + /// Generate G-vector derivative Q_{xi,xi'}(G)/dG of the plane-wave coefficients */ + void generate_pw_coeffs_gvec_deriv(int nu__); + auto const& q_pw() const { return q_pw_; @@ -89,17 +262,6 @@ class Augmentation_operator return q_pw_(i__, ig__); } - /// Set Q-matrix. - void q_mtrx(sddk::mdarray const& q_mtrx__) - { - int nbf = atom_type_.mt_basis_size(); - for (int i = 0; i < nbf; i++) { - for (int j = 0; j < nbf; j++) { - q_mtrx_(j, i) = q_mtrx__(j, i); - } - } - } - /// Get values of the Q-matrix. inline double q_mtrx(int xi1__, int xi2__) const { @@ -124,91 +286,6 @@ class Augmentation_operator } }; -// TODO: -// can't cache it in the simulation context because each time the lattice is updated, PW coefficients must be -// recomputed; so, the only way to accelerate it is to move to GPUs.. - -/// Derivative of augmentation operator PW coefficients with respect to the Cartesian component of G-vector. -class Augmentation_operator_gvec_deriv -{ - private: - fft::Gvec const& gvec_; - - sddk::mdarray q_pw_; - - sddk::mdarray sym_weight_; - - sddk::mdarray rlm_g_; - - sddk::mdarray rlm_dg_; - - sddk::mdarray ri_values_; - - sddk::mdarray ri_dg_values_; - - sddk::mdarray idx_; - - sddk::mdarray gvec_cart_; - - sddk::mdarray gvec_shell_; - - std::unique_ptr> gaunt_coefs_; - - public: - Augmentation_operator_gvec_deriv(Simulation_parameters const& param__, int lmax__, fft::Gvec const& gvec__, - sddk::mdarray const& tp__); - - void generate_pw_coeffs(Atom_type const& atom_type__, int nu__); - - void prepare(Atom_type const& atom_type__, Radial_integrals_aug const& ri__, - Radial_integrals_aug const& ri_dq__); - - //void prepare(int stream_id__) const - //{ - // #ifdef SIRIUS_GPU - // if (atom_type_.parameters().processing_unit() == GPU && atom_type_.pp_desc().augment) { - // sym_weight_.allocate(memory_t::device); - // sym_weight_.async_copy_to_device(stream_id__); - - // q_pw_.allocate(memory_t::device); - // q_pw_.async_copy_to_device(stream_id__); - // } - // #endif - //} - - //void dismiss() const - //{ - // #ifdef SIRIUS_GPU - // if (atom_type_.parameters().processing_unit() == GPU && atom_type_.pp_desc().augment) { - // q_pw_.deallocate_on_device(); - // sym_weight_.deallocate_on_device(); - // } - // #endif - //} - - auto const& q_pw() const - { - return q_pw_; - } - - auto& q_pw() - { - return q_pw_; - } - - double q_pw(int i__, int ig__) const - { - return q_pw_(i__, ig__); - } - - /// Weight of Q_{\xi,\xi'}. - /** 2 if off-diagonal (xi != xi'), 1 if diagonal (xi=xi') */ - inline double sym_weight(int idx__) const - { - return sym_weight_(idx__); - } -}; - } // namespace sirius #endif // __AUGMENTATION_OPERATOR_H__ diff --git a/src/density/density.cpp b/src/density/density.cpp index d69fbb1cb1..ee3f57beb0 100644 --- a/src/density/density.cpp +++ b/src/density/density.cpp @@ -1357,43 +1357,38 @@ Density::generate_rho_aug() { PROFILE("sirius::Density::generate_rho_aug"); - auto spl_ngv_loc = ctx_.split_gvec_local(); + /* local number of G-vectors */ + int gvec_count = ctx_.gvec().count(); + auto spl_ngv_loc = utils::split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size()); + + auto& mph = get_memory_pool(sddk::memory_t::host); + auto& mpd = get_memory_pool(sddk::memory_t::device); + + sddk::mdarray, 2> rho_aug(gvec_count, ctx_.num_mag_dims() + 1, mph); - sddk::mdarray, 2> rho_aug(ctx_.gvec().count(), ctx_.num_mag_dims() + 1, - get_memory_pool(sddk::memory_t::host)); switch (ctx_.processing_unit()) { case sddk::device_t::CPU: { rho_aug.zero(sddk::memory_t::host); break; } case sddk::device_t::GPU: { - rho_aug.allocate(get_memory_pool(sddk::memory_t::device)).zero(sddk::memory_t::device); + rho_aug.allocate(mpd).zero(sddk::memory_t::device); break; } } - // TODO: the GPU memory consumption here is huge, rewrite this; split gloc in blocks and - // overlap transfer of Q(G) for two consequtive blocks within one atom type - - if (ctx_.unit_cell().atom_type(0).augment()) { - ctx_.augmentation_op(0).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device)); - } - + /* add contribution to Q(G) from atoms of all types */ for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { auto& atom_type = unit_cell_.atom_type(iat); - if (ctx_.processing_unit() == sddk::device_t::GPU) { - acc::sync_stream(stream_id(0)); - if (iat + 1 != unit_cell_.num_atom_types() && ctx_.unit_cell().atom_type(iat + 1).augment()) { - ctx_.augmentation_op(iat + 1).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device)); - } - } - if (!atom_type.augment() || atom_type.num_atoms() == 0) { continue; } + /* number of beta-projector functions */ int nbf = atom_type.mt_basis_size(); + /* number of Q_{xi,xi'} components for each G */ + int nqlm = nbf * (nbf + 1) / 2; /* convert to real matrix */ auto dm = density_matrix_aux(this->density_matrix(), iat); @@ -1402,11 +1397,11 @@ Density::generate_rho_aug() auto cs = dm.checksum(); utils::print_checksum("density_matrix_aux", cs, ctx_.out()); } - /* treat auxiliary array as double with x2 size */ - sddk::mdarray dm_pw(nbf * (nbf + 1) / 2, spl_ngv_loc.local_size() * 2, - get_memory_pool(sddk::memory_t::host)); - sddk::mdarray phase_factors(atom_type.num_atoms(), spl_ngv_loc.local_size() * 2, - get_memory_pool(sddk::memory_t::host)); + + int ndm_pw = (ctx_.processing_unit() == sddk::device_t::CPU) ? 1 : ctx_.num_mag_dims() + 1; + /* treat auxiliary array as real with x2 size */ + sddk::mdarray dm_pw(nqlm, spl_ngv_loc[0] * 2, ndm_pw, mph); + sddk::mdarray phase_factors(atom_type.num_atoms(), spl_ngv_loc[0] * 2, mph); print_memory_usage(ctx_.out(), FILE_LINE); @@ -1415,51 +1410,60 @@ Density::generate_rho_aug() break; } case sddk::device_t::GPU: { - phase_factors.allocate(get_memory_pool(sddk::memory_t::device)); - dm_pw.allocate(get_memory_pool(sddk::memory_t::device)); - dm.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device); + phase_factors.allocate(mpd); + dm_pw.allocate(mpd); + dm.allocate(mpd).copy_to(sddk::memory_t::device); break; } } print_memory_usage(ctx_.out(), FILE_LINE); - for (int ib = 0; ib < spl_ngv_loc.num_ranks(); ib++) { - int g_begin = spl_ngv_loc.global_index(0, ib); - int g_end = g_begin + spl_ngv_loc.local_size(ib); + auto qpw = (ctx_.processing_unit() == sddk::device_t::CPU) ? sddk::mdarray() : + sddk::mdarray(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw"); + + int g_begin{0}; + /* loop over blocks of G-vectors */ + for (auto ng : spl_ngv_loc) { + /* work on the block of the local G-vectors */ switch (ctx_.processing_unit()) { case sddk::device_t::CPU: { - #pragma omp parallel for schedule(static) - for (int igloc = g_begin; igloc < g_end; igloc++) { - int ig = ctx_.gvec().offset() + igloc; + /* generate phase factors */ + #pragma omp parallel for + for (int g = 0; g < ng; g++) { + int ig = ctx_.gvec().offset() + g_begin + g; for (int i = 0; i < atom_type.num_atoms(); i++) { - int ia = atom_type.atom_id(i); - std::complex z = std::conj(ctx_.gvec_phase_factor(ig, ia)); - phase_factors(i, 2 * (igloc - g_begin)) = z.real(); - phase_factors(i, 2 * (igloc - g_begin) + 1) = z.imag(); + int ia = atom_type.atom_id(i); + auto z = std::conj(ctx_.gvec_phase_factor(ig, ia)); + phase_factors(i, 2 * g) = z.real(); + phase_factors(i, 2 * g + 1) = z.imag(); } } for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { PROFILE_START("sirius::Density::generate_rho_aug|gemm"); - la::wrap(la::lib_t::blas) - .gemm('N', 'N', nbf * (nbf + 1) / 2, 2 * spl_ngv_loc.local_size(ib), atom_type.num_atoms(), - &la::constant::one(), dm.at(sddk::memory_t::host, 0, 0, iv), dm.ld(), - phase_factors.at(sddk::memory_t::host), phase_factors.ld(), &la::constant::zero(), - dm_pw.at(sddk::memory_t::host, 0, 0), dm_pw.ld()); + la::wrap(la::lib_t::blas).gemm('N', 'N', nqlm, 2 * ng, atom_type.num_atoms(), + &la::constant::one(), + dm.at(sddk::memory_t::host, 0, 0, iv), dm.ld(), + phase_factors.at(sddk::memory_t::host), phase_factors.ld(), + &la::constant::zero(), + dm_pw.at(sddk::memory_t::host, 0, 0, 0), dm_pw.ld()); PROFILE_STOP("sirius::Density::generate_rho_aug|gemm"); PROFILE_START("sirius::Density::generate_rho_aug|sum"); #pragma omp parallel for - for (int igloc = g_begin; igloc < g_end; igloc++) { + for (int g = 0; g < ng; g++) { + int igloc = g_begin + g; std::complex zsum(0, 0); /* get contribution from non-diagonal terms */ - for (int i = 0; i < nbf * (nbf + 1) / 2; i++) { - std::complex z1 = std::complex(ctx_.augmentation_op(iat).q_pw(i, 2 * igloc), - ctx_.augmentation_op(iat).q_pw(i, 2 * igloc + 1)); - std::complex z2(dm_pw(i, 2 * (igloc - g_begin)), dm_pw(i, 2 * (igloc - g_begin) + 1)); + for (int i = 0; i < nqlm; i++) { + std::complex z1(ctx_.augmentation_op(iat).q_pw(i, 2 * igloc), + ctx_.augmentation_op(iat).q_pw(i, 2 * igloc + 1)); + std::complex z2(dm_pw(i, 2 * g, 0), + dm_pw(i, 2 * g + 1, 0)); zsum += z1 * z2 * ctx_.augmentation_op(iat).sym_weight(i); } + /* add contribution from atoms of a given type */ rho_aug(igloc, iv) += zsum; } PROFILE_STOP("sirius::Density::generate_rho_aug|sum"); @@ -1468,29 +1472,31 @@ Density::generate_rho_aug() } case sddk::device_t::GPU: { #if defined(SIRIUS_GPU) + acc::copyin(qpw.at(sddk::memory_t::device), + ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm); + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { - generate_dm_pw_gpu(atom_type.num_atoms(), spl_ngv_loc.local_size(ib), nbf, + generate_dm_pw_gpu(atom_type.num_atoms(), ng, nbf, ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device), ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0), ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1), ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2), phase_factors.at(sddk::memory_t::device), dm.at(sddk::memory_t::device, 0, 0, iv), - dm_pw.at(sddk::memory_t::device), 1); - sum_q_pw_dm_pw_gpu(spl_ngv_loc.local_size(ib), nbf, - ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::device, 0, 2 * g_begin), - dm_pw.at(sddk::memory_t::device), + dm_pw.at(sddk::memory_t::device, 0, 0, iv), 1 + iv); + sum_q_pw_dm_pw_gpu(ng, nbf, qpw.at(sddk::memory_t::device), qpw.ld(), + dm_pw.at(sddk::memory_t::device, 0, 0, iv), dm_pw.ld(), ctx_.augmentation_op(iat).sym_weight().at(sddk::memory_t::device), - rho_aug.at(sddk::memory_t::device, g_begin, iv), 1); + rho_aug.at(sddk::memory_t::device, g_begin, iv), 1 + iv); + } + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { + acc::sync_stream(stream_id(1 + iv)); } #endif break; } - } - } + } // switch (pu) - if (ctx_.processing_unit() == sddk::device_t::GPU) { - acc::sync_stream(stream_id(1)); - ctx_.augmentation_op(iat).dismiss(); + g_begin += ng; } } diff --git a/src/density/density.hpp b/src/density/density.hpp index 8e4f5d953c..483ebf5e75 100644 --- a/src/density/density.hpp +++ b/src/density/density.hpp @@ -63,7 +63,7 @@ generate_dm_pw_gpu(int num_atoms__, int num_gvec_loc__, int num_beta__, double c double const* dm__, double* dm_pw__, int stream_id__); void -sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, double const* dm_pw__, +sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, int ldq__, double const* dm_pw__, int ldd__, double const* sym_weight__, std::complex* rho_pw__, int stream_id__); } diff --git a/src/fft/gvec.cpp b/src/fft/gvec.cpp index 6b76f1997d..5768f15ee9 100644 --- a/src/fft/gvec.cpp +++ b/src/fft/gvec.cpp @@ -29,7 +29,7 @@ namespace fft { -r3::vector fft::Gvec::gvec_by_full_index(uint32_t idx__) const +r3::vector Gvec::gvec_by_full_index(uint32_t idx__) const { /* index of the z coordinate of G-vector: first 12 bits */ uint32_t j = idx__ & 0xFFF; @@ -43,7 +43,7 @@ r3::vector fft::Gvec::gvec_by_full_index(uint32_t idx__) const return r3::vector(x, y, z); } -void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) +void Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) { PROFILE("fft::Gvec::find_z_columns"); @@ -71,7 +71,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) /* get z-coordinate of G-vector */ int k = fft_box__.freq_by_coord<2>(iz); /* take G+k */ - auto vgk = dot(lattice_vectors_, (r3::vector(i, j, k) + vk_)); + auto vgk = r3::dot(lattice_vectors_, (r3::vector(i, j, k) + vk_)); /* add z-coordinate of G-vector to the list */ if (vgk.length() <= Gmax__) { zcol.push_back(k); @@ -95,6 +95,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) } }; + PROFILE_START("fft::Gvec::find_z_columns|add"); /* copy column order from previous G-vector set */ if (gvec_base_) { for (int icol = 0; icol < gvec_base_->num_zcol(); icol++) { @@ -110,6 +111,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) add_new_column(i, j); } } + PROFILE_STOP("fft::Gvec::find_z_columns|add"); if (!gvec_base_) { /* put column with {x, y} = {0, 0} to the beginning */ @@ -121,14 +123,10 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) } } - /* sort z-columns starting from the second or skip num_zcol of base distribution */ - int n = (gvec_base_) ? gvec_base_->num_zcol() : 1; - std::sort(z_columns_.begin() + n, z_columns_.end(), - [](z_column_descriptor const& a, z_column_descriptor const& b) { return a.z.size() > b.z.size(); }); - + PROFILE_START("fft::Gvec::find_z_columns|sym"); /* now we have to remove edge G-vectors that don't form a complete shell */ if (bare_gvec_) { - auto lat_sym = sirius::find_lat_sym(lattice_vectors_, 1e-6); + auto lat_sym = sirius::find_lat_sym(this->unit_cell_lattice_vectors(), this->sym_tol_); std::fill(non_zero_columns.at(sddk::memory_t::host), non_zero_columns.at(sddk::memory_t::host) + non_zero_columns.size(), -1); for (int i = 0; i < static_cast(z_columns_.size()); i++) { @@ -137,47 +135,72 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__) std::vector z_columns_tmp; - num_gvec_ = 0; - for (int i = 0; i < static_cast(z_columns_.size()); i++) { + auto remove_incomplete = [this, &z_columns_tmp, &lat_sym, &non_zero_columns](int i) // i - index of column + { + int z_min = (2 << 20); + int z_max = -(2 << 20); + for (int iz = 0; iz < static_cast(z_columns_[i].z.size()); iz++) { + z_min = std::min(z_min, z_columns_[i].z[iz]); + z_max = std::max(z_max, z_columns_[i].z[iz]); + } std::vector z; for (int iz = 0; iz < static_cast(z_columns_[i].z.size()); iz++) { - r3::vector G(z_columns_[i].x, z_columns_[i].y, z_columns_[i].z[iz]); bool found_for_all_sym{true}; - for (auto& R: lat_sym) { - /* apply lattice symmeetry operation to a G-vector */ - auto G1 = dot(R, G); - if (reduce_gvec_) { - if (G1[0] == 0 && G1[1] == 0) { - G1[2] = std::abs(G1[2]); + /* check only first or last z coordinate inside z column */ + if (z_columns_[i].z[iz] == z_min || z_columns_[i].z[iz] == z_max) { + r3::vector G(z_columns_[i].x, z_columns_[i].y, z_columns_[i].z[iz]); + for (auto& R: lat_sym) { + /* apply lattice symmeetry operation to a G-vector */ + auto G1 = r3::dot(G, R); + if (reduce_gvec_) { + if (G1[0] == 0 && G1[1] == 0) { + G1[2] = std::abs(G1[2]); + } } - } - int i1 = non_zero_columns(G1[0], G1[1]); - if (i1 == -1) { - G1 = G1 * (-1); - i1 = non_zero_columns(G1[0], G1[1]); + int i1 = non_zero_columns(G1[0], G1[1]); if (i1 == -1) { - std::stringstream s; - s << "index of z-column is not found" << std::endl - << " G : " << G << std::endl - << " G1 : " << G1; - RTE_THROW(s); + G1 = G1 * (-1); + i1 = non_zero_columns(G1[0], G1[1]); + if (i1 == -1) { + std::stringstream s; + s << "index of z-column is not found" << std::endl + << " G : " << G << std::endl + << " G1 : " << G1; + RTE_THROW(s); + } } - } - bool found = (std::find(z_columns_[i1].z.begin(), z_columns_[i1].z.end(), G1[2]) != std::end(z_columns_[i1].z)); - found_for_all_sym = found_for_all_sym && found; - } // R + bool found = (std::find(z_columns_[i1].z.begin(), z_columns_[i1].z.end(), G1[2]) != std::end(z_columns_[i1].z)); + found_for_all_sym = found_for_all_sym && found; + } // R + } if (found_for_all_sym) { z.push_back(z_columns_[i].z[iz]); } } // iz if (z.size()) { z_columns_tmp.push_back(z_column_descriptor(z_columns_[i].x, z_columns_[i].y, z)); - num_gvec_ += static_cast(z.size()); } + }; + + for (int i = 0; i < static_cast(z_columns_.size()); i++) { + remove_incomplete(i); } z_columns_ = z_columns_tmp; + + num_gvec_ = 0; + for (auto& zc : z_columns_) { + num_gvec_ += static_cast(zc.z.size()); + } } + PROFILE_STOP("fft::Gvec::find_z_columns|sym"); + + PROFILE_START("fft::Gvec::find_z_columns|sort"); + /* sort z-columns starting from the second or skip num_zcol of base distribution */ + int n = (gvec_base_) ? gvec_base_->num_zcol() : 1; + std::sort(z_columns_.begin() + n, z_columns_.end(), + [](z_column_descriptor const& a, z_column_descriptor const& b) { return a.z.size() > b.z.size(); }); + PROFILE_STOP("fft::Gvec::find_z_columns|sort"); } void Gvec::distribute_z_columns() @@ -255,7 +278,7 @@ void Gvec::find_gvec_shells() PROFILE("fft::Gvec::find_gvec_shells"); - auto lat_sym = sirius::find_lat_sym(lattice_vectors_, 1e-6); + auto lat_sym = sirius::find_lat_sym(this->unit_cell_lattice_vectors(), this->sym_tol_); num_gvec_shells_ = 0; gvec_shell_ = sddk::mdarray(num_gvec_, sddk::memory_t::host, "gvec_shell_"); @@ -264,10 +287,11 @@ void Gvec::find_gvec_shells() /* find G-vector shells using symmetry consideration */ for (int ig = 0; ig < num_gvec_; ig++) { + /* if the shell for this vector is not yet found */ if (gvec_shell_[ig] == -1) { auto G = gvec(ig); for (auto& R: lat_sym) { - auto G1 = dot(R, G); + auto G1 = r3::dot(G, R); auto ig1 = index_by_gvec(G1); if (ig1 == -1) { G1 = G1 * (-1); @@ -276,7 +300,24 @@ void Gvec::find_gvec_shells() RTE_THROW("symmetry-related G-vector is not found"); } } - gvec_shell_[ig1] = num_gvec_shells_; + if (gvec_shell_[ig1] == -1) { + gvec_shell_[ig1] = num_gvec_shells_; + } else { + if (gvec_shell_[ig1] != num_gvec_shells_) { + auto gc = r3::dot(lattice_vectors_, G); + auto gc1 = r3::dot(lattice_vectors_, G1); + std::stringstream s; + s << "Error in G-vector shell index" << std::endl + << " G : " << G << std::endl + << " rotated G : " << G1 << std::endl + << " current shell index : " << num_gvec_shells_ << std::endl + << " rotated G shell index : " << gvec_shell_[ig1] << std::endl + << " length of G : " << gc.length() << std::endl + << " length of rotated G : " << gc1.length() << std::endl + << " length difference : " << std::abs(gc.length() - gc1.length()); + RTE_THROW(s); + } + } } num_gvec_shells_++; } @@ -288,62 +329,19 @@ void Gvec::find_gvec_shells() } gvec_shell_len_ = sddk::mdarray(num_gvec_shells_, sddk::memory_t::host, "gvec_shell_len_"); - std::fill(&gvec_shell_len_[0], &gvec_shell_len_[0] + num_gvec_shells_, -1); + std::fill(&gvec_shell_len_[0], &gvec_shell_len_[0] + num_gvec_shells_, 0); + + std::vector ngv_sh(num_gvec_shells_, 0); for (int ig = 0; ig < num_gvec_; ig++) { auto g = gvec_cart(ig).length(); int igsh = gvec_shell_[ig]; - if (gvec_shell_len_[igsh] < 0) { - gvec_shell_len_[igsh] = g; - } else { - /* lattice symmetries were found with 1e-6 tolererance for the metric tensor, - so tolerance on length should be square root of that */ - if (std::abs(gvec_shell_len_[igsh] - g) > 1e-3) { - std::stringstream s; - s << "wrong G-vector length" << std::endl - << " length of G-vector : " << g << std::endl - << " length of G-shell : " << gvec_shell_len_[igsh] << std::endl - << " index of G-vector : " << ig << std::endl - << " index of G-shell : " << igsh << std::endl - << " length difference : " << std::abs(gvec_shell_len_[igsh] - g) << std::endl; - RTE_THROW(s); - } - } + gvec_shell_len_[igsh] += g; + ngv_sh[igsh]++; } - - // TODO: maybe, make an average G-shell length. - - /* list of pairs (length, index of G-vector) */ - std::vector> tmp(num_gvec_); - #pragma omp parallel for schedule(static) - for (int ig = 0; ig < num_gvec(); ig++) { - /* make some reasonable roundoff */ - uint64_t len = static_cast(gvec_shell_len_[gvec_shell_[ig]] * 1e10); - tmp[ig] = std::make_pair(len, ig); - } - /* sort by first element in pair (length) */ - std::sort(tmp.begin(), tmp.end()); - - /* index of the first shell */ - gvec_shell_(tmp[0].second) = 0; - num_gvec_shells_ = 1; - /* temporary vector to store G-shell radius */ - std::vector tmp_len; - /* radius of the first shell */ - tmp_len.push_back(static_cast(tmp[0].first) * 1e-10); - for (int ig = 1; ig < num_gvec_; ig++) { - /* if this G-vector has a different length */ - if (tmp[ig].first != tmp[ig - 1].first) { - /* increment number of shells */ - num_gvec_shells_++; - /* save the radius of the new shell */ - tmp_len.push_back(static_cast(tmp[ig].first) * 1e-10); - } - /* assign the index of the current shell */ - gvec_shell_(tmp[ig].second) = num_gvec_shells_ - 1; + for (int i = 0; i < num_gvec_shells_; i++) { + gvec_shell_len_[i] /= ngv_sh[i]; } - gvec_shell_len_ = sddk::mdarray(num_gvec_shells_, sddk::memory_t::host, "gvec_shell_len_"); - std::copy(tmp_len.begin(), tmp_len.end(), gvec_shell_len_.at(sddk::memory_t::host)); /* map from global index of G-shell to a list of local G-vectors */ std::map> gshmap; @@ -389,13 +387,16 @@ Gvec::init_gvec_cart_local() { gvec_cart_ = sddk::mdarray(3, count(), sddk::memory_t::host, "gvec_cart_"); gkvec_cart_ = sddk::mdarray(3, count(), sddk::memory_t::host, "gkvec_cart_"); + /* this arrays are allocated with GPU- friendly data layout */ + gvec_tp_ = sddk::mdarray(count(), 2, sddk::memory_t::host, "gvec_tp_"); + gkvec_tp_ = sddk::mdarray(count(), 2, sddk::memory_t::host, "gvec_tp_"); if (bare_gvec_) { gvec_len_ = sddk::mdarray(count(), sddk::memory_t::host, "gvec_len_"); } for (int igloc = 0; igloc < count(); igloc++) { - auto gc = dot(lattice_vectors_, r3::vector(&gvec_(0, igloc))); - auto gkc = dot(lattice_vectors_, r3::vector(&gkvec_(0, igloc))); + auto gc = r3::dot(lattice_vectors_, r3::vector(&gvec_(0, igloc))); + auto gkc = r3::dot(lattice_vectors_, r3::vector(&gkvec_(0, igloc))); for (int x : {0, 1, 2}) { gvec_cart_(x, igloc) = gc[x]; gkvec_cart_(x, igloc) = gkc[x]; @@ -403,6 +404,13 @@ Gvec::init_gvec_cart_local() if (bare_gvec_) { gvec_len_(igloc) = gvec_shell_len_(gvec_shell_(this->offset() + igloc)); } + auto gs = r3::spherical_coordinates(gc); + gvec_tp_(igloc, 0) = gs[1]; + gvec_tp_(igloc, 1) = gs[2]; + + auto gks = r3::spherical_coordinates(gkc); + gkvec_tp_(igloc, 0) = gks[1]; + gkvec_tp_(igloc, 1) = gks[2]; } } diff --git a/src/fft/gvec.hpp b/src/fft/gvec.hpp index 24709db09c..5373cc12ca 100644 --- a/src/fft/gvec.hpp +++ b/src/fft/gvec.hpp @@ -209,6 +209,12 @@ class Gvec /// Length of the local fraction of G-vectors. sddk::mdarray gvec_len_; + // Theta- and phi- angles of G-vectors. + sddk::mdarray gvec_tp_; + + // Theta- and phi- angles of G+k-vectors. + sddk::mdarray gkvec_tp_; + /// Offset in the global index for the local part of G-vectors. int offset_{-1}; @@ -218,6 +224,9 @@ class Gvec /// Local number of z-columns. int num_zcol_local_{-1}; + /// Symmetry tolerance of the real-space lattice. + double sym_tol_{1e-6}; + /// Return corresponding G-vector for an index in the range [0, num_gvec). r3::vector gvec_by_full_index(uint32_t idx__) const; @@ -255,49 +264,58 @@ class Gvec public: /// Constructor for G+k vectors. /** \param [in] vk K-point vector of G+k - * \param [in] M Reciprocal lattice vecotors in comumn order + * \param [in] M Reciprocal lattice vectors in column order * \param [in] Gmax Cutoff for G+k vectors * \param [in] comm Total communicator which is used to distribute G-vectors * \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry. + * \param [in] sym_tol Unit cell lattice symmetry tolerance. */ - Gvec(r3::vector vk__, r3::matrix M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__) - : vk_(vk__) - , Gmax_(Gmax__) - , lattice_vectors_(M__) - , comm_(comm__) - , reduce_gvec_(reduce_gvec__) - , bare_gvec_(false) + Gvec(r3::vector vk__, r3::matrix M__, double Gmax__, mpi::Communicator const& comm__, + bool reduce_gvec__, double sym_tol__ = 1e-6) + : vk_{vk__} + , Gmax_{Gmax__} + , lattice_vectors_{M__} + , comm_{comm__} + , reduce_gvec_{reduce_gvec__} + , bare_gvec_{false} + , sym_tol_{sym_tol__} { init(fft::get_min_grid(Gmax__, M__)); } /// Constructor for G-vectors. - /** \param [in] M Reciprocal lattice vecotors in comumn order + /** \param [in] M Reciprocal lattice vectors in column order * \param [in] Gmax Cutoff for G+k vectors * \param [in] comm Total communicator which is used to distribute G-vectors * \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry. + * \param [in] sym_tol Unit cell lattice symmetry tolerance. */ - Gvec(r3::matrix M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__) - : Gmax_(Gmax__) - , lattice_vectors_(M__) - , comm_(comm__) - , reduce_gvec_(reduce_gvec__) + Gvec(r3::matrix M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__, + double sym_tol__ = 1e-6) + : Gmax_{Gmax__} + , lattice_vectors_{M__} + , comm_{comm__} + , reduce_gvec_{reduce_gvec__} + , sym_tol_{sym_tol__} { init(fft::get_min_grid(Gmax__, M__)); } /// Constructor for G-vectors. - /** \param [in] M Reciprocal lattice vecotors in comumn order + /** \param [in] M Reciprocal lattice vectors in column order * \param [in] Gmax Cutoff for G+k vectors * \param [in] fft_grid Provide explicit boundaries for the G-vector min and max frequencies. * \param [in] comm Total communicator which is used to distribute G-vectors * \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry. + * \param [in] sym_tol Unit cell lattice symmetry tolerance. */ - Gvec(r3::matrix M__, double Gmax__, fft::Grid const& fft_grid__, mpi::Communicator const& comm__, bool reduce_gvec__) - : Gmax_(Gmax__) - , lattice_vectors_(M__) - , comm_(comm__) - , reduce_gvec_(reduce_gvec__) + Gvec(r3::matrix M__, double Gmax__, fft::Grid const& fft_grid__, mpi::Communicator const& comm__, + bool reduce_gvec__, double sym_tol__ = 1e-6) + : Gmax_{Gmax__} + , lattice_vectors_{M__} + , comm_{comm__} + , reduce_gvec_{reduce_gvec__} + , sym_tol_{sym_tol__} { init(fft_grid__); } @@ -305,25 +323,28 @@ class Gvec /// Constructor for G-vector distribution based on a previous set. /** Previous set of G-vectors must be a subset of the current set. */ Gvec(double Gmax__, Gvec const& gvec_base__) - : Gmax_(Gmax__) - , lattice_vectors_(gvec_base__.lattice_vectors()) - , comm_(gvec_base__.comm()) - , reduce_gvec_(gvec_base__.reduced()) - , gvec_base_(&gvec_base__) + : Gmax_{Gmax__} + , lattice_vectors_{gvec_base__.lattice_vectors_} + , comm_{gvec_base__.comm()} + , reduce_gvec_{gvec_base__.reduced()} + , gvec_base_{&gvec_base__} + , sym_tol_{gvec_base__.sym_tol_} { init(fft::get_min_grid(Gmax__, lattice_vectors_)); } /// Constructor for G-vectors with mpi_comm_self() - Gvec(r3::matrix M__, double Gmax__, bool reduce_gvec__) - : Gmax_(Gmax__) - , lattice_vectors_(M__) - , comm_(mpi::Communicator::self()) - , reduce_gvec_(reduce_gvec__) + Gvec(r3::matrix M__, double Gmax__, bool reduce_gvec__, double sym_tol__ = 1e-6) + : Gmax_{Gmax__} + , lattice_vectors_{M__} + , comm_{mpi::Communicator::self()} + , reduce_gvec_{reduce_gvec__} + , sym_tol_{sym_tol__} { init(fft::get_min_grid(Gmax__, M__)); } + /// Construct with the defined order of G-vectors. Gvec(r3::vector vk__, r3::matrix M__, int ngv_loc__, int const* gv__, mpi::Communicator const& comm__, bool reduce_gvec__) : vk_(vk__) @@ -408,7 +429,7 @@ class Gvec /// Set the new reciprocal lattice vectors. /** For the varibale-cell relaxation runs we need an option to preserve the number of G- and G+k vectors. * Here we can set the new lattice vectors and update the relevant members of the Gvec class. */ - inline r3::matrix const& lattice_vectors(r3::matrix lattice_vectors__) + inline auto const& lattice_vectors(r3::matrix lattice_vectors__) { lattice_vectors_ = lattice_vectors__; find_gvec_shells(); @@ -417,11 +438,18 @@ class Gvec } /// Retrn a const reference to the reciprocal lattice vectors. - inline r3::matrix const& lattice_vectors() const + inline auto const& lattice_vectors() const { return lattice_vectors_; } + inline auto const unit_cell_lattice_vectors() const + { + double const twopi = 6.2831853071795864769; + auto r = r3::transpose(r3::inverse(lattice_vectors_)) * twopi; + return r; + } + /// Return the volume of the real space unit cell that corresponds to the reciprocal lattice of G-vectors. inline double omega() const { @@ -697,6 +725,26 @@ class Gvec this->comm().bcast(&result(0, 0), 3 * ngv, rank__); return result; } + + inline auto& gvec_tp() + { + return gvec_tp_; + } + + inline auto const& gvec_tp() const + { + return gvec_tp_; + } + + inline auto& gkvec_tp() + { + return gkvec_tp_; + } + + inline auto const& gkvec_tp() const + { + return gkvec_tp_; + } }; /// Stores information about G-vector partitioning between MPI ranks for the FFT transformation. @@ -987,6 +1035,44 @@ gkvec_factory(r3::vector vk__, r3::matrix reciprocal_lattice_vec return std::make_shared(vk__, reciprocal_lattice_vectors__, gk_cutoff__, comm__, gamma__); } +inline void print(std::ostream& out__, Gvec const& gvec__) +{ + std::map> gsh_map; + for (int i = 0; i < gvec__.num_gvec(); i++) { + int igsh = gvec__.shell(i); + if (gsh_map.count(igsh) == 0) { + gsh_map[igsh] = std::vector(); + } + gsh_map[igsh].push_back(i); + } + + out__ << "num_gvec : " << gvec__.num_gvec() << std::endl; + out__ << "num_gvec_shells : " << gvec__.num_shells() << std::endl; + + for (int igsh = 0; igsh < gvec__.num_shells(); igsh++) { + auto len = gvec__.shell_len(igsh); + out__ << "shell : " << igsh << ", length : " << len << std::endl; + for (auto ig : gsh_map[igsh]) { + auto G = gvec__.gvec(ig); + auto Gc = gvec__.gvec_cart(ig); + out__ << " ig : " << ig << ", G = " << G << ", length diff : " << std::abs(Gc.length() - len) << std::endl; + } + } + //mpi::pstdout pout(gvec__.comm()); + //pout << "rank: " << gvec_.comm().rank() << std::endl; + //pout << "-- list of G-vectors in the remapped distribution --" << std::endl; + //for (int igloc = 0; igloc < gvec_count_remapped(); igloc++) { + // auto G = gvec_remapped(igloc); + + // int igsh = gvec_shell_remapped(igloc); + // pout << "igloc=" << igloc << " igsh=" << igsh << " G=" << G[0] << " " << G[1] << " " << G[2] << std::endl; + //} + //pout << "-- reverse list --" << std::endl; + //for (auto const& e: idx_gvec_) { + // pout << "G=" << e.first[0] << " " << e.first[1] << " " << e.first[2] << ", igloc=" << e.second << std::endl; + //} + //out__ << pout.flush(0); +} } // namespace sddk #endif //__GVEC_HPP__ diff --git a/src/geometry/force.cpp b/src/geometry/force.cpp index 9f257e3ec6..9d522fc1cb 100644 --- a/src/geometry/force.cpp +++ b/src/geometry/force.cpp @@ -51,8 +51,6 @@ Force::symmetrize(sddk::mdarray& forces__) const return; } - PROFILE("sirius::Force::symmetrize"); - sddk::mdarray sym_forces(3, ctx_.unit_cell().spl_num_atoms().local_size()); sym_forces.zero(); diff --git a/src/geometry/stress.cpp b/src/geometry/stress.cpp index a720992362..2fd7070834 100644 --- a/src/geometry/stress.cpp +++ b/src/geometry/stress.cpp @@ -381,13 +381,8 @@ Stress::calc_stress_us() return stress_us_; } - auto& ri = ctx_.aug_ri(); - auto& ri_dq = ctx_.aug_ri_djl(); - potential_.fft_transform(-1); - Augmentation_operator_gvec_deriv q_deriv(ctx_, ctx_.unit_cell().lmax(), ctx_.gvec(), ctx_.gvec_tp()); - la::lib_t la{la::lib_t::none}; sddk::memory_t qmem{sddk::memory_t::none}; @@ -402,7 +397,7 @@ Stress::calc_stress_us() case sddk::device_t::GPU: { mp = &get_memory_pool(sddk::memory_t::host_pinned); la = la::lib_t::spla; - qmem = sddk::memory_t::device; + qmem = sddk::memory_t::host; break; } } @@ -413,9 +408,9 @@ Stress::calc_stress_us() continue; } - q_deriv.prepare(atom_type, ri, ri_dq); + Augmentation_operator q_deriv(ctx_.unit_cell().atom_type(iat), ctx_.gvec(), ctx_.aug_ri(), ctx_.aug_ri_djl()); - int nbf = atom_type.mt_basis_size(); + auto nbf = atom_type.mt_basis_size(); /* get auxiliary density matrix */ auto dm = density_.density_matrix_aux(density_.density_matrix(), iat); @@ -424,7 +419,7 @@ Stress::calc_stress_us() get_memory_pool(sddk::memory_t::host)); PROFILE_START("sirius::Stress|us|phase_fac"); - #pragma omp parallel for schedule(static) + #pragma omp parallel for for (int igloc = 0; igloc < ctx_.gvec().count(); igloc++) { int ig = ctx_.gvec().offset() + igloc; for (int i = 0; i < atom_type.num_atoms(); i++) { @@ -436,10 +431,10 @@ Stress::calc_stress_us() sddk::mdarray v_tmp(atom_type.num_atoms(), ctx_.gvec().count() * 2, *mp); sddk::mdarray tmp(nbf * (nbf + 1) / 2, atom_type.num_atoms(), *mp); /* over spin components, can be from 1 to 4 */ - for (int ispin = 0; ispin < ctx_.num_mag_dims() + 1; ispin++) { - for (int nu = 0; nu < 3; nu++) { - q_deriv.generate_pw_coeffs(atom_type, nu); - + for (int nu = 0; nu < 3; nu++) { + /* generate dQ(G)/dG */ + q_deriv.generate_pw_coeffs_gvec_deriv(nu); + for (int ispin = 0; ispin < ctx_.num_mag_dims() + 1; ispin++) { for (int mu = 0; mu < 3; mu++) { PROFILE_START("sirius::Stress|us|prepare"); int igloc0{0}; @@ -449,7 +444,7 @@ Stress::calc_stress_us() } igloc0 = 1; } - #pragma omp parallel for schedule(static) + #pragma omp parallel for for (int igloc = igloc0; igloc < ctx_.gvec().count(); igloc++) { auto gvc = ctx_.gvec().gvec_cart(igloc); double g = gvc.length(); diff --git a/src/geometry/wavefunction_strain_deriv.hpp b/src/geometry/wavefunction_strain_deriv.hpp index e4ea260248..0cea77c0db 100644 --- a/src/geometry/wavefunction_strain_deriv.hpp +++ b/src/geometry/wavefunction_strain_deriv.hpp @@ -17,7 +17,7 @@ wavefunctions_strain_deriv(Simulation_context const& ctx__, K_point& kp_ /* Cartesian coordinats of G-vector */ auto gvc = kp__.gkvec().gkvec_cart(igkloc); /* vs = {r, theta, phi} */ - auto gvs = SHT::spherical_coordinates(gvc); + auto gvs = r3::spherical_coordinates(gvc); std::vector> ri_values(ctx__.unit_cell().num_atom_types()); for (int iat = 0; iat < ctx__.unit_cell().num_atom_types(); iat++) { diff --git a/src/gpu/spherical_harmonics.cu b/src/gpu/spherical_harmonics.cu index 08d022ee71..d3e88d483c 100644 --- a/src/gpu/spherical_harmonics.cu +++ b/src/gpu/spherical_harmonics.cu @@ -89,14 +89,14 @@ extern "C" void spherical_harmonics_ylm_gpu(int lmax__, int ntp__, double const* } -__global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double const* tp__, +__global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double const* theta__, double const* phi__, double* rlm__, int ld__) { int itp = blockDim.x * blockIdx.x + threadIdx.x; if (itp < ntp__) { - double theta = tp__[itp]; - double phi = tp__[ntp__ + itp]; + double theta = theta__[itp]; + double phi = phi__[itp]; double sint = sin(theta); double cost = cos(theta); @@ -146,10 +146,11 @@ __global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double } } -extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* tp__, double* rlm__, int ld__) +extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* theta__, double const* phi__, + double* rlm__, int ld__) { dim3 grid_t(32); dim3 grid_b(num_blocks(ntp__, grid_t.x)); accLaunchKernel((spherical_harmonics_rlm_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0, - lmax__, ntp__, tp__, rlm__, ld__); + lmax__, ntp__, theta__, phi__, rlm__, ld__); } diff --git a/src/gpu/sum_q_pw_dm_pw.cu b/src/gpu/sum_q_pw_dm_pw.cu index 5fb521d737..80516c3747 100644 --- a/src/gpu/sum_q_pw_dm_pw.cu +++ b/src/gpu/sum_q_pw_dm_pw.cu @@ -29,14 +29,9 @@ #include "gpu/cuda_timer.hpp" #endif -__global__ void sum_q_pw_dm_pw_gpu_kernel -( - int nbf__, - double const* q_pw__, - double const* dm_pw__, - double const* sym_weight__, - acc_complex_double_t* rho_pw__ -) +__global__ void +sum_q_pw_dm_pw_gpu_kernel(int nbf__, double const* q_pw__, int ldq__, double const* dm_pw__, int ldd__, + double const* sym_weight__, acc_complex_double_t* rho_pw__) { ACC_DYNAMIC_SHARED( char, sdata_ptr) double* rho_re = (double*)&sdata_ptr[0]; @@ -54,10 +49,10 @@ __global__ void sum_q_pw_dm_pw_gpu_kernel for (int n = 0; n < N; n++) { int i = n * blockDim.x + threadIdx.x; if (i < ld) { - double qx = q_pw__[array2D_offset(i, 2 * igloc, ld)]; - double qy = q_pw__[array2D_offset(i, 2 * igloc + 1, ld)]; - double dx = dm_pw__[array2D_offset(i, 2 * igloc, ld)]; - double dy = dm_pw__[array2D_offset(i, 2 * igloc + 1, ld)]; + double qx = q_pw__[array2D_offset(i, 2 * igloc, ldq__)]; + double qy = q_pw__[array2D_offset(i, 2 * igloc + 1, ldq__)]; + double dx = dm_pw__[array2D_offset(i, 2 * igloc, ldd__)]; + double dy = dm_pw__[array2D_offset(i, 2 * igloc + 1, ldd__)]; rho_re[threadIdx.x] += sym_weight__[i] * (dx * qx - dy * qy); rho_im[threadIdx.x] += sym_weight__[i] * (dy * qx + dx * qy); @@ -77,13 +72,9 @@ __global__ void sum_q_pw_dm_pw_gpu_kernel } } -extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__, - int nbf__, - double const* q_pw__, - double const* dm_pw__, - double const* sym_weight__, - acc_complex_double_t* rho_pw__, - int stream_id__) +extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, int ldq__, + double const* dm_pw__, int ldd__, double const* sym_weight__, + acc_complex_double_t* rho_pw__, int stream_id__) { #ifdef SIRIUS_CUDA CUDA_timer t("sum_q_pw_dm_pw_gpu"); @@ -93,14 +84,9 @@ extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__, dim3 grid_t(64); dim3 grid_b(num_gvec_loc__); - + accLaunchKernel((sum_q_pw_dm_pw_gpu_kernel), dim3(grid_b), dim3(grid_t), 2 * grid_t.x * sizeof(double), stream, - nbf__, - q_pw__, - dm_pw__, - sym_weight__, - rho_pw__ - ); + nbf__, q_pw__, ldq__, dm_pw__, ldd__, sym_weight__, rho_pw__); } diff --git a/src/hamiltonian/hamiltonian.cpp b/src/hamiltonian/hamiltonian.cpp index e030166ef8..c093a761be 100644 --- a/src/hamiltonian/hamiltonian.cpp +++ b/src/hamiltonian/hamiltonian.cpp @@ -50,7 +50,7 @@ Hamiltonian0::Hamiltonian0(Potential& potential__, bool precompute_lapw__) if (precompute_lapw__) { potential_->generate_pw_coefs(); potential_->update_atomic_potential(); - ctx_.unit_cell().generate_radial_functions(); + ctx_.unit_cell().generate_radial_functions(ctx_.out()); ctx_.unit_cell().generate_radial_integrals(); } hmt_ = std::vector, 2>>(ctx_.unit_cell().num_atoms()); diff --git a/src/hamiltonian/hamiltonian_k.cpp b/src/hamiltonian/hamiltonian_k.cpp index 4d3ccbc792..8cfce040bb 100644 --- a/src/hamiltonian/hamiltonian_k.cpp +++ b/src/hamiltonian/hamiltonian_k.cpp @@ -842,7 +842,11 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range if (pcs) { if (hphi__) { auto cs = hphi__->checksum(mem, wf::spin_index(0), b__); + auto cs_pw = hphi__->checksum_pw(mem, wf::spin_index(0), b__); + auto cs_mt = hphi__->checksum_mt(mem, wf::spin_index(0), b__); if (comm.rank() == 0) { + utils::print_checksum("hloc_phi_pw", cs_pw, RTE_OUT(std::cout)); + utils::print_checksum("hloc_phi_mt", cs_mt, RTE_OUT(std::cout)); utils::print_checksum("hloc_phi", cs, RTE_OUT(std::cout)); } } diff --git a/src/hamiltonian/local_operator.cpp b/src/hamiltonian/local_operator.cpp index e3c76d4622..1c003e23ad 100644 --- a/src/hamiltonian/local_operator.cpp +++ b/src/hamiltonian/local_operator.cpp @@ -499,18 +499,18 @@ void Local_operator::apply_fplapw(fft::spfft_transform_type& spfftk__, std wf::Wave_functions_fft phi_fft(gkvec_fft__, phi__, s0, b__, wf::shuffle_to::fft_layout); - std::map*, wf::Wave_functions_fft> map_wf_fft; + std::array, 4> wf_fft; if (hphi__) { - map_wf_fft[hphi__] = wf::Wave_functions_fft(gkvec_fft__, *hphi__, s0, b__, wf::shuffle_to::wf_layout); + wf_fft[0] = wf::Wave_functions_fft(gkvec_fft__, *hphi__, s0, b__, wf::shuffle_to::wf_layout); } if (ophi__) { - map_wf_fft[ophi__] = wf::Wave_functions_fft(gkvec_fft__, *ophi__, s0, b__, wf::shuffle_to::wf_layout); + wf_fft[1] = wf::Wave_functions_fft(gkvec_fft__, *ophi__, s0, b__, wf::shuffle_to::wf_layout); } if (bzphi__) { - map_wf_fft[bzphi__] = wf::Wave_functions_fft(gkvec_fft__, *bzphi__, s0, b__, wf::shuffle_to::wf_layout); + wf_fft[2] = wf::Wave_functions_fft(gkvec_fft__, *bzphi__, s0, b__, wf::shuffle_to::wf_layout); } if (bxyphi__) { - map_wf_fft[bxyphi__] = wf::Wave_functions_fft(gkvec_fft__, *bxyphi__, s0, b__, wf::shuffle_to::wf_layout); + wf_fft[3] = wf::Wave_functions_fft(gkvec_fft__, *bxyphi__, s0, b__, wf::shuffle_to::wf_layout); } auto pcs = env::print_checksum(); @@ -565,40 +565,40 @@ void Local_operator::apply_fplapw(fft::spfft_transform_type& spfftk__, std /* multiply phi(r) by step function */ mul_by_veff(spfftk__, reinterpret_cast(phi_r), veff_vec_, v_local_index_t::theta, spfft_buf); - auto mem = map_wf_fft[ophi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host; + auto mem = wf_fft[1].on_device() ? sddk::memory_t::device : sddk::memory_t::host; /* phi(r) * Theta(r) -> ophi(G) */ - spfftk__.forward(spfft_pu, map_wf_fft[ophi__].pw_coeffs_spfft(mem, wf::band_index(j)), + spfftk__.forward(spfft_pu, wf_fft[1].pw_coeffs_spfft(mem, wf::band_index(j)), SPFFT_FULL_SCALING); } if (bzphi__) { mul_by_veff(spfftk__, reinterpret_cast(phi_r), veff_vec_, v_local_index_t::v1, spfft_buf); - auto mem = map_wf_fft[bzphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host; + auto mem = wf_fft[2].on_device() ? sddk::memory_t::device : sddk::memory_t::host; /* phi(r) * Bz(r) -> bzphi(G) */ - spfftk__.forward(spfft_pu, map_wf_fft[bzphi__].pw_coeffs_spfft(mem, wf::band_index(j)), + spfftk__.forward(spfft_pu, wf_fft[2].pw_coeffs_spfft(mem, wf::band_index(j)), SPFFT_FULL_SCALING); } if (bxyphi__) { mul_by_veff(spfftk__, reinterpret_cast(phi_r), veff_vec_, 2, spfft_buf); - auto mem = map_wf_fft[bxyphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host; + auto mem = wf_fft[3].on_device() ? sddk::memory_t::device : sddk::memory_t::host; /* phi(r) * (Bx(r) - iBy(r)) -> bxyphi(G) */ - spfftk__.forward(spfft_pu, map_wf_fft[bxyphi__].pw_coeffs_spfft(mem, wf::band_index(j)), + spfftk__.forward(spfft_pu, wf_fft[3].pw_coeffs_spfft(mem, wf::band_index(j)), SPFFT_FULL_SCALING); } if (hphi__) { mul_by_veff(spfftk__, reinterpret_cast(phi_r), veff_vec_, v_local_index_t::v0, spfft_buf); - auto mem = map_wf_fft[hphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host; + auto mem = wf_fft[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host; /* phi(r) * Theta(r) * V(r) -> hphi(G) */ - spfftk__.forward(spfft_pu, map_wf_fft[hphi__].pw_coeffs_spfft(mem, wf::band_index(j)), + spfftk__.forward(spfft_pu, wf_fft[0].pw_coeffs_spfft(mem, wf::band_index(j)), SPFFT_FULL_SCALING); /* add kinetic energy */ @@ -641,13 +641,13 @@ void Local_operator::apply_fplapw(fft::spfft_transform_type& spfftk__, std #pragma omp parallel for for (int igloc = 0; igloc < gkvec_fft__->count(); igloc++) { auto gvc = gkvec_fft__->gkvec_cart(igloc); - map_wf_fft[hphi__].pw_coeffs(igloc, wf::band_index(j)) += buf_pw[igloc] * + wf_fft[0].pw_coeffs(igloc, wf::band_index(j)) += buf_pw[igloc] * static_cast(0.5 * gvc[x]); } } else { add_to_hphi_lapw_gpu(gkvec_fft__->count(), buf_pw.at(sddk::memory_t::device), gkvec_cart_.at(sddk::memory_t::device, 0, x), - map_wf_fft[hphi__].at(sddk::memory_t::device, 0, wf::band_index(j))); + wf_fft[0].at(sddk::memory_t::device, 0, wf::band_index(j))); } } // x } diff --git a/src/hubbard/hubbard_occupancies_derivatives.cpp b/src/hubbard/hubbard_occupancies_derivatives.cpp index 4bb5ab1474..3f2613897d 100644 --- a/src/hubbard/hubbard_occupancies_derivatives.cpp +++ b/src/hubbard/hubbard_occupancies_derivatives.cpp @@ -393,7 +393,7 @@ Hubbard::compute_occupancies_stress_derivatives(K_point& kp__, Q_operato for (int igkloc = 0; igkloc < kp__.num_gkvec_loc(); igkloc++) { /* gvs = {r, theta, phi} */ auto gvc = kp__.gkvec().gkvec_cart(igkloc); - auto rtp = SHT::spherical_coordinates(gvc); + auto rtp = r3::spherical_coordinates(gvc); sf::spherical_harmonics(lmax, rtp[1], rtp[2], &rlm_g(0, igkloc)); sddk::mdarray rlm_dg_tmp(&rlm_dg(0, 0, igkloc), lmmax, 3); @@ -401,9 +401,9 @@ Hubbard::compute_occupancies_stress_derivatives(K_point& kp__, Q_operato } /* atomic wave functions */ - auto& phi_atomic = kp__.atomic_wave_functions(); - auto& phi_atomic_S = kp__.atomic_wave_functions_S(); - auto& phi_hub_S = kp__.hubbard_wave_functions_S(); + auto& phi_atomic = kp__.atomic_wave_functions(); + auto& phi_atomic_S = kp__.atomic_wave_functions_S(); + auto& phi_hub_S = kp__.hubbard_wave_functions_S(); auto num_ps_atomic_wf = ctx_.unit_cell().num_ps_atomic_wf(); auto num_hubbard_wf = ctx_.unit_cell().num_hubbard_wf(); diff --git a/src/k_point/generate_fv_states.cpp b/src/k_point/generate_fv_states.cpp index a583a83102..18eaad46a4 100644 --- a/src/k_point/generate_fv_states.cpp +++ b/src/k_point/generate_fv_states.cpp @@ -38,6 +38,8 @@ void K_point::generate_fv_states() auto const& uc = ctx_.unit_cell(); + auto pcs = env::print_checksum(); + auto bs = ctx_.cyclic_block_size(); la::dmatrix> alm_fv(uc.mt_aw_basis_size(), ctx_.num_fv_states(), ctx_.blacs_grid(), bs, bs); @@ -57,6 +59,10 @@ void K_point::generate_fv_states() /* generate complex conjugated Alm coefficients for a block of atoms */ auto alm = generate_alm_block(ctx_, atom_begin, na, this->alm_coeffs_loc()); + auto cs = alm.checksum(); + if (pcs) { + utils::print_checksum("alm", cs, RTE_OUT(this->out(0))); + } /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for the block of atoms */ spla::pgemm_ssb(num_mt_aw, ctx_.num_fv_states(), this->gkvec().count(), SPLA_OP_TRANSPOSE, 1.0, @@ -106,6 +112,13 @@ void K_point::generate_fv_states() } } } + if (pcs) { + auto z1 = fv_states_->checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states())); + auto z2 = fv_states_->checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states())); + utils::print_checksum("fv_states_pw", z1, RTE_OUT(this->out(0))); + utils::print_checksum("fv_states_mt", z2, RTE_OUT(this->out(0))); + + } } template void K_point::generate_fv_states(); diff --git a/src/lapw/matching_coefficients.hpp b/src/lapw/matching_coefficients.hpp index d4b9e22e9d..8ce7799e6f 100644 --- a/src/lapw/matching_coefficients.hpp +++ b/src/lapw/matching_coefficients.hpp @@ -128,7 +128,7 @@ class Matching_coefficients // TODO: compute on GPU for (int i = 0; i < gkvec_.count(); i++) { auto gkvec_cart = gkvec_.gkvec_cart(i); /* get r, theta, phi */ - auto vs = SHT::spherical_coordinates(gkvec_cart); + auto vs = r3::spherical_coordinates(gkvec_cart); gkvec_len_[i] = vs[0]; /* get spherical harmonics */ diff --git a/src/mpi/communicator.hpp b/src/mpi/communicator.hpp index 32d075d6a2..95e8ee9697 100644 --- a/src/mpi/communicator.hpp +++ b/src/mpi/communicator.hpp @@ -276,8 +276,9 @@ class Communicator MPI_Init_thread(NULL, NULL, required__, &provided); MPI_Query_thread(&provided); - if (provided < required__) { - std::printf("Warning! Required level of thread support is not provided.\nprovided: %d \nrequired: %d\n", provided, required__); + if ((provided < required__) && (Communicator::world().rank() == 0)) { + std::printf("Warning! Required level of thread support is not provided.\n"); + std::printf("provided: %d \nrequired: %d\n", provided, required__); } } diff --git a/src/potential/generate_d_operator_matrix.cpp b/src/potential/generate_d_operator_matrix.cpp index 229325f9d0..8a04bd2e7b 100644 --- a/src/potential/generate_d_operator_matrix.cpp +++ b/src/potential/generate_d_operator_matrix.cpp @@ -43,24 +43,40 @@ void Potential::generate_D_operator_matrix() { PROFILE("sirius::Potential::generate_D_operator_matrix"); - auto spl_ngv_loc = ctx_.split_gvec_local(); + /* local number of G-vectors */ + int gvec_count = ctx_.gvec().count(); + auto spl_ngv_loc = utils::split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size()); - if (ctx_.unit_cell().atom_type(0).augment()) { - ctx_.augmentation_op(0).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device)); - } - for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { - auto& atom_type = unit_cell_.atom_type(iat); - int nbf = atom_type.mt_basis_size(); + auto& mph = get_memory_pool(sddk::memory_t::host); + auto& mpd = get_memory_pool(sddk::memory_t::device); - /* start copy of Q(G) for the next atom type */ - if (ctx_.processing_unit() == sddk::device_t::GPU) { - acc::sync_stream(stream_id(0)); - if (iat + 1 != unit_cell_.num_atom_types() && ctx_.unit_cell().atom_type(iat + 1).augment()) { - ctx_.augmentation_op(iat + 1).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device)); + int n_mag_comp{1}; + + sddk::mdarray, 2> veff; + switch (ctx_.processing_unit()) { + case sddk::device_t::CPU: { + break; + } + case sddk::device_t::GPU: { + n_mag_comp = ctx_.num_mag_dims() + 1; + veff = sddk::mdarray, 2>(gvec_count, n_mag_comp, mph); + for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) { + std::copy(&component(j).f_pw_local(0), &component(j).f_pw_local(0) + gvec_count, &veff(0, j)); } + veff.allocate(mpd).copy_to(sddk::memory_t::device); + break; } + } + + for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { + auto& atom_type = unit_cell_.atom_type(iat); + /* number of beta-projector functions */ + int nbf = atom_type.mt_basis_size(); + /* number of Q_{xi,xi'} components for each G */ + int nqlm = nbf * (nbf + 1) / 2; /* trivial case */ + /* in absence of augmentation charge D-matrix is zero */ if (!atom_type.augment() || atom_type.num_atoms() == 0) { for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { for (int i = 0; i < atom_type.num_atoms(); i++) { @@ -76,117 +92,130 @@ void Potential::generate_D_operator_matrix() } continue; } - sddk::matrix d_tmp(nbf * (nbf + 1) / 2, atom_type.num_atoms(), get_memory_pool(sddk::memory_t::host)); - if (ctx_.processing_unit() == sddk::device_t::GPU) { - d_tmp.allocate(get_memory_pool(sddk::memory_t::device)); - } - - print_memory_usage(ctx_.out(), FILE_LINE); - for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { - sddk::matrix veff_a(2 * spl_ngv_loc.local_size(), atom_type.num_atoms(), get_memory_pool(sddk::memory_t::host)); - - auto la = la::lib_t::blas; - auto mem = sddk::memory_t::host; + sddk::mdarray d_tmp(nqlm, atom_type.num_atoms(), ctx_.num_mag_dims() + 1, mph); + sddk::mdarray veff_a(spl_ngv_loc[0] * 2, atom_type.num_atoms(), n_mag_comp, mph); + sddk::mdarray qpw; - d_tmp.zero(); - if (ctx_.processing_unit() == sddk::device_t::GPU) { - la = la::lib_t::gpublas; - mem = sddk::memory_t::device; - d_tmp.zero(sddk::memory_t::device); - veff_a.allocate(get_memory_pool(sddk::memory_t::device)); + switch (ctx_.processing_unit()) { + case sddk::device_t::CPU: { + d_tmp.zero(); + break; + } + case sddk::device_t::GPU: { + d_tmp.allocate(mpd).zero(sddk::memory_t::device); + veff_a.allocate(mpd); + qpw = sddk::mdarray(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw"); + break; } + } - /* split a large loop over G-vectors into blocks */ - for (int ib = 0; ib < spl_ngv_loc.num_ranks(); ib++) { - int g_begin = spl_ngv_loc.global_index(0, ib); - int g_end = g_begin + spl_ngv_loc.local_size(ib); + print_memory_usage(ctx_.out(), FILE_LINE); - switch (ctx_.processing_unit()) { - case sddk::device_t::CPU: { - #pragma omp parallel for schedule(static) + int g_begin{0}; + /* loop over blocks of G-vectors */ + for (auto ng : spl_ngv_loc) { + /* work on the block of the local G-vectors */ + switch (ctx_.processing_unit()) { + case sddk::device_t::CPU: { + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { + /* multiply Veff(G) with the phase factors */ + #pragma omp parallel for for (int i = 0; i < atom_type.num_atoms(); i++) { int ia = atom_type.atom_id(i); - for (int igloc = g_begin; igloc < g_end; igloc++) { - int ig = ctx_.gvec().offset() + igloc; + for (int g = 0; g < ng; g++) { + int ig = ctx_.gvec().offset() + g_begin + g; /* V(G) * exp(i * G * r_{alpha}) */ - auto z = component(iv).f_pw_local(igloc) * ctx_.gvec_phase_factor(ig, ia); - veff_a(2 * (igloc - g_begin), i) = z.real(); - veff_a(2 * (igloc - g_begin) + 1, i) = z.imag(); + auto z = component(iv).f_pw_local(g_begin + g) * ctx_.gvec_phase_factor(ig, ia); + veff_a(2 * g, i, 0) = z.real(); + veff_a(2 * g + 1, i, 0) = z.imag(); } } - break; - } - case sddk::device_t::GPU: { - /* wait for stream#1 to finish previous zgemm */ - acc::sync_stream(stream_id(1)); - /* copy plane wave coefficients of effective potential to GPU */ - sddk::mdarray, 1> veff(&component(iv).f_pw_local(g_begin), spl_ngv_loc.local_size(ib)); - veff.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device); + la::wrap(la::lib_t::blas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng, + &la::constant::one(), + ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), + ctx_.augmentation_op(iat).q_pw().ld(), + veff_a.at(sddk::memory_t::host), veff_a.ld(), + &la::constant::one(), + d_tmp.at(sddk::memory_t::host, 0, 0, iv), d_tmp.ld()); + } // iv + break; + } + case sddk::device_t::GPU: { + acc::copyin(qpw.at(sddk::memory_t::device), + ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm); + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { #if defined(SIRIUS_GPU) - mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), spl_ngv_loc.local_size(ib), - veff.at(sddk::memory_t::device), - ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0), - ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1), - ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2), - ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device), - veff_a.at(sddk::memory_t::device), spl_ngv_loc.local_size(), 1); + mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), ng, veff.at(sddk::memory_t::device, 0, iv), + ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0), + ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1), + ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2), + ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device), + veff_a.at(sddk::memory_t::device, 0, 0, iv), ng, 1 + iv); + + la::wrap(la::lib_t::gpublas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng, + &la::constant::one(), + qpw.at(sddk::memory_t::device), qpw.ld(), + veff_a.at(sddk::memory_t::device, 0, 0, iv), veff_a.ld(), + &la::constant::one(), + d_tmp.at(sddk::memory_t::device, 0, 0, iv), d_tmp.ld(), + stream_id(1 + iv)); #endif - break; - } - } - if (ctx_.cfg().control().print_checksum()) { - if (ctx_.processing_unit() == sddk::device_t::GPU) { - veff_a.copy_to(sddk::memory_t::host); + } // iv + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { + acc::sync_stream(stream_id(1 + iv)); } - auto cs = veff_a.checksum(); - std::stringstream s; - s << "Gvec_block_" << ib << "_veff_a"; - utils::print_checksum(s.str(), cs, ctx_.out()); + break; } - la::wrap(la).gemm('N', 'N', nbf * (nbf + 1) / 2, atom_type.num_atoms(), 2 * spl_ngv_loc.local_size(ib), - &la::constant::one(), - ctx_.augmentation_op(iat).q_pw().at(mem, 0, 2 * g_begin), - ctx_.augmentation_op(iat).q_pw().ld(), - veff_a.at(mem), veff_a.ld(), - &la::constant::one(), - d_tmp.at(mem), d_tmp.ld(), - stream_id(1)); - } // ib (blocks of G-vectors) - - if (ctx_.processing_unit() == sddk::device_t::GPU) { - d_tmp.copy_to(sddk::memory_t::host); } + g_begin += ng; + } + + if (ctx_.processing_unit() == sddk::device_t::GPU) { + d_tmp.copy_to(sddk::memory_t::host); + } + + //if (ctx_.cfg().control().print_checksum()) { + // if (ctx_.processing_unit() == sddk::device_t::GPU) { + // veff_a.copy_to(sddk::memory_t::host); + // } + // auto cs = veff_a.checksum(); + // std::stringstream s; + // s << "Gvec_block_" << ib << "_veff_a"; + // utils::print_checksum(s.str(), cs, ctx_.out()); + //} + + for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) { if (ctx_.gvec().reduced()) { if (comm_.rank() == 0) { for (int i = 0; i < atom_type.num_atoms(); i++) { - for (int j = 0; j < nbf * (nbf + 1) / 2; j++) { - d_tmp(j, i) = 2 * d_tmp(j, i) - component(iv).f_pw_local(0).real() * + for (int j = 0; j < nqlm; j++) { + d_tmp(j, i, iv) = 2 * d_tmp(j, i, iv) - component(iv).f_pw_local(0).real() * ctx_.augmentation_op(iat).q_pw(j, 0); } } } else { for (int i = 0; i < atom_type.num_atoms(); i++) { - for (int j = 0; j < nbf * (nbf + 1) / 2; j++) { - d_tmp(j, i) *= 2; + for (int j = 0; j < nqlm; j++) { + d_tmp(j, i, iv) *= 2; } } } } /* sum from all ranks */ - comm_.allreduce(d_tmp.at(sddk::memory_t::host), static_cast(d_tmp.size())); + comm_.allreduce(d_tmp.at(sddk::memory_t::host, 0, 0, iv), nqlm * atom_type.num_atoms()); - if (ctx_.cfg().control().print_checksum() && ctx_.comm().rank() == 0) { - for (int i = 0; i < atom_type.num_atoms(); i++) { - std::stringstream s; - s << "D_mtrx_val(atom_t" << iat << "_i" << i << "_c" << iv << ")"; - auto cs = sddk::mdarray(&d_tmp(0, i), nbf * (nbf + 1) / 2).checksum(); - utils::print_checksum(s.str(), cs, ctx_.out()); - } - } + //if (ctx_.cfg().control().print_checksum() && ctx_.comm().rank() == 0) { + // for (int i = 0; i < atom_type.num_atoms(); i++) { + // std::stringstream s; + // s << "D_mtrx_val(atom_t" << iat << "_i" << i << "_c" << iv << ")"; + // auto cs = sddk::mdarray(&d_tmp(0, i), nbf * (nbf + 1) / 2).checksum(); + // utils::print_checksum(s.str(), cs, ctx_.out()); + // } + //} #pragma omp parallel for schedule(static) for (int i = 0; i < atom_type.num_atoms(); i++) { @@ -197,13 +226,12 @@ void Potential::generate_D_operator_matrix() for (int xi1 = 0; xi1 <= xi2; xi1++) { int idx12 = xi2 * (xi2 + 1) / 2 + xi1; /* D-matix is symmetric */ - atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i) * unit_cell_.omega(); + atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i, iv) * unit_cell_.omega(); } } } - } - ctx_.augmentation_op(iat).dismiss(); - } + } // iv + } // iat } } // namespace sirius diff --git a/src/sht/gaunt.hpp b/src/sht/gaunt.hpp index 1adef151e0..753bc2bb94 100644 --- a/src/sht/gaunt.hpp +++ b/src/sht/gaunt.hpp @@ -201,7 +201,8 @@ class Gaunt_coefficients return gaunt_packed_L3_(lm1, lm2); } - inline sddk::mdarray get_full_set_L3() const + /// Return the full tensor of Gaunt coefficients with a (L3, L1, L2) order of indices. + inline auto get_full_set_L3() const { sddk::mdarray gc(lmmax3_, lmmax1_, lmmax2_); gc.zero(); diff --git a/src/sht/sht.hpp b/src/sht/sht.hpp index ec1a82f5f3..4defdd9f32 100644 --- a/src/sht/sht.hpp +++ b/src/sht/sht.hpp @@ -329,34 +329,6 @@ class SHT // TODO: better name // } //} - /// Transform Cartesian coordinates [x,y,z] to spherical coordinates [r,theta,phi] - static r3::vector spherical_coordinates(r3::vector vc) - { - r3::vector vs; - - const double eps{1e-12}; - - vs[0] = vc.length(); - - if (vs[0] <= eps) { - vs[1] = 0.0; - vs[2] = 0.0; - } else { - vs[1] = std::acos(vc[2] / vs[0]); // theta = cos^{-1}(z/r) - - if (std::abs(vc[0]) > eps || std::abs(vc[1]) > eps) { - vs[2] = std::atan2(vc[1], vc[0]); // phi = tan^{-1}(y/x) - if (vs[2] < 0.0) { - vs[2] += twopi; - } - } else { - vs[2] = 0.0; - } - } - - return vs; - } - /// Compute element of the transformation matrix from complex to real spherical harmonics. /** Real spherical harmonic can be written as a linear combination of complex harmonics: \f[ diff --git a/src/specfunc/specfunc.hpp b/src/specfunc/specfunc.hpp index b10e998515..88259faa02 100644 --- a/src/specfunc/specfunc.hpp +++ b/src/specfunc/specfunc.hpp @@ -493,7 +493,6 @@ inline void dRlm_dr(int lmax__, r3::vector& r__, sddk::mdarray dtheta_dr({cost * cosp, cost * sinp, -sint}); r3::vector dphi_dr({-sinp, cosp, 0}); - std::vector dRlm_dt(lmmax); std::vector dRlm_dp_sin_t(lmmax); diff --git a/src/symmetry/crystal_symmetry.cpp b/src/symmetry/crystal_symmetry.cpp index 4328f559df..bb8af75e48 100644 --- a/src/symmetry/crystal_symmetry.cpp +++ b/src/symmetry/crystal_symmetry.cpp @@ -325,7 +325,10 @@ void Crystal_symmetry::print_info(std::ostream& out__, int verbosity__) const { if (this->spg_dataset_ && (this->spg_dataset_->n_operations != this->num_spg_sym())) { - out__ << "space group found by spglib is defferent" << std::endl; + out__ << "space group found by spglib is different" << std::endl + << " num. sym. spglib : " << this->spg_dataset_->n_operations << std::endl + << " num. sym. actual : " << this->num_spg_sym() << std::endl + << " tolerance : " << this->tolerance_ << std::endl; } else { out__ << "space group number : " << this->spacegroup_number() << std::endl << "international symbol : " << this->international_symbol() << std::endl diff --git a/src/symmetry/lattice.hpp b/src/symmetry/lattice.hpp index 35d3a9f9e5..da4aa6c18d 100644 --- a/src/symmetry/lattice.hpp +++ b/src/symmetry/lattice.hpp @@ -22,6 +22,7 @@ * \brief Crystal lattice functions. */ +#include #include "linalg/r3.hpp" #include "utils/rte.hpp" @@ -54,12 +55,14 @@ metric_tensor_error(r3::matrix const& lat_vec__, r3::matrix const& } inline auto -find_lat_sym(r3::matrix const& lat_vec__, double tol__) +find_lat_sym(r3::matrix const& lat_vec__, double tol__, double* mt_error__ = nullptr) { std::vector> lat_sym; auto r = {-1, 0, 1}; + double mt_error_max{0}; + for (int i00: r) { for (int i01: r) { for (int i02: r) { @@ -73,8 +76,10 @@ find_lat_sym(r3::matrix const& lat_vec__, double tol__) r3::matrix R({{i00, i01, i02}, {i10, i11, i12}, {i20, i21, i22}}); /* valid symmetry operation has a determinant of +/- 1 */ if (std::abs(R.det()) == 1) { + auto mt_error = metric_tensor_error(lat_vec__, R); + mt_error_max = std::max(mt_error_max, mt_error); /* metric tensor should be invariant under symmetry operation */ - if (metric_tensor_error(lat_vec__, R) < tol__) { + if (mt_error < tol__) { lat_sym.push_back(R); } } @@ -88,12 +93,38 @@ find_lat_sym(r3::matrix const& lat_vec__, double tol__) } } + if (mt_error__) { + *mt_error__ = mt_error_max; + } + if (lat_sym.size() == 0 || lat_sym.size() > 48) { std::stringstream s; - s << "wrong number of lattice symmetries: " << lat_sym.size(); + s << "wrong number of lattice symmetries: " << lat_sym.size() << std::endl + << " lattice vectors : " << lat_vec__ << std::endl + << " tolerance : " << tol__ << std::endl + << " metric tensor error : " << mt_error_max; RTE_THROW(s); } + /* check if the set of symmetry operations is a group */ + for (auto& R1: lat_sym) { + for (auto& R2: lat_sym) { + auto R3 = r3::dot(R1, R2); + if (std::find(lat_sym.begin(), lat_sym.end(), R3) == lat_sym.end()) { + std::stringstream s; + s << "lattice symmetries do not form a group" << std::endl; + for (auto& R: lat_sym) { + s << " sym.op : " << R << ", metric tensor error : " << metric_tensor_error(lat_vec__, R) << std::endl; + } + s << "R1 : " << R1 << std::endl; + s << "R2 : " << R2 << std::endl; + s << "R1 * R2 : " << R3 << " is not in group" << std::endl; + s << "metric tensor tolerance : " << tol__; + RTE_THROW(s); + } + } + } + return lat_sym; } diff --git a/src/symmetry/symmetrize.hpp b/src/symmetry/symmetrize.hpp index c07ba71e98..0f20c0c05f 100644 --- a/src/symmetry/symmetrize.hpp +++ b/src/symmetry/symmetrize.hpp @@ -31,6 +31,7 @@ #include "typedefs.hpp" #include "sht/sht.hpp" #include "utils/profiler.hpp" +#include "utils/rte.hpp" namespace sirius { @@ -140,7 +141,12 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, #if !defined(NDEBUG) if (igsh != gvec_shells__.gvec().shell(G)) { - throw std::runtime_error("wrong index of G-shell"); + std::stringstream s; + s << "wrong index of G-shell" << std::endl + << " G-vector : " << G << std::endl + << " igsh in the remapped set : " << igsh << std::endl + << " igsh in the original set : " << gvec_shells__.gvec().shell(G); + RTE_THROW(s); } #endif /* each thread is working on full shell of G-vectors */ @@ -154,7 +160,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, /* find the symmetrized PW coefficient */ for (int i = 0; i < sym__.size(); i++) { - auto G1 = dot(G, sym__[i].spg_op.R); + auto G1 = r3::dot(G, sym__[i].spg_op.R); auto S = sym__[i].spin_rotation; @@ -163,15 +169,24 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, /* local index of a rotated G-vector */ int ig1 = gvec_shells__.index_by_gvec(G1); + /* check the reduced G-vector */ if (ig1 == -1) { G1 = G1 * (-1); #if !defined(NDEBUG) if (igsh != gvec_shells__.gvec().shell(G1)) { - throw std::runtime_error("wrong index of G-shell"); + std::stringstream s; + s << "wrong index of G-shell" << std::endl + << " index of G-shell : " << igsh << std::endl + << " symmetry operation : " << sym__[i].spg_op.R << std::endl + << " G-vector : " << G << std::endl + << " rotated G-vector : " << G1 << std::endl + << " G-vector index : " << gvec_shells__.index_by_gvec(G1) << std::endl + << " index of rotated G-vector shell : " << gvec_shells__.gvec().shell(G1); + RTE_THROW(s); } #endif ig1 = gvec_shells__.index_by_gvec(G1); - assert(ig1 >= 0 && ig1 < ngv); + RTE_ASSERT(ig1 >= 0 && ig1 < ngv); if (f_pw__) { symf += std::conj(f_pw[ig1]) * phase; } @@ -179,7 +194,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, symz += std::conj(z_pw[ig1]) * phase * S(2, 2); } if (is_non_collin) { - auto v = dot(S, r3::vector>({x_pw[ig1], y_pw[ig1], z_pw[ig1]})); + auto v = r3::dot(S, r3::vector>({x_pw[ig1], y_pw[ig1], z_pw[ig1]})); symx += std::conj(v[0]) * phase; symy += std::conj(v[1]) * phase; symz += std::conj(v[2]) * phase; @@ -187,10 +202,18 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, } else { #if !defined(NDEBUG) if (igsh != gvec_shells__.gvec().shell(G1)) { - throw std::runtime_error("wrong index of G-shell"); + std::stringstream s; + s << "wrong index of G-shell" << std::endl + << " index of G-shell : " << igsh << std::endl + << " symmetry operation : " << sym__[i].spg_op.R << std::endl + << " G-vector : " << G << std::endl + << " rotated G-vector : " << G1 << std::endl + << " G-vector index : " << gvec_shells__.index_by_gvec(G1) << std::endl + << " index of rotated G-vector shell : " << gvec_shells__.gvec().shell(G1); + RTE_THROW(s); } #endif - assert(ig1 >= 0 && ig1 < ngv); + RTE_ASSERT(ig1 >= 0 && ig1 < ngv); if (f_pw__) { symf += f_pw[ig1] * phase; } @@ -198,7 +221,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, symz += z_pw[ig1] * phase * S(2, 2); } if (is_non_collin) { - auto v = dot(S, r3::vector>({x_pw[ig1], y_pw[ig1], z_pw[ig1]})); + auto v = r3::dot(S, r3::vector>({x_pw[ig1], y_pw[ig1], z_pw[ig1]})); symx += v[0] * phase; symy += v[1] * phase; symz += v[2] * phase; @@ -216,12 +239,12 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, for (int isym = 0; isym < sym__.size(); isym++) { auto S = sym__[isym].spin_rotation; - auto G1 = dot(sym__[isym].spg_op.invRT, G); + auto G1 = r3::dot(sym__[isym].spg_op.invRT, G); /* index of a rotated G-vector */ int ig1 = gvec_shells__.index_by_gvec(G1); if (ig1 != -1) { - assert(ig1 >= 0 && ig1 < ngv); + RTE_ASSERT(ig1 >= 0 && ig1 < ngv); auto phase = std::conj(phase_factor(isym, G1)); std::complex symf1, symx1, symy1, symz1; if (f_pw__) { @@ -231,7 +254,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, symz1 = symz * phase * S(2, 2); } if (is_non_collin) { - auto v = dot(S, r3::vector>({symx, symy, symz})); + auto v = r3::dot(S, r3::vector>({symx, symy, symz})); symx1 = v[0] * phase; symy1 = v[1] * phase; symz1 = v[2] * phase; @@ -293,7 +316,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__, auto G = gvec_shells__.gvec_remapped(igloc); for (int isym = 0; isym < sym__.size(); isym++) { auto S = sym__[isym].spin_rotation; - auto gv_rot = dot(sym__[isym].spg_op.invRT, G); + auto gv_rot = r3::dot(sym__[isym].spg_op.invRT, G); /* index of a rotated G-vector */ int ig_rot = gvec_shells__.index_by_gvec(gv_rot); std::complex phase = std::conj(phase_factor(isym, gv_rot)); diff --git a/src/unit_cell/atom_symmetry_class.cpp b/src/unit_cell/atom_symmetry_class.cpp index 590b635880..aac8b9d23c 100644 --- a/src/unit_cell/atom_symmetry_class.cpp +++ b/src/unit_cell/atom_symmetry_class.cpp @@ -55,7 +55,7 @@ Atom_symmetry_class::Atom_symmetry_class(int id__, Atom_type const& atom_type__) o1_radial_integrals_.zero(); } - /* copy descriptors because enu is defferent between atom classes */ + /* copy descriptors because enu is different between atom classes */ aw_descriptors_.resize(atom_type_.num_aw_descriptors()); for (int i = 0; i < num_aw_descriptors(); i++) { aw_descriptors_[i] = atom_type_.aw_descriptor(i); diff --git a/src/unit_cell/unit_cell.cpp b/src/unit_cell/unit_cell.cpp index f486647dd5..b6c0bea9be 100644 --- a/src/unit_cell/unit_cell.cpp +++ b/src/unit_cell/unit_cell.cpp @@ -494,7 +494,7 @@ Unit_cell::is_point_in_mt(r3::vector vc, int& ja, int& jr, double& dr, d r3::vector vf = vr.first - posf; /* convert to spherical coordinates */ - auto vs = SHT::spherical_coordinates(get_cartesian_coordinates(vf)); + auto vs = r3::spherical_coordinates(get_cartesian_coordinates(vf)); if (vs[0] < atom(ia).mt_radius()) { ja = ia; @@ -527,7 +527,7 @@ Unit_cell::is_point_in_mt(r3::vector vc, int& ja, int& jr, double& dr, d } void -Unit_cell::generate_radial_functions() +Unit_cell::generate_radial_functions(std::ostream& out__) { PROFILE("sirius::Unit_cell::generate_radial_functions"); @@ -543,16 +543,15 @@ Unit_cell::generate_radial_functions() if (parameters_.verbosity() >= 1) { mpi::pstdout pout(comm_); + if (comm_.rank() == 0) { + pout << std::endl << "Linearization energies" << std::endl; + } for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) { int ic = spl_num_atom_symmetry_classes(icloc); atom_symmetry_class(ic).write_enu(pout); } - - if (comm_.rank() == 0) { - std::printf("\n"); - std::printf("Linearization energies\n"); - } + RTE_OUT(out__) << pout.flush(0); } if (parameters_.verbosity() >= 4 && comm_.rank() == 0) { for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) { @@ -677,7 +676,6 @@ Unit_cell::initialize() max_mt_radial_basis_size_ = std::max(max_mt_radial_basis_size_, atom_type(iat).mt_radial_basis_size()); max_mt_aw_basis_size_ = std::max(max_mt_aw_basis_size_, atom_type(iat).mt_aw_basis_size()); max_mt_lo_basis_size_ = std::max(max_mt_lo_basis_size_, atom_type(iat).mt_lo_basis_size()); - lmax_ = std::max(lmax_, atom_type(iat).indexr().lmax()); offs_lo += atom_type(iat).mt_lo_basis_size(); } diff --git a/src/unit_cell/unit_cell.hpp b/src/unit_cell/unit_cell.hpp index 6e9f85572a..625ff6fc3b 100644 --- a/src/unit_cell/unit_cell.hpp +++ b/src/unit_cell/unit_cell.hpp @@ -153,9 +153,6 @@ class Unit_cell /// Maximum muffin-tin radius. double max_mt_radius_{0}; - /// Maximum orbital quantum number of radial functions between all atom types. - int lmax_{-1}; - std::unique_ptr symmetry_; /// Atomic coordinates in GPU-friendly ordering packed in arrays for each atom type. @@ -258,7 +255,7 @@ class Unit_cell bool is_point_in_mt(r3::vector vc, int& ja, int& jr, double& dr, double tp[2]) const; - void generate_radial_functions(); + void generate_radial_functions(std::ostream& out__); void generate_radial_integrals(); @@ -470,6 +467,16 @@ class Unit_cell return max_mt_lo_basis_size_; } + /// Maximum number of atoms across all atom types. + inline auto max_num_atoms() const + { + int max_na{0}; + for (int iat = 0; iat < this->num_atom_types(); iat++) { + max_na = std::max(max_na, this->atom_type(iat).num_atoms()); + } + return max_na; + } + void set_equivalent_atoms(int const* equivalent_atoms__) { equivalent_atoms_.resize(num_atoms()); @@ -506,9 +513,14 @@ class Unit_cell return volume_it_; } + /// Maximum orbital quantum number of radial functions between all atom types. inline int lmax() const { - return lmax_; + int l{-1}; + for (int iat = 0; iat < this->num_atom_types(); iat++) { + l = std::max(l, this->atom_type(iat).indexr().lmax()); + } + return l; } inline int lmax_apw() const diff --git a/src/utils/utils.hpp b/src/utils/utils.hpp index 06062f6142..a10f5a0a44 100644 --- a/src/utils/utils.hpp +++ b/src/utils/utils.hpp @@ -252,6 +252,9 @@ inline auto split_in_blocks(int length__, int block_size__) { int nb = num_blocks(length__, block_size__); /* adjust the block size; this is done to prevent very unequal block sizes */ + /* Take, for example, 21 elements and initial block size of 15. Number of blocks equals 2. + * Final block size is 21 / 2 + min(1, 21 % 2) = 11. Thus 21 elements will be split in two blocks + * of 11 and 10 elements. */ block_size__ = length__ / nb + std::min(1, length__ % nb); std::vector result(nb);