diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb1176e8f0..09835f051e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.18)
 
-project(SIRIUS VERSION 7.4.2)
+project(SIRIUS VERSION 7.4.3)
 
 # user variables
 set(CREATE_PYTHON_MODULE    OFF CACHE BOOL "create sirius Python module")
diff --git a/apps/tests/test_wf_fft.cpp b/apps/tests/test_wf_fft.cpp
index 192a33003e..8c6b161fdd 100644
--- a/apps/tests/test_wf_fft.cpp
+++ b/apps/tests/test_wf_fft.cpp
@@ -7,7 +7,7 @@ using namespace sirius;
 void test_wf_fft()
 {
     //sddk::MPI_grid mpi_grid({2, 3}, sddk::Communicator::world());
-    mpi::Grid mpi_grid({1, 1}, mpi::Communicator::world());
+    mpi::Grid mpi_grid({2, 2}, mpi::Communicator::world());
 
     /* creation of simple G+k vector set */
     auto gkvec = fft::gkvec_factory(8.0, mpi_grid.communicator());
@@ -18,7 +18,8 @@ void test_wf_fft()
     /* get the FFT box boundaries */
     auto fft_grid = fft::get_min_grid(8.0, gkvec->lattice_vectors());
 
-    std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
+    //std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
+    std::vector<int> num_mt_coeffs({1});
 
     wf::Wave_functions<double> wf(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
     wf::Wave_functions<double> wf_ref(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
@@ -32,7 +33,7 @@ void test_wf_fft()
             }
         }
     }
-    auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);
+    //auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);
 
     auto pu = sddk::device_t::CPU;
 
@@ -61,10 +62,10 @@ void test_wf_fft()
     for (int ispn = 0; ispn < 2; ispn++) {
 
         wf::Wave_functions_fft<double> wf_fft(gkvec_fft, wf, wf::spin_index(ispn), wf::band_range(0,10),
-                wf::shuffle_to::fft_layout | wf::shuffle_to::wf_layout);
+                wf::shuffle_to::wf_layout);
 
         for (int i = 0; i < wf_fft.num_wf_local(); i++) {
-            spfft_transform->backward(wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
+            spfft_transform->backward(wf1[ispn].pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
             spfft_transform->forward(spfft_pu, wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), SPFFT_FULL_SCALING);
         }
     }
diff --git a/apps/unit_tests/test_rlm_deriv.cpp b/apps/unit_tests/test_rlm_deriv.cpp
index 6657fe1239..7cb3ebafb1 100644
--- a/apps/unit_tests/test_rlm_deriv.cpp
+++ b/apps/unit_tests/test_rlm_deriv.cpp
@@ -482,7 +482,7 @@ int test3()
                            1 + 10 * utils::random<double>(),
                            1 + 10 * utils::random<double>());
 
-        auto rtp = SHT::spherical_coordinates(v);
+        auto rtp = r3::spherical_coordinates(v);
 
         double dr = 1e-5 * rtp[0];
 
@@ -495,8 +495,8 @@ int test3()
             r3::vector<double> v2 = v;
             v2[x] -= dr;
 
-            auto vs1 = SHT::spherical_coordinates(v1);
-            auto vs2 = SHT::spherical_coordinates(v2);
+            auto vs1 = r3::spherical_coordinates(v1);
+            auto vs2 = r3::spherical_coordinates(v2);
             std::vector<double> rlm1(lmmax);
             std::vector<double> rlm2(lmmax);
 
diff --git a/apps/unit_tests/test_rot_ylm.cpp b/apps/unit_tests/test_rot_ylm.cpp
index 6d30d40336..308dd28306 100644
--- a/apps/unit_tests/test_rot_ylm.cpp
+++ b/apps/unit_tests/test_rot_ylm.cpp
@@ -44,10 +44,10 @@ int run_test_impl(cmd_args& args)
 
             /* random Cartesian vector */
             r3::vector<double> coord(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
-            auto scoord = SHT::spherical_coordinates(coord);
+            auto scoord = r3::spherical_coordinates(coord);
             /* rotated coordinates */
             auto coord2 = dot(Rc, coord);
-            auto scoord2 = SHT::spherical_coordinates(coord2);
+            auto scoord2 = r3::spherical_coordinates(coord2);
 
             int lmax{10};
             sddk::mdarray<T, 1> ylm(utils::lmmax(lmax));
diff --git a/doc/doxygen.cfg b/doc/doxygen.cfg
index cc42f17a59..1a5e5bd4bb 100644
--- a/doc/doxygen.cfg
+++ b/doc/doxygen.cfg
@@ -39,7 +39,7 @@ PROJECT_NAME           = "SIRIUS"
 # control system is used.
 
 
-PROJECT_NUMBER         = "7.4.2"
+PROJECT_NUMBER         = "7.4.3"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/src/SDDK/memory.hpp b/src/SDDK/memory.hpp
index 5694d69e31..ba7a2ed4c8 100644
--- a/src/SDDK/memory.hpp
+++ b/src/SDDK/memory.hpp
@@ -678,7 +678,7 @@ sddk::memory_pool& get_memory_pool(sddk::memory_t M__);
     {                                                                             \
         if (!(condition__)) {                                                     \
             std::stringstream _s;                                                 \
-            _s << "Assertion (" <<  #condition__ << ") failed "                   \
+            _s << "Assertion (" << #condition__ << ") failed "                    \
                << "at line " << __LINE__ << " of file " << __FILE__ << std::endl  \
                << "array label: " << label_ << std::endl;                         \
             for (int i = 0; i < N; i++) {                                         \
@@ -755,6 +755,20 @@ class mdarray_index_descriptor
     {
         return size_;
     }
+
+    inline bool check_range(index_type i__) const
+    {
+#ifdef NDEBUG
+        return true;
+#else
+        if (i__ < begin_ || i__ > end_) {
+            std::cout << "index " << i__ << " out of range [" << begin_ << ", " << end_ << "]" << std::endl;
+            return false;
+        } else {
+            return true;
+        }
+#endif
+    }
 };
 
 /// Multidimensional array with the column-major (Fortran) order.
@@ -833,7 +847,8 @@ class mdarray
         std::array<index_type, N> i = {args...};
 
         for (int j = 0; j < N; j++) {
-            mdarray_assert(i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
+            //mdarray_assert(dims_[j].check_range(i[j]) && i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
+            mdarray_assert(dims_[j].check_range(i[j]));
         }
 
         size_t idx = offsets_[0] + i[0];
diff --git a/src/SDDK/wave_functions.hpp b/src/SDDK/wave_functions.hpp
index 57b022f6cf..d48d25eee7 100644
--- a/src/SDDK/wave_functions.hpp
+++ b/src/SDDK/wave_functions.hpp
@@ -924,15 +924,31 @@ class Wave_functions_fft : public Wave_functions_base<T>
              */
             auto& comm_col = gkvec_fft_->comm_ortho_fft();
 
-            auto ncol = sddk::splindex_base<int>::block_size(b__.size(), comm_col.size());
-            size_t sz = gkvec_fft_->count() * ncol;
-            sddk::mdarray<std::complex<T>, 1> send_recv_buf(sz, sddk::get_memory_pool(sddk::memory_t::host), "send_recv_buf");
+            /* in full-potential case leading dimenstion is larger than the number of plane-wave
+             * coefficients, so we have to copy data into temporary storage with necessary leading
+             * dimension */
+            sddk::mdarray<std::complex<T>, 2> wf_tmp;
+            if (wf_->ld() == wf_->num_pw_) { /* pure plane-wave coeffs */
+                auto ptr = (wf_->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
+                wf_tmp = sddk::mdarray<std::complex<T>, 2>(ptr, wf_->num_pw_, b__.size());
+            } else {
+                wf_tmp = sddk::mdarray<std::complex<T>, 2>(wf_->num_pw_, b__.size(), sddk::get_memory_pool(sddk::memory_t::host));
+                for (int i = 0; i < b__.size(); i++) {
+                    auto in_ptr = wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i);
+                    std::copy(in_ptr, in_ptr + wf_->num_pw_, wf_tmp.at(sddk::memory_t::host, 0, i));
+                }
+            }
 
-            auto& row_distr = gkvec_fft_->gvec_slab();
+            auto* send_buf = (wf_tmp.ld() == 0) ? nullptr : wf_tmp.at(sddk::memory_t::host);
 
             /* local number of columns */
             int n_loc = spl_num_wf_.local_size();
 
+            sddk::mdarray<std::complex<T>, 1> recv_buf(gkvec_fft_->count() * n_loc,
+                    sddk::get_memory_pool(sddk::memory_t::host), "recv_buf");
+
+            auto& row_distr = gkvec_fft_->gvec_slab();
+
             /* send and receive dimensions */
             mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size());
             for (int j = 0; j < comm_col.size(); j++) {
@@ -942,12 +958,8 @@ class Wave_functions_fft : public Wave_functions_base<T>
             sd.calc_offsets();
             rd.calc_offsets();
 
-            auto send_buf = (this->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
-
-            {
-                comm_col.alltoall(send_buf, sd.counts.data(), sd.offsets.data(), send_recv_buf.at(sddk::memory_t::host),
-                                  rd.counts.data(), rd.offsets.data());
-            }
+            comm_col.alltoall(send_buf, sd.counts.data(), sd.offsets.data(), recv_buf.at(sddk::memory_t::host),
+                              rd.counts.data(), rd.offsets.data());
 
             /* reorder received blocks */
             #pragma omp parallel for
@@ -956,7 +968,7 @@ class Wave_functions_fft : public Wave_functions_base<T>
                     int offset = row_distr.offsets[j];
                     int count  = row_distr.counts[j];
                     if (count) {
-                        auto from = &send_recv_buf[offset * n_loc + count * i];
+                        auto from = &recv_buf[offset * n_loc + count * i];
                         std::copy(from, from + count, this->data_[0].at(sddk::memory_t::host, offset, i));
                     }
                 }
@@ -989,15 +1001,15 @@ class Wave_functions_fft : public Wave_functions_base<T>
 
             auto& comm_col = gkvec_fft_->comm_ortho_fft();
 
-            auto ncol = sddk::splindex_base<int>::block_size(b__.size(), comm_col.size());
-            size_t sz = gkvec_fft_->count() * ncol;
-            sddk::mdarray<std::complex<T>, 1> send_recv_buf(sz, sddk::get_memory_pool(sddk::memory_t::host), "send_recv_buf");
-
-            auto& row_distr = gkvec_fft_->gvec_slab();
-
             /* local number of columns */
             int n_loc = spl_num_wf_.local_size();
 
+            /* send buffer */
+            sddk::mdarray<std::complex<T>, 1> send_buf(gkvec_fft_->count() * n_loc,
+                    sddk::get_memory_pool(sddk::memory_t::host), "send_buf");
+
+            auto& row_distr = gkvec_fft_->gvec_slab();
+
             /* reorder sending blocks */
             #pragma omp parallel for
             for (int i = 0; i < n_loc; i++) {
@@ -1006,7 +1018,7 @@ class Wave_functions_fft : public Wave_functions_base<T>
                     int count  = row_distr.counts[j];
                     if (count) {
                         auto from = this->data_[0].at(sddk::memory_t::host, offset, i);
-                        std::copy(from, from + count, &send_recv_buf[offset * n_loc + count * i]);
+                        std::copy(from, from + count, &send_buf[offset * n_loc + count * i]);
                     }
                 }
             }
@@ -1018,10 +1030,43 @@ class Wave_functions_fft : public Wave_functions_base<T>
             }
             sd.calc_offsets();
             rd.calc_offsets();
-            auto* recv_buf = (this->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
 
-            comm_col.alltoall(send_recv_buf.at(sddk::memory_t::host), sd.counts.data(), sd.offsets.data(), recv_buf,
+#if !defined(NDEBUG)
+            for (int i = 0; i < n_loc; i++) {
+                for (int j = 0; j < comm_col.size(); j++) {
+                    int offset = row_distr.offsets[j];
+                    int count  = row_distr.counts[j];
+                    for (int igg = 0; igg < count; igg++) {
+                        if (send_buf[offset * n_loc + count * i + igg] != this->data_[0](offset + igg, i)) {
+                            RTE_THROW("wrong packing of send buffer");
+                        }
+                    }
+                }
+            }
+#endif
+            /* full potential wave-functions have extra muffin-tin part;
+             * that makes the PW block of data not consecutive and thus we need to copy to a consecutive buffer
+             * for alltoall */
+            sddk::mdarray<std::complex<T>, 2> wf_tmp;
+            if (wf_->ld() == wf_->num_pw_) { /* pure plane-wave coeffs */
+                auto ptr = (wf_->num_pw_ == 0) ? nullptr : wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin());
+                wf_tmp = sddk::mdarray<std::complex<T>, 2>(ptr, wf_->num_pw_, b__.size());
+            } else {
+                wf_tmp = sddk::mdarray<std::complex<T>, 2>(wf_->num_pw_, b__.size(), sddk::get_memory_pool(sddk::memory_t::host));
+            }
+
+            auto* recv_buf = (wf_tmp.ld() == 0) ? nullptr : wf_tmp.at(sddk::memory_t::host);
+
+            comm_col.alltoall(send_buf.at(sddk::memory_t::host), sd.counts.data(), sd.offsets.data(), recv_buf,
                           rd.counts.data(), rd.offsets.data());
+
+            if (wf_->ld() != wf_->num_pw_) {
+                for (int i = 0; i < b__.size(); i++) {
+                    auto out_ptr = wf_->data_[sp.get()].at(sddk::memory_t::host, 0, b__.begin() + i);
+                    std::copy(wf_tmp.at(sddk::memory_t::host, 0, i),
+                            wf_tmp.at(sddk::memory_t::host, 0, i) + wf_->num_pw_, out_ptr);
+                }
+            }
         }
         if (pp && wf_->gkvec().comm().rank() == 0) {
             auto t = utils::time_interval(t0);
diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp
index e8aa91f192..29c0867b70 100644
--- a/src/api/sirius_api.cpp
+++ b/src/api/sirius_api.cpp
@@ -2768,7 +2768,7 @@ sirius_find_eigen_states(void* const* gs_handler__, void* const* ks_handler__, b
                 gs.potential().update_atomic_potential();
             }
             if (precompute_rf__ && *precompute_rf__) {
-                const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions();
+                const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions(gs.ctx().out());
             }
             if (precompute_ri__ && *precompute_ri__) {
                 const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_integrals();
@@ -4033,7 +4033,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__,
                         gkvec(x, igk)      = kp->gkvec().template gkvec<sddk::index_domain_t::global>(igk)[x];
                         gkvec_cart(x, igk) = gkc[x];
                     }
-                    auto rtp         = sirius::SHT::spherical_coordinates(gkc);
+                    auto rtp         = r3::spherical_coordinates(gkc);
                     gkvec_len[igk]   = rtp[0];
                     gkvec_tp(0, igk) = rtp[1];
                     gkvec_tp(1, igk) = rtp[2];
diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp
index d2830b0117..bf423fa14b 100644
--- a/src/band/diag_full_potential.cpp
+++ b/src/band/diag_full_potential.cpp
@@ -46,6 +46,8 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
     /* block size of scalapack 2d block-cyclic distribution */
     int bs = ctx_.cyclic_block_size();
 
+    auto pcs = env::print_checksum();
+
     la::dmatrix<std::complex<double>> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));
     la::dmatrix<std::complex<double>> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));
 
@@ -76,7 +78,7 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
         }
     }
 
-    if (ctx_.print_checksum()) {
+    if (pcs) {
         auto z1 = h.checksum(ngklo, ngklo);
         auto z2 = o.checksum(ngklo, ngklo);
         utils::print_checksum("h_lapw", z1, ctx_.out());
@@ -107,11 +109,9 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
         }
     }
 
-    if (ctx_.print_checksum()) {
+    if (pcs) {
         auto z1 = kp.fv_eigen_vectors().checksum(kp.gklo_basis_size(), ctx_.num_fv_states());
-        if (kp.comm().rank() == 0) {
-            utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
-        }
+        utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
     }
 
     /* remap to slab */
@@ -129,6 +129,12 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
             la::constant<std::complex<double>>::zero(), kp.comm().native());
     }
 
+    if (pcs) {
+        auto z1 = kp.fv_eigen_vectors_slab().checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        auto z2 = kp.fv_eigen_vectors_slab().checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        utils::print_checksum("fv_eigen_vectors_slab", z1 + z2, kp.out(1));
+    }
+
     /* renormalize wave-functions */
     if (ctx_.valence_relativity() == relativity_t::iora) {
 
diff --git a/src/beta_projectors/beta_projectors.hpp b/src/beta_projectors/beta_projectors.hpp
index cce2292ee7..5d9c83f357 100644
--- a/src/beta_projectors/beta_projectors.hpp
+++ b/src/beta_projectors/beta_projectors.hpp
@@ -57,7 +57,7 @@ class Beta_projectors : public Beta_projectors_base<T>
         #pragma omp parallel for
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             /* vs = {r, theta, phi} */
-            auto vs = SHT::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
+            auto vs = r3::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
             /* compute real spherical harmonics for G+k vector */
             std::vector<double> gkvec_rlm(utils::lmmax(this->ctx_.unit_cell().lmax()));
             sf::spherical_harmonics(this->ctx_.unit_cell().lmax(), vs[1], vs[2], &gkvec_rlm[0]);
diff --git a/src/beta_projectors/beta_projectors_strain_deriv.hpp b/src/beta_projectors/beta_projectors_strain_deriv.hpp
index c4bea65427..f1215becb4 100644
--- a/src/beta_projectors/beta_projectors_strain_deriv.hpp
+++ b/src/beta_projectors/beta_projectors_strain_deriv.hpp
@@ -55,7 +55,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
         #pragma omp parallel for schedule(static)
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
-            auto rtp = SHT::spherical_coordinates(gvc);
+            auto rtp = r3::spherical_coordinates(gvc);
 
             double theta = rtp[1];
             double phi   = rtp[2];
@@ -70,7 +70,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
             /* vs = {r, theta, phi} */
-            auto gvs = SHT::spherical_coordinates(gvc);
+            auto gvs = r3::spherical_coordinates(gvc);
 
             /* |G+k|=0 case */
             if (gvs[0] < 1e-10) {
diff --git a/src/context/config.hpp b/src/context/config.hpp
index 677c4e5407..c9ccb9f1dd 100644
--- a/src/context/config.hpp
+++ b/src/context/config.hpp
@@ -943,6 +943,18 @@ class config_t
             }
             dict_["/control/output"_json_pointer] = output__;
         }
+        /// Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator.
+        inline auto gvec_chunk_size() const
+        {
+            return dict_.at("/control/gvec_chunk_size"_json_pointer).get<int>();
+        }
+        inline void gvec_chunk_size(int gvec_chunk_size__)
+        {
+            if (dict_.contains("locked")) {
+                throw std::runtime_error(locked_msg);
+            }
+            dict_["/control/gvec_chunk_size"_json_pointer] = gvec_chunk_size__;
+        }
       private:
         nlohmann::json& dict_;
     };
diff --git a/src/context/input_schema.json b/src/context/input_schema.json
index 0a62fdcc57..dedd5264d6 100644
--- a/src/context/input_schema.json
+++ b/src/context/input_schema.json
@@ -362,7 +362,7 @@
                 },
                 "spglib_tolerance" : {
                     "type" : "number",
-                    "default" : 1e-4,
+                    "default" : 1e-6,
                     "title" : "Tolerance of the spglib in finding crystal symmetries"
                 },
                 "verbosity" : {
@@ -411,36 +411,41 @@
                     "type" : "boolean",
                     "default" : false,
                     "title" : "If true then the atomic forces are printed at the end of SCF run."
-                 },
-                 "print_timers" : {
-                     "type" : "boolean",
-                     "default" : true,
-                     "title" : "If true then the timer statistics is printed at the end of SCF run."
-                 },
-                 "print_neighbors" : {
-                     "type" : "boolean",
-                     "default" : false,
-                     "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
-                 },
-                 "use_second_variation" : {
-                     "type" : "boolean",
-                     "default" : true,
-                     "title" : "True if second-variational diagonalization is used in LAPW method."
-                 },
-                 "beta_chunk_size" : {
-                     "type" : "integer",
-                     "default" : 256,
-                     "title" : "Number of atoms in a chunk of beta-projectors."
-                 },
-                 "ortho_rf" : {
-                     "type" : "boolean",
-                     "default" : false,
-                     "title" : "Orthogonalize LAPW radial functions."
-                 },
-                 "output" : {
-                     "type" : "string",
-                     "default" : "stdout:",
-                     "title" : "Type of the output stream (stdout:, file:name)"
+                },
+                "print_timers" : {
+                    "type" : "boolean",
+                    "default" : true,
+                    "title" : "If true then the timer statistics is printed at the end of SCF run."
+                },
+                "print_neighbors" : {
+                    "type" : "boolean",
+                    "default" : false,
+                    "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
+                },
+                "use_second_variation" : {
+                    "type" : "boolean",
+                    "default" : true,
+                    "title" : "True if second-variational diagonalization is used in LAPW method."
+                },
+                "beta_chunk_size" : {
+                    "type" : "integer",
+                    "default" : 256,
+                    "title" : "Number of atoms in a chunk of beta-projectors."
+                },
+                "ortho_rf" : {
+                    "type" : "boolean",
+                    "default" : false,
+                    "title" : "Orthogonalize LAPW radial functions."
+                },
+                "output" : {
+                    "type" : "string",
+                    "default" : "stdout:",
+                    "title" : "Type of the output stream (stdout:, file:name)"
+                },
+                "gvec_chunk_size" : {
+                    "type" : "integer",
+                    "default" : 500000,
+                    "title" : "Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator."
                 }
             }
         },
diff --git a/src/context/simulation_context.cpp b/src/context/simulation_context.cpp
index 9270f2dfaf..e1799846d5 100644
--- a/src/context/simulation_context.cpp
+++ b/src/context/simulation_context.cpp
@@ -156,7 +156,7 @@ Simulation_context::generate_gvec_ylm(int lmax__)
     sddk::matrix<std::complex<double>> gvec_ylm(utils::lmmax(lmax__), gvec().count(), sddk::memory_t::host, "gvec_ylm");
     #pragma omp parallel for schedule(static)
     for (int igloc = 0; igloc < gvec().count(); igloc++) {
-        auto rtp = SHT::spherical_coordinates(gvec().gvec_cart<sddk::index_domain_t::local>(igloc));
+        auto rtp = r3::spherical_coordinates(gvec().gvec_cart<sddk::index_domain_t::local>(igloc));
         sf::spherical_harmonics(lmax__, rtp[1], rtp[2], &gvec_ylm(0, igloc));
     }
     return gvec_ylm;
@@ -279,28 +279,6 @@ Simulation_context::ewald_lambda() const
     return lambda;
 }
 
-sddk::splindex<sddk::splindex_t::block>
-Simulation_context::split_gvec_local() const
-{
-    /* local number of G-vectors for this MPI rank */
-    int ngv_loc = gvec().count();
-    /* estimate number of G-vectors in a block */
-    int ld{-1};
-    for (int iat = 0; iat < unit_cell().num_atom_types(); iat++) {
-        int nat = unit_cell().atom_type(iat).num_atoms();
-        int nbf = unit_cell().atom_type(iat).mt_basis_size();
-
-        ld = std::max(ld, std::max(nbf * (nbf + 1) / 2, nat));
-    }
-    /* limit the size of relevant array to ~1Gb */
-    int ngv_b = (1 << 30) / sizeof(std::complex<double>) / ld;
-    ngv_b     = std::max(1, std::min(ngv_loc, ngv_b));
-    /* number of blocks of G-vectors */
-    int nb = ngv_loc / ngv_b;
-    /* split local number of G-vectors between blocks */
-    return sddk::splindex<sddk::splindex_t::block>(ngv_loc, nb, 0);
-}
-
 void
 Simulation_context::initialize()
 {
@@ -361,22 +339,6 @@ Simulation_context::initialize()
     /* initialize MPI communicators */
     init_comm();
 
-    auto print_mpi_layout = env::print_mpi_layout();
-
-    if (verbosity() >= 3 || print_mpi_layout) {
-        mpi::pstdout pout(comm());
-        if (comm().rank() == 0) {
-            pout << "MPI rank placement" << std::endl;
-            pout << "------------------" << std::endl;
-        }
-        pout << "rank: " << comm().rank()
-             << ", comm_band_rank: " << comm_band().rank()
-             << ", comm_k_rank: " << comm_k().rank()
-             << ", hostname: " << utils::hostname()
-             << ", mpi processor name: " << mpi::Communicator::processor_name() << std::endl;
-        rte::ostream(this->out(), "info") << pout.flush(0);
-    }
-
     switch (processing_unit()) {
         case sddk::device_t::CPU: {
             host_memory_t_ = sddk::memory_t::host;
@@ -573,6 +535,7 @@ Simulation_context::initialize()
         }
     }
 
+    /* environment variable has a highest priority */
     auto ev_str = env::get_ev_solver();
     if (ev_str.size()) {
         evsn[0] = ev_str;
@@ -646,6 +609,30 @@ Simulation_context::initialize()
         this->cfg().control().print_checksum(true);
     }
 
+    auto print_mpi_layout = env::print_mpi_layout();
+
+    if (verbosity() >= 3 || print_mpi_layout) {
+        mpi::pstdout pout(comm());
+        if (comm().rank() == 0) {
+            pout << "MPI rank placement" << std::endl;
+            pout << utils::hbar(136, '-') << std::endl;
+            pout << "             |  comm tot, band, k | comm fft, ortho | mpi_grid tot, row, col | blacs tot, row, col" << std::endl;
+        }
+        pout << std::setw(12) << utils::hostname() << " | "
+             << std::setw(6) << comm().rank()
+             << std::setw(6) << comm_band().rank()
+             << std::setw(6) << comm_k().rank() << " | "
+             << std::setw(6) << comm_fft_coarse().rank()
+             << std::setw(6) << comm_band_ortho_fft_coarse().rank() << "    |   "
+             << std::setw(6) << mpi_grid_->communicator(3).rank()
+             << std::setw(6) << mpi_grid_->communicator(1 << 0).rank()
+             << std::setw(6) << mpi_grid_->communicator(1 << 1).rank() << "   | "
+             << std::setw(6) << blacs_grid().comm().rank()
+             << std::setw(6) << blacs_grid().comm_row().rank()
+             << std::setw(6) << blacs_grid().comm_col().rank() << std::endl;
+        rte::ostream(this->out(), "info") << pout.flush(0);
+    }
+
     initialized_ = true;
     cfg().lock();
 }
@@ -709,7 +696,6 @@ Simulation_context::print_info(std::ostream& out__) const
                << "  number of G-shells                    : " << gvecs[i]->num_shells() << std::endl
                << std::endl;
         }
-        os << "number of local G-vector blocks: " << split_gvec_local().num_ranks() << std::endl;
         os << std::endl;
     }
     {
@@ -962,7 +948,8 @@ Simulation_context::update()
        the next time only reciprocal lattice of the G-vectors is updated */
     if (!gvec_coarse_) {
         /* create list of coarse G-vectors */
-        gvec_coarse_ = std::make_unique<fft::Gvec>(rlv, 2 * gk_cutoff(), comm(), cfg().control().reduce_gvec());
+        gvec_coarse_ = std::make_unique<fft::Gvec>(rlv, 2 * gk_cutoff(), comm(), cfg().control().reduce_gvec(),
+                cfg().control().spglib_tolerance());
         /* create FFT friendly partiton */
         gvec_coarse_fft_ = std::make_shared<fft::Gvec_fft>(*gvec_coarse_, comm_fft_coarse(),
                 comm_ortho_fft_coarse());
@@ -1141,21 +1128,12 @@ Simulation_context::update()
         }
     }
 
-    /* precompute some G-vector related arrays */
-    gvec_tp_ = sddk::mdarray<double, 2>(gvec().count(), 2, sddk::memory_t::host, "gvec_tp_");
-    #pragma omp parallel for schedule(static)
-    for (int igloc = 0; igloc < gvec().count(); igloc++) {
-        auto rtp           = SHT::spherical_coordinates(gvec().gvec_cart<sddk::index_domain_t::local>(igloc));
-        gvec_tp_(igloc, 0) = rtp[1];
-        gvec_tp_(igloc, 1) = rtp[2];
-    }
-
     switch (this->processing_unit()) {
         case sddk::device_t::CPU: {
             break;
         }
         case sddk::device_t::GPU: {
-            gvec_tp_.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
+            gvec_->gvec_tp().allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
             break;
         }
     }
@@ -1241,24 +1219,10 @@ Simulation_context::update()
                 new Radial_integrals_atomic_wf<true>(unit_cell(), new_gk_cutoff, 20, idxr_wf, ps_wf, ps_atomic_wf_ri_djl_callback_));
         }
 
-        /* update augmentation operator */
-        sddk::memory_pool* mp{nullptr};
-        sddk::memory_pool* mpd{nullptr};
-        switch (this->processing_unit()) {
-            case sddk::device_t::CPU: {
-                mp = &get_memory_pool(sddk::memory_t::host);
-                break;
-            }
-            case sddk::device_t::GPU: {
-                mp  = &get_memory_pool(sddk::memory_t::host_pinned);
-                mpd = &get_memory_pool(sddk::memory_t::device);
-                break;
-            }
-        }
         for (int iat = 0; iat < unit_cell().num_atom_types(); iat++) {
             if (unit_cell().atom_type(iat).augment() && unit_cell().atom_type(iat).num_atoms() > 0) {
-                augmentation_op_[iat] = std::make_unique<Augmentation_operator>(unit_cell().atom_type(iat), gvec());
-                augmentation_op_[iat]->generate_pw_coeffs(aug_ri(), gvec_tp_, *mp, mpd);
+                augmentation_op_[iat] = std::make_unique<Augmentation_operator>(unit_cell().atom_type(iat), gvec(), aug_ri(), aug_ri_djl());
+                augmentation_op_[iat]->generate_pw_coeffs();
             } else {
                 augmentation_op_[iat] = nullptr;
             }
@@ -1533,8 +1497,6 @@ Simulation_context::init_comm()
 
     /* create communicator, orthogonal to comm_fft_coarse */
     comm_ortho_fft_coarse_ = comm().split(comm_fft_coarse().rank());
-
-    /* create communicator, orthogonal to comm_fft_coarse within a band communicator */
-    comm_band_ortho_fft_coarse_ = comm_band().split(comm_fft_coarse().rank());
 }
+
 } // namespace sirius
diff --git a/src/context/simulation_context.hpp b/src/context/simulation_context.hpp
index cfc8823a0d..a610492aab 100644
--- a/src/context/simulation_context.hpp
+++ b/src/context/simulation_context.hpp
@@ -109,11 +109,6 @@ class Simulation_context : public Simulation_parameters
     /// Auxiliary communicator for the coarse-grid FFT transformation.
     mpi::Communicator comm_ortho_fft_coarse_;
 
-    /// Communicator, which is orthogonal to comm_fft_coarse within a band communicator.
-    /** This communicator is used in reshuffling the wave-functions for the FFT-friendly distribution. It will be
-        used to parallelize application of local Hamiltonian over bands. */
-    mpi::Communicator comm_band_ortho_fft_coarse_;
-
     /// Unit cell of the simulation.
     std::unique_ptr<Unit_cell> unit_cell_;
 
@@ -177,9 +172,6 @@ class Simulation_context : public Simulation_parameters
     /// Lattice coordinats of G-vectors in a GPU-friendly ordering.
     sddk::mdarray<int, 2> gvec_coord_;
 
-    /// Theta and phi angles of G-vectors in GPU-friendly ordering.
-    sddk::mdarray<double, 2> gvec_tp_;
-
     /// Volume of the initial unit cell.
     /** This is needed to estimate the new cutoff for radial integrals. */
     double omega0_;
@@ -509,18 +501,25 @@ class Simulation_context : public Simulation_parameters
         if (cfg().control().fft_mode() == "serial") {
             return mpi::Communicator::self();
         } else {
-            return comm_band();
+            return mpi_grid_->communicator(1 << 0);
         }
     }
 
-    auto const& comm_ortho_fft_coarse() const
+    /// Communicator, which is orthogonal to comm_fft_coarse within a band communicator.
+    /** This communicator is used in reshuffling the wave-functions for the FFT-friendly distribution. It will be
+        used to parallelize application of local Hamiltonian over bands. */
+    auto const& comm_band_ortho_fft_coarse() const
     {
-        return comm_ortho_fft_coarse_;
+        if (cfg().control().fft_mode() == "serial") {
+            return comm_band();
+        } else {
+            return mpi_grid_->communicator(1 << 1);
+        }
     }
 
-    auto const& comm_band_ortho_fft_coarse() const
+    auto const& comm_ortho_fft_coarse() const
     {
-        return comm_band_ortho_fft_coarse_;
+        return comm_ortho_fft_coarse_;
     }
 
     void create_storage_file() const;
@@ -562,16 +561,11 @@ class Simulation_context : public Simulation_parameters
         return gvec_phase_factor(gvec().gvec<sddk::index_domain_t::global>(ig__), ia__);
     }
 
-    inline sddk::mdarray<int, 2> const& gvec_coord() const
+    inline auto const& gvec_coord() const
     {
         return gvec_coord_;
     }
 
-    inline sddk::mdarray<double, 2> const& gvec_tp() const
-    {
-        return gvec_tp_;
-    }
-
     /// Generate phase factors \f$ e^{i {\bf G} {\bf r}_{\alpha}} \f$ for all atoms of a given type.
     void generate_phase_factors(int iat__, sddk::mdarray<std::complex<double>, 2>& phase_factors__) const;
 
@@ -764,9 +758,6 @@ class Simulation_context : public Simulation_parameters
         return (this->processing_unit() == sddk::device_t::CPU) ? sddk::memory_t::host : sddk::memory_t::device;
     }
 
-    /// Split local set of G-vectors into chunks.
-    sddk::splindex<sddk::splindex_t::block> split_gvec_local() const;
-
     /// Set the size of the fine-grained FFT grid.
     void fft_grid_size(std::array<int, 3> fft_grid_size__)
     {
diff --git a/src/density/augmentation_operator.cpp b/src/density/augmentation_operator.cpp
index ab34b15414..9351f97522 100644
--- a/src/density/augmentation_operator.cpp
+++ b/src/density/augmentation_operator.cpp
@@ -26,25 +26,7 @@
 
 namespace sirius {
 
-#if defined(SIRIUS_GPU)
-extern "C" void aug_op_pw_coeffs_gpu(int ngvec__, int const* gvec_shell__, int const* idx__, int idxmax__,
-                                     std::complex<double> const* zilm__, int const* l_by_lm__, int lmmax__,
-                                     double const* gc__, int ld0__, int ld1__,
-                                     double const* gvec_rlm__, int ld2__,
-                                     double const* ri_values__, int ld3__, int ld4__,
-                                     double* q_pw__, int ld5__, double fourpi_omega__);
-extern "C" void aug_op_pw_coeffs_deriv_gpu(int ngvec__, int const* gvec_shell__, double const* gvec_cart__,
-                                           int const* idx__, int idxmax__,
-                                           double const* gc__, int ld0__, int ld1__,
-                                           double const* rlm__, double const* rlm_dg__, int ld2__,
-                                           double const* ri_values__, double const* ri_dg_values__, int ld3__, int ld4__,
-                                           double* q_pw__, int ld5__, double fourpi__, int nu__, int lmax_q__);
-
-extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* tp__, double* rlm__, int ld__);
-#endif
-
-void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug<false> const& radial_integrals__,
-    sddk::mdarray<double, 2> const& tp__, sddk::memory_pool& mp__, sddk::memory_pool* mpd__)
+void Augmentation_operator::generate_pw_coeffs()
 {
     if (!atom_type_.augment()) {
         return;
@@ -53,101 +35,42 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug<false> const
 
     double fourpi_omega = fourpi / gvec_.omega();
 
+    auto const& tp = gvec_.gvec_tp();
+
     /* maximum l of beta-projectors */
     int lmax_beta = atom_type_.indexr().lmax();
     int lmmax     = utils::lmmax(2 * lmax_beta);
 
-    auto l_by_lm = utils::l_by_lm(2 * lmax_beta);
-
-    sddk::mdarray<std::complex<double>, 1> zilm(lmmax);
-    for (int l = 0, lm = 0; l <= 2 * lmax_beta; l++) {
-        for (int m = -l; m <= l; m++, lm++) {
-            zilm[lm] = std::pow(std::complex<double>(0, 1), l);
-        }
-    }
-
-    /* Gaunt coefficients of three real spherical harmonics */
-    Gaunt_coefficients<double> gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr);
-
-    /* split G-vectors between ranks */
-    int gvec_count  = gvec_.count();
-
-    /* array of real spherical harmonics for each G-vector */
-    sddk::mdarray<double, 2> gvec_rlm;
-    switch (atom_type_.parameters().processing_unit()) {
-        case sddk::device_t::CPU: {
-            gvec_rlm = sddk::mdarray<double, 2>(lmmax, gvec_count, mp__);
-            #pragma omp parallel for schedule(static)
-            for (int igloc = 0; igloc < gvec_count; igloc++) {
-                sf::spherical_harmonics(2 * lmax_beta, tp__(igloc, 0), tp__(igloc, 1), &gvec_rlm(0, igloc));
-            }
-            break;
-        }
-        case sddk::device_t::GPU: {
-            gvec_rlm = sddk::mdarray<double, 2>(lmmax, gvec_count, *mpd__);
-#if defined(SIRIUS_GPU)
-            spherical_harmonics_rlm_gpu(2 * lmax_beta, gvec_count, tp__.at(sddk::memory_t::device),
-                gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld());
-#endif
-            break;
-        }
-    }
-
-    /* number of beta- radial functions */
-    int nbrf = atom_type_.mt_radial_basis_size();
-
-    sddk::mdarray<double, 3> ri_values(nbrf * (nbrf + 1) / 2, 2 * lmax_beta + 1, gvec_.num_gvec_shells_local(), mp__);
-    #pragma omp parallel for
-    for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) {
-        auto ri = radial_integrals__.values(atom_type_.id(), gvec_.gvec_shell_len_local(j));
-        for (int l = 0; l <= 2 * lmax_beta; l++) {
-            for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) {
-                ri_values(i, l, j) = ri(i, l);
-            }
-        }
-    }
-
     /* number of beta-projectors */
     int nbf = atom_type_.mt_basis_size();
-
-    int idxmax = nbf * (nbf + 1) / 2;
-
-    /* flatten the indices */
-    sddk::mdarray<int, 2> idx(3, idxmax, mp__);
-    for (int xi2 = 0; xi2 < nbf; xi2++) {
-        int lm2    = atom_type_.indexb(xi2).lm;
-        int idxrf2 = atom_type_.indexb(xi2).idxrf;
-
-        for (int xi1 = 0; xi1 <= xi2; xi1++) {
-            int lm1    = atom_type_.indexb(xi1).lm;
-            int idxrf1 = atom_type_.indexb(xi1).idxrf;
-
-            /* packed orbital index */
-            int idx12 = utils::packed_index(xi1, xi2);
-            /* packed radial-function index */
-            int idxrf12 = utils::packed_index(idxrf1, idxrf2);
-
-            idx(0, idx12) = lm1;
-            idx(1, idx12) = lm2;
-            idx(2, idx12) = idxrf12;
-        }
-    }
-
-    /* array of plane-wave coefficients */
-    q_pw_ = sddk::mdarray<double, 2>(nbf * (nbf + 1) / 2, 2 * gvec_count, mp__, "q_pw_");
-
+    /* only half of Q_{xi,xi'}(G) matrix is stored */
+    int nqlm = nbf * (nbf + 1) / 2;
+    /* local number of G-vectors */
+    int gvec_count = gvec_.count();
+
+    /* Info:
+     *   After some tests, the current GPU implementation of generating aug. operator turns out to be slower than CPU.
+     *   The reason is probaly the memory access pattern of G, lm and, idxrf indices.
+     *   The current decision is to compute aug. operator on CPU once during the initialization and
+     *   then copy the chunks of Q(G) to GPU when computing D-operator and augment charge density.
+     */
     switch (atom_type_.parameters().processing_unit()) {
-        case sddk::device_t::CPU: {
-            #pragma omp parallel for schedule(static)
+        case sddk::device_t::CPU:
+        case sddk::device_t::GPU: {
+            /* Gaunt coefficients of three real spherical harmonics */
+            Gaunt_coefficients<double> gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr);
+            #pragma omp parallel for
             for (int igloc = 0; igloc < gvec_count; igloc++) {
+                std::vector<double> rlm(lmmax);
+                sf::spherical_harmonics(2 * lmax_beta, tp(igloc, 0), tp(igloc, 1), rlm.data());
                 std::vector<std::complex<double>> v(lmmax);
-                for (int idx12 = 0; idx12 < nbf * (nbf + 1) / 2; idx12++) {
-                    int lm1     = idx(0, idx12);
-                    int lm2     = idx(1, idx12);
-                    int idxrf12 = idx(2, idx12);
+                for (int idx12 = 0; idx12 < nqlm; idx12++) {
+                    int lm1     = idx_(0, idx12);
+                    int lm2     = idx_(1, idx12);
+                    int idxrf12 = idx_(2, idx12);
                     for (int lm3 = 0; lm3 < lmmax; lm3++) {
-                        v[lm3] = std::conj(zilm[lm3]) * gvec_rlm(lm3, igloc) *
-                            ri_values(idxrf12, l_by_lm[lm3], gvec_.gvec_shell_idx_local(igloc));
+                        v[lm3] = std::conj(zilm_[lm3]) * rlm[lm3] *
+                            ri_values_(idxrf12, l_by_lm_[lm3], gvec_.gvec_shell_idx_local(igloc));
                     }
                     std::complex<double> z = fourpi_omega * gaunt_coefs.sum_L3_gaunt(lm2, lm1, &v[0]);
                     q_pw_(idx12, 2 * igloc)     = z.real();
@@ -156,49 +79,28 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug<false> const
             }
             break;
         }
-        case sddk::device_t::GPU: {
-            sddk::mdarray<int, 1> gvec_shell(gvec_count, mp__);
-            for (int igloc = 0; igloc < gvec_count; igloc++) {
-                gvec_shell(igloc) = gvec_.gvec_shell_idx_local(igloc);
-            }
-            gvec_shell.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            idx.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            zilm.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            sddk::mdarray<int, 1> l_by_lm_d(&l_by_lm[0], lmmax);
-            l_by_lm_d.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            auto gc = gaunt_coefs.get_full_set_L3();
-            gc.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            ri_values.allocate(*mpd__).copy_to(sddk::memory_t::device);
-
-            q_pw_.allocate(*mpd__);
-
-#if defined(SIRIUS_GPU)
-            int ld0 = static_cast<int>(gc.size(0));
-            int ld1 = static_cast<int>(gc.size(1));
-            aug_op_pw_coeffs_gpu(gvec_count, gvec_shell.at(sddk::memory_t::device), idx.at(sddk::memory_t::device),
-                idxmax, zilm.at(sddk::memory_t::device), l_by_lm_d.at(sddk::memory_t::device), lmmax,
-                gc.at(sddk::memory_t::device), ld0, ld1, gvec_rlm.at(sddk::memory_t::device), lmmax,
-                ri_values.at(sddk::memory_t::device), static_cast<int>(ri_values.size(0)), static_cast<int>(ri_values.size(1)),
-                q_pw_.at(sddk::memory_t::device), static_cast<int>(q_pw_.size(0)), fourpi_omega);
-#endif
-            q_pw_.copy_to(sddk::memory_t::host);
-
-            q_pw_.deallocate(sddk::memory_t::device);
-        }
-    }
-
-    sym_weight_ = sddk::mdarray<double, 1>(nbf * (nbf + 1) / 2, mp__, "sym_weight_");
-    for (int xi2 = 0; xi2 < nbf; xi2++) {
-        for (int xi1 = 0; xi1 <= xi2; xi1++) {
-            /* packed orbital index */
-            int idx12          = utils::packed_index(xi1, xi2);
-            sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2;
-        }
+//        case sddk::device_t::GPU: {
+//#if defined(SIRIUS_GPU)
+//            auto spl_ngv_loc = utils::split_in_blocks(gvec_count, atom_type_.parameters().cfg().control().gvec_chunk_size());
+//            auto& mpd = sddk::get_memory_pool(sddk::memory_t::device);
+//            /* allocate buffer for Rlm on GPUs */
+//            sddk::mdarray<double, 2> gvec_rlm(lmmax, spl_ngv_loc[0], mpd, "gvec_rlm");
+//            /* allocate buffer for Q(G) on GPUs */
+//            sddk::mdarray<double, 2> qpw(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw");
+//
+//            int g_begin{0};
+//            /* loop over blocks of G-vectors */
+//            for (auto ng : spl_ngv_loc) {
+//                /* generate Rlm spherical harmonics */
+//                spherical_harmonics_rlm_gpu(2 * lmax_beta, ng, tp.at(sddk::memory_t::device, g_begin, 0),
+//                        tp.at(sddk::memory_t::device, g_begin, 1), gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld());
+//                this->generate_pw_coeffs_chunk_gpu(g_begin, ng, gvec_rlm.at(sddk::memory_t::device), gvec_rlm.ld(), qpw);
+//                acc::copyout(q_pw_.at(sddk::memory_t::host, 0, 2 * g_begin), qpw.at(sddk::memory_t::device), 2 * ng * nqlm);
+//                g_begin += ng;
+//            }
+//#endif
+//            break;
+//        }
     }
 
     q_mtrx_ = sddk::mdarray<double, 2>(nbf, nbf);
@@ -227,207 +129,60 @@ void Augmentation_operator::generate_pw_coeffs(Radial_integrals_aug<false> const
     }
 }
 
-Augmentation_operator_gvec_deriv::Augmentation_operator_gvec_deriv(Simulation_parameters const& param__, int lmax__,
-    fft::Gvec const& gvec__, sddk::mdarray<double, 2> const& tp__)
-    : gvec_(gvec__)
-{
-    PROFILE("sirius::Augmentation_operator_gvec_deriv");
-
-    int lmmax = utils::lmmax(2 * lmax__);
-
-    /* Gaunt coefficients of three real spherical harmonics */
-    gaunt_coefs_ = std::unique_ptr<Gaunt_coefficients<double>>(
-        new Gaunt_coefficients<double>(lmax__, 2 * lmax__, lmax__, SHT::gaunt_rrr));
-
-    /* split G-vectors between ranks */
-    int gvec_count = gvec__.count();
-
-    rlm_g_  = sddk::mdarray<double, 2>(lmmax, gvec_count);
-    rlm_dg_ = sddk::mdarray<double, 3>(lmmax, 3, gvec_count);
-
-    /* array of real spherical harmonics and derivatives for each G-vector */
-    #pragma omp parallel for schedule(static)
-    for (int igloc = 0; igloc < gvec_count; igloc++) {
-        /* compute Rlm */
-        sf::spherical_harmonics(2 * lmax__, tp__(igloc, 0), tp__(igloc, 1), &rlm_g_(0, igloc));
-        /* compute dRlm/dG */
-        sddk::mdarray<double, 2> tmp(&rlm_dg_(0, 0, igloc), lmmax, 3);
-        auto gv = gvec__.gvec_cart<sddk::index_domain_t::local>(igloc);
-        sf::dRlm_dr(2 * lmax__, gv, tmp, false);
-    }
-    switch (param__.processing_unit()) {
-        case sddk::device_t::CPU: {
-            break;
-        }
-        case sddk::device_t::GPU: {
-            rlm_g_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
-            rlm_dg_.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
-        }
-    }
-}
-
-void Augmentation_operator_gvec_deriv::prepare(Atom_type const& atom_type__,
-    Radial_integrals_aug<false> const& ri__, Radial_integrals_aug<true> const& ri_dq__)
+void Augmentation_operator::generate_pw_coeffs_gvec_deriv(int nu__)
 {
-    PROFILE("sirius::Augmentation_operator_gvec_deriv::prepare");
-
-    int lmax_beta = atom_type__.lmax_beta();
-
-    /* number of beta- radial functions */
-    int nbrf = atom_type__.mt_radial_basis_size();
-
-    auto& mp = get_memory_pool(sddk::memory_t::host);
-
-    ri_values_ = sddk::mdarray<double, 3>(2 * lmax_beta + 1, nbrf * (nbrf + 1) / 2, gvec_.num_gvec_shells_local(), mp);
-    ri_dg_values_ = sddk::mdarray<double, 3>(2 * lmax_beta + 1, nbrf * (nbrf + 1) / 2, gvec_.num_gvec_shells_local(),
-        mp);
-    #pragma omp parallel for
-    for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) {
-        auto ri = ri__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j));
-        auto ri_dg = ri_dq__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j));
-        for (int l = 0; l <= 2 * lmax_beta; l++) {
-            for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) {
-                ri_values_(l, i, j) = ri(i, l);
-                ri_dg_values_(l, i, j) = ri_dg(i, l);
-            }
-        }
-    }
-
-    /* number of beta-projectors */
-    int nbf = atom_type__.mt_basis_size();
-
-    int idxmax = nbf * (nbf + 1) / 2;
-
-    /* flatten the indices */
-    idx_ = sddk::mdarray<int, 2>(3, idxmax, mp);
-    for (int xi2 = 0; xi2 < nbf; xi2++) {
-        int lm2    = atom_type__.indexb(xi2).lm;
-        int idxrf2 = atom_type__.indexb(xi2).idxrf;
-
-        for (int xi1 = 0; xi1 <= xi2; xi1++) {
-            int lm1    = atom_type__.indexb(xi1).lm;
-            int idxrf1 = atom_type__.indexb(xi1).idxrf;
-
-            /* packed orbital index */
-            int idx12 = utils::packed_index(xi1, xi2);
-            /* packed radial-function index */
-            int idxrf12 = utils::packed_index(idxrf1, idxrf2);
-
-            idx_(0, idx12) = lm1;
-            idx_(1, idx12) = lm2;
-            idx_(2, idx12) = idxrf12;
-        }
-    }
-
-    int gvec_count  = gvec_.count();
-
-    gvec_shell_ = sddk::mdarray<int, 1>(gvec_count, get_memory_pool(sddk::memory_t::host));
-    gvec_cart_ = sddk::mdarray<double, 2>(3, gvec_count, get_memory_pool(sddk::memory_t::host));
-    for (int igloc = 0; igloc < gvec_count; igloc++) {
-        auto gvc = gvec_.gvec_cart<sddk::index_domain_t::local>(igloc);
-        gvec_shell_(igloc) = gvec_.gvec_shell_idx_local(igloc);
-        for (int x: {0, 1, 2}) {
-            gvec_cart_(x, igloc) = gvc[x];
-        }
-    }
-
-    sym_weight_ = sddk::mdarray<double, 1>(nbf * (nbf + 1) / 2, mp, "sym_weight_");
-    for (int xi2 = 0; xi2 < nbf; xi2++) {
-        for (int xi1 = 0; xi1 <= xi2; xi1++) {
-            /* packed orbital index */
-            int idx12          = xi2 * (xi2 + 1) / 2 + xi1;
-            sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2;
-        }
-    }
-
-    switch (atom_type__.parameters().processing_unit()) {
-        case sddk::device_t::CPU: {
-            /* array of plane-wave coefficients */
-            q_pw_ = sddk::mdarray<double, 2>(nbf * (nbf + 1) / 2, 2 * gvec_count, mp, "q_pw_dg_");
-            break;
-        }
-        case sddk::device_t::GPU: {
-            auto& mpd = get_memory_pool(sddk::memory_t::device);
-            ri_values_.allocate(mpd).copy_to(sddk::memory_t::device);
-            ri_dg_values_.allocate(mpd).copy_to(sddk::memory_t::device);
-            idx_.allocate(mpd).copy_to(sddk::memory_t::device);
-            gvec_shell_.allocate(mpd).copy_to(sddk::memory_t::device);
-            gvec_cart_.allocate(mpd).copy_to(sddk::memory_t::device);
-            /* array of plane-wave coefficients */
-            q_pw_ = sddk::mdarray<double, 2>(nbf * (nbf + 1) / 2, 2 * gvec_count, mpd, "q_pw_dg_");
-            break;
-        }
+    if (!atom_type_.augment()) {
+        return;
     }
-}
+    PROFILE("sirius::Augmentation_operator::generate_pw_coeffs_gvec_deriv");
 
-void Augmentation_operator_gvec_deriv::generate_pw_coeffs(Atom_type const& atom_type__, int nu__)
-{
-    PROFILE("sirius::Augmentation_operator_gvec_deriv::generate_pw_coeffs");
+    auto const& tp = gvec_.gvec_tp();
 
     /* maximum l of beta-projectors */
-    int lmax_beta = atom_type__.indexr().lmax();
-
-    int lmax_q = 2 * lmax_beta;
-
-    int lmmax = utils::lmmax(lmax_q);
-
-    auto l_by_lm = utils::l_by_lm(lmax_q);
-
-    sddk::mdarray<std::complex<double>, 1> zilm(lmmax);
-    for (int l = 0, lm = 0; l <= lmax_q; l++) {
-        for (int m = -l; m <= l; m++, lm++) {
-            zilm[lm] = std::pow(std::complex<double>(0, 1), l);
-        }
-    }
+    int lmax_beta = atom_type_.indexr().lmax();
+    int lmmax     = utils::lmmax(2 * lmax_beta);
 
     /* number of beta-projectors */
-    int nbf = atom_type__.mt_basis_size();
-
-    /* split G-vectors between ranks */
-    int gvec_count  = gvec_.count();
+    int nbf = atom_type_.mt_basis_size();
+    /* local number of G-vectors */
+    int gvec_count = gvec_.count();
 
-    switch (atom_type__.parameters().processing_unit()) {
+    switch (atom_type_.parameters().processing_unit()) {
+        case sddk::device_t::GPU:
         case sddk::device_t::CPU: {
-            #pragma omp parallel for schedule(static)
+            /* Gaunt coefficients of three real spherical harmonics */
+            Gaunt_coefficients<double> gaunt_coefs(lmax_beta, 2 * lmax_beta, lmax_beta, SHT::gaunt_rrr);
+            #pragma omp parallel for
             for (int igloc = 0; igloc < gvec_count; igloc++) {
                 /* index of the G-vector shell */
                 int igsh = gvec_.gvec_shell_idx_local(igloc);
+
+                auto gvc = gvec_.gvec_cart<sddk::index_domain_t::local>(igloc);
+                double gvc_nu = gvc[nu__];
+
+                std::vector<double> rlm(lmmax);
+                sddk::mdarray<double, 2> rlm_dq(lmmax, 3);
+                sf::spherical_harmonics(2 * lmax_beta, tp(igloc, 0), tp(igloc, 1), rlm.data());
+                const bool divide_by_r{false};
+                sf::dRlm_dr(2 * lmax_beta, gvc, rlm_dq, divide_by_r);
+
                 std::vector<std::complex<double>> v(lmmax);
-                double gvc_nu = gvec_cart_(nu__, igloc);
                 for (int idx12 = 0; idx12 < nbf * (nbf + 1) / 2; idx12++) {
                     int lm1     = idx_(0, idx12);
                     int lm2     = idx_(1, idx12);
                     int idxrf12 = idx_(2, idx12);
                     for (int lm3 = 0; lm3 < lmmax; lm3++) {
-                        int l = l_by_lm[lm3];
-                        v[lm3] = std::conj(zilm[lm3]) *
-                            (rlm_dg_(lm3, nu__, igloc) * ri_values_(l, idxrf12, igsh) +
-                             rlm_g_(lm3, igloc) * ri_dg_values_(l, idxrf12, igsh) * gvc_nu);
+                        int l = l_by_lm_[lm3];
+                        v[lm3] = std::conj(zilm_[lm3]) * (rlm_dq(lm3, nu__) * ri_values_(idxrf12, l, igsh) +
+                             rlm[lm3] * ri_dq_values_(idxrf12, l, igsh) * gvc_nu);
                     }
-                    std::complex<double> z = fourpi * gaunt_coefs_->sum_L3_gaunt(lm2, lm1, &v[0]);
+                    std::complex<double> z = fourpi * gaunt_coefs.sum_L3_gaunt(lm2, lm1, &v[0]);
                     q_pw_(idx12, 2 * igloc)     = z.real();
                     q_pw_(idx12, 2 * igloc + 1) = z.imag();
                 }
             }
             break;
         }
-        case sddk::device_t::GPU: {
-            auto& mpd = get_memory_pool(sddk::memory_t::device);
-
-            auto gc = gaunt_coefs_->get_full_set_L3();
-            gc.allocate(mpd).copy_to(sddk::memory_t::device);
-
-#if defined(SIRIUS_GPU)
-            aug_op_pw_coeffs_deriv_gpu(gvec_count, gvec_shell_.at(sddk::memory_t::device), gvec_cart_.at(sddk::memory_t::device),
-                idx_.at(sddk::memory_t::device), static_cast<int>(idx_.size(1)),
-                gc.at(sddk::memory_t::device), static_cast<int>(gc.size(0)), static_cast<int>(gc.size(1)),
-                rlm_g_.at(sddk::memory_t::device), rlm_dg_.at(sddk::memory_t::device), static_cast<int>(rlm_g_.size(0)),
-                ri_values_.at(sddk::memory_t::device), ri_dg_values_.at(sddk::memory_t::device), static_cast<int>(ri_values_.size(0)),
-                static_cast<int>(ri_values_.size(1)), q_pw_.at(sddk::memory_t::device), static_cast<int>(q_pw_.size(0)),
-                fourpi, nu__, lmax_q);
-#endif
-            break;
-        }
     }
 }
 
diff --git a/src/density/augmentation_operator.hpp b/src/density/augmentation_operator.hpp
index 8be3150df9..d6662203f6 100644
--- a/src/density/augmentation_operator.hpp
+++ b/src/density/augmentation_operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2013-2019 Anton Kozhevnikov, Thomas Schulthess
+// Copyright (c) 2013-2023 Anton Kozhevnikov, Thomas Schulthess
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that
@@ -28,8 +28,82 @@
 #include "radial/radial_integrals.hpp"
 #include "fft/gvec.hpp"
 
+#if defined(SIRIUS_GPU)
+extern "C" {
+
+void
+aug_op_pw_coeffs_gpu(int ngvec__, int const* gvec_shell__, int const* idx__, int idxmax__,
+        std::complex<double> const* zilm__, int const* l_by_lm__, int lmmax__, double const* gc__, int ld0__,
+        int ld1__, double const* gvec_rlm__, int ld2__, double const* ri_values__, int ld3__, int ld4__,
+        double* q_pw__, int ld5__, double fourpi_omega__);
+
+void
+aug_op_pw_coeffs_deriv_gpu(int ngvec__, int const* gvec_shell__, double const* gvec_cart__, int const* idx__,
+        int idxmax__, double const* gc__, int ld0__, int ld1__, double const* rlm__, double const* rlm_dg__, int ld2__,
+        double const* ri_values__, double const* ri_dg_values__, int ld3__, int ld4__, double* q_pw__, int ld5__,
+        double fourpi__, int nu__, int lmax_q__);
+
+void
+spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* theta__, double const* phi__, double* rlm__, int ld__);
+
+}
+#endif
+
 namespace sirius {
 
+template <typename F>
+inline void iterate_aug_atom_types(Unit_cell const& uc__, F&& f__)
+{
+    for (int iat = 0; iat < uc__.num_atom_types(); iat++) {
+        auto& atom_type = uc__.atom_type(iat);
+
+        if (!atom_type.augment() || atom_type.num_atoms() == 0) {
+            continue;
+        }
+        f__(atom_type);
+    }
+}
+
+inline auto max_l_aug(Unit_cell const& uc__)
+{
+    int l{0};
+
+    iterate_aug_atom_types(uc__,
+        [&l](Atom_type const& type__)
+        {
+            l = std::max(l, type__.indexr().lmax());
+        });
+
+    return l;
+}
+
+inline auto max_na_aug(Unit_cell const& uc__)
+{
+    int na{0};
+
+    iterate_aug_atom_types(uc__,
+        [&na](Atom_type const& type__)
+        {
+            na = std::max(na, type__.num_atoms());
+        });
+
+    return na;
+
+}
+
+inline auto max_nb_aug(Unit_cell const& uc__)
+{
+    int nb{0};
+
+    iterate_aug_atom_types(uc__,
+        [&nb](Atom_type const& type__)
+        {
+            nb = std::max(nb, type__.mt_basis_size());
+        });
+
+    return nb;
+}
+
 /// Augmentation charge operator Q(r) of the ultrasoft pseudopotential formalism.
 /** This class generates and stores the plane-wave coefficients of the augmentation charge operator for
     a given atom type. */
@@ -42,43 +116,142 @@ class Augmentation_operator
 
     sddk::mdarray<double, 2> q_mtrx_;
 
-    mutable sddk::mdarray<double, 2> q_pw_;
+    sddk::mdarray<double, 2> q_pw_;
+
+    sddk::mdarray<double, 1> sym_weight_;
+
+    sddk::mdarray<std::complex<double>, 1> zilm_;
+
+    sddk::mdarray<double, 3> ri_values_;
+    sddk::mdarray<double, 3> ri_dq_values_;
+
+    sddk::mdarray<int, 2> idx_;
 
-    mutable sddk::mdarray<double, 1> sym_weight_;
+    sddk::mdarray<int, 1> l_by_lm_;
 
   public:
-    Augmentation_operator(Atom_type const& atom_type__, fft::Gvec const& gvec__)
+    /// Constructor.
+    /**\param [in] atom_type        Atom type instance.
+     * \param [in] gvec             G-vector instance.
+     * \param [in] radial_integrals Radial integrals of the Q(r) with spherical Bessel functions.
+     */
+    Augmentation_operator(Atom_type const& atom_type__, fft::Gvec const& gvec__,
+        Radial_integrals_aug<false> const& ri__, Radial_integrals_aug<true> const& ri_dq__)
         : atom_type_(atom_type__)
         , gvec_(gvec__)
     {
-    }
+        int lmax_beta = atom_type_.indexr().lmax();
+        int lmax      = 2 * lmax_beta;
+        int lmmax     = utils::lmmax(lmax);
+
+        /* compute l of lm index */
+        auto l_by_lm = utils::l_by_lm(lmax);
+        l_by_lm_ = sddk::mdarray<int, 1>(lmmax);
+        std::copy(l_by_lm.begin(), l_by_lm.end(), &l_by_lm_[0]);
+
+        /* compute i^l array */
+        zilm_ = sddk::mdarray<std::complex<double>, 1>(lmmax);
+        for (int l = 0, lm = 0; l <= lmax; l++) {
+            for (int m = -l; m <= l; m++, lm++) {
+                zilm_[lm] = std::pow(std::complex<double>(0, 1), l);
+            }
+        }
+
+        /* number of beta-projectors */
+        int nbf = atom_type_.mt_basis_size();
+        /* number of beta-projector radial functions */
+        int nbrf = atom_type__.mt_radial_basis_size();
+        /* only half of Q_{xi,xi'}(G) matrix is stored */
+        int nqlm = nbf * (nbf + 1) / 2;
+
+        /* flatten the indices */
+        idx_ = sddk::mdarray<int, 2>(3, nqlm);
+        for (int xi2 = 0; xi2 < nbf; xi2++) {
+            int lm2    = atom_type_.indexb(xi2).lm;
+            int idxrf2 = atom_type_.indexb(xi2).idxrf;
+
+            for (int xi1 = 0; xi1 <= xi2; xi1++) {
+                int lm1    = atom_type_.indexb(xi1).lm;
+                int idxrf1 = atom_type_.indexb(xi1).idxrf;
+
+                /* packed orbital index */
+                int idx12 = utils::packed_index(xi1, xi2);
+                /* packed radial-function index */
+                int idxrf12 = utils::packed_index(idxrf1, idxrf2);
+
+                idx_(0, idx12) = lm1;
+                idx_(1, idx12) = lm2;
+                idx_(2, idx12) = idxrf12;
+            }
+        }
 
-    void generate_pw_coeffs(Radial_integrals_aug<false> const& radial_integrals__, sddk::mdarray<double, 2> const& tp__,
-        sddk::memory_pool& mp__, sddk::memory_pool* mpd__);
+        ri_values_ = sddk::mdarray<double, 3>(nbrf * (nbrf + 1) / 2, lmax + 1, gvec_.num_gvec_shells_local());
+        ri_dq_values_ = sddk::mdarray<double, 3>(nbrf * (nbrf + 1) / 2, lmax + 1, gvec_.num_gvec_shells_local());
+        #pragma omp parallel for
+        for (int j = 0; j < gvec_.num_gvec_shells_local(); j++) {
+            auto ri = ri__.values(atom_type_.id(), gvec_.gvec_shell_len_local(j));
+            auto ri_dq = ri_dq__.values(atom_type__.id(), gvec_.gvec_shell_len_local(j));
+            for (int l = 0; l <= lmax; l++) {
+                for (int i = 0; i < nbrf * (nbrf + 1) / 2; i++) {
+                    ri_values_(i, l, j) = ri(i, l);
+                    ri_dq_values_(i, l, j) = ri_dq(i, l);
+                }
+            }
+        }
 
-    void prepare(stream_id sid, sddk::memory_pool* mp__) const
-    {
-        if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU && atom_type_.augment()) {
-            if (mp__) {
-                sym_weight_.allocate(*mp__);
-                q_pw_.allocate(*mp__);
-            } else {
-                sym_weight_.allocate(sddk::memory_t::device);
-                q_pw_.allocate(sddk::memory_t::device);
+        sym_weight_ = sddk::mdarray<double, 1>(nqlm);
+        for (int xi2 = 0; xi2 < nbf; xi2++) {
+            for (int xi1 = 0; xi1 <= xi2; xi1++) {
+                /* packed orbital index */
+                int idx12          = utils::packed_index(xi1, xi2);
+                sym_weight_(idx12) = (xi1 == xi2) ? 1 : 2;
             }
-            sym_weight_.copy_to(sddk::memory_t::device, sid);
-            q_pw_.copy_to(sddk::memory_t::device, sid);
         }
-    }
 
-    void dismiss() const
-    {
-        if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU && atom_type_.augment()) {
-            q_pw_.deallocate(sddk::memory_t::device);
-            sym_weight_.deallocate(sddk::memory_t::device);
+        if (atom_type_.parameters().processing_unit() == sddk::device_t::GPU) {
+            auto& mpd = sddk::get_memory_pool(sddk::memory_t::device);
+            sym_weight_.allocate(mpd).copy_to(sddk::memory_t::device);
         }
+
+        /* allocate array of plane-wave coefficients */
+        auto mt = (atom_type_.parameters().processing_unit() == sddk::device_t::CPU) ? sddk::memory_t::host :
+            sddk::memory_t::host_pinned;
+        q_pw_ = sddk::mdarray<double, 2>(nqlm, 2 * gvec_.count(), sddk::get_memory_pool(mt), "q_pw_");
+
     }
 
+// TODO: not used at the moment, evaluate the possibility to remove in the future
+//    /// Generate chunk of plane-wave coefficients on the GPU.
+//    void generate_pw_coeffs_chunk_gpu(int g_begin__, int ng__, double const* gvec_rlm__, int ld__,
+//            sddk::mdarray<double, 2>& qpw__) const
+//    {
+//#if defined(SIRIUS_GPU)
+//        double fourpi_omega = fourpi / gvec_.omega();
+//
+//        /* maximum l of beta-projectors */
+//        int lmax_beta = atom_type_.indexr().lmax();
+//        int lmmax     = utils::lmmax(2 * lmax_beta);
+//        /* number of beta-projectors */
+//        int nbf = atom_type_.mt_basis_size();
+//        /* only half of Q_{xi,xi'}(G) matrix is stored */
+//        int nqlm = nbf * (nbf + 1) / 2;
+//        /* generate Q(G) */
+//        aug_op_pw_coeffs_gpu(ng__, gvec_shell_.at(sddk::memory_t::device, g_begin__), idx_.at(sddk::memory_t::device),
+//            nqlm, zilm_.at(sddk::memory_t::device), l_by_lm_.at(sddk::memory_t::device), lmmax,
+//            gaunt_coefs_.at(sddk::memory_t::device), static_cast<int>(gaunt_coefs_.size(0)),
+//            static_cast<int>(gaunt_coefs_.size(1)), gvec_rlm__, ld__,
+//            ri_values_.at(sddk::memory_t::device), static_cast<int>(ri_values_.size(0)),
+//            static_cast<int>(ri_values_.size(1)), qpw__.at(sddk::memory_t::device), static_cast<int>(qpw__.size(0)),
+//            fourpi_omega);
+//#endif
+//    }
+
+    /// Generate Q_{xi,xi'}(G) plane wave coefficients.
+    void generate_pw_coeffs();
+
+    /// Generate G-vector derivative Q_{xi,xi'}(G)/dG of the plane-wave coefficients */
+    void generate_pw_coeffs_gvec_deriv(int nu__);
+
     auto const& q_pw() const
     {
         return q_pw_;
@@ -89,17 +262,6 @@ class Augmentation_operator
         return q_pw_(i__, ig__);
     }
 
-    /// Set Q-matrix.
-    void q_mtrx(sddk::mdarray<double, 2> const& q_mtrx__)
-    {
-        int nbf = atom_type_.mt_basis_size();
-        for (int i = 0; i < nbf; i++) {
-            for (int j = 0; j < nbf; j++) {
-                q_mtrx_(j, i) = q_mtrx__(j, i);
-            }
-        }
-    }
-
     /// Get values of the Q-matrix.
     inline double q_mtrx(int xi1__, int xi2__) const
     {
@@ -124,91 +286,6 @@ class Augmentation_operator
     }
 };
 
-// TODO:
-// can't cache it in the simulation context because each time the lattice is updated, PW coefficients must be
-// recomputed; so, the only way to accelerate it is to move to GPUs..
-
-/// Derivative of augmentation operator PW coefficients with respect to the Cartesian component of G-vector.
-class Augmentation_operator_gvec_deriv
-{
-  private:
-    fft::Gvec const& gvec_;
-
-    sddk::mdarray<double, 2> q_pw_;
-
-    sddk::mdarray<double, 1> sym_weight_;
-
-    sddk::mdarray<double, 2> rlm_g_;
-
-    sddk::mdarray<double, 3> rlm_dg_;
-
-    sddk::mdarray<double, 3> ri_values_;
-
-    sddk::mdarray<double, 3> ri_dg_values_;
-
-    sddk::mdarray<int, 2> idx_;
-
-    sddk::mdarray<double, 2> gvec_cart_;
-
-    sddk::mdarray<int, 1> gvec_shell_;
-
-    std::unique_ptr<Gaunt_coefficients<double>> gaunt_coefs_;
-
-  public:
-    Augmentation_operator_gvec_deriv(Simulation_parameters const& param__, int lmax__, fft::Gvec const& gvec__,
-        sddk::mdarray<double, 2> const& tp__);
-
-    void generate_pw_coeffs(Atom_type const& atom_type__, int nu__);
-
-    void prepare(Atom_type const& atom_type__, Radial_integrals_aug<false> const& ri__,
-        Radial_integrals_aug<true> const& ri_dq__);
-
-    //void prepare(int stream_id__) const
-    //{
-    //    #ifdef SIRIUS_GPU
-    //    if (atom_type_.parameters().processing_unit() == GPU && atom_type_.pp_desc().augment) {
-    //        sym_weight_.allocate(memory_t::device);
-    //        sym_weight_.async_copy_to_device(stream_id__);
-
-    //        q_pw_.allocate(memory_t::device);
-    //        q_pw_.async_copy_to_device(stream_id__);
-    //    }
-    //    #endif
-    //}
-
-    //void dismiss() const
-    //{
-    //    #ifdef SIRIUS_GPU
-    //    if (atom_type_.parameters().processing_unit() == GPU && atom_type_.pp_desc().augment) {
-    //        q_pw_.deallocate_on_device();
-    //        sym_weight_.deallocate_on_device();
-    //    }
-    //    #endif
-    //}
-
-    auto const& q_pw() const
-    {
-        return q_pw_;
-    }
-
-    auto& q_pw()
-    {
-        return q_pw_;
-    }
-
-    double q_pw(int i__, int ig__) const
-    {
-        return q_pw_(i__, ig__);
-    }
-
-    /// Weight of Q_{\xi,\xi'}.
-    /** 2 if off-diagonal (xi != xi'), 1 if diagonal (xi=xi') */
-    inline double sym_weight(int idx__) const
-    {
-        return sym_weight_(idx__);
-    }
-};
-
 } // namespace sirius
 
 #endif // __AUGMENTATION_OPERATOR_H__
diff --git a/src/density/density.cpp b/src/density/density.cpp
index d69fbb1cb1..ee3f57beb0 100644
--- a/src/density/density.cpp
+++ b/src/density/density.cpp
@@ -1357,43 +1357,38 @@ Density::generate_rho_aug()
 {
     PROFILE("sirius::Density::generate_rho_aug");
 
-    auto spl_ngv_loc = ctx_.split_gvec_local();
+    /* local number of G-vectors */
+    int gvec_count = ctx_.gvec().count();
+    auto spl_ngv_loc = utils::split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size());
+
+    auto& mph = get_memory_pool(sddk::memory_t::host);
+    auto& mpd = get_memory_pool(sddk::memory_t::device);
+
+    sddk::mdarray<std::complex<double>, 2> rho_aug(gvec_count, ctx_.num_mag_dims() + 1, mph);
 
-    sddk::mdarray<std::complex<double>, 2> rho_aug(ctx_.gvec().count(), ctx_.num_mag_dims() + 1,
-                                             get_memory_pool(sddk::memory_t::host));
     switch (ctx_.processing_unit()) {
         case sddk::device_t::CPU: {
             rho_aug.zero(sddk::memory_t::host);
             break;
         }
         case sddk::device_t::GPU: {
-            rho_aug.allocate(get_memory_pool(sddk::memory_t::device)).zero(sddk::memory_t::device);
+            rho_aug.allocate(mpd).zero(sddk::memory_t::device);
             break;
         }
     }
 
-    // TODO: the GPU memory consumption here is huge, rewrite this; split gloc in blocks and
-    //       overlap transfer of Q(G) for two consequtive blocks within one atom type
-
-    if (ctx_.unit_cell().atom_type(0).augment()) {
-        ctx_.augmentation_op(0).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device));
-    }
-
+    /* add contribution to Q(G) from atoms of all types */
     for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
         auto& atom_type = unit_cell_.atom_type(iat);
 
-        if (ctx_.processing_unit() == sddk::device_t::GPU) {
-            acc::sync_stream(stream_id(0));
-            if (iat + 1 != unit_cell_.num_atom_types() && ctx_.unit_cell().atom_type(iat + 1).augment()) {
-                ctx_.augmentation_op(iat + 1).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device));
-            }
-        }
-
         if (!atom_type.augment() || atom_type.num_atoms() == 0) {
             continue;
         }
 
+        /* number of beta-projector functions */
         int nbf = atom_type.mt_basis_size();
+        /* number of Q_{xi,xi'} components for each G */
+        int nqlm = nbf * (nbf + 1) / 2;
 
         /* convert to real matrix */
         auto dm = density_matrix_aux(this->density_matrix(), iat);
@@ -1402,11 +1397,11 @@ Density::generate_rho_aug()
             auto cs = dm.checksum();
             utils::print_checksum("density_matrix_aux", cs, ctx_.out());
         }
-        /* treat auxiliary array as double with x2 size */
-        sddk::mdarray<double, 2> dm_pw(nbf * (nbf + 1) / 2, spl_ngv_loc.local_size() * 2,
-                                       get_memory_pool(sddk::memory_t::host));
-        sddk::mdarray<double, 2> phase_factors(atom_type.num_atoms(), spl_ngv_loc.local_size() * 2,
-                                               get_memory_pool(sddk::memory_t::host));
+
+        int ndm_pw = (ctx_.processing_unit() == sddk::device_t::CPU) ? 1 : ctx_.num_mag_dims() + 1;
+        /* treat auxiliary array as real with x2 size */
+        sddk::mdarray<double, 3> dm_pw(nqlm, spl_ngv_loc[0] * 2, ndm_pw, mph);
+        sddk::mdarray<double, 2> phase_factors(atom_type.num_atoms(), spl_ngv_loc[0] * 2, mph);
 
         print_memory_usage(ctx_.out(), FILE_LINE);
 
@@ -1415,51 +1410,60 @@ Density::generate_rho_aug()
                 break;
             }
             case sddk::device_t::GPU: {
-                phase_factors.allocate(get_memory_pool(sddk::memory_t::device));
-                dm_pw.allocate(get_memory_pool(sddk::memory_t::device));
-                dm.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
+                phase_factors.allocate(mpd);
+                dm_pw.allocate(mpd);
+                dm.allocate(mpd).copy_to(sddk::memory_t::device);
                 break;
             }
         }
 
         print_memory_usage(ctx_.out(), FILE_LINE);
 
-        for (int ib = 0; ib < spl_ngv_loc.num_ranks(); ib++) {
-            int g_begin = spl_ngv_loc.global_index(0, ib);
-            int g_end   = g_begin + spl_ngv_loc.local_size(ib);
+        auto qpw = (ctx_.processing_unit() ==  sddk::device_t::CPU) ? sddk::mdarray<double, 2>() :
+            sddk::mdarray<double, 2>(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw");
+
+        int g_begin{0};
+        /* loop over blocks of G-vectors */
+        for (auto ng : spl_ngv_loc) {
 
+            /* work on the block of the local G-vectors */
             switch (ctx_.processing_unit()) {
                 case sddk::device_t::CPU: {
-                    #pragma omp parallel for schedule(static)
-                    for (int igloc = g_begin; igloc < g_end; igloc++) {
-                        int ig = ctx_.gvec().offset() + igloc;
+                    /* generate phase factors */
+                    #pragma omp parallel for
+                    for (int g = 0; g < ng; g++) {
+                        int ig = ctx_.gvec().offset() + g_begin + g;
                         for (int i = 0; i < atom_type.num_atoms(); i++) {
-                            int ia                                      = atom_type.atom_id(i);
-                            std::complex<double> z                            = std::conj(ctx_.gvec_phase_factor(ig, ia));
-                            phase_factors(i, 2 * (igloc - g_begin))     = z.real();
-                            phase_factors(i, 2 * (igloc - g_begin) + 1) = z.imag();
+                            int ia = atom_type.atom_id(i);
+                            auto z = std::conj(ctx_.gvec_phase_factor(ig, ia));
+                            phase_factors(i, 2 * g)     = z.real();
+                            phase_factors(i, 2 * g + 1) = z.imag();
                         }
                     }
                     for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
                         PROFILE_START("sirius::Density::generate_rho_aug|gemm");
-                        la::wrap(la::lib_t::blas)
-                            .gemm('N', 'N', nbf * (nbf + 1) / 2, 2 * spl_ngv_loc.local_size(ib), atom_type.num_atoms(),
-                                  &la::constant<double>::one(), dm.at(sddk::memory_t::host, 0, 0, iv), dm.ld(),
-                                  phase_factors.at(sddk::memory_t::host), phase_factors.ld(), &la::constant<double>::zero(),
-                                  dm_pw.at(sddk::memory_t::host, 0, 0), dm_pw.ld());
+                        la::wrap(la::lib_t::blas).gemm('N', 'N', nqlm, 2 * ng, atom_type.num_atoms(),
+                                  &la::constant<double>::one(),
+                                  dm.at(sddk::memory_t::host, 0, 0, iv), dm.ld(),
+                                  phase_factors.at(sddk::memory_t::host), phase_factors.ld(),
+                                  &la::constant<double>::zero(),
+                                  dm_pw.at(sddk::memory_t::host, 0, 0, 0), dm_pw.ld());
                         PROFILE_STOP("sirius::Density::generate_rho_aug|gemm");
                         PROFILE_START("sirius::Density::generate_rho_aug|sum");
                         #pragma omp parallel for
-                        for (int igloc = g_begin; igloc < g_end; igloc++) {
+                        for (int g = 0; g < ng; g++) {
+                            int igloc = g_begin + g;
                             std::complex<double> zsum(0, 0);
                             /* get contribution from non-diagonal terms */
-                            for (int i = 0; i < nbf * (nbf + 1) / 2; i++) {
-                                std::complex<double> z1 = std::complex<double>(ctx_.augmentation_op(iat).q_pw(i, 2 * igloc),
-                                                                   ctx_.augmentation_op(iat).q_pw(i, 2 * igloc + 1));
-                                std::complex<double> z2(dm_pw(i, 2 * (igloc - g_begin)), dm_pw(i, 2 * (igloc - g_begin) + 1));
+                            for (int i = 0; i < nqlm; i++) {
+                                std::complex<double> z1(ctx_.augmentation_op(iat).q_pw(i, 2 * igloc),
+                                                        ctx_.augmentation_op(iat).q_pw(i, 2 * igloc + 1));
+                                std::complex<double> z2(dm_pw(i, 2 * g, 0),
+                                                        dm_pw(i, 2 * g + 1, 0));
 
                                 zsum += z1 * z2 * ctx_.augmentation_op(iat).sym_weight(i);
                             }
+                            /* add contribution from atoms of a given type */
                             rho_aug(igloc, iv) += zsum;
                         }
                         PROFILE_STOP("sirius::Density::generate_rho_aug|sum");
@@ -1468,29 +1472,31 @@ Density::generate_rho_aug()
                 }
                 case sddk::device_t::GPU: {
 #if defined(SIRIUS_GPU)
+                    acc::copyin(qpw.at(sddk::memory_t::device),
+                            ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm);
+
                     for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
-                        generate_dm_pw_gpu(atom_type.num_atoms(), spl_ngv_loc.local_size(ib), nbf,
+                        generate_dm_pw_gpu(atom_type.num_atoms(), ng, nbf,
                                            ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device),
                                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0),
                                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1),
                                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2),
                                            phase_factors.at(sddk::memory_t::device), dm.at(sddk::memory_t::device, 0, 0, iv),
-                                           dm_pw.at(sddk::memory_t::device), 1);
-                        sum_q_pw_dm_pw_gpu(spl_ngv_loc.local_size(ib), nbf,
-                                           ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::device, 0, 2 * g_begin),
-                                           dm_pw.at(sddk::memory_t::device),
+                                           dm_pw.at(sddk::memory_t::device, 0, 0, iv), 1 + iv);
+                        sum_q_pw_dm_pw_gpu(ng, nbf, qpw.at(sddk::memory_t::device), qpw.ld(),
+                                           dm_pw.at(sddk::memory_t::device, 0, 0, iv), dm_pw.ld(),
                                            ctx_.augmentation_op(iat).sym_weight().at(sddk::memory_t::device),
-                                           rho_aug.at(sddk::memory_t::device, g_begin, iv), 1);
+                                           rho_aug.at(sddk::memory_t::device, g_begin, iv), 1 + iv);
+                    }
+                    for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
+                        acc::sync_stream(stream_id(1 + iv));
                     }
 #endif
                     break;
                 }
-            }
-        }
+            } // switch (pu)
 
-        if (ctx_.processing_unit() == sddk::device_t::GPU) {
-            acc::sync_stream(stream_id(1));
-            ctx_.augmentation_op(iat).dismiss();
+            g_begin += ng;
         }
     }
 
diff --git a/src/density/density.hpp b/src/density/density.hpp
index 8e4f5d953c..483ebf5e75 100644
--- a/src/density/density.hpp
+++ b/src/density/density.hpp
@@ -63,7 +63,7 @@ generate_dm_pw_gpu(int num_atoms__, int num_gvec_loc__, int num_beta__, double c
                         double const* dm__, double* dm_pw__, int stream_id__);
 
 void
-sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, double const* dm_pw__,
+sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, int ldq__, double const* dm_pw__, int ldd__,
                         double const* sym_weight__, std::complex<double>* rho_pw__, int stream_id__);
 
 }
diff --git a/src/fft/gvec.cpp b/src/fft/gvec.cpp
index 6b76f1997d..5768f15ee9 100644
--- a/src/fft/gvec.cpp
+++ b/src/fft/gvec.cpp
@@ -29,7 +29,7 @@
 
 namespace fft {
 
-r3::vector<int> fft::Gvec::gvec_by_full_index(uint32_t idx__) const
+r3::vector<int> Gvec::gvec_by_full_index(uint32_t idx__) const
 {
     /* index of the z coordinate of G-vector: first 12 bits */
     uint32_t j = idx__ & 0xFFF;
@@ -43,7 +43,7 @@ r3::vector<int> fft::Gvec::gvec_by_full_index(uint32_t idx__) const
     return r3::vector<int>(x, y, z);
 }
 
-void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
+void Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
 {
     PROFILE("fft::Gvec::find_z_columns");
 
@@ -71,7 +71,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
             /* get z-coordinate of G-vector */
             int k = fft_box__.freq_by_coord<2>(iz);
             /* take G+k */
-            auto vgk = dot(lattice_vectors_, (r3::vector<double>(i, j, k) + vk_));
+            auto vgk = r3::dot(lattice_vectors_, (r3::vector<double>(i, j, k) + vk_));
             /* add z-coordinate of G-vector to the list */
             if (vgk.length() <= Gmax__) {
                 zcol.push_back(k);
@@ -95,6 +95,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
         }
     };
 
+    PROFILE_START("fft::Gvec::find_z_columns|add");
     /* copy column order from previous G-vector set */
     if (gvec_base_) {
         for (int icol = 0; icol < gvec_base_->num_zcol(); icol++) {
@@ -110,6 +111,7 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
             add_new_column(i, j);
         }
     }
+    PROFILE_STOP("fft::Gvec::find_z_columns|add");
 
     if (!gvec_base_) {
         /* put column with {x, y} = {0, 0} to the beginning */
@@ -121,14 +123,10 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
         }
     }
 
-    /* sort z-columns starting from the second or skip num_zcol of base distribution */
-    int n = (gvec_base_) ? gvec_base_->num_zcol() : 1;
-    std::sort(z_columns_.begin() + n, z_columns_.end(),
-              [](z_column_descriptor const& a, z_column_descriptor const& b) { return a.z.size() > b.z.size(); });
-
+    PROFILE_START("fft::Gvec::find_z_columns|sym");
     /* now we have to remove edge G-vectors that don't form a complete shell */
     if (bare_gvec_) {
-        auto lat_sym = sirius::find_lat_sym(lattice_vectors_, 1e-6);
+        auto lat_sym = sirius::find_lat_sym(this->unit_cell_lattice_vectors(), this->sym_tol_);
 
         std::fill(non_zero_columns.at(sddk::memory_t::host), non_zero_columns.at(sddk::memory_t::host) + non_zero_columns.size(), -1);
         for (int i = 0; i < static_cast<int>(z_columns_.size()); i++) {
@@ -137,47 +135,72 @@ void fft::Gvec::find_z_columns(double Gmax__, fft::Grid const& fft_box__)
 
         std::vector<z_column_descriptor> z_columns_tmp;
 
-        num_gvec_ = 0;
-        for (int i = 0; i < static_cast<int>(z_columns_.size()); i++) {
+        auto remove_incomplete = [this, &z_columns_tmp, &lat_sym, &non_zero_columns](int i) // i - index of column
+        {
+            int z_min = (2 << 20);
+            int z_max = -(2 << 20);
+            for (int iz = 0; iz < static_cast<int>(z_columns_[i].z.size()); iz++) {
+                z_min = std::min(z_min, z_columns_[i].z[iz]);
+                z_max = std::max(z_max, z_columns_[i].z[iz]);
+            }
             std::vector<int> z;
             for (int iz = 0; iz < static_cast<int>(z_columns_[i].z.size()); iz++) {
-                r3::vector<int> G(z_columns_[i].x, z_columns_[i].y, z_columns_[i].z[iz]);
                 bool found_for_all_sym{true};
-                for (auto& R: lat_sym) {
-                    /* apply lattice symmeetry operation to a G-vector */
-                    auto G1 = dot(R, G);
-                    if (reduce_gvec_) {
-                        if (G1[0] == 0 && G1[1] == 0) {
-                            G1[2] = std::abs(G1[2]);
+                /* check only first or last z coordinate inside z column */
+                if (z_columns_[i].z[iz] == z_min || z_columns_[i].z[iz] == z_max) {
+                    r3::vector<int> G(z_columns_[i].x, z_columns_[i].y, z_columns_[i].z[iz]);
+                    for (auto& R: lat_sym) {
+                        /* apply lattice symmeetry operation to a G-vector */
+                        auto G1 = r3::dot(G, R);
+                        if (reduce_gvec_) {
+                            if (G1[0] == 0 && G1[1] == 0) {
+                                G1[2] = std::abs(G1[2]);
+                            }
                         }
-                    }
-                    int i1 = non_zero_columns(G1[0], G1[1]);
-                    if (i1 == -1) {
-                        G1 = G1 * (-1);
-                        i1 = non_zero_columns(G1[0], G1[1]);
+                        int i1 = non_zero_columns(G1[0], G1[1]);
                         if (i1 == -1) {
-                            std::stringstream s;
-                            s << "index of z-column is not found" << std::endl
-                              << "  G : " << G << std::endl
-                              << "  G1 : " << G1;
-                            RTE_THROW(s);
+                            G1 = G1 * (-1);
+                            i1 = non_zero_columns(G1[0], G1[1]);
+                            if (i1 == -1) {
+                                std::stringstream s;
+                                s << "index of z-column is not found" << std::endl
+                                  << "  G : " << G << std::endl
+                                  << "  G1 : " << G1;
+                                RTE_THROW(s);
+                            }
                         }
-                    }
 
-                    bool found = (std::find(z_columns_[i1].z.begin(), z_columns_[i1].z.end(), G1[2]) != std::end(z_columns_[i1].z));
-                    found_for_all_sym = found_for_all_sym && found;
-                } // R
+                        bool found = (std::find(z_columns_[i1].z.begin(), z_columns_[i1].z.end(), G1[2]) != std::end(z_columns_[i1].z));
+                        found_for_all_sym = found_for_all_sym && found;
+                    } // R
+                }
                 if (found_for_all_sym) {
                     z.push_back(z_columns_[i].z[iz]);
                 }
             } // iz
             if (z.size()) {
                 z_columns_tmp.push_back(z_column_descriptor(z_columns_[i].x, z_columns_[i].y, z));
-                num_gvec_ += static_cast<int>(z.size());
             }
+        };
+
+        for (int i = 0; i < static_cast<int>(z_columns_.size()); i++) {
+            remove_incomplete(i);
         }
         z_columns_ = z_columns_tmp;
+
+        num_gvec_ = 0;
+        for (auto& zc : z_columns_) {
+            num_gvec_ += static_cast<int>(zc.z.size());
+        }
     }
+    PROFILE_STOP("fft::Gvec::find_z_columns|sym");
+
+    PROFILE_START("fft::Gvec::find_z_columns|sort");
+    /* sort z-columns starting from the second or skip num_zcol of base distribution */
+    int n = (gvec_base_) ? gvec_base_->num_zcol() : 1;
+    std::sort(z_columns_.begin() + n, z_columns_.end(),
+              [](z_column_descriptor const& a, z_column_descriptor const& b) { return a.z.size() > b.z.size(); });
+    PROFILE_STOP("fft::Gvec::find_z_columns|sort");
 }
 
 void Gvec::distribute_z_columns()
@@ -255,7 +278,7 @@ void Gvec::find_gvec_shells()
 
     PROFILE("fft::Gvec::find_gvec_shells");
 
-    auto lat_sym = sirius::find_lat_sym(lattice_vectors_, 1e-6);
+    auto lat_sym = sirius::find_lat_sym(this->unit_cell_lattice_vectors(), this->sym_tol_);
 
     num_gvec_shells_ = 0;
     gvec_shell_      = sddk::mdarray<int, 1>(num_gvec_, sddk::memory_t::host, "gvec_shell_");
@@ -264,10 +287,11 @@ void Gvec::find_gvec_shells()
 
     /* find G-vector shells using symmetry consideration */
     for (int ig = 0; ig < num_gvec_; ig++) {
+        /* if the shell for this vector is not yet found */
         if (gvec_shell_[ig] == -1) {
             auto G = gvec<sddk::index_domain_t::global>(ig);
             for (auto& R: lat_sym) {
-                auto G1 = dot(R, G);
+                auto G1 = r3::dot(G, R);
                 auto ig1 = index_by_gvec(G1);
                 if (ig1 == -1) {
                     G1 = G1 * (-1);
@@ -276,7 +300,24 @@ void Gvec::find_gvec_shells()
                         RTE_THROW("symmetry-related G-vector is not found");
                     }
                 }
-                gvec_shell_[ig1] = num_gvec_shells_;
+                if (gvec_shell_[ig1] == -1) {
+                    gvec_shell_[ig1] = num_gvec_shells_;
+                } else {
+                    if (gvec_shell_[ig1] != num_gvec_shells_) {
+                        auto gc  = r3::dot(lattice_vectors_, G);
+                        auto gc1  = r3::dot(lattice_vectors_, G1);
+                        std::stringstream s;
+                        s << "Error in G-vector shell index" << std::endl
+                          << "  G : " << G << std::endl
+                          << "  rotated G : " << G1 << std::endl
+                          << "  current shell index : " << num_gvec_shells_ << std::endl
+                          << "  rotated G shell index : " << gvec_shell_[ig1] << std::endl
+                          << "  length of G : " << gc.length() << std::endl
+                          << "  length of rotated G : " << gc1.length() << std::endl
+                          << "  length difference : " << std::abs(gc.length() - gc1.length());
+                        RTE_THROW(s);
+                    }
+                }
             }
             num_gvec_shells_++;
         }
@@ -288,62 +329,19 @@ void Gvec::find_gvec_shells()
     }
 
     gvec_shell_len_ = sddk::mdarray<double, 1>(num_gvec_shells_, sddk::memory_t::host, "gvec_shell_len_");
-    std::fill(&gvec_shell_len_[0], &gvec_shell_len_[0] + num_gvec_shells_, -1);
+    std::fill(&gvec_shell_len_[0], &gvec_shell_len_[0] + num_gvec_shells_, 0);
+
+    std::vector<int> ngv_sh(num_gvec_shells_, 0);
 
     for (int ig = 0; ig < num_gvec_; ig++) {
         auto g   = gvec_cart<sddk::index_domain_t::global>(ig).length();
         int igsh = gvec_shell_[ig];
-        if (gvec_shell_len_[igsh] < 0) {
-            gvec_shell_len_[igsh] = g;
-        } else {
-            /* lattice symmetries were found with 1e-6 tolererance for the metric tensor,
-               so tolerance on length should be square root of that */
-            if (std::abs(gvec_shell_len_[igsh] - g) > 1e-3) {
-                std::stringstream s;
-                s << "wrong G-vector length" << std::endl
-                  << "  length of G-vector : " << g << std::endl
-                  << "  length of G-shell : " << gvec_shell_len_[igsh] << std::endl
-                  << "  index of G-vector : " << ig << std::endl
-                  << "  index of G-shell : " << igsh << std::endl
-                  << "  length difference : " << std::abs(gvec_shell_len_[igsh] - g) << std::endl;
-                RTE_THROW(s);
-            }
-        }
+        gvec_shell_len_[igsh] += g;
+        ngv_sh[igsh]++;
     }
-
-    // TODO: maybe, make an average G-shell length.
-
-    /* list of pairs (length, index of G-vector) */
-    std::vector<std::pair<uint64_t, int>> tmp(num_gvec_);
-    #pragma omp parallel for schedule(static)
-    for (int ig = 0; ig < num_gvec(); ig++) {
-        /* make some reasonable roundoff */
-        uint64_t len = static_cast<uint64_t>(gvec_shell_len_[gvec_shell_[ig]] * 1e10);
-        tmp[ig]      = std::make_pair(len, ig);
-    }
-    /* sort by first element in pair (length) */
-    std::sort(tmp.begin(), tmp.end());
-
-    /* index of the first shell */
-    gvec_shell_(tmp[0].second) = 0;
-    num_gvec_shells_           = 1;
-    /* temporary vector to store G-shell radius */
-    std::vector<double> tmp_len;
-    /* radius of the first shell */
-    tmp_len.push_back(static_cast<double>(tmp[0].first) * 1e-10);
-    for (int ig = 1; ig < num_gvec_; ig++) {
-        /* if this G-vector has a different length */
-        if (tmp[ig].first != tmp[ig - 1].first) {
-            /* increment number of shells */
-            num_gvec_shells_++;
-            /* save the radius of the new shell */
-            tmp_len.push_back(static_cast<double>(tmp[ig].first) * 1e-10);
-        }
-        /* assign the index of the current shell */
-        gvec_shell_(tmp[ig].second) = num_gvec_shells_ - 1;
+    for (int i = 0; i < num_gvec_shells_; i++) {
+        gvec_shell_len_[i] /= ngv_sh[i];
     }
-    gvec_shell_len_ = sddk::mdarray<double, 1>(num_gvec_shells_, sddk::memory_t::host, "gvec_shell_len_");
-    std::copy(tmp_len.begin(), tmp_len.end(), gvec_shell_len_.at(sddk::memory_t::host));
 
     /* map from global index of G-shell to a list of local G-vectors */
     std::map<int, std::vector<int>> gshmap;
@@ -389,13 +387,16 @@ Gvec::init_gvec_cart_local()
 {
     gvec_cart_  = sddk::mdarray<double, 2>(3, count(), sddk::memory_t::host, "gvec_cart_");
     gkvec_cart_ = sddk::mdarray<double, 2>(3, count(), sddk::memory_t::host, "gkvec_cart_");
+    /* this arrays are allocated with GPU- friendly data layout */
+    gvec_tp_    = sddk::mdarray<double, 2>(count(), 2, sddk::memory_t::host, "gvec_tp_");
+    gkvec_tp_   = sddk::mdarray<double, 2>(count(), 2, sddk::memory_t::host, "gvec_tp_");
     if (bare_gvec_) {
         gvec_len_ = sddk::mdarray<double, 1>(count(), sddk::memory_t::host, "gvec_len_");
     }
 
     for (int igloc = 0; igloc < count(); igloc++) {
-        auto gc  = dot(lattice_vectors_, r3::vector<int>(&gvec_(0, igloc)));
-        auto gkc = dot(lattice_vectors_, r3::vector<double>(&gkvec_(0, igloc)));
+        auto gc  = r3::dot(lattice_vectors_, r3::vector<int>(&gvec_(0, igloc)));
+        auto gkc = r3::dot(lattice_vectors_, r3::vector<double>(&gkvec_(0, igloc)));
         for (int x : {0, 1, 2}) {
             gvec_cart_(x, igloc)  = gc[x];
             gkvec_cart_(x, igloc) = gkc[x];
@@ -403,6 +404,13 @@ Gvec::init_gvec_cart_local()
         if (bare_gvec_) {
             gvec_len_(igloc) = gvec_shell_len_(gvec_shell_(this->offset() + igloc));
         }
+        auto gs = r3::spherical_coordinates(gc);
+        gvec_tp_(igloc, 0) = gs[1];
+        gvec_tp_(igloc, 1) = gs[2];
+
+        auto gks = r3::spherical_coordinates(gkc);
+        gkvec_tp_(igloc, 0) = gks[1];
+        gkvec_tp_(igloc, 1) = gks[2];
     }
 }
 
diff --git a/src/fft/gvec.hpp b/src/fft/gvec.hpp
index 24709db09c..5373cc12ca 100644
--- a/src/fft/gvec.hpp
+++ b/src/fft/gvec.hpp
@@ -209,6 +209,12 @@ class Gvec
     /// Length of the local fraction of G-vectors.
     sddk::mdarray<double, 1> gvec_len_;
 
+    // Theta- and phi- angles of G-vectors.
+    sddk::mdarray<double, 2> gvec_tp_;
+
+    // Theta- and phi- angles of G+k-vectors.
+    sddk::mdarray<double, 2> gkvec_tp_;
+
     /// Offset in the global index for the local part of G-vectors.
     int offset_{-1};
 
@@ -218,6 +224,9 @@ class Gvec
     /// Local number of z-columns.
     int num_zcol_local_{-1};
 
+    /// Symmetry tolerance of the real-space lattice.
+    double sym_tol_{1e-6};
+
     /// Return corresponding G-vector for an index in the range [0, num_gvec).
     r3::vector<int> gvec_by_full_index(uint32_t idx__) const;
 
@@ -255,49 +264,58 @@ class Gvec
   public:
     /// Constructor for G+k vectors.
     /** \param [in] vk          K-point vector of G+k
-     *  \param [in] M           Reciprocal lattice vecotors in comumn order
+     *  \param [in] M           Reciprocal lattice vectors in column order
      *  \param [in] Gmax        Cutoff for G+k vectors
      *  \param [in] comm        Total communicator which is used to distribute G-vectors
      *  \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry.
+     *  \param [in] sym_tol     Unit cell lattice symmetry tolerance.
      */
-    Gvec(r3::vector<double> vk__, r3::matrix<double> M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__)
-        : vk_(vk__)
-        , Gmax_(Gmax__)
-        , lattice_vectors_(M__)
-        , comm_(comm__)
-        , reduce_gvec_(reduce_gvec__)
-        , bare_gvec_(false)
+    Gvec(r3::vector<double> vk__, r3::matrix<double> M__, double Gmax__, mpi::Communicator const& comm__,
+            bool reduce_gvec__, double sym_tol__ = 1e-6)
+        : vk_{vk__}
+        , Gmax_{Gmax__}
+        , lattice_vectors_{M__}
+        , comm_{comm__}
+        , reduce_gvec_{reduce_gvec__}
+        , bare_gvec_{false}
+        , sym_tol_{sym_tol__}
     {
         init(fft::get_min_grid(Gmax__, M__));
     }
 
     /// Constructor for G-vectors.
-    /** \param [in] M           Reciprocal lattice vecotors in comumn order
+    /** \param [in] M           Reciprocal lattice vectors in column order
      *  \param [in] Gmax        Cutoff for G+k vectors
      *  \param [in] comm        Total communicator which is used to distribute G-vectors
      *  \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry.
+     *  \param [in] sym_tol     Unit cell lattice symmetry tolerance.
      */
-    Gvec(r3::matrix<double> M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__)
-        : Gmax_(Gmax__)
-        , lattice_vectors_(M__)
-        , comm_(comm__)
-        , reduce_gvec_(reduce_gvec__)
+    Gvec(r3::matrix<double> M__, double Gmax__, mpi::Communicator const& comm__, bool reduce_gvec__,
+            double sym_tol__ = 1e-6)
+        : Gmax_{Gmax__}
+        , lattice_vectors_{M__}
+        , comm_{comm__}
+        , reduce_gvec_{reduce_gvec__}
+        , sym_tol_{sym_tol__}
     {
         init(fft::get_min_grid(Gmax__, M__));
     }
 
     /// Constructor for G-vectors.
-    /** \param [in] M           Reciprocal lattice vecotors in comumn order
+    /** \param [in] M           Reciprocal lattice vectors in column order
      *  \param [in] Gmax        Cutoff for G+k vectors
      *  \param [in] fft_grid    Provide explicit boundaries for the G-vector min and max frequencies.
      *  \param [in] comm        Total communicator which is used to distribute G-vectors
      *  \param [in] reduce_gvec True if G-vectors need to be reduced by inversion symmetry.
+     *  \param [in] sym_tol     Unit cell lattice symmetry tolerance.
      */
-    Gvec(r3::matrix<double> M__, double Gmax__, fft::Grid const& fft_grid__, mpi::Communicator const& comm__, bool reduce_gvec__)
-        : Gmax_(Gmax__)
-        , lattice_vectors_(M__)
-        , comm_(comm__)
-        , reduce_gvec_(reduce_gvec__)
+    Gvec(r3::matrix<double> M__, double Gmax__, fft::Grid const& fft_grid__, mpi::Communicator const& comm__,
+            bool reduce_gvec__, double sym_tol__ = 1e-6)
+        : Gmax_{Gmax__}
+        , lattice_vectors_{M__}
+        , comm_{comm__}
+        , reduce_gvec_{reduce_gvec__}
+        , sym_tol_{sym_tol__}
     {
         init(fft_grid__);
     }
@@ -305,25 +323,28 @@ class Gvec
     /// Constructor for G-vector distribution based on a previous set.
     /** Previous set of G-vectors must be a subset of the current set. */
     Gvec(double Gmax__, Gvec const& gvec_base__)
-        : Gmax_(Gmax__)
-        , lattice_vectors_(gvec_base__.lattice_vectors())
-        , comm_(gvec_base__.comm())
-        , reduce_gvec_(gvec_base__.reduced())
-        , gvec_base_(&gvec_base__)
+        : Gmax_{Gmax__}
+        , lattice_vectors_{gvec_base__.lattice_vectors_}
+        , comm_{gvec_base__.comm()}
+        , reduce_gvec_{gvec_base__.reduced()}
+        , gvec_base_{&gvec_base__}
+        , sym_tol_{gvec_base__.sym_tol_}
     {
         init(fft::get_min_grid(Gmax__, lattice_vectors_));
     }
 
     /// Constructor for G-vectors with mpi_comm_self()
-    Gvec(r3::matrix<double> M__, double Gmax__, bool reduce_gvec__)
-        : Gmax_(Gmax__)
-        , lattice_vectors_(M__)
-        , comm_(mpi::Communicator::self())
-        , reduce_gvec_(reduce_gvec__)
+    Gvec(r3::matrix<double> M__, double Gmax__, bool reduce_gvec__, double sym_tol__ = 1e-6)
+        : Gmax_{Gmax__}
+        , lattice_vectors_{M__}
+        , comm_{mpi::Communicator::self()}
+        , reduce_gvec_{reduce_gvec__}
+        , sym_tol_{sym_tol__}
     {
         init(fft::get_min_grid(Gmax__, M__));
     }
 
+    /// Construct with the defined order of G-vectors.
     Gvec(r3::vector<double> vk__, r3::matrix<double> M__, int ngv_loc__, int const* gv__,
             mpi::Communicator const& comm__, bool reduce_gvec__)
         : vk_(vk__)
@@ -408,7 +429,7 @@ class Gvec
     /// Set the new reciprocal lattice vectors.
     /** For the varibale-cell relaxation runs we need an option to preserve the number of G- and G+k vectors.
      *  Here we can set the new lattice vectors and update the relevant members of the Gvec class. */
-    inline r3::matrix<double> const& lattice_vectors(r3::matrix<double> lattice_vectors__)
+    inline auto const& lattice_vectors(r3::matrix<double> lattice_vectors__)
     {
         lattice_vectors_ = lattice_vectors__;
         find_gvec_shells();
@@ -417,11 +438,18 @@ class Gvec
     }
 
     /// Retrn a const reference to the reciprocal lattice vectors.
-    inline r3::matrix<double> const& lattice_vectors() const
+    inline auto const& lattice_vectors() const
     {
         return lattice_vectors_;
     }
 
+    inline auto const unit_cell_lattice_vectors() const
+    {
+        double const twopi = 6.2831853071795864769;
+        auto r = r3::transpose(r3::inverse(lattice_vectors_)) * twopi;
+        return r;
+    }
+
     /// Return the volume of the real space unit cell that corresponds to the reciprocal lattice of G-vectors.
     inline double omega() const
     {
@@ -697,6 +725,26 @@ class Gvec
         this->comm().bcast(&result(0, 0), 3 * ngv, rank__);
         return result;
     }
+
+    inline auto& gvec_tp()
+    {
+        return gvec_tp_;
+    }
+
+    inline auto const& gvec_tp() const
+    {
+        return gvec_tp_;
+    }
+
+    inline auto& gkvec_tp()
+    {
+        return gkvec_tp_;
+    }
+
+    inline auto const& gkvec_tp() const
+    {
+        return gkvec_tp_;
+    }
 };
 
 /// Stores information about G-vector partitioning between MPI ranks for the FFT transformation.
@@ -987,6 +1035,44 @@ gkvec_factory(r3::vector<double> vk__, r3::matrix<double> reciprocal_lattice_vec
     return std::make_shared<Gvec>(vk__, reciprocal_lattice_vectors__, gk_cutoff__, comm__, gamma__);
 }
 
+inline void print(std::ostream& out__, Gvec const& gvec__)
+{
+    std::map<int, std::vector<int>> gsh_map;
+    for (int i = 0; i < gvec__.num_gvec(); i++) {
+        int igsh = gvec__.shell(i);
+        if (gsh_map.count(igsh) == 0) {
+            gsh_map[igsh] = std::vector<int>();
+        }
+        gsh_map[igsh].push_back(i);
+    }
+
+    out__ << "num_gvec : " << gvec__.num_gvec() << std::endl;
+    out__ << "num_gvec_shells : " << gvec__.num_shells() << std::endl;
+
+    for (int igsh = 0; igsh < gvec__.num_shells(); igsh++) {
+        auto len = gvec__.shell_len(igsh);
+        out__ << "shell : " << igsh << ", length : " << len << std::endl;
+        for (auto ig : gsh_map[igsh]) {
+            auto G = gvec__.gvec<sddk::index_domain_t::global>(ig);
+            auto Gc = gvec__.gvec_cart<sddk::index_domain_t::global>(ig);
+            out__ << "  ig : " << ig << ", G = " << G << ", length diff : " << std::abs(Gc.length() - len) << std::endl;
+        }
+    }
+    //mpi::pstdout pout(gvec__.comm());
+    //pout << "rank: " << gvec_.comm().rank() << std::endl;
+    //pout << "-- list of G-vectors in the remapped distribution --" << std::endl;
+    //for (int igloc = 0; igloc < gvec_count_remapped(); igloc++) {
+    //    auto G = gvec_remapped(igloc);
+
+    //    int igsh = gvec_shell_remapped(igloc);
+    //    pout << "igloc=" << igloc << " igsh=" << igsh << " G=" << G[0] << " " << G[1] << " " << G[2] << std::endl;
+    //}
+    //pout << "-- reverse list --" << std::endl;
+    //for (auto const& e: idx_gvec_) {
+    //    pout << "G=" << e.first[0] << " " << e.first[1] << " " << e.first[2] << ", igloc=" << e.second << std::endl;
+    //}
+    //out__ << pout.flush(0);
+}
 } // namespace sddk
 
 #endif //__GVEC_HPP__
diff --git a/src/geometry/force.cpp b/src/geometry/force.cpp
index 9f257e3ec6..9d522fc1cb 100644
--- a/src/geometry/force.cpp
+++ b/src/geometry/force.cpp
@@ -51,8 +51,6 @@ Force::symmetrize(sddk::mdarray<double, 2>& forces__) const
         return;
     }
 
-    PROFILE("sirius::Force::symmetrize");
-
     sddk::mdarray<double, 2> sym_forces(3, ctx_.unit_cell().spl_num_atoms().local_size());
     sym_forces.zero();
 
diff --git a/src/geometry/stress.cpp b/src/geometry/stress.cpp
index a720992362..2fd7070834 100644
--- a/src/geometry/stress.cpp
+++ b/src/geometry/stress.cpp
@@ -381,13 +381,8 @@ Stress::calc_stress_us()
         return stress_us_;
     }
 
-    auto& ri    = ctx_.aug_ri();
-    auto& ri_dq = ctx_.aug_ri_djl();
-
     potential_.fft_transform(-1);
 
-    Augmentation_operator_gvec_deriv q_deriv(ctx_, ctx_.unit_cell().lmax(), ctx_.gvec(), ctx_.gvec_tp());
-
     la::lib_t la{la::lib_t::none};
     sddk::memory_t qmem{sddk::memory_t::none};
 
@@ -402,7 +397,7 @@ Stress::calc_stress_us()
         case sddk::device_t::GPU: {
             mp   = &get_memory_pool(sddk::memory_t::host_pinned);
             la   = la::lib_t::spla;
-            qmem = sddk::memory_t::device;
+            qmem = sddk::memory_t::host;
             break;
         }
     }
@@ -413,9 +408,9 @@ Stress::calc_stress_us()
             continue;
         }
 
-        q_deriv.prepare(atom_type, ri, ri_dq);
+        Augmentation_operator q_deriv(ctx_.unit_cell().atom_type(iat), ctx_.gvec(), ctx_.aug_ri(), ctx_.aug_ri_djl());
 
-        int nbf = atom_type.mt_basis_size();
+        auto nbf = atom_type.mt_basis_size();
 
         /* get auxiliary density matrix */
         auto dm = density_.density_matrix_aux(density_.density_matrix(), iat);
@@ -424,7 +419,7 @@ Stress::calc_stress_us()
                                                        get_memory_pool(sddk::memory_t::host));
 
         PROFILE_START("sirius::Stress|us|phase_fac");
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (int igloc = 0; igloc < ctx_.gvec().count(); igloc++) {
             int ig = ctx_.gvec().offset() + igloc;
             for (int i = 0; i < atom_type.num_atoms(); i++) {
@@ -436,10 +431,10 @@ Stress::calc_stress_us()
         sddk::mdarray<double, 2> v_tmp(atom_type.num_atoms(), ctx_.gvec().count() * 2, *mp);
         sddk::mdarray<double, 2> tmp(nbf * (nbf + 1) / 2, atom_type.num_atoms(), *mp);
         /* over spin components, can be from 1 to 4 */
-        for (int ispin = 0; ispin < ctx_.num_mag_dims() + 1; ispin++) {
-            for (int nu = 0; nu < 3; nu++) {
-                q_deriv.generate_pw_coeffs(atom_type, nu);
-
+        for (int nu = 0; nu < 3; nu++) {
+            /* generate dQ(G)/dG */
+            q_deriv.generate_pw_coeffs_gvec_deriv(nu);
+            for (int ispin = 0; ispin < ctx_.num_mag_dims() + 1; ispin++) {
                 for (int mu = 0; mu < 3; mu++) {
                     PROFILE_START("sirius::Stress|us|prepare");
                     int igloc0{0};
@@ -449,7 +444,7 @@ Stress::calc_stress_us()
                         }
                         igloc0 = 1;
                     }
-                    #pragma omp parallel for schedule(static)
+                    #pragma omp parallel for
                     for (int igloc = igloc0; igloc < ctx_.gvec().count(); igloc++) {
                         auto gvc = ctx_.gvec().gvec_cart<sddk::index_domain_t::local>(igloc);
                         double g = gvc.length();
diff --git a/src/geometry/wavefunction_strain_deriv.hpp b/src/geometry/wavefunction_strain_deriv.hpp
index e4ea260248..0cea77c0db 100644
--- a/src/geometry/wavefunction_strain_deriv.hpp
+++ b/src/geometry/wavefunction_strain_deriv.hpp
@@ -17,7 +17,7 @@ wavefunctions_strain_deriv(Simulation_context const& ctx__, K_point<double>& kp_
         /* Cartesian coordinats of G-vector */
         auto gvc = kp__.gkvec().gkvec_cart<sddk::index_domain_t::local>(igkloc);
         /* vs = {r, theta, phi} */
-        auto gvs = SHT::spherical_coordinates(gvc);
+        auto gvs = r3::spherical_coordinates(gvc);
 
         std::vector<sddk::mdarray<double, 1>> ri_values(ctx__.unit_cell().num_atom_types());
         for (int iat = 0; iat < ctx__.unit_cell().num_atom_types(); iat++) {
diff --git a/src/gpu/spherical_harmonics.cu b/src/gpu/spherical_harmonics.cu
index 08d022ee71..d3e88d483c 100644
--- a/src/gpu/spherical_harmonics.cu
+++ b/src/gpu/spherical_harmonics.cu
@@ -89,14 +89,14 @@ extern "C" void spherical_harmonics_ylm_gpu(int lmax__, int ntp__, double const*
 }
 
 
-__global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double const* tp__,
+__global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double const* theta__, double const* phi__,
                                                    double* rlm__, int ld__)
 {
     int itp = blockDim.x * blockIdx.x + threadIdx.x;
 
     if (itp < ntp__) {
-        double theta = tp__[itp];
-        double phi   = tp__[ntp__ + itp];
+        double theta = theta__[itp];
+        double phi   = phi__[itp];
         double sint = sin(theta);
         double cost = cos(theta);
 
@@ -146,10 +146,11 @@ __global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double
     }
 }
 
-extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* tp__, double* rlm__, int ld__)
+extern "C" void spherical_harmonics_rlm_gpu(int lmax__, int ntp__, double const* theta__, double const* phi__,
+        double* rlm__, int ld__)
 {
     dim3 grid_t(32);
     dim3 grid_b(num_blocks(ntp__, grid_t.x));
     accLaunchKernel((spherical_harmonics_rlm_gpu_kernel), dim3(grid_b), dim3(grid_t), 0, 0,
-        lmax__, ntp__, tp__, rlm__, ld__);
+        lmax__, ntp__, theta__, phi__, rlm__, ld__);
 }
diff --git a/src/gpu/sum_q_pw_dm_pw.cu b/src/gpu/sum_q_pw_dm_pw.cu
index 5fb521d737..80516c3747 100644
--- a/src/gpu/sum_q_pw_dm_pw.cu
+++ b/src/gpu/sum_q_pw_dm_pw.cu
@@ -29,14 +29,9 @@
 #include "gpu/cuda_timer.hpp"
 #endif
 
-__global__ void sum_q_pw_dm_pw_gpu_kernel
-(
-    int nbf__,
-    double const* q_pw__,
-    double const* dm_pw__,
-    double const* sym_weight__,
-    acc_complex_double_t* rho_pw__
-)
+__global__ void
+sum_q_pw_dm_pw_gpu_kernel(int nbf__, double const* q_pw__, int ldq__, double const* dm_pw__, int ldd__,
+    double const* sym_weight__, acc_complex_double_t* rho_pw__)
 {
     ACC_DYNAMIC_SHARED( char, sdata_ptr)
     double* rho_re = (double*)&sdata_ptr[0];
@@ -54,10 +49,10 @@ __global__ void sum_q_pw_dm_pw_gpu_kernel
     for (int n = 0; n < N; n++) {
         int i = n * blockDim.x + threadIdx.x;
         if (i < ld) {
-            double qx =  q_pw__[array2D_offset(i, 2 * igloc,     ld)];
-            double qy =  q_pw__[array2D_offset(i, 2 * igloc + 1, ld)];
-            double dx = dm_pw__[array2D_offset(i, 2 * igloc,     ld)];
-            double dy = dm_pw__[array2D_offset(i, 2 * igloc + 1, ld)];
+            double qx =  q_pw__[array2D_offset(i, 2 * igloc,     ldq__)];
+            double qy =  q_pw__[array2D_offset(i, 2 * igloc + 1, ldq__)];
+            double dx = dm_pw__[array2D_offset(i, 2 * igloc,     ldd__)];
+            double dy = dm_pw__[array2D_offset(i, 2 * igloc + 1, ldd__)];
 
             rho_re[threadIdx.x] += sym_weight__[i] * (dx * qx - dy * qy);
             rho_im[threadIdx.x] += sym_weight__[i] * (dy * qx + dx * qy);
@@ -77,13 +72,9 @@ __global__ void sum_q_pw_dm_pw_gpu_kernel
     }
 }
 
-extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__,
-                                   int nbf__,
-                                   double const* q_pw__,
-                                   double const* dm_pw__,
-                                   double const* sym_weight__,
-                                   acc_complex_double_t* rho_pw__,
-                                   int stream_id__)
+extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__, int nbf__, double const* q_pw__, int ldq__,
+                                   double const* dm_pw__, int ldd__, double const* sym_weight__,
+                                   acc_complex_double_t* rho_pw__, int stream_id__)
 {
 #ifdef SIRIUS_CUDA
     CUDA_timer t("sum_q_pw_dm_pw_gpu");
@@ -93,14 +84,9 @@ extern "C" void sum_q_pw_dm_pw_gpu(int num_gvec_loc__,
 
     dim3 grid_t(64);
     dim3 grid_b(num_gvec_loc__);
-    
+
     accLaunchKernel((sum_q_pw_dm_pw_gpu_kernel), dim3(grid_b), dim3(grid_t), 2 * grid_t.x * sizeof(double), stream, 
-        nbf__, 
-        q_pw__, 
-        dm_pw__,
-        sym_weight__,
-        rho_pw__
-    );
+        nbf__, q_pw__, ldq__, dm_pw__, ldd__, sym_weight__, rho_pw__);
 }
 
 
diff --git a/src/hamiltonian/hamiltonian.cpp b/src/hamiltonian/hamiltonian.cpp
index e030166ef8..c093a761be 100644
--- a/src/hamiltonian/hamiltonian.cpp
+++ b/src/hamiltonian/hamiltonian.cpp
@@ -50,7 +50,7 @@ Hamiltonian0<T>::Hamiltonian0(Potential& potential__, bool precompute_lapw__)
         if (precompute_lapw__) {
             potential_->generate_pw_coefs();
             potential_->update_atomic_potential();
-            ctx_.unit_cell().generate_radial_functions();
+            ctx_.unit_cell().generate_radial_functions(ctx_.out());
             ctx_.unit_cell().generate_radial_integrals();
         }
         hmt_ = std::vector<sddk::mdarray<std::complex<T>, 2>>(ctx_.unit_cell().num_atoms());
diff --git a/src/hamiltonian/hamiltonian_k.cpp b/src/hamiltonian/hamiltonian_k.cpp
index 4d3ccbc792..8cfce040bb 100644
--- a/src/hamiltonian/hamiltonian_k.cpp
+++ b/src/hamiltonian/hamiltonian_k.cpp
@@ -842,7 +842,11 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
         if (pcs) {
             if (hphi__) {
                 auto cs = hphi__->checksum(mem, wf::spin_index(0), b__);
+                auto cs_pw = hphi__->checksum_pw(mem, wf::spin_index(0), b__);
+                auto cs_mt = hphi__->checksum_mt(mem, wf::spin_index(0), b__);
                 if (comm.rank() == 0) {
+                    utils::print_checksum("hloc_phi_pw", cs_pw, RTE_OUT(std::cout));
+                    utils::print_checksum("hloc_phi_mt", cs_mt, RTE_OUT(std::cout));
                     utils::print_checksum("hloc_phi", cs, RTE_OUT(std::cout));
                 }
             }
diff --git a/src/hamiltonian/local_operator.cpp b/src/hamiltonian/local_operator.cpp
index e3c76d4622..1c003e23ad 100644
--- a/src/hamiltonian/local_operator.cpp
+++ b/src/hamiltonian/local_operator.cpp
@@ -499,18 +499,18 @@ void Local_operator<T>::apply_fplapw(fft::spfft_transform_type<T>& spfftk__, std
 
     wf::Wave_functions_fft<T> phi_fft(gkvec_fft__, phi__, s0, b__, wf::shuffle_to::fft_layout);
 
-    std::map<wf::Wave_functions<T>*, wf::Wave_functions_fft<T>> map_wf_fft;
+    std::array<wf::Wave_functions_fft<T>, 4> wf_fft;
     if (hphi__) {
-        map_wf_fft[hphi__] = wf::Wave_functions_fft<T>(gkvec_fft__, *hphi__, s0, b__, wf::shuffle_to::wf_layout);
+        wf_fft[0] = wf::Wave_functions_fft<T>(gkvec_fft__, *hphi__, s0, b__, wf::shuffle_to::wf_layout);
     }
     if (ophi__) {
-        map_wf_fft[ophi__] = wf::Wave_functions_fft<T>(gkvec_fft__, *ophi__, s0, b__, wf::shuffle_to::wf_layout);
+        wf_fft[1] = wf::Wave_functions_fft<T>(gkvec_fft__, *ophi__, s0, b__, wf::shuffle_to::wf_layout);
     }
     if (bzphi__) {
-        map_wf_fft[bzphi__] = wf::Wave_functions_fft<T>(gkvec_fft__, *bzphi__, s0, b__, wf::shuffle_to::wf_layout);
+        wf_fft[2] = wf::Wave_functions_fft<T>(gkvec_fft__, *bzphi__, s0, b__, wf::shuffle_to::wf_layout);
     }
     if (bxyphi__) {
-        map_wf_fft[bxyphi__] = wf::Wave_functions_fft<T>(gkvec_fft__, *bxyphi__, s0, b__, wf::shuffle_to::wf_layout);
+        wf_fft[3] = wf::Wave_functions_fft<T>(gkvec_fft__, *bxyphi__, s0, b__, wf::shuffle_to::wf_layout);
     }
 
     auto pcs = env::print_checksum();
@@ -565,40 +565,40 @@ void Local_operator<T>::apply_fplapw(fft::spfft_transform_type<T>& spfftk__, std
             /* multiply phi(r) by step function */
             mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::theta, spfft_buf);
 
-            auto mem = map_wf_fft[ophi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
+            auto mem = wf_fft[1].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
 
             /* phi(r) * Theta(r) -> ophi(G) */
-            spfftk__.forward(spfft_pu, map_wf_fft[ophi__].pw_coeffs_spfft(mem, wf::band_index(j)),
+            spfftk__.forward(spfft_pu, wf_fft[1].pw_coeffs_spfft(mem, wf::band_index(j)),
                     SPFFT_FULL_SCALING);
         }
 
         if (bzphi__) {
             mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::v1, spfft_buf);
 
-            auto mem = map_wf_fft[bzphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
+            auto mem = wf_fft[2].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
 
             /* phi(r) * Bz(r) -> bzphi(G) */
-            spfftk__.forward(spfft_pu, map_wf_fft[bzphi__].pw_coeffs_spfft(mem, wf::band_index(j)),
+            spfftk__.forward(spfft_pu, wf_fft[2].pw_coeffs_spfft(mem, wf::band_index(j)),
                     SPFFT_FULL_SCALING);
         }
 
         if (bxyphi__) {
             mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, 2, spfft_buf);
 
-            auto mem = map_wf_fft[bxyphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
+            auto mem = wf_fft[3].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
 
             /* phi(r) * (Bx(r) - iBy(r)) -> bxyphi(G) */
-            spfftk__.forward(spfft_pu, map_wf_fft[bxyphi__].pw_coeffs_spfft(mem, wf::band_index(j)),
+            spfftk__.forward(spfft_pu, wf_fft[3].pw_coeffs_spfft(mem, wf::band_index(j)),
                     SPFFT_FULL_SCALING);
         }
 
         if (hphi__) {
             mul_by_veff(spfftk__, reinterpret_cast<T*>(phi_r), veff_vec_, v_local_index_t::v0, spfft_buf);
 
-            auto mem = map_wf_fft[hphi__].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
+            auto mem = wf_fft[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
 
             /* phi(r) * Theta(r) * V(r) -> hphi(G) */
-            spfftk__.forward(spfft_pu, map_wf_fft[hphi__].pw_coeffs_spfft(mem, wf::band_index(j)),
+            spfftk__.forward(spfft_pu, wf_fft[0].pw_coeffs_spfft(mem, wf::band_index(j)),
                     SPFFT_FULL_SCALING);
 
             /* add kinetic energy */
@@ -641,13 +641,13 @@ void Local_operator<T>::apply_fplapw(fft::spfft_transform_type<T>& spfftk__, std
                     #pragma omp parallel for
                     for (int igloc = 0; igloc < gkvec_fft__->count(); igloc++) {
                         auto gvc = gkvec_fft__->gkvec_cart(igloc);
-                        map_wf_fft[hphi__].pw_coeffs(igloc, wf::band_index(j)) += buf_pw[igloc] *
+                        wf_fft[0].pw_coeffs(igloc, wf::band_index(j)) += buf_pw[igloc] *
                             static_cast<T>(0.5 * gvc[x]);
                     }
                 } else {
                     add_to_hphi_lapw_gpu(gkvec_fft__->count(), buf_pw.at(sddk::memory_t::device),
                             gkvec_cart_.at(sddk::memory_t::device, 0, x),
-                            map_wf_fft[hphi__].at(sddk::memory_t::device, 0, wf::band_index(j)));
+                            wf_fft[0].at(sddk::memory_t::device, 0, wf::band_index(j)));
                 }
             } // x
         }
diff --git a/src/hubbard/hubbard_occupancies_derivatives.cpp b/src/hubbard/hubbard_occupancies_derivatives.cpp
index 4bb5ab1474..3f2613897d 100644
--- a/src/hubbard/hubbard_occupancies_derivatives.cpp
+++ b/src/hubbard/hubbard_occupancies_derivatives.cpp
@@ -393,7 +393,7 @@ Hubbard::compute_occupancies_stress_derivatives(K_point<double>& kp__, Q_operato
     for (int igkloc = 0; igkloc < kp__.num_gkvec_loc(); igkloc++) {
         /* gvs = {r, theta, phi} */
         auto gvc = kp__.gkvec().gkvec_cart<sddk::index_domain_t::local>(igkloc);
-        auto rtp = SHT::spherical_coordinates(gvc);
+        auto rtp = r3::spherical_coordinates(gvc);
 
         sf::spherical_harmonics(lmax, rtp[1], rtp[2], &rlm_g(0, igkloc));
         sddk::mdarray<double, 2> rlm_dg_tmp(&rlm_dg(0, 0, igkloc), lmmax, 3);
@@ -401,9 +401,9 @@ Hubbard::compute_occupancies_stress_derivatives(K_point<double>& kp__, Q_operato
     }
 
     /* atomic wave functions  */
-    auto& phi_atomic    = kp__.atomic_wave_functions();
-    auto& phi_atomic_S  = kp__.atomic_wave_functions_S();
-    auto& phi_hub_S     = kp__.hubbard_wave_functions_S();
+    auto& phi_atomic   = kp__.atomic_wave_functions();
+    auto& phi_atomic_S = kp__.atomic_wave_functions_S();
+    auto& phi_hub_S    = kp__.hubbard_wave_functions_S();
 
     auto num_ps_atomic_wf = ctx_.unit_cell().num_ps_atomic_wf();
     auto num_hubbard_wf   = ctx_.unit_cell().num_hubbard_wf();
diff --git a/src/k_point/generate_fv_states.cpp b/src/k_point/generate_fv_states.cpp
index a583a83102..18eaad46a4 100644
--- a/src/k_point/generate_fv_states.cpp
+++ b/src/k_point/generate_fv_states.cpp
@@ -38,6 +38,8 @@ void K_point<T>::generate_fv_states()
 
     auto const& uc = ctx_.unit_cell();
 
+    auto pcs = env::print_checksum();
+
     auto bs = ctx_.cyclic_block_size();
     la::dmatrix<std::complex<T>> alm_fv(uc.mt_aw_basis_size(), ctx_.num_fv_states(),
             ctx_.blacs_grid(), bs, bs);
@@ -57,6 +59,10 @@ void K_point<T>::generate_fv_states()
 
         /* generate complex conjugated Alm coefficients for a block of atoms */
         auto alm = generate_alm_block<false, T>(ctx_, atom_begin, na, this->alm_coeffs_loc());
+        auto cs = alm.checksum();
+        if (pcs) {
+            utils::print_checksum("alm", cs, RTE_OUT(this->out(0)));
+        }
 
         /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for the block of atoms */
         spla::pgemm_ssb(num_mt_aw, ctx_.num_fv_states(), this->gkvec().count(), SPLA_OP_TRANSPOSE, 1.0,
@@ -106,6 +112,13 @@ void K_point<T>::generate_fv_states()
             }
         }
     }
+    if (pcs) {
+        auto z1 = fv_states_->checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        auto z2 = fv_states_->checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        utils::print_checksum("fv_states_pw", z1, RTE_OUT(this->out(0)));
+        utils::print_checksum("fv_states_mt", z2, RTE_OUT(this->out(0)));
+
+    }
 }
 
 template void K_point<double>::generate_fv_states();
diff --git a/src/lapw/matching_coefficients.hpp b/src/lapw/matching_coefficients.hpp
index d4b9e22e9d..8ce7799e6f 100644
--- a/src/lapw/matching_coefficients.hpp
+++ b/src/lapw/matching_coefficients.hpp
@@ -128,7 +128,7 @@ class Matching_coefficients // TODO: compute on GPU
             for (int i = 0; i < gkvec_.count(); i++) {
                 auto gkvec_cart = gkvec_.gkvec_cart<sddk::index_domain_t::local>(i);
                 /* get r, theta, phi */
-                auto vs = SHT::spherical_coordinates(gkvec_cart);
+                auto vs = r3::spherical_coordinates(gkvec_cart);
                 gkvec_len_[i] = vs[0];
 
                 /* get spherical harmonics */
diff --git a/src/mpi/communicator.hpp b/src/mpi/communicator.hpp
index 32d075d6a2..95e8ee9697 100644
--- a/src/mpi/communicator.hpp
+++ b/src/mpi/communicator.hpp
@@ -276,8 +276,9 @@ class Communicator
         MPI_Init_thread(NULL, NULL, required__, &provided);
 
         MPI_Query_thread(&provided);
-        if (provided < required__) {
-            std::printf("Warning! Required level of thread support is not provided.\nprovided: %d \nrequired: %d\n", provided, required__);
+        if ((provided < required__) && (Communicator::world().rank() == 0)) {
+            std::printf("Warning! Required level of thread support is not provided.\n");
+            std::printf("provided: %d \nrequired: %d\n", provided, required__);
         }
     }
 
diff --git a/src/potential/generate_d_operator_matrix.cpp b/src/potential/generate_d_operator_matrix.cpp
index 229325f9d0..8a04bd2e7b 100644
--- a/src/potential/generate_d_operator_matrix.cpp
+++ b/src/potential/generate_d_operator_matrix.cpp
@@ -43,24 +43,40 @@ void Potential::generate_D_operator_matrix()
 {
     PROFILE("sirius::Potential::generate_D_operator_matrix");
 
-    auto spl_ngv_loc = ctx_.split_gvec_local();
+    /* local number of G-vectors */
+    int gvec_count = ctx_.gvec().count();
+    auto spl_ngv_loc = utils::split_in_blocks(gvec_count, ctx_.cfg().control().gvec_chunk_size());
 
-    if (ctx_.unit_cell().atom_type(0).augment()) {
-        ctx_.augmentation_op(0).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device));
-    }
-    for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
-        auto& atom_type = unit_cell_.atom_type(iat);
-        int nbf         = atom_type.mt_basis_size();
+    auto& mph = get_memory_pool(sddk::memory_t::host);
+    auto& mpd = get_memory_pool(sddk::memory_t::device);
 
-        /* start copy of Q(G) for the next atom type */
-        if (ctx_.processing_unit() == sddk::device_t::GPU) {
-            acc::sync_stream(stream_id(0));
-            if (iat + 1 != unit_cell_.num_atom_types() && ctx_.unit_cell().atom_type(iat + 1).augment()) {
-                ctx_.augmentation_op(iat + 1).prepare(stream_id(0), &get_memory_pool(sddk::memory_t::device));
+    int n_mag_comp{1};
+
+    sddk::mdarray<std::complex<double>, 2> veff;
+    switch (ctx_.processing_unit()) {
+        case sddk::device_t::CPU: {
+            break;
+        }
+        case sddk::device_t::GPU: {
+            n_mag_comp = ctx_.num_mag_dims() + 1;
+            veff = sddk::mdarray<std::complex<double>, 2>(gvec_count, n_mag_comp, mph);
+            for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
+                std::copy(&component(j).f_pw_local(0), &component(j).f_pw_local(0) + gvec_count, &veff(0, j));
             }
+            veff.allocate(mpd).copy_to(sddk::memory_t::device);
+            break;
         }
+    }
+
+    for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
+        auto& atom_type = unit_cell_.atom_type(iat);
+        /* number of beta-projector functions */
+        int nbf = atom_type.mt_basis_size();
+        /* number of Q_{xi,xi'} components for each G */
+        int nqlm = nbf * (nbf + 1) / 2;
 
         /* trivial case */
+        /* in absence of augmentation charge D-matrix is zero */
         if (!atom_type.augment() || atom_type.num_atoms() == 0) {
             for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
                 for (int i = 0; i < atom_type.num_atoms(); i++) {
@@ -76,117 +92,130 @@ void Potential::generate_D_operator_matrix()
             }
             continue;
         }
-        sddk::matrix<double> d_tmp(nbf * (nbf + 1) / 2, atom_type.num_atoms(), get_memory_pool(sddk::memory_t::host));
-        if (ctx_.processing_unit() == sddk::device_t::GPU) {
-            d_tmp.allocate(get_memory_pool(sddk::memory_t::device));
-        }
-
-        print_memory_usage(ctx_.out(), FILE_LINE);
 
-        for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
-            sddk::matrix<double> veff_a(2 * spl_ngv_loc.local_size(), atom_type.num_atoms(), get_memory_pool(sddk::memory_t::host));
-
-            auto la = la::lib_t::blas;
-            auto mem = sddk::memory_t::host;
+        sddk::mdarray<double, 3> d_tmp(nqlm, atom_type.num_atoms(), ctx_.num_mag_dims() + 1, mph);
+        sddk::mdarray<double, 3> veff_a(spl_ngv_loc[0] * 2, atom_type.num_atoms(), n_mag_comp, mph);
+        sddk::mdarray<double, 2> qpw;
 
-            d_tmp.zero();
-            if (ctx_.processing_unit() == sddk::device_t::GPU) {
-                la = la::lib_t::gpublas;
-                mem = sddk::memory_t::device;
-                d_tmp.zero(sddk::memory_t::device);
-                veff_a.allocate(get_memory_pool(sddk::memory_t::device));
+        switch (ctx_.processing_unit()) {
+            case sddk::device_t::CPU: {
+                d_tmp.zero();
+                break;
+            }
+            case sddk::device_t::GPU: {
+                d_tmp.allocate(mpd).zero(sddk::memory_t::device);
+                veff_a.allocate(mpd);
+                qpw = sddk::mdarray<double, 2>(nqlm, 2 * spl_ngv_loc[0], mpd, "qpw");
+                break;
             }
+        }
 
-            /* split a large loop over G-vectors into blocks */
-            for (int ib = 0; ib < spl_ngv_loc.num_ranks(); ib++) {
-                int g_begin = spl_ngv_loc.global_index(0, ib);
-                int g_end = g_begin + spl_ngv_loc.local_size(ib);
+        print_memory_usage(ctx_.out(), FILE_LINE);
 
-                switch (ctx_.processing_unit()) {
-                    case sddk::device_t::CPU: {
-                        #pragma omp parallel for schedule(static)
+        int g_begin{0};
+        /* loop over blocks of G-vectors */
+        for (auto ng : spl_ngv_loc) {
+            /* work on the block of the local G-vectors */
+            switch (ctx_.processing_unit()) {
+                case sddk::device_t::CPU: {
+                    for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
+                        /* multiply Veff(G) with the phase factors */
+                        #pragma omp parallel for
                         for (int i = 0; i < atom_type.num_atoms(); i++) {
                             int ia = atom_type.atom_id(i);
 
-                            for (int igloc = g_begin; igloc < g_end; igloc++) {
-                                int ig = ctx_.gvec().offset() + igloc;
+                            for (int g = 0; g < ng; g++) {
+                                int ig = ctx_.gvec().offset() + g_begin + g;
                                 /* V(G) * exp(i * G * r_{alpha}) */
-                                auto z = component(iv).f_pw_local(igloc) * ctx_.gvec_phase_factor(ig, ia);
-                                veff_a(2 * (igloc - g_begin),     i) = z.real();
-                                veff_a(2 * (igloc - g_begin) + 1, i) = z.imag();
+                                auto z = component(iv).f_pw_local(g_begin + g) * ctx_.gvec_phase_factor(ig, ia);
+                                veff_a(2 * g,     i, 0) = z.real();
+                                veff_a(2 * g + 1, i, 0) = z.imag();
                             }
                         }
-                        break;
-                    }
-                    case sddk::device_t::GPU: {
-                        /* wait for stream#1 to finish previous zgemm */
-                        acc::sync_stream(stream_id(1));
-                        /* copy plane wave coefficients of effective potential to GPU */
-                        sddk::mdarray<std::complex<double>, 1> veff(&component(iv).f_pw_local(g_begin), spl_ngv_loc.local_size(ib));
-                        veff.allocate(get_memory_pool(sddk::memory_t::device)).copy_to(sddk::memory_t::device);
+                        la::wrap(la::lib_t::blas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng,
+                                  &la::constant<double>::one(),
+                                  ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin),
+                                  ctx_.augmentation_op(iat).q_pw().ld(),
+                                  veff_a.at(sddk::memory_t::host), veff_a.ld(),
+                                  &la::constant<double>::one(),
+                                  d_tmp.at(sddk::memory_t::host, 0, 0, iv), d_tmp.ld());
+                    } // iv
+                    break;
+                }
+                case sddk::device_t::GPU: {
+                    acc::copyin(qpw.at(sddk::memory_t::device),
+                            ctx_.augmentation_op(iat).q_pw().at(sddk::memory_t::host, 0, 2 * g_begin), 2 * ng * nqlm);
+                    for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
 #if defined(SIRIUS_GPU)
-                        mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), spl_ngv_loc.local_size(ib),
-                                                        veff.at(sddk::memory_t::device),
-                                                        ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0),
-                                                        ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1),
-                                                        ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2),
-                                                        ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device),
-                                                        veff_a.at(sddk::memory_t::device), spl_ngv_loc.local_size(), 1);
+                        mul_veff_with_phase_factors_gpu(atom_type.num_atoms(), ng, veff.at(sddk::memory_t::device, 0, iv),
+                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 0),
+                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 1),
+                            ctx_.gvec_coord().at(sddk::memory_t::device, g_begin, 2),
+                            ctx_.unit_cell().atom_coord(iat).at(sddk::memory_t::device),
+                            veff_a.at(sddk::memory_t::device, 0, 0, iv), ng, 1 + iv);
+
+                        la::wrap(la::lib_t::gpublas).gemm('N', 'N', nqlm, atom_type.num_atoms(), 2 * ng,
+                                  &la::constant<double>::one(),
+                                  qpw.at(sddk::memory_t::device), qpw.ld(),
+                                  veff_a.at(sddk::memory_t::device, 0, 0, iv), veff_a.ld(),
+                                  &la::constant<double>::one(),
+                                  d_tmp.at(sddk::memory_t::device, 0, 0, iv), d_tmp.ld(),
+                                  stream_id(1 + iv));
 #endif
-                        break;
-                    }
-                }
-                if (ctx_.cfg().control().print_checksum()) {
-                    if (ctx_.processing_unit() == sddk::device_t::GPU) {
-                        veff_a.copy_to(sddk::memory_t::host);
+                    } // iv
+                    for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
+                        acc::sync_stream(stream_id(1 + iv));
                     }
-                    auto cs = veff_a.checksum();
-                    std::stringstream s;
-                    s << "Gvec_block_" << ib << "_veff_a";
-                    utils::print_checksum(s.str(), cs, ctx_.out());
+                    break;
                 }
-                la::wrap(la).gemm('N', 'N', nbf * (nbf + 1) / 2, atom_type.num_atoms(), 2 * spl_ngv_loc.local_size(ib),
-                                  &la::constant<double>::one(),
-                                  ctx_.augmentation_op(iat).q_pw().at(mem, 0, 2 * g_begin),
-                                  ctx_.augmentation_op(iat).q_pw().ld(),
-                                  veff_a.at(mem), veff_a.ld(),
-                                  &la::constant<double>::one(),
-                                  d_tmp.at(mem), d_tmp.ld(),
-                                  stream_id(1));
-            } // ib (blocks of G-vectors)
-
-            if (ctx_.processing_unit() == sddk::device_t::GPU) {
-                d_tmp.copy_to(sddk::memory_t::host);
             }
 
+            g_begin += ng;
+        }
+
+        if (ctx_.processing_unit() == sddk::device_t::GPU) {
+            d_tmp.copy_to(sddk::memory_t::host);
+        }
+
+        //if (ctx_.cfg().control().print_checksum()) {
+        //    if (ctx_.processing_unit() == sddk::device_t::GPU) {
+        //        veff_a.copy_to(sddk::memory_t::host);
+        //    }
+        //    auto cs = veff_a.checksum();
+        //    std::stringstream s;
+        //    s << "Gvec_block_" << ib << "_veff_a";
+        //    utils::print_checksum(s.str(), cs, ctx_.out());
+        //}
+
+        for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
             if (ctx_.gvec().reduced()) {
                 if (comm_.rank() == 0) {
                     for (int i = 0; i < atom_type.num_atoms(); i++) {
-                        for (int j = 0; j < nbf * (nbf + 1) / 2; j++) {
-                            d_tmp(j, i) = 2 * d_tmp(j, i) - component(iv).f_pw_local(0).real() *
+                        for (int j = 0; j < nqlm; j++) {
+                            d_tmp(j, i, iv) = 2 * d_tmp(j, i, iv) - component(iv).f_pw_local(0).real() *
                                 ctx_.augmentation_op(iat).q_pw(j, 0);
                         }
                     }
                 } else {
                     for (int i = 0; i < atom_type.num_atoms(); i++) {
-                        for (int j = 0; j < nbf * (nbf + 1) / 2; j++) {
-                            d_tmp(j, i) *= 2;
+                        for (int j = 0; j < nqlm; j++) {
+                            d_tmp(j, i, iv) *= 2;
                         }
                     }
                 }
             }
 
             /* sum from all ranks */
-            comm_.allreduce(d_tmp.at(sddk::memory_t::host), static_cast<int>(d_tmp.size()));
+            comm_.allreduce(d_tmp.at(sddk::memory_t::host, 0, 0, iv), nqlm * atom_type.num_atoms());
 
-            if (ctx_.cfg().control().print_checksum() && ctx_.comm().rank() == 0) {
-                for (int i = 0; i < atom_type.num_atoms(); i++) {
-                    std::stringstream s;
-                    s << "D_mtrx_val(atom_t" << iat << "_i" << i << "_c" << iv << ")";
-                    auto cs = sddk::mdarray<double, 1>(&d_tmp(0, i), nbf * (nbf + 1) / 2).checksum();
-                    utils::print_checksum(s.str(), cs, ctx_.out());
-                }
-            }
+            //if (ctx_.cfg().control().print_checksum() && ctx_.comm().rank() == 0) {
+            //    for (int i = 0; i < atom_type.num_atoms(); i++) {
+            //        std::stringstream s;
+            //        s << "D_mtrx_val(atom_t" << iat << "_i" << i << "_c" << iv << ")";
+            //        auto cs = sddk::mdarray<double, 1>(&d_tmp(0, i), nbf * (nbf + 1) / 2).checksum();
+            //        utils::print_checksum(s.str(), cs, ctx_.out());
+            //    }
+            //}
 
             #pragma omp parallel for schedule(static)
             for (int i = 0; i < atom_type.num_atoms(); i++) {
@@ -197,13 +226,12 @@ void Potential::generate_D_operator_matrix()
                     for (int xi1 = 0; xi1 <= xi2; xi1++) {
                         int idx12 = xi2 * (xi2 + 1) / 2 + xi1;
                         /* D-matix is symmetric */
-                        atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i) * unit_cell_.omega();
+                        atom.d_mtrx(xi1, xi2, iv) = atom.d_mtrx(xi2, xi1, iv) = d_tmp(idx12, i, iv) * unit_cell_.omega();
                     }
                 }
             }
-        }
-        ctx_.augmentation_op(iat).dismiss();
-    }
+        } // iv
+    } // iat
 }
 
 } // namespace sirius
diff --git a/src/sht/gaunt.hpp b/src/sht/gaunt.hpp
index 1adef151e0..753bc2bb94 100644
--- a/src/sht/gaunt.hpp
+++ b/src/sht/gaunt.hpp
@@ -201,7 +201,8 @@ class Gaunt_coefficients
         return gaunt_packed_L3_(lm1, lm2);
     }
 
-    inline sddk::mdarray<T, 3> get_full_set_L3() const
+    /// Return the full tensor of Gaunt coefficients <R_{L1}|R_{L3}|R_{L2}> with a (L3, L1, L2) order of indices.
+    inline auto get_full_set_L3() const
     {
         sddk::mdarray<T, 3> gc(lmmax3_, lmmax1_, lmmax2_);
         gc.zero();
diff --git a/src/sht/sht.hpp b/src/sht/sht.hpp
index ec1a82f5f3..4defdd9f32 100644
--- a/src/sht/sht.hpp
+++ b/src/sht/sht.hpp
@@ -329,34 +329,6 @@ class SHT // TODO: better name
     //    }
     //}
 
-    /// Transform Cartesian coordinates [x,y,z] to spherical coordinates [r,theta,phi]
-    static r3::vector<double> spherical_coordinates(r3::vector<double> vc)
-    {
-        r3::vector<double> vs;
-
-        const double eps{1e-12};
-
-        vs[0] = vc.length();
-
-        if (vs[0] <= eps) {
-            vs[1] = 0.0;
-            vs[2] = 0.0;
-        } else {
-            vs[1] = std::acos(vc[2] / vs[0]); // theta = cos^{-1}(z/r)
-
-            if (std::abs(vc[0]) > eps || std::abs(vc[1]) > eps) {
-                vs[2] = std::atan2(vc[1], vc[0]); // phi = tan^{-1}(y/x)
-                if (vs[2] < 0.0) {
-                    vs[2] += twopi;
-                }
-            } else {
-                vs[2] = 0.0;
-            }
-        }
-
-        return vs;
-    }
-
     /// Compute element of the transformation matrix from complex to real spherical harmonics.
     /** Real spherical harmonic can be written as a linear combination of complex harmonics:
         \f[
diff --git a/src/specfunc/specfunc.hpp b/src/specfunc/specfunc.hpp
index b10e998515..88259faa02 100644
--- a/src/specfunc/specfunc.hpp
+++ b/src/specfunc/specfunc.hpp
@@ -493,7 +493,6 @@ inline void dRlm_dr(int lmax__, r3::vector<double>& r__, sddk::mdarray<double, 2
     r3::vector<double> dtheta_dr({cost * cosp, cost * sinp, -sint});
     r3::vector<double> dphi_dr({-sinp, cosp, 0});
 
-
     std::vector<double> dRlm_dt(lmmax);
     std::vector<double> dRlm_dp_sin_t(lmmax);
 
diff --git a/src/symmetry/crystal_symmetry.cpp b/src/symmetry/crystal_symmetry.cpp
index 4328f559df..bb8af75e48 100644
--- a/src/symmetry/crystal_symmetry.cpp
+++ b/src/symmetry/crystal_symmetry.cpp
@@ -325,7 +325,10 @@ void
 Crystal_symmetry::print_info(std::ostream& out__, int verbosity__) const
 {
     if (this->spg_dataset_ && (this->spg_dataset_->n_operations != this->num_spg_sym())) {
-        out__ << "space group found by spglib is defferent" << std::endl;
+        out__ << "space group found by spglib is different" << std::endl
+              << "  num. sym. spglib : " << this->spg_dataset_->n_operations << std::endl
+              << "  num. sym. actual : " << this->num_spg_sym() << std::endl
+              << "  tolerance : " << this->tolerance_ << std::endl;
     } else {
         out__ << "space group number   : " << this->spacegroup_number() << std::endl
               << "international symbol : " << this->international_symbol() << std::endl
diff --git a/src/symmetry/lattice.hpp b/src/symmetry/lattice.hpp
index 35d3a9f9e5..da4aa6c18d 100644
--- a/src/symmetry/lattice.hpp
+++ b/src/symmetry/lattice.hpp
@@ -22,6 +22,7 @@
  *  \brief Crystal lattice functions.
  */
 
+#include <algorithm>
 #include "linalg/r3.hpp"
 #include "utils/rte.hpp"
 
@@ -54,12 +55,14 @@ metric_tensor_error(r3::matrix<double> const& lat_vec__, r3::matrix<int> const&
 }
 
 inline auto
-find_lat_sym(r3::matrix<double> const& lat_vec__, double tol__)
+find_lat_sym(r3::matrix<double> const& lat_vec__, double tol__, double* mt_error__ = nullptr)
 {
     std::vector<r3::matrix<int>> lat_sym;
 
     auto r = {-1, 0, 1};
 
+    double mt_error_max{0};
+
     for (int i00: r) {
     for (int i01: r) {
     for (int i02: r) {
@@ -73,8 +76,10 @@ find_lat_sym(r3::matrix<double> const& lat_vec__, double tol__)
                 r3::matrix<int> R({{i00, i01, i02}, {i10, i11, i12}, {i20, i21, i22}});
                 /* valid symmetry operation has a determinant of +/- 1 */
                 if (std::abs(R.det()) == 1) {
+                    auto mt_error = metric_tensor_error(lat_vec__, R);
+                    mt_error_max = std::max(mt_error_max, mt_error);
                     /* metric tensor should be invariant under symmetry operation */
-                    if (metric_tensor_error(lat_vec__, R) < tol__) {
+                    if (mt_error < tol__) {
                         lat_sym.push_back(R);
                     }
                 }
@@ -88,12 +93,38 @@ find_lat_sym(r3::matrix<double> const& lat_vec__, double tol__)
     }
     }
 
+    if (mt_error__) {
+        *mt_error__ = mt_error_max;
+    }
+
     if (lat_sym.size() == 0 || lat_sym.size() > 48) {
         std::stringstream s;
-        s << "wrong number of lattice symmetries: " << lat_sym.size();
+        s << "wrong number of lattice symmetries: " << lat_sym.size() << std::endl
+          << "  lattice vectors : " << lat_vec__ << std::endl
+          << "  tolerance : " << tol__ << std::endl
+          << "  metric tensor error : " << mt_error_max;
         RTE_THROW(s);
     }
 
+    /* check if the set of symmetry operations is a group */
+    for (auto& R1: lat_sym) {
+        for (auto& R2: lat_sym) {
+            auto R3 = r3::dot(R1, R2);
+            if (std::find(lat_sym.begin(), lat_sym.end(), R3) == lat_sym.end()) {
+                std::stringstream s;
+                s << "lattice symmetries do not form a group" << std::endl;
+                for (auto& R: lat_sym) {
+                    s << " sym.op : " << R << ", metric tensor error : " << metric_tensor_error(lat_vec__, R) << std::endl;
+                }
+                s << "R1 : " << R1 << std::endl;
+                s << "R2 : " << R2 << std::endl;
+                s << "R1 * R2 : " << R3 << " is not in group" << std::endl;
+                s << "metric tensor tolerance : " << tol__;
+                RTE_THROW(s);
+            }
+        }
+    }
+
     return lat_sym;
 }
 
diff --git a/src/symmetry/symmetrize.hpp b/src/symmetry/symmetrize.hpp
index c07ba71e98..0f20c0c05f 100644
--- a/src/symmetry/symmetrize.hpp
+++ b/src/symmetry/symmetrize.hpp
@@ -31,6 +31,7 @@
 #include "typedefs.hpp"
 #include "sht/sht.hpp"
 #include "utils/profiler.hpp"
+#include "utils/rte.hpp"
 
 namespace sirius {
 
@@ -140,7 +141,12 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
 
 #if !defined(NDEBUG)
             if (igsh != gvec_shells__.gvec().shell(G)) {
-                throw std::runtime_error("wrong index of G-shell");
+                std::stringstream s;
+                s << "wrong index of G-shell" << std::endl
+                  << "  G-vector : " << G << std::endl
+                  << "  igsh in the remapped set : " << igsh << std::endl
+                  << "  igsh in the original set : " << gvec_shells__.gvec().shell(G);
+                RTE_THROW(s);
             }
 #endif
             /* each thread is working on full shell of G-vectors */
@@ -154,7 +160,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                 /* find the symmetrized PW coefficient */
 
                 for (int i = 0; i < sym__.size(); i++) {
-                    auto G1 = dot(G, sym__[i].spg_op.R);
+                    auto G1 = r3::dot(G, sym__[i].spg_op.R);
 
                     auto S = sym__[i].spin_rotation;
 
@@ -163,15 +169,24 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                     /* local index of a rotated G-vector */
                     int ig1 = gvec_shells__.index_by_gvec(G1);
 
+                    /* check the reduced G-vector */
                     if (ig1 == -1) {
                         G1 = G1 * (-1);
 #if !defined(NDEBUG)
                         if (igsh != gvec_shells__.gvec().shell(G1)) {
-                            throw std::runtime_error("wrong index of G-shell");
+                            std::stringstream s;
+                            s << "wrong index of G-shell" << std::endl
+                              << "  index of G-shell : " << igsh << std::endl
+                              << "  symmetry operation : " << sym__[i].spg_op.R << std::endl
+                              << "  G-vector : " << G << std::endl
+                              << "  rotated G-vector : " << G1 << std::endl
+                              << "  G-vector index : " << gvec_shells__.index_by_gvec(G1) << std::endl
+                              << "  index of rotated G-vector shell : " << gvec_shells__.gvec().shell(G1);
+                            RTE_THROW(s);
                         }
 #endif
                         ig1 = gvec_shells__.index_by_gvec(G1);
-                        assert(ig1 >= 0 && ig1 < ngv);
+                        RTE_ASSERT(ig1 >= 0 && ig1 < ngv);
                         if (f_pw__) {
                             symf += std::conj(f_pw[ig1]) * phase;
                         }
@@ -179,7 +194,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                             symz += std::conj(z_pw[ig1]) * phase * S(2, 2);
                         }
                         if (is_non_collin) {
-                            auto v = dot(S, r3::vector<std::complex<double>>({x_pw[ig1], y_pw[ig1], z_pw[ig1]}));
+                            auto v = r3::dot(S, r3::vector<std::complex<double>>({x_pw[ig1], y_pw[ig1], z_pw[ig1]}));
                             symx += std::conj(v[0]) * phase;
                             symy += std::conj(v[1]) * phase;
                             symz += std::conj(v[2]) * phase;
@@ -187,10 +202,18 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                     } else {
 #if !defined(NDEBUG)
                         if (igsh != gvec_shells__.gvec().shell(G1)) {
-                            throw std::runtime_error("wrong index of G-shell");
+                            std::stringstream s;
+                            s << "wrong index of G-shell" << std::endl
+                              << "  index of G-shell : " << igsh << std::endl
+                              << "  symmetry operation : " << sym__[i].spg_op.R << std::endl
+                              << "  G-vector : " << G << std::endl
+                              << "  rotated G-vector : " << G1 << std::endl
+                              << "  G-vector index : " << gvec_shells__.index_by_gvec(G1) << std::endl
+                              << "  index of rotated G-vector shell : " << gvec_shells__.gvec().shell(G1);
+                            RTE_THROW(s);
                         }
 #endif
-                        assert(ig1 >= 0 && ig1 < ngv);
+                        RTE_ASSERT(ig1 >= 0 && ig1 < ngv);
                         if (f_pw__) {
                             symf += f_pw[ig1] * phase;
                         }
@@ -198,7 +221,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                             symz += z_pw[ig1] * phase * S(2, 2);
                         }
                         if (is_non_collin) {
-                            auto v = dot(S, r3::vector<std::complex<double>>({x_pw[ig1], y_pw[ig1], z_pw[ig1]}));
+                            auto v = r3::dot(S, r3::vector<std::complex<double>>({x_pw[ig1], y_pw[ig1], z_pw[ig1]}));
                             symx += v[0] * phase;
                             symy += v[1] * phase;
                             symz += v[2] * phase;
@@ -216,12 +239,12 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                 for (int isym = 0; isym < sym__.size(); isym++) {
                     auto S = sym__[isym].spin_rotation;
 
-                    auto G1 = dot(sym__[isym].spg_op.invRT, G);
+                    auto G1 = r3::dot(sym__[isym].spg_op.invRT, G);
                     /* index of a rotated G-vector */
                     int ig1 = gvec_shells__.index_by_gvec(G1);
 
                     if (ig1 != -1) {
-                        assert(ig1 >= 0 && ig1 < ngv);
+                        RTE_ASSERT(ig1 >= 0 && ig1 < ngv);
                         auto phase = std::conj(phase_factor(isym, G1));
                         std::complex<double> symf1, symx1, symy1, symz1;
                         if (f_pw__) {
@@ -231,7 +254,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
                             symz1 = symz * phase * S(2, 2);
                         }
                         if (is_non_collin) {
-                            auto v = dot(S, r3::vector<std::complex<double>>({symx, symy, symz}));
+                            auto v = r3::dot(S, r3::vector<std::complex<double>>({symx, symy, symz}));
                             symx1  = v[0] * phase;
                             symy1  = v[1] * phase;
                             symz1  = v[2] * phase;
@@ -293,7 +316,7 @@ symmetrize(Crystal_symmetry const& sym__, fft::Gvec_shells const& gvec_shells__,
         auto G = gvec_shells__.gvec_remapped(igloc);
         for (int isym = 0; isym < sym__.size(); isym++) {
             auto S      = sym__[isym].spin_rotation;
-            auto gv_rot = dot(sym__[isym].spg_op.invRT, G);
+            auto gv_rot = r3::dot(sym__[isym].spg_op.invRT, G);
             /* index of a rotated G-vector */
             int ig_rot           = gvec_shells__.index_by_gvec(gv_rot);
             std::complex<double> phase = std::conj(phase_factor(isym, gv_rot));
diff --git a/src/unit_cell/atom_symmetry_class.cpp b/src/unit_cell/atom_symmetry_class.cpp
index 590b635880..aac8b9d23c 100644
--- a/src/unit_cell/atom_symmetry_class.cpp
+++ b/src/unit_cell/atom_symmetry_class.cpp
@@ -55,7 +55,7 @@ Atom_symmetry_class::Atom_symmetry_class(int id__, Atom_type const& atom_type__)
         o1_radial_integrals_.zero();
     }
 
-    /* copy descriptors because enu is defferent between atom classes */
+    /* copy descriptors because enu is different between atom classes */
     aw_descriptors_.resize(atom_type_.num_aw_descriptors());
     for (int i = 0; i < num_aw_descriptors(); i++) {
         aw_descriptors_[i] = atom_type_.aw_descriptor(i);
diff --git a/src/unit_cell/unit_cell.cpp b/src/unit_cell/unit_cell.cpp
index f486647dd5..b6c0bea9be 100644
--- a/src/unit_cell/unit_cell.cpp
+++ b/src/unit_cell/unit_cell.cpp
@@ -494,7 +494,7 @@ Unit_cell::is_point_in_mt(r3::vector<double> vc, int& ja, int& jr, double& dr, d
                     r3::vector<double> vf = vr.first - posf;
 
                     /* convert to spherical coordinates */
-                    auto vs = SHT::spherical_coordinates(get_cartesian_coordinates(vf));
+                    auto vs = r3::spherical_coordinates(get_cartesian_coordinates(vf));
 
                     if (vs[0] < atom(ia).mt_radius()) {
                         ja    = ia;
@@ -527,7 +527,7 @@ Unit_cell::is_point_in_mt(r3::vector<double> vc, int& ja, int& jr, double& dr, d
 }
 
 void
-Unit_cell::generate_radial_functions()
+Unit_cell::generate_radial_functions(std::ostream& out__)
 {
     PROFILE("sirius::Unit_cell::generate_radial_functions");
 
@@ -543,16 +543,15 @@ Unit_cell::generate_radial_functions()
 
     if (parameters_.verbosity() >= 1) {
         mpi::pstdout pout(comm_);
+        if (comm_.rank() == 0) {
+            pout << std::endl << "Linearization energies" << std::endl;
+        }
 
         for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) {
             int ic = spl_num_atom_symmetry_classes(icloc);
             atom_symmetry_class(ic).write_enu(pout);
         }
-
-        if (comm_.rank() == 0) {
-            std::printf("\n");
-            std::printf("Linearization energies\n");
-        }
+        RTE_OUT(out__) << pout.flush(0);
     }
     if (parameters_.verbosity() >= 4 && comm_.rank() == 0) {
         for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) {
@@ -677,7 +676,6 @@ Unit_cell::initialize()
         max_mt_radial_basis_size_ = std::max(max_mt_radial_basis_size_, atom_type(iat).mt_radial_basis_size());
         max_mt_aw_basis_size_     = std::max(max_mt_aw_basis_size_, atom_type(iat).mt_aw_basis_size());
         max_mt_lo_basis_size_     = std::max(max_mt_lo_basis_size_, atom_type(iat).mt_lo_basis_size());
-        lmax_                     = std::max(lmax_, atom_type(iat).indexr().lmax());
         offs_lo += atom_type(iat).mt_lo_basis_size();
     }
 
diff --git a/src/unit_cell/unit_cell.hpp b/src/unit_cell/unit_cell.hpp
index 6e9f85572a..625ff6fc3b 100644
--- a/src/unit_cell/unit_cell.hpp
+++ b/src/unit_cell/unit_cell.hpp
@@ -153,9 +153,6 @@ class Unit_cell
     /// Maximum muffin-tin radius.
     double max_mt_radius_{0};
 
-    /// Maximum orbital quantum number of radial functions between all atom types.
-    int lmax_{-1};
-
     std::unique_ptr<Crystal_symmetry> symmetry_;
 
     /// Atomic coordinates in GPU-friendly ordering packed in arrays for each atom type.
@@ -258,7 +255,7 @@ class Unit_cell
 
     bool is_point_in_mt(r3::vector<double> vc, int& ja, int& jr, double& dr, double tp[2]) const;
 
-    void generate_radial_functions();
+    void generate_radial_functions(std::ostream& out__);
 
     void generate_radial_integrals();
 
@@ -470,6 +467,16 @@ class Unit_cell
         return max_mt_lo_basis_size_;
     }
 
+    /// Maximum number of atoms across all atom types.
+    inline auto max_num_atoms() const
+    {
+        int max_na{0};
+        for (int iat = 0; iat < this->num_atom_types(); iat++) {
+            max_na = std::max(max_na, this->atom_type(iat).num_atoms());
+        }
+        return max_na;
+    }
+
     void set_equivalent_atoms(int const* equivalent_atoms__)
     {
         equivalent_atoms_.resize(num_atoms());
@@ -506,9 +513,14 @@ class Unit_cell
         return volume_it_;
     }
 
+    /// Maximum orbital quantum number of radial functions between all atom types.
     inline int lmax() const
     {
-        return lmax_;
+        int l{-1};
+        for (int iat = 0; iat < this->num_atom_types(); iat++) {
+            l = std::max(l, this->atom_type(iat).indexr().lmax());
+        }
+        return l;
     }
 
     inline int lmax_apw() const
diff --git a/src/utils/utils.hpp b/src/utils/utils.hpp
index 06062f6142..a10f5a0a44 100644
--- a/src/utils/utils.hpp
+++ b/src/utils/utils.hpp
@@ -252,6 +252,9 @@ inline auto split_in_blocks(int length__, int block_size__)
 {
     int nb = num_blocks(length__, block_size__);
     /* adjust the block size; this is done to prevent very unequal block sizes */
+    /* Take, for example, 21 elements and initial block size of 15. Number of blocks equals 2.
+     * Final block size is 21 / 2 + min(1, 21 % 2) = 11. Thus 21 elements will be split in two blocks
+     * of 11 and 10 elements. */
     block_size__ = length__ / nb + std::min(1, length__ % nb);
 
     std::vector<int> result(nb);