diff --git a/src/Band/band.hpp b/src/Band/band.hpp
index c39b6ecc0..01f0814fd 100644
--- a/src/Band/band.hpp
+++ b/src/Band/band.hpp
@@ -26,7 +26,7 @@
 #define __BAND_HPP__
 
 #include "periodic_function.h"
-#include "k_point_set.h"
+#include "K_point/k_point_set.hpp"
 #include "Hamiltonian/hamiltonian.hpp"
 
 namespace sirius {
diff --git a/src/augmentation_operator.h b/src/Density/augmentation_operator.hpp
similarity index 98%
rename from src/augmentation_operator.h
rename to src/Density/augmentation_operator.hpp
index 0fedba041..ce2dc03e3 100644
--- a/src/augmentation_operator.h
+++ b/src/Density/augmentation_operator.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
+// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that
@@ -17,13 +17,13 @@
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-/** \file augmentation_operator.h
+/** \file augmentation_operator.hpp
  *
  *  \brief Contains implementation of sirius::Augmentation_operator class.
  */
 
-#ifndef __AUGMENTATION_OPERATOR_H__
-#define __AUGMENTATION_OPERATOR_H__
+#ifndef __AUGMENTATION_OPERATOR_HPP__
+#define __AUGMENTATION_OPERATOR_HPP__
 
 #include "radial_integrals.h"
 
@@ -220,6 +220,7 @@ class Augmentation_operator
     }
 };
 
+/// Derivative of augmentation operator PW coefficients with respect to the Cartesian component of G-vector.
 class Augmentation_operator_gvec_deriv
 {
   private:
diff --git a/src/Density/density.hpp b/src/Density/density.hpp
index e58453ebb..5ffb0b2ad 100644
--- a/src/Density/density.hpp
+++ b/src/Density/density.hpp
@@ -26,7 +26,7 @@
 #define __DENSITY_HPP__
 
 #include "periodic_function.h"
-#include "k_point_set.h"
+#include "K_point/k_point_set.hpp"
 #include "simulation_context.h"
 #include "mixer.h"
 #include "field4d.hpp"
diff --git a/src/Geometry/force.hpp b/src/Geometry/force.hpp
index f7e019a4d..92b60195a 100644
--- a/src/Geometry/force.hpp
+++ b/src/Geometry/force.hpp
@@ -26,12 +26,11 @@
 #ifndef __FORCE_HPP__
 #define __FORCE_HPP__
 
-#include "../simulation_context.h"
-#include "../periodic_function.h"
-#include "../augmentation_operator.h"
-#include "../Beta_projectors/beta_projectors.h"
-#include "../Beta_projectors/beta_projectors_gradient.h"
-//#include "../potential.h"
+#include "simulation_context.h"
+#include "periodic_function.h"
+#include "Density/augmentation_operator.hpp"
+#include "Beta_projectors/beta_projectors.h"
+#include "Beta_projectors/beta_projectors_gradient.h"
 #include "non_local_functor.hpp"
 
 namespace sirius {
diff --git a/src/Geometry/non_local_functor.hpp b/src/Geometry/non_local_functor.hpp
index 847c53176..d29544800 100644
--- a/src/Geometry/non_local_functor.hpp
+++ b/src/Geometry/non_local_functor.hpp
@@ -25,11 +25,10 @@
 #ifndef __NON_LOCAL_FUNCTOR_HPP__
 #define __NON_LOCAL_FUNCTOR_HPP__
 
-#include "../simulation_context.h"
-#include "../periodic_function.h"
-#include "../augmentation_operator.h"
-#include "../Beta_projectors/beta_projectors.h"
-//#include "../potential.h"
+#include "simulation_context.h"
+#include "periodic_function.h"
+#include "Density/augmentation_operator.hpp"
+#include "Beta_projectors/beta_projectors.h"
 
 namespace sirius {
 
diff --git a/src/Hamiltonian/hamiltonian.hpp b/src/Hamiltonian/hamiltonian.hpp
index 22d06ca77..bc7fdcc21 100644
--- a/src/Hamiltonian/hamiltonian.hpp
+++ b/src/Hamiltonian/hamiltonian.hpp
@@ -29,7 +29,7 @@
 #include "simulation_context.h"
 #include "Hubbard/hubbard.hpp"
 #include "Potential/potential.hpp"
-#include "k_point.h"
+#include "K_point/k_point.hpp"
 #include "local_operator.hpp"
 #include "non_local_operator.hpp"
 #include "memory_pool.hpp"
diff --git a/src/Hubbard/hubbard.hpp b/src/Hubbard/hubbard.hpp
index 6b0832e8c..dcd9cbde9 100644
--- a/src/Hubbard/hubbard.hpp
+++ b/src/Hubbard/hubbard.hpp
@@ -4,12 +4,12 @@
 #include <cstdio>
 #include <cstdlib>
 #include "simulation_context.h"
-#include "k_point.h"
+#include "K_point/k_point.hpp"
 #include "wave_functions.hpp"
 #include "Hamiltonian/non_local_operator.hpp"
-#include "../Beta_projectors/beta_projectors.h"
-#include "../Beta_projectors/beta_projectors_gradient.h"
-#include "../Beta_projectors/beta_projectors_strain_deriv.h"
+#include "Beta_projectors/beta_projectors.h"
+#include "Beta_projectors/beta_projectors_gradient.h"
+#include "Beta_projectors/beta_projectors_strain_deriv.h"
 #include "radial_integrals.h"
 #include "mixer.h"
 
diff --git a/src/K_point/k_point.hpp b/src/K_point/k_point.hpp
index cc5ae7ca0..9efe7ad00 100644
--- a/src/K_point/k_point.hpp
+++ b/src/K_point/k_point.hpp
@@ -1,3 +1,775 @@
+// Copyright (c) 2013-2016 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file k_point.hpp
+ *
+ *  \brief Contains definition and partial implementation of sirius::K_point class.
+ */
+
+#ifndef __K_POINT_HPP__
+#define __K_POINT_HPP__
+
+#include "periodic_function.h"
+#include "matching_coefficients.h"
+#include "Beta_projectors/beta_projectors.h"
+#include "wave_functions.hpp"
+
+namespace sirius {
+
+/// K-point related variables and methods.
+/** \image html wf_storage.png "Wave-function storage"
+ *  \image html fv_eigen_vectors.png "First-variational eigen vectors" */
+class K_point
+{
+    private:
+
+        /// Simulation context.
+        Simulation_context& ctx_;
+
+        /// Unit cell object.
+        Unit_cell const& unit_cell_;
+
+        /// Weight of k-point.
+        double weight_;
+
+        /// Fractional k-point coordinates.
+        vector3d<double> vk_;
+
+        /// List of G-vectors with |G+k| < cutoff.
+        std::unique_ptr<Gvec> gkvec_;
+
+        /// G-vector distribution for the FFT transformation.
+        std::unique_ptr<Gvec_partition> gkvec_partition_;
+
+        /// First-variational eigen values
+        std::vector<double> fv_eigen_values_;
+
+        /// First-variational eigen vectors, distributed over 2D BLACS grid.
+        dmatrix<double_complex> fv_eigen_vectors_;
+
+        /// First-variational eigen vectors, distributed in slabs.
+        std::unique_ptr<Wave_functions> fv_eigen_vectors_slab_;
+
+        /// Lowest eigen-vectors of the LAPW overlap matrix with small aigen-values.
+        std::unique_ptr<Wave_functions> singular_components_;
+
+        /// Second-variational eigen vectors.
+        /** Second-variational eigen-vectors are stored as one or two \f$ N_{fv} \times N_{fv} \f$ matrices in
+         *  case of non-magnetic or collinear magnetic case or as a single \f$ 2 N_{fv} \times 2 N_{fv} \f$
+         *  matrix in case of general non-collinear magnetism. */
+        dmatrix<double_complex> sv_eigen_vectors_[2];
+
+        /// Full-diagonalization eigen vectors.
+        mdarray<double_complex, 2> fd_eigen_vectors_;
+
+        /// First-variational states.
+        std::unique_ptr<Wave_functions> fv_states_{nullptr};
+
+        /// Two-component (spinor) wave functions describing the bands.
+        std::unique_ptr<Wave_functions> spinor_wave_functions_{nullptr};
+
+        /// Two-component (spinor) hubbard wave functions where the S matrix is applied (if ppus).
+        std::unique_ptr<Wave_functions> hubbard_wave_functions_{nullptr};
+
+        /// Band occupation numbers.
+        mdarray<double, 2> band_occupancies_;
+
+        /// Band energies.
+        mdarray<double, 2> band_energies_;
+
+        /// LAPW matching coefficients for the row G+k vectors.
+        /** Used to setup the distributed LAPW Hamiltonian and overlap matrices. */
+        std::unique_ptr<Matching_coefficients> alm_coeffs_row_{nullptr};
+
+        /// LAPW matching coefficients for the column G+k vectors.
+        /** Used to setup the distributed LAPW Hamiltonian and overlap matrices. */
+        std::unique_ptr<Matching_coefficients> alm_coeffs_col_{nullptr};
+
+        /// LAPW matching coefficients for the local set G+k vectors.
+        std::unique_ptr<Matching_coefficients> alm_coeffs_loc_{nullptr};
+
+        /// Mapping between local row and global G+k vecotor index.
+        /** Used by matching_coefficients class. */
+        std::vector<int> igk_row_;
+
+        /// Mapping between local column and global G+k vecotor index.
+        /** Used by matching_coefficients class. */
+        std::vector<int> igk_col_;
+
+        /// Mapping between local and global G+k vecotor index.
+        /** Used by matching_coefficients class. */
+        std::vector<int> igk_loc_;
+
+        /// Number of G+k vectors distributed along rows of MPI grid
+        int num_gkvec_row_{0};
+
+        /// Number of G+k vectors distributed along columns of MPI grid
+        int num_gkvec_col_{0};
+
+        /// Offset of the local fraction of G+k vectors in the global index.
+        int gkvec_offset_{0};
+
+        /// Basis descriptors distributed between rows of the 2D MPI grid.
+        /** This is a local array. Only MPI ranks belonging to the same column have identical copies of this array. */
+        std::vector<lo_basis_descriptor> lo_basis_descriptors_row_;
+
+        /// Basis descriptors distributed between columns of the 2D MPI grid.
+        /** This is a local array. Only MPI ranks belonging to the same row have identical copies of this array. */
+        std::vector<lo_basis_descriptor> lo_basis_descriptors_col_;
+
+        /// List of columns of the Hamiltonian and overlap matrix lo block (local index) for a given atom.
+        std::vector<std::vector<int>> atom_lo_cols_;
+
+        /// list of rows of the Hamiltonian and overlap matrix lo block (local index) for a given atom
+        std::vector<std::vector<int>> atom_lo_rows_;
+
+        /// Imaginary unit to the power of l.
+        std::vector<double_complex> zil_;
+
+        /// Mapping between lm and l.
+        std::vector<int> l_by_lm_;
+
+        /// Column rank of the processors of ScaLAPACK/ELPA diagonalization grid.
+        int rank_col_;
+
+        /// Number of processors along the columns of the diagonalization grid.
+        int num_ranks_col_;
+
+        /// Row rank of the processors of ScaLAPACK/ELPA diagonalization grid.
+        int rank_row_;
+
+        /// Number of processors along the rows of the diagonalization grid.
+        int num_ranks_row_;
+
+        /// Beta projectors for a local set of G+k vectors.
+        std::unique_ptr<Beta_projectors> beta_projectors_{nullptr};
+
+        /// Beta projectors for row G+k vectors.
+        /** Used to setup the full Hamiltonian in PP-PW case (for verification purpose only) */
+        std::unique_ptr<Beta_projectors> beta_projectors_row_{nullptr};
+
+        /// Beta projectors for column G+k vectors.
+        /** Used to setup the full Hamiltonian in PP-PW case (for verification purpose only) */
+        std::unique_ptr<Beta_projectors> beta_projectors_col_{nullptr};
+
+        /// Preconditioner matrix for Chebyshev solver.
+        mdarray<double_complex, 3> p_mtrx_;
+
+        /// Communicator for parallelization inside k-point.
+        /** This communicator is used to split G+k vectors and wave-functions. */
+        Communicator const& comm_;
+
+        /// Communicator between(!!) rows.
+        Communicator const& comm_row_;
+
+        /// Communicator between(!!) columns.
+        Communicator const& comm_col_;
+
+        /// Generate G+k and local orbital basis sets.
+        inline void generate_gklo_basis();
+
+        /// Test orthonormalization of first-variational states.
+        inline void test_fv_states();
+
+        inline double& band_energy_aux(int j__, int ispn__)
+        {
+            if (ctx_.num_mag_dims() == 3) {
+                return band_energies_(j__, 0);
+            } else {
+                if (!(ispn__ == 0 || ispn__ == 1)) {
+                    TERMINATE("wrong spin index");
+                }
+                return band_energies_(j__, ispn__);
+            }
+        }
+
+        inline double& band_occupancy_aux(int j__, int ispn__)
+        {
+            if (ctx_.num_mag_dims() == 3) {
+                return band_occupancies_(j__, 0);
+            } else {
+                if (!(ispn__ == 0 || ispn__ == 1)) {
+                    TERMINATE("wrong spin index");
+                }
+                return band_occupancies_(j__, ispn__);
+            }
+        }
+
+        /// Find G+k vectors within the cutoff.
+        inline void generate_gkvec(double gk_cutoff__)
+        {
+            PROFILE("sirius::K_point::generate_gkvec");
+
+            if (ctx_.full_potential() && (gk_cutoff__ * unit_cell_.max_mt_radius() > ctx_.lmax_apw()) &&
+                comm_.rank() == 0 && ctx_.control().verbosity_ >= 0) {
+                std::stringstream s;
+                s << "G+k cutoff (" << gk_cutoff__ << ") is too large for a given lmax ("
+                  << ctx_.lmax_apw() << ") and a maximum MT radius (" << unit_cell_.max_mt_radius() << ")" << std::endl
+                  << "suggested minimum value for lmax : " << int(gk_cutoff__ * unit_cell_.max_mt_radius()) + 1;
+                WARNING(s);
+            }
+
+            if (gk_cutoff__ * 2 > ctx_.pw_cutoff()) {
+                std::stringstream s;
+                s << "G+k cutoff is too large for a given plane-wave cutoff" << std::endl
+                  << "  pw cutoff : " << ctx_.pw_cutoff() << std::endl
+                  << "  doubled G+k cutoff : " << gk_cutoff__ * 2;
+                TERMINATE(s);
+            }
+
+            /* create G+k vectors; communicator of the coarse FFT grid is used because wave-functions will be transformed
+             * only on the coarse grid; G+k-vectors will be distributed between MPI ranks assigned to the k-point */
+            gkvec_ = std::unique_ptr<Gvec>(new Gvec(vk_, ctx_.unit_cell().reciprocal_lattice_vectors(), gk_cutoff__, comm(),
+                                                    ctx_.gamma_point()));
+
+            gkvec_partition_ = std::unique_ptr<Gvec_partition>(new Gvec_partition(*gkvec_, ctx_.comm_fft_coarse(),
+                                                                                  ctx_.comm_band_ortho_fft_coarse()));
+
+            gkvec_offset_ = gkvec().gvec_offset(comm().rank());
+        }
+
+    public:
+
+        /// Constructor
+        K_point(Simulation_context& ctx__,
+                double const* vk__,
+                double weight__)
+            : ctx_(ctx__)
+            , unit_cell_(ctx_.unit_cell())
+            , weight_(weight__)
+            , comm_(ctx_.comm_band())
+            , comm_row_(ctx_.blacs_grid().comm_row())
+            , comm_col_(ctx_.blacs_grid().comm_col())
+        {
+            PROFILE("sirius::K_point::K_point");
+
+            for (int x = 0; x < 3; x++) {
+                vk_[x] = vk__[x];
+            }
+
+            band_occupancies_ = mdarray<double, 2>(ctx_.num_bands(), ctx_.num_spin_dims());
+            band_occupancies_.zero();
+            band_energies_ = mdarray<double, 2>(ctx_.num_bands(), ctx_.num_spin_dims());
+            band_energies_.zero();
+
+            num_ranks_row_ = comm_row_.size();
+            num_ranks_col_ = comm_col_.size();
+
+            rank_row_ = comm_row_.rank();
+            rank_col_ = comm_col_.rank();
+        }
+
+        /// Initialize the k-point related arrays and data.
+        inline void initialize();
+
+        inline void update()
+        {
+            PROFILE("sirius::K_point::update");
+
+            gkvec_->lattice_vectors(ctx_.unit_cell().reciprocal_lattice_vectors());
+
+            if (ctx_.full_potential()) {
+                if (ctx_.iterative_solver_input().type_ == "exact") {
+                    alm_coeffs_row_ = std::unique_ptr<Matching_coefficients>(
+                        new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_row(), igk_row_, gkvec()));
+                    alm_coeffs_col_ = std::unique_ptr<Matching_coefficients>(
+                        new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_col(), igk_col_, gkvec()));
+                }
+                alm_coeffs_loc_ = std::unique_ptr<Matching_coefficients>(
+                    new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_loc(), igk_loc_, gkvec()));
+            }
+
+            if (!ctx_.full_potential()) {
+                /* compute |beta> projectors for atom types */
+                beta_projectors_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_loc_));
+
+                if (ctx_.iterative_solver_input().type_ == "exact") {
+                    beta_projectors_row_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_row_));
+                    beta_projectors_col_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_col_));
+
+                }
+
+                //if (false) {
+                //    p_mtrx_ = mdarray<double_complex, 3>(unit_cell_.max_mt_basis_size(), unit_cell_.max_mt_basis_size(), unit_cell_.num_atom_types());
+                //    p_mtrx_.zero();
+
+                //    for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
+                //        auto& atom_type = unit_cell_.atom_type(iat);
+
+                //        if (!atom_type.pp_desc().augment) {
+                //            continue;
+                //        }
+                //        int nbf = atom_type.mt_basis_size();
+                //        int ofs = atom_type.offset_lo();
+
+                //        matrix<double_complex> qinv(nbf, nbf);
+                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
+                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
+                //                qinv(xi2, xi1) = ctx_.augmentation_op(iat).q_mtrx(xi2, xi1);
+                //            }
+                //        }
+                //        linalg<CPU>::geinv(nbf, qinv);
+                //
+                //        /* compute P^{+}*P */
+                //        linalg<CPU>::gemm(2, 0, nbf, nbf, num_gkvec_loc(),
+                //                          beta_projectors_->beta_gk_t().at<CPU>(0, ofs), beta_projectors_->beta_gk_t().ld(),
+                //                          beta_projectors_->beta_gk_t().at<CPU>(0, ofs), beta_projectors_->beta_gk_t().ld(),
+                //                          &p_mtrx_(0, 0, iat), p_mtrx_.ld());
+                //        comm().allreduce(&p_mtrx_(0, 0, iat), unit_cell_.max_mt_basis_size() * unit_cell_.max_mt_basis_size());
+
+                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
+                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
+                //                qinv(xi2, xi1) += p_mtrx_(xi2, xi1, iat);
+                //            }
+                //        }
+                //        /* compute (Q^{-1} + P^{+}*P)^{-1} */
+                //        linalg<CPU>::geinv(nbf, qinv);
+                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
+                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
+                //                p_mtrx_(xi2, xi1, iat) = qinv(xi2, xi1);
+                //            }
+                //        }
+                //    }
+                //}
+            }
+
+        }
+
+        /// Generate first-variational states from eigen-vectors.
+        /** First-variational states are obtained from the first-variational eigen-vectors and
+         *  LAPW matching coefficients.
+         *
+         *  APW part:
+         *  \f[
+         *      \psi_{\xi j}^{\bf k} = \sum_{{\bf G}} Z_{{\bf G} j}^{\bf k} * A_{\xi}({\bf G+k})
+         *  \f]
+         */
+        void generate_fv_states();
+
+        #ifdef __GPU
+        void generate_fv_states_aw_mt_gpu();
+        #endif
+
+        /// Generate two-component spinor wave functions
+        inline void generate_spinor_wave_functions();
+
+        inline void generate_atomic_centered_wavefunctions(const int num_ao__, Wave_functions &phi);
+
+        inline void generate_atomic_centered_wavefunctions_aux(const int num_ao__, Wave_functions &phi, std::vector<int> &offset, bool hubbard);
+
+        void compute_gradient_wavefunctions(Wave_functions &phi,
+                                            const int starting_position_i,
+                                            const int num_wf,
+                                            Wave_functions &dphi,
+                                            const int starting_position_j,
+                                            const int direction);
+
+        void save(int id);
+
+        void load(HDF5_tree h5in, int id);
+
+        //== void save_wave_functions(int id);
+
+        //== void load_wave_functions(int id);
+
+        void get_fv_eigen_vectors(mdarray<double_complex, 2>& fv_evec);
+
+        void get_sv_eigen_vectors(mdarray<double_complex, 2>& sv_evec);
+
+        /// Test orthonormalization of spinor wave-functions
+        void test_spinor_wave_functions(int use_fft);
+
+        /// Get the number of occupied bands for each spin channel.
+        int num_occupied_bands(int ispn__ = -1)
+        {
+            for (int j = ctx_.num_bands() - 1; j >= 0; j--) {
+                if (std::abs(band_occupancy(j, ispn__) * weight()) > 1e-14) {
+                    return j + 1;
+                }
+            }
+            return 0;
+        }
+
+        /// Total number of G+k vectors within the cutoff distance
+        inline int num_gkvec() const
+        {
+            return gkvec_->num_gvec();
+        }
+
+        /// Total number of muffin-tin and plane-wave expansion coefficients for the wave-functions.
+        /** APW+lo basis \f$ \varphi_{\mu {\bf k}}({\bf r}) = \{ \varphi_{\bf G+k}({\bf r}),
+         *  \varphi_{j{\bf k}}({\bf r}) \} \f$ is used to expand first-variational wave-functions:
+         *
+         *  \f[
+         *      \psi_{i{\bf k}}({\bf r}) = \sum_{\mu} c_{\mu i}^{\bf k} \varphi_{\mu \bf k}({\bf r}) =
+         *      \sum_{{\bf G}}c_{{\bf G} i}^{\bf k} \varphi_{\bf G+k}({\bf r}) +
+         *      \sum_{j}c_{j i}^{\bf k}\varphi_{j{\bf k}}({\bf r})
+         *  \f]
+         *
+         *  Inside muffin-tins the expansion is converted into the following form:
+         *  \f[
+         *      \psi_{i {\bf k}}({\bf r})= \begin{array}{ll}
+         *      \displaystyle \sum_{L} \sum_{\lambda=1}^{N_{\ell}^{\alpha}}
+         *      F_{L \lambda}^{i {\bf k},\alpha}f_{\ell \lambda}^{\alpha}(r)
+         *      Y_{\ell m}(\hat {\bf r}) & {\bf r} \in MT_{\alpha} \end{array}
+         *  \f]
+         *
+         *  Thus, the total number of coefficients representing a wave-funstion is equal
+         *  to the number of muffin-tin basis functions of the form \f$ f_{\ell \lambda}^{\alpha}(r)
+         *  Y_{\ell m}(\hat {\bf r}) \f$ plust the number of G+k plane waves. */
+        //inline int wf_size() const // TODO: better name for this
+        //{
+        //    if (ctx_.full_potential()) {
+        //        return unit_cell_.mt_basis_size() + num_gkvec();
+        //    } else {
+        //        return num_gkvec();
+        //    }
+        //}
+
+        inline double& band_energy(int j__, int ispn__)
+        {
+            return band_energy_aux(j__, ispn__);
+        }
+
+        inline double band_energy(int j__, int ispn__) const
+        {
+            auto const& e = const_cast<K_point*>(this)->band_energy_aux(j__, ispn__);
+            return e;
+        }
+
+        inline double& band_occupancy(int j__, int ispn__)
+        {
+            return band_occupancy_aux(j__, ispn__);
+        }
+
+        inline double band_occupancy(int j__, int ispn__) const
+        {
+            auto const& e = const_cast<K_point*>(this)->band_occupancy_aux(j__, ispn__);
+            return e;
+        }
+
+        inline double fv_eigen_value(int i) const
+        {
+            return fv_eigen_values_[i];
+        }
+
+        void set_fv_eigen_values(double* eval)
+        {
+            std::memcpy(&fv_eigen_values_[0], eval, ctx_.num_fv_states() * sizeof(double));
+        }
+
+        inline double weight() const
+        {
+            return weight_;
+        }
+
+        inline Wave_functions& fv_states()
+        {
+            return *fv_states_;
+        }
+
+        inline Wave_functions& spinor_wave_functions()
+        {
+            return *spinor_wave_functions_;
+        }
+
+        inline Wave_functions& hubbard_wave_functions()
+        {
+            return *hubbard_wave_functions_;
+        }
+
+        inline Wave_functions const& hubbard_wave_functions() const
+        {
+            return *hubbard_wave_functions_;
+        }
+
+        inline void allocate_hubbard_wave_functions(int size)
+        {
+            if (hubbard_wave_functions_ != nullptr) {
+                return;
+            }
+            const int num_sc = ctx_.num_mag_dims() == 3 ?  2 : 1;
+            hubbard_wave_functions_ = std::unique_ptr<Wave_functions>(new Wave_functions(gkvec_partition(),
+                                                                                         size,
+                                                                                         num_sc));
+        }
+
+        inline bool hubbard_wave_functions_calculated()
+        {
+            return (hubbard_wave_functions_ != nullptr);
+        }
+
+        inline Wave_functions& singular_components()
+        {
+            return *singular_components_;
+        }
+
+        inline vector3d<double> vk() const
+        {
+            return vk_;
+        }
+
+        /// Basis size of LAPW+lo method.
+        inline int gklo_basis_size() const
+        {
+            return static_cast<int>(num_gkvec() + unit_cell_.mt_lo_basis_size());
+        }
+
+        /// Local number of G+k vectors in case of flat distribution.
+        inline int num_gkvec_loc() const
+        {
+            return gkvec().count();
+        }
+
+        /// Return global index of G+k vector.
+        inline int idxgk(int igkloc__) const
+        {
+            return gkvec_offset_ + igkloc__;
+        }
+
+        /// Local number of G+k vectors for each MPI rank in the row of the 2D MPI grid.
+        inline int num_gkvec_row() const
+        {
+            return num_gkvec_row_;
+        }
+
+        /// Local number of local orbitals for each MPI rank in the row of the 2D MPI grid.
+        inline int num_lo_row() const
+        {
+            return static_cast<int>(lo_basis_descriptors_row_.size());
+        }
+
+        /// Local number of basis functions for each MPI rank in the row of the 2D MPI grid.
+        inline int gklo_basis_size_row() const
+        {
+            return num_gkvec_row() + num_lo_row();
+        }
+
+        /// Local number of G+k vectors for each MPI rank in the column of the 2D MPI grid.
+        inline int num_gkvec_col() const
+        {
+            return num_gkvec_col_;
+        }
+
+        /// Local number of local orbitals for each MPI rank in the column of the 2D MPI grid.
+        inline int num_lo_col() const
+        {
+            return static_cast<int>(lo_basis_descriptors_col_.size());
+        }
+
+        /// Local number of basis functions for each MPI rank in the column of the 2D MPI grid.
+        inline int gklo_basis_size_col() const
+        {
+            return num_gkvec_col() + num_lo_col();
+        }
+
+        inline lo_basis_descriptor const& lo_basis_descriptor_col(int idx) const
+        {
+            assert(idx >=0 && idx < (int)lo_basis_descriptors_col_.size());
+            return lo_basis_descriptors_col_[idx];
+        }
+
+        inline lo_basis_descriptor const& lo_basis_descriptor_row(int idx) const
+        {
+            assert(idx >= 0 && idx < (int)lo_basis_descriptors_row_.size());
+            return lo_basis_descriptors_row_[idx];
+        }
+
+        inline int igk_loc(int idx__) const
+        {
+            return igk_loc_[idx__];
+        }
+
+        inline std::vector<int> const& igk_loc() const
+        {
+            return igk_loc_;
+        }
+
+        inline int igk_row(int idx__) const
+        {
+            return igk_row_[idx__];
+        }
+
+        inline std::vector<int> const& igk_row() const
+        {
+            return igk_row_;
+        }
+
+        inline int igk_col(int idx__) const
+        {
+            return igk_col_[idx__];
+        }
+
+        inline std::vector<int> const& igk_col() const
+        {
+            return igk_col_;
+        }
+
+        inline int num_ranks_row() const
+        {
+            return num_ranks_row_;
+        }
+
+        inline int rank_row() const
+        {
+            return rank_row_;
+        }
+
+        inline int num_ranks_col() const
+        {
+            return num_ranks_col_;
+        }
+
+        inline int rank_col() const
+        {
+            return rank_col_;
+        }
+
+        /// Number of MPI ranks for a given k-point
+        inline int num_ranks() const
+        {
+            return comm_.size();
+        }
+
+        inline int rank() const
+        {
+            return comm_.rank();
+        }
+
+        /// Return number of lo columns for a given atom
+        inline int num_atom_lo_cols(int ia) const
+        {
+            return (int)atom_lo_cols_[ia].size();
+        }
+
+        /// Return local index (for the current MPI rank) of a column for a given atom and column index within an atom
+        inline int lo_col(int ia, int i) const
+        {
+            return atom_lo_cols_[ia][i];
+        }
+
+        /// Return number of lo rows for a given atom
+        inline int num_atom_lo_rows(int ia) const
+        {
+            return (int)atom_lo_rows_[ia].size();
+        }
+
+        /// Return local index (for the current MPI rank) of a row for a given atom and row index within an atom
+        inline int lo_row(int ia, int i) const
+        {
+            return atom_lo_rows_[ia][i];
+        }
+
+        inline dmatrix<double_complex>& fv_eigen_vectors()
+        {
+            return fv_eigen_vectors_;
+        }
+
+        inline Wave_functions& fv_eigen_vectors_slab()
+        {
+            return *fv_eigen_vectors_slab_;
+        }
+
+        inline dmatrix<double_complex>& sv_eigen_vectors(int ispn)
+        {
+            return sv_eigen_vectors_[ispn];
+        }
+
+        inline mdarray<double_complex, 2>& fd_eigen_vectors()
+        {
+            return fd_eigen_vectors_;
+        }
+
+        void bypass_sv()
+        {
+            std::memcpy(&band_energies_[0], &fv_eigen_values_[0], ctx_.num_fv_states() * sizeof(double));
+        }
+
+        inline Gvec const& gkvec() const
+        {
+            return *gkvec_;
+        }
+
+        inline Gvec_partition const& gkvec_partition() const
+        {
+            return *gkvec_partition_;
+        }
+
+        inline Matching_coefficients const& alm_coeffs_row()
+        {
+            return *alm_coeffs_row_;
+        }
+
+        inline Matching_coefficients const& alm_coeffs_col()
+        {
+            return *alm_coeffs_col_;
+        }
+
+        inline Matching_coefficients const& alm_coeffs_loc() const
+        {
+            return *alm_coeffs_loc_;
+        }
+
+        inline Communicator const& comm() const
+        {
+            return comm_;
+        }
+
+        inline Communicator const& comm_row() const
+        {
+            return comm_row_;
+        }
+
+        inline Communicator const& comm_col() const
+        {
+            return comm_col_;
+        }
+
+        inline double_complex p_mtrx(int xi1, int xi2, int iat) const
+        {
+            return p_mtrx_(xi1, xi2, iat);
+        }
+
+        inline mdarray<double_complex, 3>& p_mtrx()
+        {
+            return p_mtrx_;
+        }
+
+        Beta_projectors& beta_projectors()
+        {
+            assert(beta_projectors_ != nullptr);
+            return *beta_projectors_;
+        }
+
+        Beta_projectors& beta_projectors_row()
+        {
+            assert(beta_projectors_ != nullptr);
+            return *beta_projectors_row_;
+        }
+
+        Beta_projectors& beta_projectors_col()
+        {
+            assert(beta_projectors_ != nullptr);
+            return *beta_projectors_col_;
+        }
+};
+
 //== void K_point::check_alm(int num_gkvec_loc, int ia, mdarray<double_complex, 2>& alm)
 //== {
 //==     static SHT* sht = NULL;
@@ -505,3 +1277,61 @@ inline void K_point::get_sv_eigen_vectors(mdarray<double_complex, 2>& sv_evec)
     comm_.allreduce(sv_evec.at<CPU>(), (int)sv_evec.size());
 }
 
+#include "generate_fv_states.hpp"
+#include "generate_spinor_wave_functions.hpp"
+#include "generate_gklo_basis.hpp"
+#include "initialize.hpp"
+#include "test_fv_states.hpp"
+#include "generate_atomic_centered_wavefunctions.hpp"
+
+}
+
+/** \page basis Basis functions for Kohn-Sham wave-functions expansion
+ *
+ *  \section basis1 LAPW+lo basis
+ *
+ *  LAPW+lo basis consists of two different sets of functions: LAPW functions \f$ \varphi_{{\bf G+k}} \f$ defined over
+ *  entire unit cell:
+ *  \f[
+ *      \varphi_{{\bf G+k}}({\bf r}) = \left\{ \begin{array}{ll}
+ *      \displaystyle \sum_{L} \sum_{\nu=1}^{O_{\ell}^{\alpha}} a_{L\nu}^{\alpha}({\bf G+k})u_{\ell \nu}^{\alpha}(r)
+ *      Y_{\ell m}(\hat {\bf r}) & {\bf r} \in {\rm MT} \alpha \\
+ *      \displaystyle \frac{1}{\sqrt  \Omega} e^{i({\bf G+k}){\bf r}} & {\bf r} \in {\rm I} \end{array} \right.
+ *  \f]
+ *  and Bloch sums of local orbitals defined inside muffin-tin spheres only:
+ *  \f[
+ *      \begin{array}{ll} \displaystyle \varphi_{j{\bf k}}({\bf r})=\sum_{{\bf T}} e^{i{\bf kT}}
+ *      \varphi_{j}({\bf r - T}) & {\rm {\bf r} \in MT} \end{array}
+ *  \f]
+ *  Each local orbital is composed of radial and angular parts:
+ *  \f[
+ *      \varphi_{j}({\bf r}) = \phi_{\ell_j}^{\zeta_j,\alpha_j}(r) Y_{\ell_j m_j}(\hat {\bf r})
+ *  \f]
+ *  Radial part of local orbital is defined as a linear combination of radial functions (minimum two radial functions
+ *  are required) such that local orbital vanishes at the sphere boundary:
+ *  \f[
+ *      \phi_{\ell}^{\zeta, \alpha}(r) = \sum_{p}\gamma_{p}^{\zeta,\alpha} u_{\ell \nu_p}^{\alpha}(r)
+ *  \f]
+ *
+ *  Arbitrary number of local orbitals can be introduced for each angular quantum number (this is highlighted by
+ *  the index \f$ \zeta \f$).
+ *
+ *  Radial functions are m-th order (with zero-order being a function itself) energy derivatives of the radial
+ *  Schrödinger equation:
+ *  \f[
+ *      u_{\ell \nu}^{\alpha}(r) = \frac{\partial^{m_{\nu}}}{\partial^{m_{\nu}}E}u_{\ell}^{\alpha}(r,E)\Big|_{E=E_{\nu}}
+ *  \f]
+ */
+
+/** \page data_dist K-point data distribution
+ *
+ *  \section data_dist1 "Panel" and "full" data storage
+ *
+ *  We have to deal with big arrays (matching coefficients, eigen vectors, wave functions, etc.) which may not fit
+ *  into the memory of a single node. For some operations we need a "panel" distribution of data, where each
+ *  MPI rank gets a local panel of block-cyclic distributed matrix. This way of storing data is necessary for the
+ *  distributed PBLAS and ScaLAPACK operations.
+ *
+ */
+
+#endif // __K_POINT_H__
diff --git a/src/k_point_set.h b/src/K_point/k_point_set.hpp
similarity index 99%
rename from src/k_point_set.h
rename to src/K_point/k_point_set.hpp
index 503a963f8..892fa37c0 100644
--- a/src/k_point_set.h
+++ b/src/K_point/k_point_set.hpp
@@ -17,15 +17,15 @@
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-/** \file k_point_set.h
+/** \file k_point_set.hpp
  *
  *  \brief Contains declaration and partial implementation of sirius::K_point_set class.
  */
 
-#ifndef __K_POINT_SET_H__
-#define __K_POINT_SET_H__
+#ifndef __K_POINT_SET_HPP__
+#define __K_POINT_SET_HPP__
 
-#include "k_point.h"
+#include "k_point.hpp"
 #include "geometry3d.hpp"
 #include "smearing.hpp"
 
diff --git a/src/Potential/generate_d_operator_matrix.hpp b/src/Potential/generate_d_operator_matrix.hpp
index 923c24d54..07ef8e362 100644
--- a/src/Potential/generate_d_operator_matrix.hpp
+++ b/src/Potential/generate_d_operator_matrix.hpp
@@ -40,7 +40,7 @@ inline void Potential::generate_D_operator_matrix()
     if (ctx_.processing_unit() == GPU) {
         veff_tmp.allocate(memory_t::device);
     }
-    
+
     if (ctx_.unit_cell().atom_type(0).augment() && ctx_.unit_cell().atom_type(0).num_atoms() > 0) {
         ctx_.augmentation_op(0).prepare(0);
     }
diff --git a/src/xc_functional.h b/src/Potential/xc_functional.hpp
similarity index 99%
rename from src/xc_functional.h
rename to src/Potential/xc_functional.hpp
index d97eadd47..5779288b9 100644
--- a/src/xc_functional.h
+++ b/src/Potential/xc_functional.hpp
@@ -17,13 +17,13 @@
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-/** \file xc_functional.h
+/** \file xc_functional.hpp
  *
  *  \brief Contains implementation of sirius::XC_functional class.
  */
 
-#ifndef __XC_FUNCTIONAL_H__
-#define __XC_FUNCTIONAL_H__
+#ifndef __XC_FUNCTIONAL_HPP__
+#define __XC_FUNCTIONAL_HPP__
 
 #include <xc.h>
 #include <string.h>
diff --git a/src/SHT/gaunt.hpp b/src/SHT/gaunt.hpp
new file mode 100644
index 000000000..8fdfb4269
--- /dev/null
+++ b/src/SHT/gaunt.hpp
@@ -0,0 +1,208 @@
+// Copyright (c) 2013-2018 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file gaunt.hpp
+ *
+ *  \brief Contains definition and implementation of sirius::Gaunt class.
+ */
+
+#ifndef __GAUNT_HPP__
+#define __GAUNT_HPP__
+
+#include "mdarray.hpp"
+
+namespace sirius {
+
+/// Used in the {lm1, lm2} : {lm3, coefficient} way of grouping non-zero Gaunt coefficients
+template <typename T>
+struct gaunt_L3
+{
+    int lm3;
+    int l3;
+    T   coef;
+};
+
+/// Used in the {lm1, lm2, coefficient} : {lm3} way of grouping non-zero Gaunt coefficients
+template <typename T>
+struct gaunt_L1_L2
+{
+    int lm1;
+    int lm2;
+    T   coef;
+};
+
+/// Compact storage of non-zero Gaunt coefficients \f$ \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle \f$.
+/** Very important! The following notation is adopted and used everywhere: lm1 and lm2 represent 'bra' and 'ket' 
+ *  spherical harmonics of the Gaunt integral and lm3 represent the inner spherical harmonic. 
+ */
+template <typename T>
+class Gaunt_coefficients
+{
+  private:
+    /// lmax of <lm1|
+    int lmax1_;
+    /// lmmax of <lm1|
+    int lmmax1_;
+
+    /// lmax of inner real or complex spherical harmonic
+    int lmax3_;
+    /// lmmax of inner real or complex spherical harmonic
+    int lmmax3_;
+
+    /// lmax of |lm2>
+    int lmax2_;
+    /// lmmax of |lm2>
+    int lmmax2_;
+
+    /// List of non-zero Gaunt coefficients for each lm3.
+    mdarray<std::vector<gaunt_L1_L2<T>>, 1> gaunt_packed_L1_L2_;
+
+    /// List of non-zero Gaunt coefficients for each combination of lm1, lm2.
+    mdarray<std::vector<gaunt_L3<T>>, 2> gaunt_packed_L3_;
+
+  public:
+    /// Class constructor.
+    Gaunt_coefficients(int                                            lmax1__,
+                       int                                            lmax3__,
+                       int                                            lmax2__,
+                       std::function<T(int, int, int, int, int, int)> get__)
+        : lmax1_(lmax1__)
+        , lmax3_(lmax3__)
+        , lmax2_(lmax2__)
+    {
+        lmmax1_ = utils::lmmax(lmax1_);
+        lmmax3_ = utils::lmmax(lmax3_);
+        lmmax2_ = utils::lmmax(lmax2_);
+
+        gaunt_packed_L1_L2_ = mdarray<std::vector<gaunt_L1_L2<T>>, 1>(lmmax3_);
+        gaunt_L1_L2<T> g12;
+
+        gaunt_packed_L3_ = mdarray<std::vector<gaunt_L3<T>>, 2>(lmmax1_, lmmax2_);
+        gaunt_L3<T> g3;
+
+        for (int l1 = 0, lm1 = 0; l1 <= lmax1_; l1++) {
+            for (int m1 = -l1; m1 <= l1; m1++, lm1++) {
+                for (int l2 = 0, lm2 = 0; l2 <= lmax2_; l2++) {
+                    for (int m2 = -l2; m2 <= l2; m2++, lm2++) {
+                        for (int l3 = 0, lm3 = 0; l3 <= lmax3_; l3++) {
+                            for (int m3 = -l3; m3 <= l3; m3++, lm3++) {
+
+                                T gc = get__(l1, l3, l2, m1, m3, m2);
+                                if (std::abs(gc) > 1e-12) {
+                                    g12.lm1  = lm1;
+                                    g12.lm2  = lm2;
+                                    g12.coef = gc;
+                                    gaunt_packed_L1_L2_[lm3].push_back(g12);
+
+                                    g3.lm3  = lm3;
+                                    g3.l3   = l3;
+                                    g3.coef = gc;
+                                    gaunt_packed_L3_(lm1, lm2).push_back(g3);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Return number of non-zero Gaunt coefficients for a given lm3.
+    inline int num_gaunt(int lm3) const
+    {
+        assert(lm3 >= 0 && lm3 < lmmax3_);
+        return static_cast<int>(gaunt_packed_L1_L2_[lm3].size());
+    }
+
+    /// Return a structure containing {lm1, lm2, coef} for a given lm3 and index.
+    /** Example:
+     *  \code{.cpp}
+     *  for (int lm3 = 0; lm3 < lmmax3; lm3++)
+     *  {
+     *      for (int i = 0; i < gaunt_coefs.num_gaunt(lm3); i++) {
+     *          int lm1 = gaunt_coefs.gaunt(lm3, i).lm1;
+     *          int lm2 = gaunt_coefs.gaunt(lm3, i).lm2;
+     *          double coef = gaunt_coefs.gaunt(lm3, i).coef;
+     *
+     *          // do something with lm1,lm2,lm3 and coef
+     *      }
+     *  }
+     *  \endcode
+     */
+    inline gaunt_L1_L2<T> const& gaunt(int lm3, int idx) const
+    {
+        assert(lm3 >= 0 && lm3 < lmmax3_);
+        assert(idx >= 0 && idx < (int)gaunt_packed_L1_L2_[lm3].size());
+        return gaunt_packed_L1_L2_[lm3][idx];
+    }
+
+    /// Return number of non-zero Gaunt coefficients for a combination of lm1 and lm2.
+    inline int num_gaunt(int lm1, int lm2) const
+    {
+        return static_cast<int>(gaunt_packed_L3_(lm1, lm2).size());
+    }
+
+    /// Return a structure containing {lm3, coef} for a given lm1, lm2 and index
+    inline gaunt_L3<T> const& gaunt(int lm1, int lm2, int idx) const
+    {
+        return gaunt_packed_L3_(lm1, lm2)[idx];
+    }
+
+    /// Return a sum over L3 (lm3) index of Gaunt coefficients and a complex vector.
+    /** The following operation is performed:
+     *  \f[
+     *      \sum_{\ell_3 m_3} \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle v_{\ell_3 m_3}
+     *  \f]
+     *  Result is assumed to be complex.
+     */
+    inline double_complex sum_L3_gaunt(int lm1, int lm2, double_complex const* v) const
+    {
+        double_complex zsum(0, 0);
+        for (int k = 0; k < (int)gaunt_packed_L3_(lm1, lm2).size(); k++) {
+            zsum += gaunt_packed_L3_(lm1, lm2)[k].coef * v[gaunt_packed_L3_(lm1, lm2)[k].lm3];
+        }
+        return zsum;
+    }
+
+    /// Return a sum over L3 (lm3) index of Gaunt coefficients and a real vector.
+    /** The following operation is performed:
+     *  \f[
+     *      \sum_{\ell_3 m_3} \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle v_{\ell_3 m_3}
+     *  \f]
+     *  Result is assumed to be of the same type as Gaunt coefficients.
+     */
+    inline T sum_L3_gaunt(int lm1, int lm2, double const* v) const
+    {
+        T sum = 0;
+        for (int k = 0; k < (int)gaunt_packed_L3_(lm1, lm2).size(); k++) {
+            sum += gaunt_packed_L3_(lm1, lm2)[k].coef * v[gaunt_packed_L3_(lm1, lm2)[k].lm3];
+        }
+        return sum;
+    }
+
+    /// Return vector of non-zero Gaunt coefficients for a given combination of lm1 and lm2
+    inline std::vector<gaunt_L3<T>> const& gaunt_vector(int lm1, int lm2) const
+    {
+        return gaunt_packed_L3_(lm1, lm2);
+    }
+};
+
+}; // namespace sirius
+
+#endif
diff --git a/src/lebedev_grids.hpp b/src/SHT/lebedev_grids.hpp
similarity index 100%
rename from src/lebedev_grids.hpp
rename to src/SHT/lebedev_grids.hpp
diff --git a/src/SHT/sht.hpp b/src/SHT/sht.hpp
new file mode 100644
index 000000000..51335a83a
--- /dev/null
+++ b/src/SHT/sht.hpp
@@ -0,0 +1,1296 @@
+// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file sht.hpp
+ *
+ *  \brief Contains declaration and particular implementation of sirius::SHT class.
+ */
+
+#ifndef __SHT_HPP__
+#define __SHT_HPP__
+
+#include <math.h>
+#include <stddef.h>
+#include <gsl/gsl_sf_coupling.h>
+#include <gsl/gsl_sf_legendre.h>
+#include <string.h>
+#include <vector>
+#include <algorithm>
+
+#include "typedefs.h"
+#include "linalg.hpp"
+#include "lebedev_grids.hpp"
+
+namespace sirius {
+
+/// Spherical harmonics transformations and related oprtations.
+/** This class is responsible for the generation of complex and real spherical harmonics, generation of transformation
+ *  matrices, transformation between spectral and real-space representations, generation of Gaunt and Clebsch-Gordan
+ *  coefficients and calculation of spherical harmonic derivatives */
+class SHT // TODO: better name
+{
+  private:
+    /// Maximum \f$ \ell \f$ of spherical harmonics.
+    int lmax_;
+
+    /// Maximum number of \f$ \ell, m \f$ components.
+    int lmmax_;
+
+    /// Number of real-space \f$ (\theta, \phi) \f$ points on the sphere.
+    int num_points_;
+
+    /// Cartesian coordinates of points (normalized to 1).
+    mdarray<double, 2> coord_;
+
+    /// \f$ (\theta, \phi) \f$ angles of points.
+    mdarray<double, 2> tp_;
+
+    /// Point weights.
+    std::vector<double> w_;
+
+    /// Backward transformation from Ylm to spherical coordinates.
+    mdarray<double_complex, 2> ylm_backward_;
+
+    /// Forward transformation from spherical coordinates to Ylm.
+    mdarray<double_complex, 2> ylm_forward_;
+
+    /// Backward transformation from Rlm to spherical coordinates.
+    mdarray<double, 2> rlm_backward_;
+
+    /// Forward transformation from spherical coordinates to Rlm.
+    mdarray<double, 2> rlm_forward_;
+
+    /// Type of spherical grid (0: Lebedev-Laikov, 1: uniform).
+    int mesh_type_{0};
+
+  public:
+    /// Default constructor.
+    SHT(int lmax__)
+        : lmax_(lmax__)
+    {
+        lmmax_ = (lmax_ + 1) * (lmax_ + 1);
+
+        if (mesh_type_ == 0) {
+            num_points_ = Lebedev_Laikov_npoint(2 * lmax_);
+        }
+        if (mesh_type_ == 1) {
+            num_points_ = lmmax_;
+        }
+
+        std::vector<double> x(num_points_);
+        std::vector<double> y(num_points_);
+        std::vector<double> z(num_points_);
+
+        coord_ = mdarray<double, 2>(3, num_points_);
+
+        tp_ = mdarray<double, 2>(2, num_points_);
+
+        w_.resize(num_points_);
+
+        if (mesh_type_ == 0)
+            Lebedev_Laikov_sphere(num_points_, &x[0], &y[0], &z[0], &w_[0]);
+        if (mesh_type_ == 1)
+            uniform_coverage();
+
+        ylm_backward_ = mdarray<double_complex, 2>(lmmax_, num_points_);
+
+        ylm_forward_ = mdarray<double_complex, 2>(num_points_, lmmax_);
+
+        rlm_backward_ = mdarray<double, 2>(lmmax_, num_points_);
+
+        rlm_forward_ = mdarray<double, 2>(num_points_, lmmax_);
+
+        for (int itp = 0; itp < num_points_; itp++) {
+            if (mesh_type_ == 0) {
+                coord_(0, itp) = x[itp];
+                coord_(1, itp) = y[itp];
+                coord_(2, itp) = z[itp];
+
+                auto vs     = spherical_coordinates(vector3d<double>(x[itp], y[itp], z[itp]));
+                tp_(0, itp) = vs[1];
+                tp_(1, itp) = vs[2];
+                spherical_harmonics(lmax_, vs[1], vs[2], &ylm_backward_(0, itp));
+                spherical_harmonics(lmax_, vs[1], vs[2], &rlm_backward_(0, itp));
+                for (int lm = 0; lm < lmmax_; lm++) {
+                    ylm_forward_(itp, lm) = std::conj(ylm_backward_(lm, itp)) * w_[itp] * fourpi;
+                    rlm_forward_(itp, lm) = rlm_backward_(lm, itp) * w_[itp] * fourpi;
+                }
+            }
+            if (mesh_type_ == 1) {
+                double t = tp_(0, itp);
+                double p = tp_(1, itp);
+
+                coord_(0, itp) = sin(t) * cos(p);
+                coord_(1, itp) = sin(t) * sin(p);
+                coord_(2, itp) = cos(t);
+
+                spherical_harmonics(lmax_, t, p, &ylm_backward_(0, itp));
+                spherical_harmonics(lmax_, t, p, &rlm_backward_(0, itp));
+
+                for (int lm = 0; lm < lmmax_; lm++) {
+                    ylm_forward_(lm, itp) = ylm_backward_(lm, itp);
+                    rlm_forward_(lm, itp) = rlm_backward_(lm, itp);
+                }
+            }
+        }
+
+        if (mesh_type_ == 1) {
+            linalg<CPU>::geinv(lmmax_, ylm_forward_);
+            linalg<CPU>::geinv(lmmax_, rlm_forward_);
+        }
+
+#if (__VERIFICATION > 0)
+        {
+            double dr = 0;
+            double dy = 0;
+
+            for (int lm = 0; lm < lmmax_; lm++) {
+                for (int lm1 = 0; lm1 < lmmax_; lm1++) {
+                    double         t = 0;
+                    double_complex zt(0, 0);
+                    for (int itp = 0; itp < num_points_; itp++) {
+                        zt += ylm_forward_(itp, lm) * ylm_backward_(lm1, itp);
+                        t += rlm_forward_(itp, lm) * rlm_backward_(lm1, itp);
+                    }
+
+                    if (lm == lm1) {
+                        zt -= 1.0;
+                        t -= 1.0;
+                    }
+                    dr += std::abs(t);
+                    dy += std::abs(zt);
+                }
+            }
+            dr = dr / lmmax_ / lmmax_;
+            dy = dy / lmmax_ / lmmax_;
+
+            if (dr > 1e-15 || dy > 1e-15) {
+                std::stringstream s;
+                s << "spherical mesh error is too big" << std::endl
+                  << "  real spherical integration error " << dr << std::endl
+                  << "  complex spherical integration error " << dy;
+                WARNING(s.str())
+            }
+
+            std::vector<double> flm(lmmax_);
+            std::vector<double> ftp(num_points_);
+            for (int lm = 0; lm < lmmax_; lm++) {
+                std::memset(&flm[0], 0, lmmax_ * sizeof(double));
+                flm[lm] = 1.0;
+                backward_transform(lmmax_, &flm[0], 1, lmmax_, &ftp[0]);
+                forward_transform(&ftp[0], 1, lmmax_, lmmax_, &flm[0]);
+                flm[lm] -= 1.0;
+
+                double t = 0.0;
+                for (int lm1 = 0; lm1 < lmmax_; lm1++)
+                    t += std::abs(flm[lm1]);
+
+                t /= lmmax_;
+
+                if (t > 1e-15) {
+                    std::stringstream s;
+                    s << "test of backward / forward real SHT failed" << std::endl
+                      << "  total error " << t;
+                    WARNING(s.str());
+                }
+            }
+        }
+#endif
+    }
+
+    /// Perform a backward transformation from spherical harmonics to spherical coordinates.
+    /** \f[
+     *      f(\theta, \phi, r) = \sum_{\ell m} f_{\ell m}(r) Y_{\ell m}(\theta, \phi)
+     *  \f]
+     *
+     *  \param [in] ld Size of leading dimension of flm.
+     *  \param [in] flm Raw pointer to \f$ f_{\ell m}(r) \f$.
+     *  \param [in] nr Number of radial points.
+     *  \param [in] lmmax Maximum number of lm- harmonics to take into sum.
+     *  \param [out] ftp Raw pointer to \f$ f(\theta, \phi, r) \f$.
+     */
+    template <typename T>
+    void backward_transform(int ld, T const* flm, int nr, int lmmax, T* ftp);
+
+    /// Perform a forward transformation from spherical coordinates to spherical harmonics.
+    /** \f[
+     *      f_{\ell m}(r) = \iint  f(\theta, \phi, r) Y_{\ell m}^{*}(\theta, \phi) \sin \theta d\phi d\theta =
+     *        \sum_{i} f(\theta_i, \phi_i, r) Y_{\ell m}^{*}(\theta_i, \phi_i) w_i
+     *  \f]
+     *
+     *  \param [in] ftp Raw pointer to \f$ f(\theta, \phi, r) \f$.
+     *  \param [in] nr Number of radial points.
+     *  \param [in] lmmax Maximum number of lm- coefficients to generate.
+     *  \param [in] ld Size of leading dimension of flm.
+     *  \param [out] flm Raw pointer to \f$ f_{\ell m}(r) \f$.
+     */
+    template <typename T>
+    void forward_transform(T const* ftp, int nr, int lmmax, int ld, T* flm);
+
+    /// Convert form Rlm to Ylm representation.
+    static void convert(int lmax__, double const* f_rlm__, double_complex* f_ylm__)
+    {
+        int lm = 0;
+        for (int l = 0; l <= lmax__; l++) {
+            for (int m = -l; m <= l; m++) {
+                if (m == 0) {
+                    f_ylm__[lm] = f_rlm__[lm];
+                } else {
+                    int lm1     = utils::lm(l, -m);
+                    f_ylm__[lm] = ylm_dot_rlm(l, m, m) * f_rlm__[lm] + ylm_dot_rlm(l, m, -m) * f_rlm__[lm1];
+                }
+                lm++;
+            }
+        }
+    }
+
+    /// Convert from Ylm to Rlm representation.
+    static void convert(int lmax__, double_complex const* f_ylm__, double* f_rlm__)
+    {
+        int lm = 0;
+        for (int l = 0; l <= lmax__; l++) {
+            for (int m = -l; m <= l; m++) {
+                if (m == 0) {
+                    f_rlm__[lm] = std::real(f_ylm__[lm]);
+                } else {
+                    int lm1     = utils::lm(l, -m);
+                    f_rlm__[lm] = std::real(rlm_dot_ylm(l, m, m) * f_ylm__[lm] + rlm_dot_ylm(l, m, -m) * f_ylm__[lm1]);
+                }
+                lm++;
+            }
+        }
+    }
+
+    //void rlm_forward_iterative_transform(double *ftp__, int lmmax, int ncol, double* flm)
+    //{
+    //    Timer t("sirius::SHT::rlm_forward_iterative_transform");
+    //
+    //    assert(lmmax <= lmmax_);
+
+    //    mdarray<double, 2> ftp(ftp__, num_points_, ncol);
+    //    mdarray<double, 2> ftp1(num_points_, ncol);
+    //
+    //    blas<cpu>::gemm(1, 0, lmmax, ncol, num_points_, 1.0, &rlm_forward_(0, 0), num_points_, &ftp(0, 0), num_points_, 0.0,
+    //                    flm, lmmax);
+    //
+    //    for (int i = 0; i < 2; i++)
+    //    {
+    //        rlm_backward_transform(flm, lmmax, ncol, &ftp1(0, 0));
+    //        double tdiff = 0.0;
+    //        for (int ir = 0; ir < ncol; ir++)
+    //        {
+    //            for (int itp = 0; itp < num_points_; itp++)
+    //            {
+    //                ftp1(itp, ir) = ftp(itp, ir) - ftp1(itp, ir);
+    //                //tdiff += fabs(ftp1(itp, ir));
+    //            }
+    //        }
+    //
+    //        for (int itp = 0; itp < num_points_; itp++)
+    //        {
+    //            tdiff += fabs(ftp1(itp, ncol - 1));
+    //        }
+    //        std::cout << "iter : " << i << " avg. MT diff = " << tdiff / num_points_ << std::endl;
+    //        blas<cpu>::gemm(1, 0, lmmax, ncol, num_points_, 1.0, &rlm_forward_(0, 0), num_points_, &ftp1(0, 0), num_points_, 1.0,
+    //                        flm, lmmax);
+    //    }
+    //}
+
+    /// Transform Cartesian coordinates [x,y,z] to spherical coordinates [r,theta,phi]
+    static vector3d<double> spherical_coordinates(vector3d<double> vc)
+    {
+        vector3d<double> vs;
+
+        const double eps{1e-12};
+
+        vs[0] = vc.length();
+
+        if (vs[0] <= eps) {
+            vs[1] = 0.0;
+            vs[2] = 0.0;
+        } else {
+            vs[1] = std::acos(vc[2] / vs[0]); // theta = cos^{-1}(z/r)
+
+            if (std::abs(vc[0]) > eps || std::abs(vc[1]) > eps) {
+                vs[2] = std::atan2(vc[1], vc[0]); // phi = tan^{-1}(y/x)
+                if (vs[2] < 0.0) {
+                    vs[2] += twopi;
+                }
+            } else {
+                vs[2] = 0.0;
+            }
+        }
+
+        return vs;
+    }
+
+    /// Generate complex spherical harmonics Ylm
+    static void spherical_harmonics(int lmax, double theta, double phi, double_complex* ylm)
+    {
+        double x = std::cos(theta);
+
+        std::vector<double> result_array(lmax + 1);
+
+        for (int l = 0; l <= lmax; l++) {
+            for (int m = 0; m <= l; m++) {
+                double_complex z     = std::exp(double_complex(0.0, m * phi));
+                ylm[utils::lm(l, m)] = gsl_sf_legendre_sphPlm(l, m, x) * z;
+                if (m % 2) {
+                    ylm[utils::lm(l, -m)] = -std::conj(ylm[utils::lm(l, m)]);
+                } else {
+                    ylm[utils::lm(l, -m)] = std::conj(ylm[utils::lm(l, m)]);
+                }
+            }
+        }
+    }
+
+    /// Generate real spherical harmonics Rlm
+    /** Mathematica code:
+     *  \verbatim
+     *  R[l_, m_, th_, ph_] :=
+     *   If[m > 0, std::sqrt[2]*ComplexExpand[Re[SphericalHarmonicY[l, m, th, ph]]],
+     *   If[m < 0, std::sqrt[2]*ComplexExpand[Im[SphericalHarmonicY[l, m, th, ph]]],
+     *   If[m == 0, ComplexExpand[Re[SphericalHarmonicY[l, 0, th, ph]]]]]]
+     *  \endverbatim
+     */
+    static void spherical_harmonics(int lmax, double theta, double phi, double* rlm)
+    {
+        int lmmax = (lmax + 1) * (lmax + 1);
+
+        std::vector<double_complex> ylm(lmmax);
+        spherical_harmonics(lmax, theta, phi, &ylm[0]);
+
+        double const t = std::sqrt(2.0);
+
+        rlm[0] = y00;
+
+        for (int l = 1; l <= lmax; l++) {
+            for (int m = -l; m < 0; m++) {
+                rlm[utils::lm(l, m)] = t * ylm[utils::lm(l, m)].imag();
+            }
+
+            rlm[utils::lm(l, 0)] = ylm[utils::lm(l, 0)].real();
+
+            for (int m = 1; m <= l; m++) {
+                rlm[utils::lm(l, m)] = t * ylm[utils::lm(l, m)].real();
+            }
+        }
+    }
+
+    /// Generate real or complex spherical harmonics for a given Cartesian vectors.
+    template <typename T>
+    static std::vector<T> spherical_harmonics(int lmax__, vector3d<double> vc__)
+    {
+        auto rtp = spherical_coordinates(vc__);
+
+        std::vector<T> v(utils::lmmax(lmax__));
+        spherical_harmonics(lmax__, rtp[1], rtp[2], v.data());
+        return std::move(v);
+    }
+
+    /// Compute element of the transformation matrix from complex to real spherical harmonics.
+    /** Real spherical harmonic can be written as a linear combination of complex harmonics:
+        \f[
+            R_{\ell m}(\theta, \phi) = \sum_{m'} a^{\ell}_{m' m}Y_{\ell m'}(\theta, \phi)
+        \f]
+        where
+        \f[
+            a^{\ell}_{m' m} = \langle Y_{\ell m'} | R_{\ell m} \rangle
+        \f]
+        which gives the name to this function.
+
+        Transformation from real to complex spherical harmonics is conjugate transpose:
+
+        \f[
+            Y_{\ell m}(\theta, \phi) = \sum_{m'} a^{\ell*}_{m m'}R_{\ell m'}(\theta, \phi)
+        \f]
+
+        Mathematica code:
+        \verbatim
+        b[m1_, m2_] :=
+         If[m1 == 0, 1,
+         If[m1 < 0 && m2 < 0, -I/Sqrt[2],
+         If[m1 > 0 && m2 < 0, (-1)^m1*I/Sqrt[2],
+         If[m1 < 0 && m2 > 0, (-1)^m2/Sqrt[2],
+         If[m1 > 0 && m2 > 0, 1/Sqrt[2]]]]]]
+
+        a[m1_, m2_] := If[Abs[m1] == Abs[m2], b[m1, m2], 0]
+
+        Rlm[l_, m_, t_, p_] := Sum[a[m1, m]*SphericalHarmonicY[l, m1, t, p], {m1, -l, l}]
+        \endverbatim
+     */
+    static inline double_complex ylm_dot_rlm(int l, int m1, int m2)
+    {
+        double const isqrt2 = 1.0 / std::sqrt(2);
+
+        assert(l >= 0 && std::abs(m1) <= l && std::abs(m2) <= l);
+
+        if (!((m1 == m2) || (m1 == -m2))) {
+            return double_complex(0, 0);
+        }
+
+        if (m1 == 0) {
+            return double_complex(1, 0);
+        }
+
+        if (m1 < 0) {
+            if (m2 < 0) {
+                return -double_complex(0, isqrt2);
+            } else {
+                return std::pow(-1.0, m2) * double_complex(isqrt2, 0);
+            }
+        } else {
+            if (m2 < 0) {
+                return std::pow(-1.0, m1) * double_complex(0, isqrt2);
+            } else {
+                return double_complex(isqrt2, 0);
+            }
+        }
+    }
+
+    static inline double_complex rlm_dot_ylm(int l, int m1, int m2)
+    {
+        return std::conj(ylm_dot_rlm(l, m2, m1));
+    }
+
+    /// Gaunt coefficent of three complex spherical harmonics.
+    /**
+     *  \f[
+     *    \langle Y_{\ell_1 m_1} | Y_{\ell_2 m_2} | Y_{\ell_3 m_3} \rangle
+     *  \f]
+     */
+    static double gaunt_ylm(int l1, int l2, int l3, int m1, int m2, int m3)
+    {
+        assert(l1 >= 0);
+        assert(l2 >= 0);
+        assert(l3 >= 0);
+        assert(m1 >= -l1 && m1 <= l1);
+        assert(m2 >= -l2 && m2 <= l2);
+        assert(m3 >= -l3 && m3 <= l3);
+
+        return std::pow(-1.0, std::abs(m1)) * std::sqrt(double(2 * l1 + 1) * double(2 * l2 + 1) * double(2 * l3 + 1) / fourpi) *
+               gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, 0, 0, 0) *
+               gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, -2 * m1, 2 * m2, 2 * m3);
+    }
+
+    /// Gaunt coefficent of three real spherical harmonics.
+    /**
+     *  \f[
+     *    \langle R_{\ell_1 m_1} | R_{\ell_2 m_2} | R_{\ell_3 m_3} \rangle
+     *  \f]
+     */
+    static double gaunt_rlm(int l1, int l2, int l3, int m1, int m2, int m3)
+    {
+        assert(l1 >= 0);
+        assert(l2 >= 0);
+        assert(l3 >= 0);
+        assert(m1 >= -l1 && m1 <= l1);
+        assert(m2 >= -l2 && m2 <= l2);
+        assert(m3 >= -l3 && m3 <= l3);
+
+        double d = 0;
+        for (int k1 = -l1; k1 <= l1; k1++) {
+            for (int k2 = -l2; k2 <= l2; k2++) {
+                for (int k3 = -l3; k3 <= l3; k3++) {
+                    d += std::real(std::conj(SHT::ylm_dot_rlm(l1, k1, m1)) *
+                                   SHT::ylm_dot_rlm(l2, k2, m2) *
+                                   SHT::ylm_dot_rlm(l3, k3, m3)) *
+                         SHT::gaunt_ylm(l1, l2, l3, k1, k2, k3);
+                }
+            }
+        }
+        return d;
+    }
+
+    /// Gaunt coefficent of two real spherical harmonics with a complex one.
+    /**
+     *  \f[
+     *    \langle R_{\ell_1 m_1} | Y_{\ell_2 m_2} | R_{\ell_3 m_3} \rangle
+     *  \f]
+     */
+    static double gaunt_rlm_ylm_rlm(int l1, int l2, int l3, int m1, int m2, int m3)
+    {
+        assert(l1 >= 0);
+        assert(l2 >= 0);
+        assert(l3 >= 0);
+        assert(m1 >= -l1 && m1 <= l1);
+        assert(m2 >= -l2 && m2 <= l2);
+        assert(m3 >= -l3 && m3 <= l3);
+
+        double d = 0;
+        for (int k1 = -l1; k1 <= l1; k1++) {
+            for (int k3 = -l3; k3 <= l3; k3++) {
+                d += std::real(std::conj(SHT::ylm_dot_rlm(l1, k1, m1)) *
+                               SHT::ylm_dot_rlm(l3, k3, m3)) *
+                     SHT::gaunt_ylm(l1, l2, l3, k1, m2, k3);
+            }
+        }
+        return d;
+    }
+
+    /// Gaunt coefficent of two complex and one real spherical harmonics.
+    /**
+     *  \f[
+     *    \langle Y_{\ell_1 m_1} | R_{\ell_2 m_2} | Y_{\ell_3 m_3} \rangle
+     *  \f]
+     */
+    static double_complex gaunt_hybrid(int l1, int l2, int l3, int m1, int m2, int m3)
+    {
+        assert(l1 >= 0);
+        assert(l2 >= 0);
+        assert(l3 >= 0);
+        assert(m1 >= -l1 && m1 <= l1);
+        assert(m2 >= -l2 && m2 <= l2);
+        assert(m3 >= -l3 && m3 <= l3);
+
+        if (m2 == 0) {
+            return double_complex(gaunt_ylm(l1, l2, l3, m1, m2, m3), 0.0);
+        } else {
+            return (ylm_dot_rlm(l2, m2, m2) * gaunt_ylm(l1, l2, l3, m1, m2, m3) +
+                    ylm_dot_rlm(l2, -m2, m2) * gaunt_ylm(l1, l2, l3, m1, -m2, m3));
+        }
+    }
+
+    void uniform_coverage()
+    {
+        tp_(0, 0) = pi;
+        tp_(1, 0) = 0;
+
+        for (int k = 1; k < num_points_ - 1; k++) {
+            double hk = -1.0 + double(2 * k) / double(num_points_ - 1);
+            tp_(0, k) = std::acos(hk);
+            double t  = tp_(1, k - 1) + 3.80925122745582 / std::sqrt(double(num_points_)) / std::sqrt(1 - hk * hk);
+            tp_(1, k) = std::fmod(t, twopi);
+        }
+
+        tp_(0, num_points_ - 1) = 0;
+        tp_(1, num_points_ - 1) = 0;
+    }
+
+    /// Return Clebsch-Gordan coefficient.
+    /** Clebsch-Gordan coefficients arise when two angular momenta are combined into a
+     *  total angular momentum.
+     */
+    static inline double clebsch_gordan(int l1, int l2, int l3, int m1, int m2, int m3)
+    {
+        assert(l1 >= 0);
+        assert(l2 >= 0);
+        assert(l3 >= 0);
+        assert(m1 >= -l1 && m1 <= l1);
+        assert(m2 >= -l2 && m2 <= l2);
+        assert(m3 >= -l3 && m3 <= l3);
+
+        return std::pow(-1, l1 - l2 + m3) * std::sqrt(double(2 * l3 + 1)) *
+               gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, 2 * m1, 2 * m2, -2 * m3);
+    }
+
+    inline double_complex ylm_backward(int lm, int itp) const
+    {
+        return ylm_backward_(lm, itp);
+    }
+
+    inline double rlm_backward(int lm, int itp) const
+    {
+        return rlm_backward_(lm, itp);
+    }
+
+    inline double coord(int x, int itp) const
+    {
+        return coord_(x, itp);
+    }
+
+    inline vector3d<double> coord(int idx__) const
+    {
+        return vector3d<double>(coord_(0, idx__), coord_(1, idx__), coord(2, idx__));
+    }
+
+    inline double theta(int idx__) const
+    {
+        return tp_(0, idx__);
+    }
+
+    inline double phi(int idx__) const
+    {
+        return tp_(1, idx__);
+    }
+
+    inline double weight(int idx__) const
+    {
+        return w_[idx__];
+    }
+
+    inline int num_points() const
+    {
+        return num_points_;
+    }
+
+    inline int lmax() const
+    {
+        return lmax_;
+    }
+
+    inline int lmmax() const
+    {
+        return lmmax_;
+    }
+
+    static void wigner_d_matrix(int l, double beta, mdarray<double, 2>& d_mtrx__)
+    {
+        long double cos_b2 = std::cos((long double)beta / 2.0L);
+        long double sin_b2 = std::sin((long double)beta / 2.0L);
+
+        for (int m1 = -l; m1 <= l; m1++) {
+            for (int m2 = -l; m2 <= l; m2++) {
+                long double d = 0;
+                for (int j = 0; j <= std::min(l + m1, l - m2); j++) {
+                    if ((l - m2 - j) >= 0 && (l + m1 - j) >= 0 && (j + m2 - m1) >= 0) {
+                        long double g = (std::sqrt(utils::factorial<long double>(l + m1)) / utils::factorial<long double>(l - m2 - j)) *
+                                        (std::sqrt(utils::factorial<long double>(l - m1)) / utils::factorial<long double>(l + m1 - j)) *
+                                        (std::sqrt(utils::factorial<long double>(l - m2)) / utils::factorial<long double>(j + m2 - m1)) *
+                                        (std::sqrt(utils::factorial<long double>(l + m2)) / utils::factorial<long double>(j));
+                        d += g * std::pow(-1, j) * std::pow(cos_b2, 2 * l + m1 - m2 - 2 * j) * std::pow(sin_b2, 2 * j + m2 - m1);
+                    }
+                }
+                d_mtrx__(m1 + l, m2 + l) = (double)d;
+            }
+        }
+    }
+
+    static void rotation_matrix_l(int l, vector3d<double> euler_angles, int proper_rotation,
+                                  double_complex* rot_mtrx__, int ld)
+    {
+        mdarray<double_complex, 2> rot_mtrx(rot_mtrx__, ld, 2 * l + 1);
+
+        mdarray<double, 2> d_mtrx(2 * l + 1, 2 * l + 1);
+        wigner_d_matrix(l, euler_angles[1], d_mtrx);
+
+        for (int m1 = -l; m1 <= l; m1++) {
+            for (int m2 = -l; m2 <= l; m2++) {
+                rot_mtrx(m1 + l, m2 + l) = std::exp(double_complex(0, -euler_angles[0] * m1 - euler_angles[2] * m2)) *
+                                           d_mtrx(m1 + l, m2 + l) * std::pow(proper_rotation, l);
+            }
+        }
+    }
+
+    static void rotation_matrix_l(int l, vector3d<double> euler_angles, int proper_rotation,
+                                  double* rot_mtrx__, int ld)
+    {
+        mdarray<double, 2>         rot_mtrx_rlm(rot_mtrx__, ld, 2 * l + 1);
+        mdarray<double_complex, 2> rot_mtrx_ylm(2 * l + 1, 2 * l + 1);
+
+        mdarray<double, 2> d_mtrx(2 * l + 1, 2 * l + 1);
+        wigner_d_matrix(l, euler_angles[1], d_mtrx);
+
+        for (int m1 = -l; m1 <= l; m1++) {
+            for (int m2 = -l; m2 <= l; m2++) {
+                rot_mtrx_ylm(m1 + l, m2 + l) = std::exp(double_complex(0, -euler_angles[0] * m1 - euler_angles[2] * m2)) *
+                                               d_mtrx(m1 + l, m2 + l) * std::pow(proper_rotation, l);
+            }
+        }
+        for (int m1 = -l; m1 <= l; m1++) {
+            auto i13 = (m1 == 0) ? std::vector<int>({0}) : std::vector<int>({-m1, m1});
+
+            for (int m2 = -l; m2 <= l; m2++) {
+                auto i24 = (m2 == 0) ? std::vector<int>({0}) : std::vector<int>({-m2, m2});
+
+                for (int m3 : i13) {
+                    for (int m4 : i24) {
+                        rot_mtrx_rlm(m1 + l, m2 + l) += std::real(rlm_dot_ylm(l, m1, m3) *
+                                                                  rot_mtrx_ylm(m3 + l, m4 + l) *
+                                                                  ylm_dot_rlm(l, m4, m2));
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    static void rotation_matrix(int              lmax,
+                                vector3d<double> euler_angles,
+                                int              proper_rotation,
+                                mdarray<T, 2>&   rotm)
+    {
+        rotm.zero();
+
+        for (int l = 0; l <= lmax; l++) {
+            rotation_matrix_l(l, euler_angles, proper_rotation, &rotm(l * l, l * l), rotm.ld());
+        }
+    }
+
+    /// Compute derivative of real-spherical harmonic with respect to theta angle.
+    static void dRlm_dtheta(int lmax, double theta, double phi, mdarray<double, 1>& data)
+    {
+        assert(lmax <= 8);
+
+        data[0] = 0;
+
+        if (lmax == 0)
+            return;
+
+        auto cos_theta = SHT::cosxn(lmax, theta);
+        auto sin_theta = SHT::sinxn(lmax, theta);
+        auto cos_phi   = SHT::cosxn(lmax, phi);
+        auto sin_phi   = SHT::sinxn(lmax, phi);
+
+        data[1] = -(std::sqrt(3 / pi) * cos_theta[0] * sin_phi[0]) / 2.;
+
+        data[2] = -(std::sqrt(3 / pi) * sin_theta[0]) / 2.;
+
+        data[3] = -(std::sqrt(3 / pi) * cos_phi[0] * cos_theta[0]) / 2.;
+
+        if (lmax == 1)
+            return;
+
+        data[4] = -(std::sqrt(15 / pi) * cos_phi[0] * cos_theta[0] * sin_phi[0] * sin_theta[0]);
+
+        data[5] = -(std::sqrt(15 / pi) * cos_theta[1] * sin_phi[0]) / 2.;
+
+        data[6] = (-3 * std::sqrt(5 / pi) * cos_theta[0] * sin_theta[0]) / 2.;
+
+        data[7] = -(std::sqrt(15 / pi) * cos_phi[0] * cos_theta[1]) / 2.;
+
+        data[8] = (std::sqrt(15 / pi) * cos_phi[1] * sin_theta[1]) / 4.;
+
+        if (lmax == 2)
+            return;
+
+        data[9] = (-3 * std::sqrt(35 / (2. * pi)) * cos_theta[0] * sin_phi[2] * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[10] = (std::sqrt(105 / pi) * sin_phi[1] * (sin_theta[0] - 3 * sin_theta[2])) / 16.;
+
+        data[11] = -(std::sqrt(21 / (2. * pi)) * (cos_theta[0] + 15 * cos_theta[2]) * sin_phi[0]) / 16.;
+
+        data[12] = (-3 * std::sqrt(7 / pi) * (sin_theta[0] + 5 * sin_theta[2])) / 16.;
+
+        data[13] = -(std::sqrt(21 / (2. * pi)) * cos_phi[0] * (cos_theta[0] + 15 * cos_theta[2])) / 16.;
+
+        data[14] = -(std::sqrt(105 / pi) * cos_phi[1] * (sin_theta[0] - 3 * sin_theta[2])) / 16.;
+
+        data[15] = (-3 * std::sqrt(35 / (2. * pi)) * cos_phi[2] * cos_theta[0] * std::pow(sin_theta[0], 2)) / 4.;
+
+        if (lmax == 3)
+            return;
+
+        data[16] = (-3 * std::sqrt(35 / pi) * cos_theta[0] * sin_phi[3] * std::pow(sin_theta[0], 3)) / 4.;
+
+        data[17] = (-3 * std::sqrt(35 / (2. * pi)) * (1 + 2 * cos_theta[1]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[18] = (3 * std::sqrt(5 / pi) * sin_phi[1] * (2 * sin_theta[1] - 7 * sin_theta[3])) / 16.;
+
+        data[19] = (-3 * std::sqrt(5 / (2. * pi)) * (cos_theta[1] + 7 * cos_theta[3]) * sin_phi[0]) / 8.;
+
+        data[20] = (-15 * (2 * sin_theta[1] + 7 * sin_theta[3])) / (32. * std::sqrt(pi));
+
+        data[21] = (-3 * std::sqrt(5 / (2. * pi)) * cos_phi[0] * (cos_theta[1] + 7 * cos_theta[3])) / 8.;
+
+        data[22] = (3 * std::sqrt(5 / pi) * cos_phi[1] * (-2 * sin_theta[1] + 7 * sin_theta[3])) / 16.;
+
+        data[23] = (-3 * std::sqrt(35 / (2. * pi)) * cos_phi[2] * (1 + 2 * cos_theta[1]) * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[24] = (3 * std::sqrt(35 / pi) * cos_phi[3] * cos_theta[0] * std::pow(sin_theta[0], 3)) / 4.;
+
+        if (lmax == 4)
+            return;
+
+        data[25] = (-15 * std::sqrt(77 / (2. * pi)) * cos_theta[0] * sin_phi[4] * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[26] = (-3 * std::sqrt(385 / pi) * (3 + 5 * cos_theta[1]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 32.;
+
+        data[27] = (-3 * std::sqrt(385 / (2. * pi)) * cos_theta[0] * (1 + 15 * cos_theta[1]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 32.;
+
+        data[28] = (std::sqrt(1155 / pi) * sin_phi[1] * (2 * sin_theta[0] + 3 * (sin_theta[2] - 5 * sin_theta[4]))) / 128.;
+
+        data[29] = -(std::sqrt(165 / pi) * (2 * cos_theta[0] + 21 * (cos_theta[2] + 5 * cos_theta[4])) * sin_phi[0]) / 256.;
+
+        data[30] = (-15 * std::sqrt(11 / pi) * (2 * sin_theta[0] + 7 * (sin_theta[2] + 3 * sin_theta[4]))) / 256.;
+
+        data[31] = -(std::sqrt(165 / pi) * cos_phi[0] * (2 * cos_theta[0] + 21 * (cos_theta[2] + 5 * cos_theta[4]))) / 256.;
+
+        data[32] = (std::sqrt(1155 / pi) * cos_phi[1] * (-2 * sin_theta[0] - 3 * sin_theta[2] + 15 * sin_theta[4])) / 128.;
+
+        data[33] = (-3 * std::sqrt(385 / (2. * pi)) * cos_phi[2] * (17 * cos_theta[0] + 15 * cos_theta[2]) * std::pow(sin_theta[0], 2)) / 64.;
+
+        data[34] = (3 * std::sqrt(385 / pi) * cos_phi[3] * (3 + 5 * cos_theta[1]) * std::pow(sin_theta[0], 3)) / 32.;
+
+        data[35] = (-15 * std::sqrt(77 / (2. * pi)) * cos_phi[4] * cos_theta[0] * std::pow(sin_theta[0], 4)) / 16.;
+
+        if (lmax == 5)
+            return;
+
+        data[36] = (-3 * std::sqrt(3003 / (2. * pi)) * cos_theta[0] * sin_phi[5] * std::pow(sin_theta[0], 5)) / 16.;
+
+        data[37] = (-3 * std::sqrt(1001 / (2. * pi)) * (2 + 3 * cos_theta[1]) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[38] = (-3 * std::sqrt(91 / pi) * cos_theta[0] * (7 + 33 * cos_theta[1]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 32.;
+
+        data[39] = (-3 * std::sqrt(1365 / (2. * pi)) * (7 + 14 * cos_theta[1] + 11 * cos_theta[3]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 64.;
+
+        data[40] = (std::sqrt(1365 / (2. * pi)) * sin_phi[1] * (17 * sin_theta[1] + 12 * sin_theta[3] - 99 * sin_theta[5])) / 512.;
+
+        data[41] = -(std::sqrt(273 / pi) * (5 * cos_theta[1] + 24 * cos_theta[3] + 99 * cos_theta[5]) * sin_phi[0]) / 256.;
+
+        data[42] = (-21 * std::sqrt(13 / pi) * (5 * sin_theta[1] + 12 * sin_theta[3] + 33 * sin_theta[5])) / 512.;
+
+        data[43] = -(std::sqrt(273 / pi) * cos_phi[0] * (5 * cos_theta[1] + 24 * cos_theta[3] + 99 * cos_theta[5])) / 256.;
+
+        data[44] = (std::sqrt(1365 / (2. * pi)) * cos_phi[1] * (-17 * sin_theta[1] - 12 * sin_theta[3] + 99 * sin_theta[5])) / 512.;
+
+        data[45] = (-3 * std::sqrt(1365 / (2. * pi)) * cos_phi[2] * (7 + 14 * cos_theta[1] + 11 * cos_theta[3]) * std::pow(sin_theta[0], 2)) / 64.;
+
+        data[46] = (3 * std::sqrt(91 / pi) * cos_phi[3] * (47 * cos_theta[0] + 33 * cos_theta[2]) * std::pow(sin_theta[0], 3)) / 64.;
+
+        data[47] = (-3 * std::sqrt(1001 / (2. * pi)) * cos_phi[4] * (2 + 3 * cos_theta[1]) * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[48] = (3 * std::sqrt(3003 / (2. * pi)) * cos_phi[5] * cos_theta[0] * std::pow(sin_theta[0], 5)) / 16.;
+
+        if (lmax == 6)
+            return;
+
+        data[49] = (-21 * std::sqrt(715 / pi) * cos_theta[0] * sin_phi[6] * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[50] = (-3 * std::sqrt(5005 / (2. * pi)) * (5 + 7 * cos_theta[1]) * sin_phi[5] * std::pow(sin_theta[0], 5)) / 64.;
+
+        data[51] = (-3 * std::sqrt(385 / pi) * cos_theta[0] * (29 + 91 * cos_theta[1]) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 128.;
+
+        data[52] = (-3 * std::sqrt(385 / pi) * (81 + 148 * cos_theta[1] + 91 * cos_theta[3]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 256.;
+
+        data[53] = (-3 * std::sqrt(35 / pi) * cos_theta[0] * (523 + 396 * cos_theta[1] + 1001 * cos_theta[3]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[54] = (3 * std::sqrt(35 / (2. * pi)) * sin_phi[1] * (75 * sin_theta[0] + 171 * sin_theta[2] + 55 * sin_theta[4] - 1001 * sin_theta[6])) / 2048.;
+
+        data[55] = -(std::sqrt(105 / pi) * (25 * cos_theta[0] + 243 * cos_theta[2] + 825 * cos_theta[4] + 3003 * cos_theta[6]) * sin_phi[0]) / 4096.;
+
+        data[56] = (-7 * std::sqrt(15 / pi) * (25 * sin_theta[0] + 81 * sin_theta[2] + 165 * sin_theta[4] + 429 * sin_theta[6])) / 2048.;
+
+        data[57] = -(std::sqrt(105 / pi) * cos_phi[0] * (25 * cos_theta[0] + 243 * cos_theta[2] + 825 * cos_theta[4] + 3003 * cos_theta[6])) / 4096.;
+
+        data[58] = (-3 * std::sqrt(35 / (2. * pi)) * cos_phi[1] * (75 * sin_theta[0] + 171 * sin_theta[2] + 55 * sin_theta[4] - 1001 * sin_theta[6])) / 2048.;
+
+        data[59] = (-3 * std::sqrt(35 / pi) * cos_phi[2] * (1442 * cos_theta[0] + 1397 * cos_theta[2] + 1001 * cos_theta[4]) * std::pow(sin_theta[0], 2)) / 1024.;
+
+        data[60] = (3 * std::sqrt(385 / pi) * cos_phi[3] * (81 + 148 * cos_theta[1] + 91 * cos_theta[3]) * std::pow(sin_theta[0], 3)) / 256.;
+
+        data[61] = (-3 * std::sqrt(385 / pi) * cos_phi[4] * (149 * cos_theta[0] + 91 * cos_theta[2]) * std::pow(sin_theta[0], 4)) / 256.;
+
+        data[62] = (3 * std::sqrt(5005 / (2. * pi)) * cos_phi[5] * (5 + 7 * cos_theta[1]) * std::pow(sin_theta[0], 5)) / 64.;
+
+        data[63] = (-21 * std::sqrt(715 / pi) * cos_phi[6] * cos_theta[0] * std::pow(sin_theta[0], 6)) / 64.;
+
+        if (lmax == 7)
+            return;
+
+        data[64] = (-3 * std::sqrt(12155 / pi) * cos_theta[0] * sin_phi[7] * std::pow(sin_theta[0], 7)) / 32.;
+
+        data[65] = (-3 * std::sqrt(12155 / pi) * (3 + 4 * cos_theta[1]) * sin_phi[6] * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[66] = (-3 * std::sqrt(7293 / (2. * pi)) * cos_theta[0] * (2 + 5 * cos_theta[1]) * sin_phi[5] * std::pow(sin_theta[0], 5)) / 16.;
+
+        data[67] = (-3 * std::sqrt(17017 / pi) * (11 + 19 * cos_theta[1] + 10 * cos_theta[3]) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 128.;
+
+        data[68] = (-3 * std::sqrt(1309 / pi) * cos_theta[0] * (43 + 52 * cos_theta[1] + 65 * cos_theta[3]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 128.;
+
+        data[69] = (-3 * std::sqrt(19635 / pi) * (21 + 42 * cos_theta[1] + 39 * cos_theta[3] + 26 * cos_theta[5]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[70] = (-3 * std::sqrt(595 / (2. * pi)) * (-8 + 121 * cos_theta[1] + 143 * cos_theta[5]) * sin_phi[1] * sin_theta[1]) / 512.;
+
+        data[71] = (-3 * std::sqrt(17 / pi) * (35 * cos_theta[1] + 154 * cos_theta[3] + 429 * cos_theta[5] + 1430 * cos_theta[7]) * sin_phi[0]) / 2048.;
+
+        data[72] = (-9 * std::sqrt(17 / pi) * (70 * sin_theta[1] + 154 * sin_theta[3] + 286 * sin_theta[5] + 715 * sin_theta[7])) / 4096.;
+
+        data[73] = (-3 * std::sqrt(17 / pi) * cos_phi[0] * (35 * cos_theta[1] + 154 * cos_theta[3] + 429 * cos_theta[5] + 1430 * cos_theta[7])) / 2048.;
+
+        data[74] = (3 * std::sqrt(595 / (2. * pi)) * cos_phi[1] * (-16 * sin_theta[1] - 22 * sin_theta[3] + 143 * sin_theta[7])) / 1024.;
+
+        data[75] = (-3 * std::sqrt(19635 / pi) * cos_phi[2] * (21 + 42 * cos_theta[1] + 39 * cos_theta[3] + 26 * cos_theta[5]) * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[76] = (3 * std::sqrt(1309 / pi) * cos_phi[3] * (138 * cos_theta[0] + 117 * cos_theta[2] + 65 * cos_theta[4]) * std::pow(sin_theta[0], 3)) / 256.;
+
+        data[77] = (-3 * std::sqrt(17017 / pi) * cos_phi[4] * (11 + 19 * cos_theta[1] + 10 * cos_theta[3]) * std::pow(sin_theta[0], 4)) / 128.;
+
+        data[78] = (3 * std::sqrt(7293 / (2. * pi)) * cos_phi[5] * (9 * cos_theta[0] + 5 * cos_theta[2]) * std::pow(sin_theta[0], 5)) / 32.;
+
+        data[79] = (-3 * std::sqrt(12155 / pi) * cos_phi[6] * (3 + 4 * cos_theta[1]) * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[80] = (3 * std::sqrt(12155 / pi) * cos_phi[7] * cos_theta[0] * std::pow(sin_theta[0], 7)) / 32.;
+    }
+
+    ///  Compute derivative of real-spherical harmonic with respect to phi angle and divide by sin(theta).
+    static void dRlm_dphi_sin_theta(int lmax, double theta, double phi, mdarray<double, 1>& data)
+    {
+        assert(lmax <= 8);
+
+        data[0] = 0;
+
+        if (lmax == 0)
+            return;
+
+        auto cos_theta = SHT::cosxn(lmax, theta);
+        auto sin_theta = SHT::sinxn(lmax, theta);
+        auto cos_phi   = SHT::cosxn(lmax, phi);
+        auto sin_phi   = SHT::sinxn(lmax, phi);
+
+        data[1] = -(std::sqrt(3 / pi) * cos_phi[0]) / 2.;
+
+        data[2] = 0;
+
+        data[3] = (std::sqrt(3 / pi) * sin_phi[0]) / 2.;
+
+        if (lmax == 1)
+            return;
+
+        data[4] = -(std::sqrt(15 / pi) * cos_phi[1] * sin_theta[0]) / 2.;
+
+        data[5] = -(std::sqrt(15 / pi) * cos_phi[0] * cos_theta[0]) / 2.;
+
+        data[6] = 0;
+
+        data[7] = (std::sqrt(15 / pi) * cos_theta[0] * sin_phi[0]) / 2.;
+
+        data[8] = -(std::sqrt(15 / pi) * cos_phi[0] * sin_phi[0] * sin_theta[0]);
+
+        if (lmax == 2)
+            return;
+
+        data[9] = (-3 * std::sqrt(35 / (2. * pi)) * cos_phi[2] * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[10] = -(std::sqrt(105 / pi) * cos_phi[1] * sin_theta[1]) / 4.;
+
+        data[11] = -(std::sqrt(21 / (2. * pi)) * cos_phi[0] * (3 + 5 * cos_theta[1])) / 8.;
+
+        data[12] = 0;
+
+        data[13] = (std::sqrt(21 / (2. * pi)) * (3 + 5 * cos_theta[1]) * sin_phi[0]) / 8.;
+
+        data[14] = -(std::sqrt(105 / pi) * cos_phi[0] * cos_theta[0] * sin_phi[0] * sin_theta[0]);
+
+        data[15] = (3 * std::sqrt(35 / (2. * pi)) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 4.;
+
+        if (lmax == 3)
+            return;
+
+        data[16] = (-3 * std::sqrt(35 / pi) * cos_phi[3] * std::pow(sin_theta[0], 3)) / 4.;
+
+        data[17] = (-9 * std::sqrt(35 / (2. * pi)) * cos_phi[2] * cos_theta[0] * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[18] = (-3 * std::sqrt(5 / pi) * cos_phi[1] * (3 * sin_theta[0] + 7 * sin_theta[2])) / 16.;
+
+        data[19] = (-3 * std::sqrt(5 / (2. * pi)) * cos_phi[0] * (9 * cos_theta[0] + 7 * cos_theta[2])) / 16.;
+
+        data[20] = 0;
+
+        data[21] = (3 * std::sqrt(5 / (2. * pi)) * cos_theta[0] * (1 + 7 * cos_theta[1]) * sin_phi[0]) / 8.;
+
+        data[22] = (-3 * std::sqrt(5 / pi) * sin_phi[1] * (3 * sin_theta[0] + 7 * sin_theta[2])) / 16.;
+
+        data[23] = (9 * std::sqrt(35 / (2. * pi)) * cos_theta[0] * sin_phi[2] * std::pow(sin_theta[0], 2)) / 4.;
+
+        data[24] = (-3 * std::sqrt(35 / pi) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 4.;
+
+        if (lmax == 4)
+            return;
+
+        data[25] = (-15 * std::sqrt(77 / (2. * pi)) * cos_phi[4] * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[26] = (-3 * std::sqrt(385 / pi) * cos_phi[3] * cos_theta[0] * std::pow(sin_theta[0], 3)) / 4.;
+
+        data[27] = (-3 * std::sqrt(385 / (2. * pi)) * cos_phi[2] * (7 + 9 * cos_theta[1]) * std::pow(sin_theta[0], 2)) / 32.;
+
+        data[28] = -(std::sqrt(1155 / pi) * cos_phi[1] * (2 * sin_theta[1] + 3 * sin_theta[3])) / 32.;
+
+        data[29] = -(std::sqrt(165 / pi) * cos_phi[0] * (15 + 28 * cos_theta[1] + 21 * cos_theta[3])) / 128.;
+
+        data[30] = 0;
+
+        data[31] = (std::sqrt(165 / pi) * (15 + 28 * cos_theta[1] + 21 * cos_theta[3]) * sin_phi[0]) / 128.;
+
+        data[32] = -(std::sqrt(1155 / pi) * sin_phi[1] * (2 * sin_theta[1] + 3 * sin_theta[3])) / 32.;
+
+        data[33] = (3 * std::sqrt(385 / (2. * pi)) * (7 + 9 * cos_theta[1]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 32.;
+
+        data[34] = (-3 * std::sqrt(385 / pi) * cos_theta[0] * sin_phi[3] * std::pow(sin_theta[0], 3)) / 4.;
+
+        data[35] = (15 * std::sqrt(77 / (2. * pi)) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 16.;
+
+        if (lmax == 5)
+            return;
+
+        data[36] = (-3 * std::sqrt(3003 / (2. * pi)) * cos_phi[5] * std::pow(sin_theta[0], 5)) / 16.;
+
+        data[37] = (-15 * std::sqrt(1001 / (2. * pi)) * cos_phi[4] * cos_theta[0] * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[38] = (-3 * std::sqrt(91 / pi) * cos_phi[3] * (9 + 11 * cos_theta[1]) * std::pow(sin_theta[0], 3)) / 16.;
+
+        data[39] = (-3 * std::sqrt(1365 / (2. * pi)) * cos_phi[2] * (21 * cos_theta[0] + 11 * cos_theta[2]) * std::pow(sin_theta[0], 2)) / 64.;
+
+        data[40] = -(std::sqrt(1365 / (2. * pi)) * cos_phi[1] * (10 * sin_theta[0] + 27 * sin_theta[2] + 33 * sin_theta[4])) / 256.;
+
+        data[41] = -(std::sqrt(273 / pi) * cos_phi[0] * (50 * cos_theta[0] + 45 * cos_theta[2] + 33 * cos_theta[4])) / 256.;
+
+        data[42] = 0;
+
+        data[43] = (std::sqrt(273 / pi) * cos_theta[0] * (19 + 12 * cos_theta[1] + 33 * cos_theta[3]) * sin_phi[0]) / 128.;
+
+        data[44] = -(std::sqrt(1365 / (2. * pi)) * sin_phi[1] * (10 * sin_theta[0] + 27 * sin_theta[2] + 33 * sin_theta[4])) / 256.;
+
+        data[45] = (3 * std::sqrt(1365 / (2. * pi)) * cos_theta[0] * (5 + 11 * cos_theta[1]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 32.;
+
+        data[46] = (-3 * std::sqrt(91 / pi) * (9 + 11 * cos_theta[1]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 16.;
+
+        data[47] = (15 * std::sqrt(1001 / (2. * pi)) * cos_theta[0] * sin_phi[4] * std::pow(sin_theta[0], 4)) / 16.;
+
+        data[48] = (-3 * std::sqrt(3003 / (2. * pi)) * sin_phi[5] * std::pow(sin_theta[0], 5)) / 16.;
+
+        if (lmax == 6)
+            return;
+
+        data[49] = (-21 * std::sqrt(715 / pi) * cos_phi[6] * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[50] = (-9 * std::sqrt(5005 / (2. * pi)) * cos_phi[5] * cos_theta[0] * std::pow(sin_theta[0], 5)) / 16.;
+
+        data[51] = (-15 * std::sqrt(385 / pi) * cos_phi[4] * (11 + 13 * cos_theta[1]) * std::pow(sin_theta[0], 4)) / 128.;
+
+        data[52] = (-3 * std::sqrt(385 / pi) * cos_phi[3] * (27 * cos_theta[0] + 13 * cos_theta[2]) * std::pow(sin_theta[0], 3)) / 32.;
+
+        data[53] = (-9 * std::sqrt(35 / pi) * cos_phi[2] * (189 + 308 * cos_theta[1] + 143 * cos_theta[3]) * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[54] = (-3 * std::sqrt(35 / (2. * pi)) * cos_phi[1] * (75 * sin_theta[1] + 132 * sin_theta[3] + 143 * sin_theta[5])) / 512.;
+
+        data[55] = -(std::sqrt(105 / pi) * cos_phi[0] * (350 + 675 * cos_theta[1] + 594 * cos_theta[3] + 429 * cos_theta[5])) / 2048.;
+
+        data[56] = 0;
+
+        data[57] = (std::sqrt(105 / pi) * (350 + 675 * cos_theta[1] + 594 * cos_theta[3] + 429 * cos_theta[5]) * sin_phi[0]) / 2048.;
+
+        data[58] = (-3 * std::sqrt(35 / (2. * pi)) * sin_phi[1] * (75 * sin_theta[1] + 132 * sin_theta[3] + 143 * sin_theta[5])) / 512.;
+
+        data[59] = (9 * std::sqrt(35 / pi) * (189 + 308 * cos_theta[1] + 143 * cos_theta[3]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[60] = (-3 * std::sqrt(385 / pi) * cos_theta[0] * (7 + 13 * cos_theta[1]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 16.;
+
+        data[61] = (15 * std::sqrt(385 / pi) * (11 + 13 * cos_theta[1]) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 128.;
+
+        data[62] = (-9 * std::sqrt(5005 / (2. * pi)) * cos_theta[0] * sin_phi[5] * std::pow(sin_theta[0], 5)) / 16.;
+
+        data[63] = (21 * std::sqrt(715 / pi) * sin_phi[6] * std::pow(sin_theta[0], 6)) / 64.;
+
+        if (lmax == 7)
+            return;
+
+        data[64] = (-3 * std::sqrt(12155 / pi) * cos_phi[7] * std::pow(sin_theta[0], 7)) / 32.;
+
+        data[65] = (-21 * std::sqrt(12155 / pi) * cos_phi[6] * cos_theta[0] * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[66] = (-3 * std::sqrt(7293 / (2. * pi)) * cos_phi[5] * (13 + 15 * cos_theta[1]) * std::pow(sin_theta[0], 5)) / 64.;
+
+        data[67] = (-15 * std::sqrt(17017 / pi) * cos_phi[4] * (11 * cos_theta[0] + 5 * cos_theta[2]) * std::pow(sin_theta[0], 4)) / 256.;
+
+        data[68] = (-3 * std::sqrt(1309 / pi) * cos_phi[3] * (99 + 156 * cos_theta[1] + 65 * cos_theta[3]) * std::pow(sin_theta[0], 3)) / 256.;
+
+        data[69] = (-3 * std::sqrt(19635 / pi) * cos_phi[2] * (126 * cos_theta[0] + 91 * cos_theta[2] + 39 * cos_theta[4]) * std::pow(sin_theta[0], 2)) / 1024.;
+
+        data[70] = (-3 * std::sqrt(595 / (2. * pi)) * cos_phi[1] * (35 * sin_theta[0] + 11 * (9 * sin_theta[2] + 13 * (sin_theta[4] + sin_theta[6])))) / 2048.;
+
+        data[71] = (-3 * std::sqrt(17 / pi) * cos_phi[0] * (1225 * cos_theta[0] + 11 * (105 * cos_theta[2] + 91 * cos_theta[4] + 65 * cos_theta[6]))) / 4096.;
+
+        data[72] = 0;
+
+        data[73] = (3 * std::sqrt(17 / pi) * cos_theta[0] * (178 + 869 * cos_theta[1] + 286 * cos_theta[3] + 715 * cos_theta[5]) * sin_phi[0]) / 2048.;
+
+        data[74] = (-3 * std::sqrt(595 / (2. * pi)) * sin_phi[1] * (35 * sin_theta[0] + 11 * (9 * sin_theta[2] + 13 * (sin_theta[4] + sin_theta[6])))) / 2048.;
+
+        data[75] = (3 * std::sqrt(19635 / pi) * cos_theta[0] * (37 + 52 * cos_theta[1] + 39 * cos_theta[3]) * sin_phi[2] * std::pow(sin_theta[0], 2)) / 512.;
+
+        data[76] = (-3 * std::sqrt(1309 / pi) * (99 + 156 * cos_theta[1] + 65 * cos_theta[3]) * sin_phi[3] * std::pow(sin_theta[0], 3)) / 256.;
+
+        data[77] = (15 * std::sqrt(17017 / pi) * (11 * cos_theta[0] + 5 * cos_theta[2]) * sin_phi[4] * std::pow(sin_theta[0], 4)) / 256.;
+
+        data[78] = (-3 * std::sqrt(7293 / (2. * pi)) * (13 + 15 * cos_theta[1]) * sin_phi[5] * std::pow(sin_theta[0], 5)) / 64.;
+
+        data[79] = (21 * std::sqrt(12155 / pi) * cos_theta[0] * sin_phi[6] * std::pow(sin_theta[0], 6)) / 64.;
+
+        data[80] = (-3 * std::sqrt(12155 / pi) * sin_phi[7] * std::pow(sin_theta[0], 7)) / 32.;
+    }
+
+    /// convert 3x3 transformation matrix to SU2 2x2 matrix
+    /// Create quaternion components from the 3x3 matrix. The components are just a w = Cos(\Omega/2)
+    /// and {x,y,z} = unit rotation vector multiplied by Sin[\Omega/2]
+    /// see https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation
+    /// and https://en.wikipedia.org/wiki/Rotation_group_SO(3)#Quaternions_of_unit_norm
+    static mdarray<double_complex, 2> rotation_matrix_su2(const matrix3d<double>& m)
+    {
+        double det = m.det() > 0 ? 1.0 : -1.0;
+
+        matrix3d<double>           mat = m * det;
+        mdarray<double_complex, 2> su2mat(2, 2);
+
+        su2mat.zero();
+
+        /* make quaternion components*/
+        double w = sqrt(std::max(0., 1. + mat(0, 0) + mat(1, 1) + mat(2, 2))) / 2.;
+        double x = sqrt(std::max(0., 1. + mat(0, 0) - mat(1, 1) - mat(2, 2))) / 2.;
+        double y = sqrt(std::max(0., 1. - mat(0, 0) + mat(1, 1) - mat(2, 2))) / 2.;
+        double z = sqrt(std::max(0., 1. - mat(0, 0) - mat(1, 1) + mat(2, 2))) / 2.;
+        x        = std::copysign(x, mat(2, 1) - mat(1, 2));
+        y        = std::copysign(y, mat(0, 2) - mat(2, 0));
+        z        = std::copysign(z, mat(1, 0) - mat(0, 1));
+
+        su2mat(0, 0) = double_complex(w, -z);
+        su2mat(1, 1) = double_complex(w, z);
+        su2mat(0, 1) = double_complex(-y, -x);
+        su2mat(1, 0) = double_complex(y, -x);
+
+        return std::move(su2mat);
+    }
+
+    /// Compute the derivatives of real spherical harmonics over the components of cartesian vector.
+    /** The following derivative is computed:
+      *  \f[
+      *    \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial r_{\mu}} =
+      *      \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial \theta_r} \frac{\partial \theta_r}{\partial r_{\mu}} +
+      *      \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial \phi_r} \frac{\partial \phi_r}{\partial r_{\mu}}
+      *  \f]
+      *  The derivatives of angles are:
+      *  \f[
+      *     \frac{\partial \theta_r}{\partial r_{x}} = \frac{\cos(\phi_r) \cos(\theta_r)}{r} \\
+      *     \frac{\partial \theta_r}{\partial r_{y}} = \frac{\cos(\theta_r) \sin(\phi_r)}{r} \\
+      *     \frac{\partial \theta_r}{\partial r_{z}} = -\frac{\sin(\theta_r)}{r}
+      *  \f]
+      *  and
+      *  \f[
+      *     \frac{\partial \phi_r}{\partial r_{x}} = -\frac{\sin(\phi_r)}{\sin(\theta_r) r} \\
+      *     \frac{\partial \phi_r}{\partial r_{y}} = \frac{\cos(\phi_r)}{\sin(\theta_r) r} \\
+      *     \frac{\partial \phi_r}{\partial r_{z}} = 0
+      *  \f]
+      *  The derivative of \f$ \phi \f$ has discontinuities at \f$ \theta = 0, \theta=\pi \f$. This, however, is not a problem, because
+      *  multiplication by the the derivative of \f$ R_{\ell m} \f$ removes it. The following functions have to be hardcoded:
+      *  \f[
+      *    \frac{\partial R_{\ell m}(\theta, \phi)}{\partial \theta} \\
+      *    \frac{\partial R_{\ell m}(\theta, \phi)}{\partial \phi} \frac{1}{\sin(\theta)}
+      *  \f]
+      *
+      *  Mathematica script for spherical harmonic derivatives:
+         \verbatim
+         Rlm[l_, m_, th_, ph_] :=
+          If[m > 0, Sqrt[2]*ComplexExpand[Re[SphericalHarmonicY[l, m, th, ph]]],
+            If[m < 0, Sqrt[2]*ComplexExpand[Im[SphericalHarmonicY[l, m, th, ph]]],
+              If[m == 0, ComplexExpand[Re[SphericalHarmonicY[l, 0, th, ph]]]]
+            ]
+          ]
+         Do[Print[FullSimplify[D[Rlm[l, m, theta, phi], theta]]], {l, 0, 4}, {m, -l, l}]
+         Do[Print[FullSimplify[TrigExpand[D[Rlm[l, m, theta, phi], phi]/Sin[theta]]]], {l, 0, 4}, {m, -l, l}]
+         \endverbatim
+     */
+    static void dRlm_dr(int lmax__, vector3d<double>& r__, mdarray<double, 2>& data__)
+    {
+        /* get spherical coordinates of the Cartesian vector */
+        auto vrs = spherical_coordinates(r__);
+
+        if (vrs[0] < 1e-12) {
+            data__.zero();
+            return;
+        }
+
+        int lmmax = (lmax__ + 1) * (lmax__ + 1);
+
+        double theta = vrs[1];
+        double phi   = vrs[2];
+
+        vector3d<double> dtheta_dr({std::cos(phi) * std::cos(theta), std::cos(theta) * std::sin(phi), -std::sin(theta)});
+        vector3d<double> dphi_dr({-std::sin(phi), std::cos(phi), 0.0});
+
+        mdarray<double, 1> dRlm_dt(lmmax);
+        mdarray<double, 1> dRlm_dp_sin_t(lmmax);
+
+        dRlm_dtheta(lmax__, theta, phi, dRlm_dt);
+        dRlm_dphi_sin_theta(lmax__, theta, phi, dRlm_dp_sin_t);
+
+        for (int mu = 0; mu < 3; mu++) {
+            for (int lm = 0; lm < lmmax; lm++) {
+                data__(lm, mu) = (dRlm_dt[lm] * dtheta_dr[mu] + dRlm_dp_sin_t[lm] * dphi_dr[mu]) / vrs[0];
+            }
+        }
+    }
+
+    /// Generate \f$ \cos(m x) \f$ for m in [1, n] using recursion.
+    static mdarray<double, 1> cosxn(int n__, double x__)
+    {
+        assert(n__ > 0);
+        mdarray<double, 1> data(n__);
+        data[0] = std::cos(x__);
+        if (n__ > 1) {
+            data[1] = std::cos(2 * x__);
+            for (int i = 2; i < n__; i++) {
+                data[i] = 2 * data[0] * data[i - 1] - data[i - 2];
+            }
+        }
+        return std::move(data);
+    }
+
+    /// Generate \f$ \sin(m x) \f$ for m in [1, n] using recursion.
+    static mdarray<double, 1> sinxn(int n__, double x__)
+    {
+        assert(n__ > 0);
+        mdarray<double, 1> data(n__);
+        auto               cosx = std::cos(x__);
+        data[0]                 = std::sin(x__);
+        if (n__ > 1) {
+            data[1] = std::sin(2 * x__);
+            for (int i = 2; i < n__; i++) {
+                data[i] = 2 * cosx * data[i - 1] - data[i - 2];
+            }
+        }
+        return std::move(data);
+    }
+};
+
+template <>
+inline void SHT::backward_transform<double>(int ld, double const* flm, int nr, int lmmax, double* ftp)
+{
+    assert(lmmax <= lmmax_);
+    assert(ld >= lmmax);
+    linalg<CPU>::gemm(1, 0, num_points_, nr, lmmax, &rlm_backward_(0, 0), lmmax_, flm, ld, ftp, num_points_);
+}
+
+template <>
+inline void SHT::backward_transform<double_complex>(int ld, double_complex const* flm, int nr, int lmmax, double_complex* ftp)
+{
+    assert(lmmax <= lmmax_);
+    assert(ld >= lmmax);
+    linalg<CPU>::gemm(1, 0, num_points_, nr, lmmax, &ylm_backward_(0, 0), lmmax_, flm, ld, ftp, num_points_);
+}
+
+template <>
+inline void SHT::forward_transform<double>(double const* ftp, int nr, int lmmax, int ld, double* flm)
+{
+    assert(lmmax <= lmmax_);
+    assert(ld >= lmmax);
+    linalg<CPU>::gemm(1, 0, lmmax, nr, num_points_, &rlm_forward_(0, 0), num_points_, ftp, num_points_, flm, ld);
+}
+
+template <>
+inline void SHT::forward_transform<double_complex>(double_complex const* ftp, int nr, int lmmax, int ld, double_complex* flm)
+{
+    assert(lmmax <= lmmax_);
+    assert(ld >= lmmax);
+    linalg<CPU>::gemm(1, 0, lmmax, nr, num_points_, &ylm_forward_(0, 0), num_points_, ftp, num_points_, flm, ld);
+}
+
+} // namespace sirius
+
+#endif // __SHT_HPP__
diff --git a/src/Unit_cell/atom.hpp b/src/Unit_cell/atom.hpp
index 8c049a4e0..d0459a8c5 100644
--- a/src/Unit_cell/atom.hpp
+++ b/src/Unit_cell/atom.hpp
@@ -25,7 +25,7 @@
 #ifndef __ATOM_HPP__
 #define __ATOM_HPP__
 
-#include "gaunt.h"
+#include "SHT/gaunt.hpp"
 #include "atom_symmetry_class.hpp"
 #include "sddk.hpp"
 #include "spheric_function.h"
diff --git a/src/Unit_cell/atom_type.hpp b/src/Unit_cell/atom_type.hpp
index 2455cf174..1ae4d8716 100644
--- a/src/Unit_cell/atom_type.hpp
+++ b/src/Unit_cell/atom_type.hpp
@@ -31,9 +31,9 @@
 #include "geometry3d.hpp"
 #include "radial_grid.h"
 #include "radial_solver.h"
-#include "xc_functional.h"
+#include "Potential/xc_functional.hpp"
 #include "simulation_parameters.h"
-#include "sht.h"
+//#include "SHT/sht.hpp"
 #include "radial_functions_index.hpp"
 #include "basis_functions_index.hpp"
 #include "hubbard_orbitals_descriptor.hpp"
diff --git a/src/dft_ground_state.h b/src/dft_ground_state.h
index be156bbaf..90e5a066b 100644
--- a/src/dft_ground_state.h
+++ b/src/dft_ground_state.h
@@ -25,7 +25,7 @@
 #ifndef __DFT_GROUND_STATE_H__
 #define __DFT_GROUND_STATE_H__
 
-#include "k_point_set.h"
+#include "K_point/k_point_set.hpp"
 #include "utils/json.hpp"
 #include "Hubbard/hubbard.hpp"
 #include "Geometry/stress.hpp"
diff --git a/src/gaunt.h b/src/gaunt.h
deleted file mode 100644
index 80adf2cc1..000000000
--- a/src/gaunt.h
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
-// All rights reserved.
-// 
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that 
-// the following conditions are met:
-// 
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 
-//    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
-//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/** \file gaunt.h
- *   
- *  \brief Contains definition and implementation of sirius::Gaunt class.
- */
-
-#ifndef __GAUNT_H__
-#define __GAUNT_H__
-
-#include "mdarray.hpp"
-
-namespace sirius {
-
-/// Used in the {lm1, lm2} : {lm3, coefficient} way of grouping non-zero Gaunt coefficients
-template <typename T>
-struct gaunt_L3
-{
-    int lm3;
-    int l3;
-    T coef;
-};
-
-/// Used in the {lm1, lm2, coefficient} : {lm3} way of grouping non-zero Gaunt coefficients
-template <typename T>
-struct gaunt_L1_L2
-{
-    int lm1;
-    int lm2;
-    T coef;
-};
-
-/// Compact storage of non-zero Gaunt coefficients \f$ \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle \f$.
-/** Very important! The following notation is adopted and used everywhere: lm1 and lm2 represent 'bra' and 'ket' 
- *  spherical harmonics of the Gaunt integral and lm3 represent the inner spherical harmonic. 
- */
-template <typename T>
-class Gaunt_coefficients
-{
-    private:
-
-        /// lmax of <lm1|
-        int lmax1_;
-        /// lmmax of <lm1|
-        int lmmax1_;
-        
-        /// lmax of inner real or complex spherical harmonic
-        int lmax3_;
-        /// lmmax of inner real or complex spherical harmonic
-        int lmmax3_;
-
-        /// lmax of |lm2>
-        int lmax2_;
-        /// lmmax of |lm2>
-        int lmmax2_;
-
-        /// List of non-zero Gaunt coefficients for each lm3.
-        mdarray<std::vector<gaunt_L1_L2<T>>, 1> gaunt_packed_L1_L2_;
-
-        /// List of non-zero Gaunt coefficients for each combination of lm1, lm2.
-        mdarray<std::vector<gaunt_L3<T>>, 2> gaunt_packed_L3_;
-
-    public:
-        
-        /// Class constructor.
-        Gaunt_coefficients(int lmax1__,
-                           int lmax3__,
-                           int lmax2__,
-                           std::function<T(int, int, int, int, int, int)> get__)
-            : lmax1_(lmax1__)
-            , lmax3_(lmax3__)
-            , lmax2_(lmax2__)
-        {
-            lmmax1_ = utils::lmmax(lmax1_);
-            lmmax3_ = utils::lmmax(lmax3_);
-            lmmax2_ = utils::lmmax(lmax2_);
-
-            gaunt_packed_L1_L2_ = mdarray<std::vector<gaunt_L1_L2<T>>, 1>(lmmax3_);
-            gaunt_L1_L2<T> g12;
-            
-            gaunt_packed_L3_ = mdarray<std::vector<gaunt_L3<T>>, 2>(lmmax1_, lmmax2_);
-            gaunt_L3<T> g3;
-
-            for (int l1 = 0, lm1 = 0; l1 <= lmax1_; l1++) {
-                for (int m1 = -l1; m1 <= l1; m1++, lm1++) {
-                    for (int l2 = 0, lm2 = 0; l2 <= lmax2_; l2++) {
-                        for (int m2 = -l2; m2 <= l2; m2++, lm2++) {
-                            for (int l3 = 0, lm3 = 0; l3 <= lmax3_; l3++) {
-                                for (int m3 = -l3; m3 <= l3; m3++, lm3++) {
-                                    
-                                    T gc = get__(l1, l3, l2, m1, m3, m2);
-                                    if (std::abs(gc) > 1e-12) {
-                                        g12.lm1 = lm1;
-                                        g12.lm2 = lm2;
-                                        g12.coef = gc;
-                                        gaunt_packed_L1_L2_[lm3].push_back(g12);
-
-                                        g3.lm3 = lm3;
-                                        g3.l3 = l3;
-                                        g3.coef = gc;
-                                        gaunt_packed_L3_(lm1, lm2).push_back(g3);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        /// Return number of non-zero Gaunt coefficients for a given lm3.
-        inline int num_gaunt(int lm3) const
-        {
-            assert(lm3 >= 0 && lm3 < lmmax3_);
-            return static_cast<int>(gaunt_packed_L1_L2_[lm3].size());
-        }
-
-        /// Return a structure containing {lm1, lm2, coef} for a given lm3 and index.
-        /** Example:
-         *  \code{.cpp}
-         *  for (int lm3 = 0; lm3 < lmmax3; lm3++)
-         *  {
-         *      for (int i = 0; i < gaunt_coefs.num_gaunt(lm3); i++) {
-         *          int lm1 = gaunt_coefs.gaunt(lm3, i).lm1;
-         *          int lm2 = gaunt_coefs.gaunt(lm3, i).lm2;
-         *          double coef = gaunt_coefs.gaunt(lm3, i).coef;
-         *
-         *          // do something with lm1,lm2,lm3 and coef
-         *      }
-         *  }
-         *  \endcode
-         */
-        inline gaunt_L1_L2<T> const& gaunt(int lm3, int idx) const
-        {
-            assert(lm3 >= 0 && lm3 < lmmax3_);
-            assert(idx >= 0 && idx < (int)gaunt_packed_L1_L2_[lm3].size());
-            return gaunt_packed_L1_L2_[lm3][idx];
-        }
-
-        /// Return number of non-zero Gaunt coefficients for a combination of lm1 and lm2.
-        inline int num_gaunt(int lm1, int lm2) const
-        {
-            return static_cast<int>(gaunt_packed_L3_(lm1, lm2).size());
-        }
-
-        /// Return a structure containing {lm3, coef} for a given lm1, lm2 and index
-        inline gaunt_L3<T> const& gaunt(int lm1, int lm2, int idx) const
-        {
-            return gaunt_packed_L3_(lm1, lm2)[idx];
-        }
-
-        /// Return a sum over L3 (lm3) index of Gaunt coefficients and a complex vector.
-        /** The following operation is performed:
-         *  \f[
-         *      \sum_{\ell_3 m_3} \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle v_{\ell_3 m_3}
-         *  \f]
-         *  Result is assumed to be complex.
-         */
-        inline double_complex sum_L3_gaunt(int lm1, int lm2, double_complex const* v) const
-        {
-            double_complex zsum(0, 0);
-            for (int k = 0; k < (int)gaunt_packed_L3_(lm1, lm2).size(); k++) {
-                zsum += gaunt_packed_L3_(lm1, lm2)[k].coef * v[gaunt_packed_L3_(lm1, lm2)[k].lm3];
-            }
-            return zsum;
-        }
-
-        /// Return a sum over L3 (lm3) index of Gaunt coefficients and a real vector.
-        /** The following operation is performed:
-         *  \f[
-         *      \sum_{\ell_3 m_3} \langle \ell_1 m_1 | \ell_3 m_3 | \ell_2 m_2 \rangle v_{\ell_3 m_3}
-         *  \f]
-         *  Result is assumed to be of the same type as Gaunt coefficients.
-         */
-        inline T sum_L3_gaunt(int lm1, int lm2, double const* v) const
-        {
-            T sum = 0;
-            for (int k = 0; k < (int)gaunt_packed_L3_(lm1, lm2).size(); k++) {
-                sum += gaunt_packed_L3_(lm1, lm2)[k].coef * v[gaunt_packed_L3_(lm1, lm2)[k].lm3];
-            }
-            return sum;
-        }
-
-        /// Return vector of non-zero Gaunt coefficients for a given combination of lm1 and lm2
-        inline std::vector<gaunt_L3<T>> const& gaunt_vector(int lm1, int lm2) const
-        {
-            return gaunt_packed_L3_(lm1, lm2);
-        }
-};
-
-};
-
-#endif
-
diff --git a/src/k_point.h b/src/k_point.h
deleted file mode 100644
index ded5c2622..000000000
--- a/src/k_point.h
+++ /dev/null
@@ -1,833 +0,0 @@
-// Copyright (c) 2013-2016 Anton Kozhevnikov, Thomas Schulthess
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
-// the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
-//    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
-//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/** \file k_point.h
- *
- *  \brief Contains definition and partial implementation of sirius::K_point class.
- */
-
-#ifndef __K_POINT_H__
-#define __K_POINT_H__
-
-#include "periodic_function.h"
-#include "matching_coefficients.h"
-#include "Beta_projectors/beta_projectors.h"
-#include "wave_functions.hpp"
-
-namespace sirius
-{
-
-/// K-point related variables and methods.
-/** \image html wf_storage.png "Wave-function storage"
- *  \image html fv_eigen_vectors.png "First-variational eigen vectors" */
-class K_point
-{
-    private:
-
-        /// Simulation context.
-        Simulation_context& ctx_;
-
-        /// Unit cell object.
-        Unit_cell const& unit_cell_;
-
-        /// Weight of k-point.
-        double weight_;
-
-        /// Fractional k-point coordinates.
-        vector3d<double> vk_;
-
-        /// List of G-vectors with |G+k| < cutoff.
-        std::unique_ptr<Gvec> gkvec_;
-
-        /// G-vector distribution for the FFT transformation.
-        std::unique_ptr<Gvec_partition> gkvec_partition_;
-
-        /// First-variational eigen values
-        std::vector<double> fv_eigen_values_;
-
-        /// First-variational eigen vectors, distributed over 2D BLACS grid.
-        dmatrix<double_complex> fv_eigen_vectors_;
-
-        /// First-variational eigen vectors, distributed in slabs.
-        std::unique_ptr<Wave_functions> fv_eigen_vectors_slab_;
-
-        /// Lowest eigen-vectors of the LAPW overlap matrix with small aigen-values.
-        std::unique_ptr<Wave_functions> singular_components_;
-
-        /// Second-variational eigen vectors.
-        /** Second-variational eigen-vectors are stored as one or two \f$ N_{fv} \times N_{fv} \f$ matrices in
-         *  case of non-magnetic or collinear magnetic case or as a single \f$ 2 N_{fv} \times 2 N_{fv} \f$
-         *  matrix in case of general non-collinear magnetism. */
-        dmatrix<double_complex> sv_eigen_vectors_[2];
-
-        /// Full-diagonalization eigen vectors.
-        mdarray<double_complex, 2> fd_eigen_vectors_;
-
-        /// First-variational states.
-        std::unique_ptr<Wave_functions> fv_states_{nullptr};
-
-        /// Two-component (spinor) wave functions describing the bands.
-        std::unique_ptr<Wave_functions> spinor_wave_functions_{nullptr};
-
-        /// Two-component (spinor) hubbard wave functions where the S matrix is applied (if ppus).
-        std::unique_ptr<Wave_functions> hubbard_wave_functions_{nullptr};
-
-        /// Band occupation numbers.
-        mdarray<double, 2> band_occupancies_;
-
-        /// Band energies.
-        mdarray<double, 2> band_energies_;
-
-        /// LAPW matching coefficients for the row G+k vectors.
-        /** Used to setup the distributed LAPW Hamiltonian and overlap matrices. */
-        std::unique_ptr<Matching_coefficients> alm_coeffs_row_{nullptr};
-
-        /// LAPW matching coefficients for the column G+k vectors.
-        /** Used to setup the distributed LAPW Hamiltonian and overlap matrices. */
-        std::unique_ptr<Matching_coefficients> alm_coeffs_col_{nullptr};
-
-        /// LAPW matching coefficients for the local set G+k vectors.
-        std::unique_ptr<Matching_coefficients> alm_coeffs_loc_{nullptr};
-
-        /// Mapping between local row and global G+k vecotor index.
-        /** Used by matching_coefficients class. */
-        std::vector<int> igk_row_;
-
-        /// Mapping between local column and global G+k vecotor index.
-        /** Used by matching_coefficients class. */
-        std::vector<int> igk_col_;
-
-        /// Mapping between local and global G+k vecotor index.
-        /** Used by matching_coefficients class. */
-        std::vector<int> igk_loc_;
-
-        /// Number of G+k vectors distributed along rows of MPI grid
-        int num_gkvec_row_{0};
-
-        /// Number of G+k vectors distributed along columns of MPI grid
-        int num_gkvec_col_{0};
-
-        /// Offset of the local fraction of G+k vectors in the global index.
-        int gkvec_offset_{0};
-
-        /// Basis descriptors distributed between rows of the 2D MPI grid.
-        /** This is a local array. Only MPI ranks belonging to the same column have identical copies of this array. */
-        std::vector<lo_basis_descriptor> lo_basis_descriptors_row_;
-
-        /// Basis descriptors distributed between columns of the 2D MPI grid.
-        /** This is a local array. Only MPI ranks belonging to the same row have identical copies of this array. */
-        std::vector<lo_basis_descriptor> lo_basis_descriptors_col_;
-
-        /// List of columns of the Hamiltonian and overlap matrix lo block (local index) for a given atom.
-        std::vector<std::vector<int>> atom_lo_cols_;
-
-        /// list of rows of the Hamiltonian and overlap matrix lo block (local index) for a given atom
-        std::vector<std::vector<int>> atom_lo_rows_;
-
-        /// Imaginary unit to the power of l.
-        std::vector<double_complex> zil_;
-
-        /// Mapping between lm and l.
-        std::vector<int> l_by_lm_;
-
-        /// Column rank of the processors of ScaLAPACK/ELPA diagonalization grid.
-        int rank_col_;
-
-        /// Number of processors along the columns of the diagonalization grid.
-        int num_ranks_col_;
-
-        /// Row rank of the processors of ScaLAPACK/ELPA diagonalization grid.
-        int rank_row_;
-
-        /// Number of processors along the rows of the diagonalization grid.
-        int num_ranks_row_;
-
-        /// Beta projectors for a local set of G+k vectors.
-        std::unique_ptr<Beta_projectors> beta_projectors_{nullptr};
-
-        /// Beta projectors for row G+k vectors.
-        /** Used to setup the full Hamiltonian in PP-PW case (for verification purpose only) */
-        std::unique_ptr<Beta_projectors> beta_projectors_row_{nullptr};
-
-        /// Beta projectors for column G+k vectors.
-        /** Used to setup the full Hamiltonian in PP-PW case (for verification purpose only) */
-        std::unique_ptr<Beta_projectors> beta_projectors_col_{nullptr};
-
-        /// Preconditioner matrix for Chebyshev solver.
-        mdarray<double_complex, 3> p_mtrx_;
-
-        /// Communicator for parallelization inside k-point.
-        /** This communicator is used to split G+k vectors and wave-functions. */
-        Communicator const& comm_;
-
-        /// Communicator between(!!) rows.
-        Communicator const& comm_row_;
-
-        /// Communicator between(!!) columns.
-        Communicator const& comm_col_;
-
-        /// Generate G+k and local orbital basis sets.
-        inline void generate_gklo_basis();
-
-        /// Test orthonormalization of first-variational states.
-        inline void test_fv_states();
-
-        inline double& band_energy_aux(int j__, int ispn__)
-        {
-            if (ctx_.num_mag_dims() == 3) {
-                return band_energies_(j__, 0);
-            } else {
-                if (!(ispn__ == 0 || ispn__ == 1)) {
-                    TERMINATE("wrong spin index");
-                }
-                return band_energies_(j__, ispn__);
-            }
-        }
-
-        inline double& band_occupancy_aux(int j__, int ispn__)
-        {
-            if (ctx_.num_mag_dims() == 3) {
-                return band_occupancies_(j__, 0);
-            } else {
-                if (!(ispn__ == 0 || ispn__ == 1)) {
-                    TERMINATE("wrong spin index");
-                }
-                return band_occupancies_(j__, ispn__);
-            }
-        }
-
-        /// Find G+k vectors within the cutoff.
-        inline void generate_gkvec(double gk_cutoff__)
-        {
-            PROFILE("sirius::K_point::generate_gkvec");
-
-            if (ctx_.full_potential() && (gk_cutoff__ * unit_cell_.max_mt_radius() > ctx_.lmax_apw()) &&
-                comm_.rank() == 0 && ctx_.control().verbosity_ >= 0) {
-                std::stringstream s;
-                s << "G+k cutoff (" << gk_cutoff__ << ") is too large for a given lmax ("
-                  << ctx_.lmax_apw() << ") and a maximum MT radius (" << unit_cell_.max_mt_radius() << ")" << std::endl
-                  << "suggested minimum value for lmax : " << int(gk_cutoff__ * unit_cell_.max_mt_radius()) + 1;
-                WARNING(s);
-            }
-
-            if (gk_cutoff__ * 2 > ctx_.pw_cutoff()) {
-                std::stringstream s;
-                s << "G+k cutoff is too large for a given plane-wave cutoff" << std::endl
-                  << "  pw cutoff : " << ctx_.pw_cutoff() << std::endl
-                  << "  doubled G+k cutoff : " << gk_cutoff__ * 2;
-                TERMINATE(s);
-            }
-
-            /* create G+k vectors; communicator of the coarse FFT grid is used because wave-functions will be transformed
-             * only on the coarse grid; G+k-vectors will be distributed between MPI ranks assigned to the k-point */
-            gkvec_ = std::unique_ptr<Gvec>(new Gvec(vk_, ctx_.unit_cell().reciprocal_lattice_vectors(), gk_cutoff__, comm(),
-                                                    ctx_.gamma_point()));
-
-            gkvec_partition_ = std::unique_ptr<Gvec_partition>(new Gvec_partition(*gkvec_, ctx_.comm_fft_coarse(),
-                                                                                  ctx_.comm_band_ortho_fft_coarse()));
-
-            gkvec_offset_ = gkvec().gvec_offset(comm().rank());
-        }
-
-    public:
-
-        /// Constructor
-        K_point(Simulation_context& ctx__,
-                double const* vk__,
-                double weight__)
-            : ctx_(ctx__)
-            , unit_cell_(ctx_.unit_cell())
-            , weight_(weight__)
-            , comm_(ctx_.comm_band())
-            , comm_row_(ctx_.blacs_grid().comm_row())
-            , comm_col_(ctx_.blacs_grid().comm_col())
-        {
-            PROFILE("sirius::K_point::K_point");
-
-            for (int x = 0; x < 3; x++) {
-                vk_[x] = vk__[x];
-            }
-
-            band_occupancies_ = mdarray<double, 2>(ctx_.num_bands(), ctx_.num_spin_dims());
-            band_occupancies_.zero();
-            band_energies_ = mdarray<double, 2>(ctx_.num_bands(), ctx_.num_spin_dims());
-            band_energies_.zero();
-
-            num_ranks_row_ = comm_row_.size();
-            num_ranks_col_ = comm_col_.size();
-
-            rank_row_ = comm_row_.rank();
-            rank_col_ = comm_col_.rank();
-        }
-
-        /// Initialize the k-point related arrays and data.
-        inline void initialize();
-
-        inline void update()
-        {
-            PROFILE("sirius::K_point::update");
-
-            gkvec_->lattice_vectors(ctx_.unit_cell().reciprocal_lattice_vectors());
-
-            if (ctx_.full_potential()) {
-                if (ctx_.iterative_solver_input().type_ == "exact") {
-                    alm_coeffs_row_ = std::unique_ptr<Matching_coefficients>(
-                        new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_row(), igk_row_, gkvec()));
-                    alm_coeffs_col_ = std::unique_ptr<Matching_coefficients>(
-                        new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_col(), igk_col_, gkvec()));
-                }
-                alm_coeffs_loc_ = std::unique_ptr<Matching_coefficients>(
-                    new Matching_coefficients(unit_cell_, ctx_.lmax_apw(), num_gkvec_loc(), igk_loc_, gkvec()));
-            }
-
-            if (!ctx_.full_potential()) {
-                /* compute |beta> projectors for atom types */
-                beta_projectors_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_loc_));
-
-                if (ctx_.iterative_solver_input().type_ == "exact") {
-                    beta_projectors_row_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_row_));
-                    beta_projectors_col_ = std::unique_ptr<Beta_projectors>(new Beta_projectors(ctx_, gkvec(), igk_col_));
-
-                }
-
-                //if (false) {
-                //    p_mtrx_ = mdarray<double_complex, 3>(unit_cell_.max_mt_basis_size(), unit_cell_.max_mt_basis_size(), unit_cell_.num_atom_types());
-                //    p_mtrx_.zero();
-
-                //    for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
-                //        auto& atom_type = unit_cell_.atom_type(iat);
-
-                //        if (!atom_type.pp_desc().augment) {
-                //            continue;
-                //        }
-                //        int nbf = atom_type.mt_basis_size();
-                //        int ofs = atom_type.offset_lo();
-
-                //        matrix<double_complex> qinv(nbf, nbf);
-                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
-                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
-                //                qinv(xi2, xi1) = ctx_.augmentation_op(iat).q_mtrx(xi2, xi1);
-                //            }
-                //        }
-                //        linalg<CPU>::geinv(nbf, qinv);
-                //
-                //        /* compute P^{+}*P */
-                //        linalg<CPU>::gemm(2, 0, nbf, nbf, num_gkvec_loc(),
-                //                          beta_projectors_->beta_gk_t().at<CPU>(0, ofs), beta_projectors_->beta_gk_t().ld(),
-                //                          beta_projectors_->beta_gk_t().at<CPU>(0, ofs), beta_projectors_->beta_gk_t().ld(),
-                //                          &p_mtrx_(0, 0, iat), p_mtrx_.ld());
-                //        comm().allreduce(&p_mtrx_(0, 0, iat), unit_cell_.max_mt_basis_size() * unit_cell_.max_mt_basis_size());
-
-                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
-                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
-                //                qinv(xi2, xi1) += p_mtrx_(xi2, xi1, iat);
-                //            }
-                //        }
-                //        /* compute (Q^{-1} + P^{+}*P)^{-1} */
-                //        linalg<CPU>::geinv(nbf, qinv);
-                //        for (int xi1 = 0; xi1 < nbf; xi1++) {
-                //            for (int xi2 = 0; xi2 < nbf; xi2++) {
-                //                p_mtrx_(xi2, xi1, iat) = qinv(xi2, xi1);
-                //            }
-                //        }
-                //    }
-                //}
-            }
-
-        }
-
-        /// Generate first-variational states from eigen-vectors.
-        /** First-variational states are obtained from the first-variational eigen-vectors and
-         *  LAPW matching coefficients.
-         *
-         *  APW part:
-         *  \f[
-         *      \psi_{\xi j}^{\bf k} = \sum_{{\bf G}} Z_{{\bf G} j}^{\bf k} * A_{\xi}({\bf G+k})
-         *  \f]
-         */
-        void generate_fv_states();
-
-        #ifdef __GPU
-        void generate_fv_states_aw_mt_gpu();
-        #endif
-
-        /// Generate two-component spinor wave functions
-        inline void generate_spinor_wave_functions();
-
-        inline void generate_atomic_centered_wavefunctions(const int num_ao__, Wave_functions &phi);
-
-        inline void generate_atomic_centered_wavefunctions_aux(const int num_ao__, Wave_functions &phi, std::vector<int> &offset, bool hubbard);
-
-        void compute_gradient_wavefunctions(Wave_functions &phi,
-                                            const int starting_position_i,
-                                            const int num_wf,
-                                            Wave_functions &dphi,
-                                            const int starting_position_j,
-                                            const int direction);
-
-        void save(int id);
-
-        void load(HDF5_tree h5in, int id);
-
-        //== void save_wave_functions(int id);
-
-        //== void load_wave_functions(int id);
-
-        void get_fv_eigen_vectors(mdarray<double_complex, 2>& fv_evec);
-
-        void get_sv_eigen_vectors(mdarray<double_complex, 2>& sv_evec);
-
-        /// Test orthonormalization of spinor wave-functions
-        void test_spinor_wave_functions(int use_fft);
-
-        /// Get the number of occupied bands for each spin channel.
-        int num_occupied_bands(int ispn__ = -1)
-        {
-            for (int j = ctx_.num_bands() - 1; j >= 0; j--) {
-                if (std::abs(band_occupancy(j, ispn__) * weight()) > 1e-14) {
-                    return j + 1;
-                }
-            }
-            return 0;
-        }
-
-        /// Total number of G+k vectors within the cutoff distance
-        inline int num_gkvec() const
-        {
-            return gkvec_->num_gvec();
-        }
-
-        /// Total number of muffin-tin and plane-wave expansion coefficients for the wave-functions.
-        /** APW+lo basis \f$ \varphi_{\mu {\bf k}}({\bf r}) = \{ \varphi_{\bf G+k}({\bf r}),
-         *  \varphi_{j{\bf k}}({\bf r}) \} \f$ is used to expand first-variational wave-functions:
-         *
-         *  \f[
-         *      \psi_{i{\bf k}}({\bf r}) = \sum_{\mu} c_{\mu i}^{\bf k} \varphi_{\mu \bf k}({\bf r}) =
-         *      \sum_{{\bf G}}c_{{\bf G} i}^{\bf k} \varphi_{\bf G+k}({\bf r}) +
-         *      \sum_{j}c_{j i}^{\bf k}\varphi_{j{\bf k}}({\bf r})
-         *  \f]
-         *
-         *  Inside muffin-tins the expansion is converted into the following form:
-         *  \f[
-         *      \psi_{i {\bf k}}({\bf r})= \begin{array}{ll}
-         *      \displaystyle \sum_{L} \sum_{\lambda=1}^{N_{\ell}^{\alpha}}
-         *      F_{L \lambda}^{i {\bf k},\alpha}f_{\ell \lambda}^{\alpha}(r)
-         *      Y_{\ell m}(\hat {\bf r}) & {\bf r} \in MT_{\alpha} \end{array}
-         *  \f]
-         *
-         *  Thus, the total number of coefficients representing a wave-funstion is equal
-         *  to the number of muffin-tin basis functions of the form \f$ f_{\ell \lambda}^{\alpha}(r)
-         *  Y_{\ell m}(\hat {\bf r}) \f$ plust the number of G+k plane waves. */
-        //inline int wf_size() const // TODO: better name for this
-        //{
-        //    if (ctx_.full_potential()) {
-        //        return unit_cell_.mt_basis_size() + num_gkvec();
-        //    } else {
-        //        return num_gkvec();
-        //    }
-        //}
-
-        inline double& band_energy(int j__, int ispn__)
-        {
-            return band_energy_aux(j__, ispn__);
-        }
-
-        inline double band_energy(int j__, int ispn__) const
-        {
-            auto const& e = const_cast<K_point*>(this)->band_energy_aux(j__, ispn__);
-            return e;
-        }
-
-        inline double& band_occupancy(int j__, int ispn__)
-        {
-            return band_occupancy_aux(j__, ispn__);
-        }
-
-        inline double band_occupancy(int j__, int ispn__) const
-        {
-            auto const& e = const_cast<K_point*>(this)->band_occupancy_aux(j__, ispn__);
-            return e;
-        }
-
-        inline double fv_eigen_value(int i) const
-        {
-            return fv_eigen_values_[i];
-        }
-
-        void set_fv_eigen_values(double* eval)
-        {
-            std::memcpy(&fv_eigen_values_[0], eval, ctx_.num_fv_states() * sizeof(double));
-        }
-
-        inline double weight() const
-        {
-            return weight_;
-        }
-
-        inline Wave_functions& fv_states()
-        {
-            return *fv_states_;
-        }
-
-        inline Wave_functions& spinor_wave_functions()
-        {
-            return *spinor_wave_functions_;
-        }
-
-        inline Wave_functions& hubbard_wave_functions()
-        {
-            return *hubbard_wave_functions_;
-        }
-
-        inline Wave_functions const& hubbard_wave_functions() const
-        {
-            return *hubbard_wave_functions_;
-        }
-
-        inline void allocate_hubbard_wave_functions(int size)
-        {
-            if (hubbard_wave_functions_ != nullptr) {
-                return;
-            }
-            const int num_sc = ctx_.num_mag_dims() == 3 ?  2 : 1;
-            hubbard_wave_functions_ = std::unique_ptr<Wave_functions>(new Wave_functions(gkvec_partition(),
-                                                                                         size,
-                                                                                         num_sc));
-        }
-
-        inline bool hubbard_wave_functions_calculated()
-        {
-            return (hubbard_wave_functions_ != nullptr);
-        }
-
-        inline Wave_functions& singular_components()
-        {
-            return *singular_components_;
-        }
-
-        inline vector3d<double> vk() const
-        {
-            return vk_;
-        }
-
-        /// Basis size of LAPW+lo method.
-        inline int gklo_basis_size() const
-        {
-            return static_cast<int>(num_gkvec() + unit_cell_.mt_lo_basis_size());
-        }
-
-        /// Local number of G+k vectors in case of flat distribution.
-        inline int num_gkvec_loc() const
-        {
-            return gkvec().count();
-        }
-
-        /// Return global index of G+k vector.
-        inline int idxgk(int igkloc__) const
-        {
-            return gkvec_offset_ + igkloc__;
-        }
-
-        /// Local number of G+k vectors for each MPI rank in the row of the 2D MPI grid.
-        inline int num_gkvec_row() const
-        {
-            return num_gkvec_row_;
-        }
-
-        /// Local number of local orbitals for each MPI rank in the row of the 2D MPI grid.
-        inline int num_lo_row() const
-        {
-            return static_cast<int>(lo_basis_descriptors_row_.size());
-        }
-
-        /// Local number of basis functions for each MPI rank in the row of the 2D MPI grid.
-        inline int gklo_basis_size_row() const
-        {
-            return num_gkvec_row() + num_lo_row();
-        }
-
-        /// Local number of G+k vectors for each MPI rank in the column of the 2D MPI grid.
-        inline int num_gkvec_col() const
-        {
-            return num_gkvec_col_;
-        }
-
-        /// Local number of local orbitals for each MPI rank in the column of the 2D MPI grid.
-        inline int num_lo_col() const
-        {
-            return static_cast<int>(lo_basis_descriptors_col_.size());
-        }
-
-        /// Local number of basis functions for each MPI rank in the column of the 2D MPI grid.
-        inline int gklo_basis_size_col() const
-        {
-            return num_gkvec_col() + num_lo_col();
-        }
-
-        inline lo_basis_descriptor const& lo_basis_descriptor_col(int idx) const
-        {
-            assert(idx >=0 && idx < (int)lo_basis_descriptors_col_.size());
-            return lo_basis_descriptors_col_[idx];
-        }
-
-        inline lo_basis_descriptor const& lo_basis_descriptor_row(int idx) const
-        {
-            assert(idx >= 0 && idx < (int)lo_basis_descriptors_row_.size());
-            return lo_basis_descriptors_row_[idx];
-        }
-
-        inline int igk_loc(int idx__) const
-        {
-            return igk_loc_[idx__];
-        }
-
-        inline std::vector<int> const& igk_loc() const
-        {
-            return igk_loc_;
-        }
-
-        inline int igk_row(int idx__) const
-        {
-            return igk_row_[idx__];
-        }
-
-        inline std::vector<int> const& igk_row() const
-        {
-            return igk_row_;
-        }
-
-        inline int igk_col(int idx__) const
-        {
-            return igk_col_[idx__];
-        }
-
-        inline std::vector<int> const& igk_col() const
-        {
-            return igk_col_;
-        }
-
-        inline int num_ranks_row() const
-        {
-            return num_ranks_row_;
-        }
-
-        inline int rank_row() const
-        {
-            return rank_row_;
-        }
-
-        inline int num_ranks_col() const
-        {
-            return num_ranks_col_;
-        }
-
-        inline int rank_col() const
-        {
-            return rank_col_;
-        }
-
-        /// Number of MPI ranks for a given k-point
-        inline int num_ranks() const
-        {
-            return comm_.size();
-        }
-
-        inline int rank() const
-        {
-            return comm_.rank();
-        }
-
-        /// Return number of lo columns for a given atom
-        inline int num_atom_lo_cols(int ia) const
-        {
-            return (int)atom_lo_cols_[ia].size();
-        }
-
-        /// Return local index (for the current MPI rank) of a column for a given atom and column index within an atom
-        inline int lo_col(int ia, int i) const
-        {
-            return atom_lo_cols_[ia][i];
-        }
-
-        /// Return number of lo rows for a given atom
-        inline int num_atom_lo_rows(int ia) const
-        {
-            return (int)atom_lo_rows_[ia].size();
-        }
-
-        /// Return local index (for the current MPI rank) of a row for a given atom and row index within an atom
-        inline int lo_row(int ia, int i) const
-        {
-            return atom_lo_rows_[ia][i];
-        }
-
-        inline dmatrix<double_complex>& fv_eigen_vectors()
-        {
-            return fv_eigen_vectors_;
-        }
-
-        inline Wave_functions& fv_eigen_vectors_slab()
-        {
-            return *fv_eigen_vectors_slab_;
-        }
-
-        inline dmatrix<double_complex>& sv_eigen_vectors(int ispn)
-        {
-            return sv_eigen_vectors_[ispn];
-        }
-
-        inline mdarray<double_complex, 2>& fd_eigen_vectors()
-        {
-            return fd_eigen_vectors_;
-        }
-
-        void bypass_sv()
-        {
-            std::memcpy(&band_energies_[0], &fv_eigen_values_[0], ctx_.num_fv_states() * sizeof(double));
-        }
-
-        inline Gvec const& gkvec() const
-        {
-            return *gkvec_;
-        }
-
-        inline Gvec_partition const& gkvec_partition() const
-        {
-            return *gkvec_partition_;
-        }
-
-        inline Matching_coefficients const& alm_coeffs_row()
-        {
-            return *alm_coeffs_row_;
-        }
-
-        inline Matching_coefficients const& alm_coeffs_col()
-        {
-            return *alm_coeffs_col_;
-        }
-
-        inline Matching_coefficients const& alm_coeffs_loc() const
-        {
-            return *alm_coeffs_loc_;
-        }
-
-        inline Communicator const& comm() const
-        {
-            return comm_;
-        }
-
-        inline Communicator const& comm_row() const
-        {
-            return comm_row_;
-        }
-
-        inline Communicator const& comm_col() const
-        {
-            return comm_col_;
-        }
-
-        inline double_complex p_mtrx(int xi1, int xi2, int iat) const
-        {
-            return p_mtrx_(xi1, xi2, iat);
-        }
-
-        inline mdarray<double_complex, 3>& p_mtrx()
-        {
-            return p_mtrx_;
-        }
-
-        Beta_projectors& beta_projectors()
-        {
-            assert(beta_projectors_ != nullptr);
-            return *beta_projectors_;
-        }
-
-        Beta_projectors& beta_projectors_row()
-        {
-            assert(beta_projectors_ != nullptr);
-            return *beta_projectors_row_;
-        }
-
-        Beta_projectors& beta_projectors_col()
-        {
-            assert(beta_projectors_ != nullptr);
-            return *beta_projectors_col_;
-        }
-};
-
-#include "K_point/generate_fv_states.hpp"
-#include "K_point/generate_spinor_wave_functions.hpp"
-#include "K_point/generate_gklo_basis.hpp"
-#include "K_point/initialize.hpp"
-#include "K_point/k_point.hpp"
-#include "K_point/test_fv_states.hpp"
-#include "K_point/generate_atomic_centered_wavefunctions.hpp"
-
-}
-
-/** \page basis Basis functions for Kohn-Sham wave-functions expansion
- *
- *  \section basis1 LAPW+lo basis
- *
- *  LAPW+lo basis consists of two different sets of functions: LAPW functions \f$ \varphi_{{\bf G+k}} \f$ defined over
- *  entire unit cell:
- *  \f[
- *      \varphi_{{\bf G+k}}({\bf r}) = \left\{ \begin{array}{ll}
- *      \displaystyle \sum_{L} \sum_{\nu=1}^{O_{\ell}^{\alpha}} a_{L\nu}^{\alpha}({\bf G+k})u_{\ell \nu}^{\alpha}(r)
- *      Y_{\ell m}(\hat {\bf r}) & {\bf r} \in {\rm MT} \alpha \\
- *      \displaystyle \frac{1}{\sqrt  \Omega} e^{i({\bf G+k}){\bf r}} & {\bf r} \in {\rm I} \end{array} \right.
- *  \f]
- *  and Bloch sums of local orbitals defined inside muffin-tin spheres only:
- *  \f[
- *      \begin{array}{ll} \displaystyle \varphi_{j{\bf k}}({\bf r})=\sum_{{\bf T}} e^{i{\bf kT}}
- *      \varphi_{j}({\bf r - T}) & {\rm {\bf r} \in MT} \end{array}
- *  \f]
- *  Each local orbital is composed of radial and angular parts:
- *  \f[
- *      \varphi_{j}({\bf r}) = \phi_{\ell_j}^{\zeta_j,\alpha_j}(r) Y_{\ell_j m_j}(\hat {\bf r})
- *  \f]
- *  Radial part of local orbital is defined as a linear combination of radial functions (minimum two radial functions
- *  are required) such that local orbital vanishes at the sphere boundary:
- *  \f[
- *      \phi_{\ell}^{\zeta, \alpha}(r) = \sum_{p}\gamma_{p}^{\zeta,\alpha} u_{\ell \nu_p}^{\alpha}(r)
- *  \f]
- *
- *  Arbitrary number of local orbitals can be introduced for each angular quantum number (this is highlighted by
- *  the index \f$ \zeta \f$).
- *
- *  Radial functions are m-th order (with zero-order being a function itself) energy derivatives of the radial
- *  Schrödinger equation:
- *  \f[
- *      u_{\ell \nu}^{\alpha}(r) = \frac{\partial^{m_{\nu}}}{\partial^{m_{\nu}}E}u_{\ell}^{\alpha}(r,E)\Big|_{E=E_{\nu}}
- *  \f]
- */
-
-/** \page data_dist K-point data distribution
- *
- *  \section data_dist1 "Panel" and "full" data storage
- *
- *  We have to deal with big arrays (matching coefficients, eigen vectors, wave functions, etc.) which may not fit
- *  into the memory of a single node. For some operations we need a "panel" distribution of data, where each
- *  MPI rank gets a local panel of block-cyclic distributed matrix. This way of storing data is necessary for the
- *  distributed PBLAS and ScaLAPACK operations.
- *
- */
-
-
-#endif // __K_POINT_H__
diff --git a/src/sht.h b/src/sht.h
deleted file mode 100644
index 5eda743b6..000000000
--- a/src/sht.h
+++ /dev/null
@@ -1,1288 +0,0 @@
-// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
-// the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
-//    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
-//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/** \file sht.h
- *
- *  \brief Contains declaration and particular implementation of sirius::SHT class.
- */
-
-#ifndef __SHT_H__
-#define __SHT_H__
-
-#include <math.h>
-#include <stddef.h>
-#include <gsl/gsl_sf_coupling.h>
-#include <gsl/gsl_sf_legendre.h>
-#include <string.h>
-#include <vector>
-#include <algorithm>
-
-#include "typedefs.h"
-#include "linalg.hpp"
-#include "lebedev_grids.hpp"
-
-namespace sirius {
-
-/// Spherical harmonics transformations and related oprtations.
-/** This class is responsible for the generation of complex and real spherical harmonics, generation of transformation
- *  matrices, transformation between spectral and real-space representations, generation of Gaunt and Clebsch-Gordan
- *  coefficients and calculation of spherical harmonic derivatives */
-class SHT // TODO: better name
-{
-    private:
-
-        /// Maximum \f$ \ell \f$ of spherical harmonics.
-        int lmax_;
-
-        /// Maximum number of \f$ \ell, m \f$ components.
-        int lmmax_;
-
-        /// Number of real-space \f$ (\theta, \phi) \f$ points on the sphere.
-        int num_points_;
-
-        /// Cartesian coordinates of points (normalized to 1).
-        mdarray<double, 2> coord_;
-
-        /// \f$ (\theta, \phi) \f$ angles of points.
-        mdarray<double, 2> tp_;
-
-        /// Point weights.
-        std::vector<double> w_;
-
-        /// Backward transformation from Ylm to spherical coordinates.
-        mdarray<double_complex, 2> ylm_backward_;
-
-        /// Forward transformation from spherical coordinates to Ylm.
-        mdarray<double_complex, 2> ylm_forward_;
-
-        /// Backward transformation from Rlm to spherical coordinates.
-        mdarray<double, 2> rlm_backward_;
-
-        /// Forward transformation from spherical coordinates to Rlm.
-        mdarray<double, 2> rlm_forward_;
-
-        /// Type of spherical grid (0: Lebedev-Laikov, 1: uniform).
-        int mesh_type_{0};
-
-    public:
-
-        /// Default constructor.
-        SHT(int lmax__)
-            : lmax_(lmax__)
-        {
-            lmmax_ = (lmax_ + 1) * (lmax_ + 1);
-
-            if (mesh_type_ == 0) {
-                num_points_ = Lebedev_Laikov_npoint(2 * lmax_);
-            }
-            if (mesh_type_ == 1) {
-                num_points_ = lmmax_;
-            }
-
-            std::vector<double> x(num_points_);
-            std::vector<double> y(num_points_);
-            std::vector<double> z(num_points_);
-
-            coord_ = mdarray<double, 2>(3, num_points_);
-
-            tp_ = mdarray<double, 2>(2, num_points_);
-
-            w_.resize(num_points_);
-
-            if (mesh_type_ == 0) Lebedev_Laikov_sphere(num_points_, &x[0], &y[0], &z[0], &w_[0]);
-            if (mesh_type_ == 1) uniform_coverage();
-
-            ylm_backward_ = mdarray<double_complex, 2>(lmmax_, num_points_);
-
-            ylm_forward_ = mdarray<double_complex, 2>(num_points_, lmmax_);
-
-            rlm_backward_ = mdarray<double, 2>(lmmax_, num_points_);
-
-            rlm_forward_ = mdarray<double, 2>(num_points_, lmmax_);
-
-            for (int itp = 0; itp < num_points_; itp++) {
-                if (mesh_type_ == 0) {
-                    coord_(0, itp) = x[itp];
-                    coord_(1, itp) = y[itp];
-                    coord_(2, itp) = z[itp];
-
-                    auto vs = spherical_coordinates(vector3d<double>(x[itp], y[itp], z[itp]));
-                    tp_(0, itp) = vs[1];
-                    tp_(1, itp) = vs[2];
-                    spherical_harmonics(lmax_, vs[1], vs[2], &ylm_backward_(0, itp));
-                    spherical_harmonics(lmax_, vs[1], vs[2], &rlm_backward_(0, itp));
-                    for (int lm = 0; lm < lmmax_; lm++) {
-                        ylm_forward_(itp, lm) = std::conj(ylm_backward_(lm, itp)) * w_[itp] * fourpi;
-                        rlm_forward_(itp, lm) = rlm_backward_(lm, itp) * w_[itp] * fourpi;
-                    }
-                }
-                if (mesh_type_ == 1) {
-                    double t = tp_(0, itp);
-                    double p = tp_(1, itp);
-
-                    coord_(0, itp) = sin(t) * cos(p);
-                    coord_(1, itp) = sin(t) * sin(p);
-                    coord_(2, itp) = cos(t);
-
-                    spherical_harmonics(lmax_, t, p, &ylm_backward_(0, itp));
-                    spherical_harmonics(lmax_, t, p, &rlm_backward_(0, itp));
-
-                    for (int lm = 0; lm < lmmax_; lm++) {
-                        ylm_forward_(lm, itp) = ylm_backward_(lm, itp);
-                        rlm_forward_(lm, itp) = rlm_backward_(lm, itp);
-                    }
-                }
-            }
-
-            if (mesh_type_ == 1) {
-                linalg<CPU>::geinv(lmmax_, ylm_forward_);
-                linalg<CPU>::geinv(lmmax_, rlm_forward_);
-            }
-
-            #if (__VERIFICATION > 0)
-            {
-                double dr = 0;
-                double dy = 0;
-
-                for (int lm = 0; lm < lmmax_; lm++)
-                {
-                    for (int lm1 = 0; lm1 < lmmax_; lm1++)
-                    {
-                        double t = 0;
-                        double_complex zt(0, 0);
-                        for (int itp = 0; itp < num_points_; itp++)
-                        {
-                            zt += ylm_forward_(itp, lm) * ylm_backward_(lm1, itp);
-                            t += rlm_forward_(itp, lm) * rlm_backward_(lm1, itp);
-                        }
-
-                        if (lm == lm1)
-                        {
-                            zt -= 1.0;
-                            t -= 1.0;
-                        }
-                        dr += std::abs(t);
-                        dy += std::abs(zt);
-                    }
-                }
-                dr = dr / lmmax_ / lmmax_;
-                dy = dy / lmmax_ / lmmax_;
-
-                if (dr > 1e-15 || dy > 1e-15)
-                {
-                    std::stringstream s;
-                    s << "spherical mesh error is too big" << std::endl
-                      << "  real spherical integration error " << dr << std::endl
-                      << "  complex spherical integration error " << dy;
-                    WARNING(s.str())
-                }
-
-                std::vector<double> flm(lmmax_);
-                std::vector<double> ftp(num_points_);
-                for (int lm = 0; lm < lmmax_; lm++)
-                {
-                    std::memset(&flm[0], 0, lmmax_ * sizeof(double));
-                    flm[lm] = 1.0;
-                    backward_transform(lmmax_, &flm[0], 1, lmmax_, &ftp[0]);
-                    forward_transform(&ftp[0], 1, lmmax_, lmmax_, &flm[0]);
-                    flm[lm] -= 1.0;
-
-                    double t = 0.0;
-                    for (int lm1 = 0; lm1 < lmmax_; lm1++) t += std::abs(flm[lm1]);
-
-                    t /= lmmax_;
-
-                    if (t > 1e-15)
-                    {
-                        std::stringstream s;
-                        s << "test of backward / forward real SHT failed" << std::endl
-                          << "  total error " << t;
-                        WARNING(s.str());
-                    }
-                }
-            }
-            #endif
-        }
-
-        /// Perform a backward transformation from spherical harmonics to spherical coordinates.
-        /** \f[
-         *      f(\theta, \phi, r) = \sum_{\ell m} f_{\ell m}(r) Y_{\ell m}(\theta, \phi)
-         *  \f]
-         *
-         *  \param [in] ld Size of leading dimension of flm.
-         *  \param [in] flm Raw pointer to \f$ f_{\ell m}(r) \f$.
-         *  \param [in] nr Number of radial points.
-         *  \param [in] lmmax Maximum number of lm- harmonics to take into sum.
-         *  \param [out] ftp Raw pointer to \f$ f(\theta, \phi, r) \f$.
-         */
-        template <typename T>
-        void backward_transform(int ld, T const* flm, int nr, int lmmax, T* ftp);
-
-        /// Perform a forward transformation from spherical coordinates to spherical harmonics.
-        /** \f[
-         *      f_{\ell m}(r) = \iint  f(\theta, \phi, r) Y_{\ell m}^{*}(\theta, \phi) \sin \theta d\phi d\theta =
-         *        \sum_{i} f(\theta_i, \phi_i, r) Y_{\ell m}^{*}(\theta_i, \phi_i) w_i
-         *  \f]
-         *
-         *  \param [in] ftp Raw pointer to \f$ f(\theta, \phi, r) \f$.
-         *  \param [in] nr Number of radial points.
-         *  \param [in] lmmax Maximum number of lm- coefficients to generate.
-         *  \param [in] ld Size of leading dimension of flm.
-         *  \param [out] flm Raw pointer to \f$ f_{\ell m}(r) \f$.
-         */
-        template <typename T>
-        void forward_transform(T const* ftp, int nr, int lmmax, int ld, T* flm);
-
-        /// Convert form Rlm to Ylm representation.
-        static void convert(int lmax__, double const* f_rlm__, double_complex* f_ylm__)
-        {
-            int lm = 0;
-            for (int l = 0; l <= lmax__; l++) {
-                for (int m = -l; m <= l; m++) {
-                    if (m == 0) {
-                        f_ylm__[lm] = f_rlm__[lm];
-                    } else {
-                        int lm1 = utils::lm(l, -m);
-                        f_ylm__[lm] = ylm_dot_rlm(l, m, m) * f_rlm__[lm] + ylm_dot_rlm(l, m, -m) * f_rlm__[lm1];
-                    }
-                    lm++;
-                }
-            }
-        }
-
-        /// Convert from Ylm to Rlm representation.
-        static void convert(int lmax__, double_complex const* f_ylm__, double* f_rlm__)
-        {
-            int lm = 0;
-            for (int l = 0; l <= lmax__; l++) {
-                for (int m = -l; m <= l; m++) {
-                    if (m == 0) {
-                        f_rlm__[lm] = std::real(f_ylm__[lm]);
-                    } else {
-                        int lm1 = utils::lm(l, -m);
-                        f_rlm__[lm] = std::real(rlm_dot_ylm(l, m, m) * f_ylm__[lm] + rlm_dot_ylm(l, m, -m) * f_ylm__[lm1]);
-                    }
-                    lm++;
-                }
-            }
-        }
-
-        //void rlm_forward_iterative_transform(double *ftp__, int lmmax, int ncol, double* flm)
-        //{
-        //    Timer t("sirius::SHT::rlm_forward_iterative_transform");
-        //
-        //    assert(lmmax <= lmmax_);
-
-        //    mdarray<double, 2> ftp(ftp__, num_points_, ncol);
-        //    mdarray<double, 2> ftp1(num_points_, ncol);
-        //
-        //    blas<cpu>::gemm(1, 0, lmmax, ncol, num_points_, 1.0, &rlm_forward_(0, 0), num_points_, &ftp(0, 0), num_points_, 0.0,
-        //                    flm, lmmax);
-        //
-        //    for (int i = 0; i < 2; i++)
-        //    {
-        //        rlm_backward_transform(flm, lmmax, ncol, &ftp1(0, 0));
-        //        double tdiff = 0.0;
-        //        for (int ir = 0; ir < ncol; ir++)
-        //        {
-        //            for (int itp = 0; itp < num_points_; itp++)
-        //            {
-        //                ftp1(itp, ir) = ftp(itp, ir) - ftp1(itp, ir);
-        //                //tdiff += fabs(ftp1(itp, ir));
-        //            }
-        //        }
-        //
-        //        for (int itp = 0; itp < num_points_; itp++)
-        //        {
-        //            tdiff += fabs(ftp1(itp, ncol - 1));
-        //        }
-        //        std::cout << "iter : " << i << " avg. MT diff = " << tdiff / num_points_ << std::endl;
-        //        blas<cpu>::gemm(1, 0, lmmax, ncol, num_points_, 1.0, &rlm_forward_(0, 0), num_points_, &ftp1(0, 0), num_points_, 1.0,
-        //                        flm, lmmax);
-        //    }
-        //}
-
-        /// Transform Cartesian coordinates [x,y,z] to spherical coordinates [r,theta,phi]
-        static vector3d<double> spherical_coordinates(vector3d<double> vc)
-        {
-            vector3d<double> vs;
-
-            const double eps{1e-12};
-
-            vs[0] = vc.length();
-
-            if (vs[0] <= eps) {
-                vs[1] = 0.0;
-                vs[2] = 0.0;
-            } else {
-                vs[1] = std::acos(vc[2] / vs[0]); // theta = cos^{-1}(z/r)
-
-                if (std::abs(vc[0]) > eps || std::abs(vc[1]) > eps) {
-                    vs[2] = std::atan2(vc[1], vc[0]); // phi = tan^{-1}(y/x)
-                    if (vs[2] < 0.0) {
-                        vs[2] += twopi;
-                    }
-                } else {
-                    vs[2] = 0.0;
-                }
-            }
-
-            return vs;
-        }
-
-        /// Generate complex spherical harmonics Ylm
-        static void spherical_harmonics(int lmax, double theta, double phi, double_complex* ylm)
-        {
-            double x = std::cos(theta);
-            std::vector<double> result_array(lmax + 1);
-
-            for (int l = 0; l <= lmax; l++) {
-                for (int m = 0; m <= l; m++) {
-                    double_complex z = std::exp(double_complex(0.0, m * phi));
-                    ylm[utils::lm(l, m)] = gsl_sf_legendre_sphPlm(l, m, x) * z;
-                    if (m % 2) {
-                        ylm[utils::lm(l, -m)] = -std::conj(ylm[utils::lm(l, m)]);
-                    } else {
-                        ylm[utils::lm(l, -m)] = std::conj(ylm[utils::lm(l, m)]);
-                    }
-                }
-            }
-        }
-
-        /// Generate real spherical harmonics Rlm
-        /** Mathematica code:
-         *  \verbatim
-         *  R[l_, m_, th_, ph_] :=
-         *   If[m > 0, std::sqrt[2]*ComplexExpand[Re[SphericalHarmonicY[l, m, th, ph]]],
-         *   If[m < 0, std::sqrt[2]*ComplexExpand[Im[SphericalHarmonicY[l, m, th, ph]]],
-         *   If[m == 0, ComplexExpand[Re[SphericalHarmonicY[l, 0, th, ph]]]]]]
-         *  \endverbatim
-         */
-        static void spherical_harmonics(int lmax, double theta, double phi, double* rlm)
-        {
-            int lmmax = (lmax + 1) * (lmax + 1);
-            std::vector<double_complex> ylm(lmmax);
-            spherical_harmonics(lmax, theta, phi, &ylm[0]);
-
-            double const t = std::sqrt(2.0);
-
-            rlm[0] = y00;
-
-            for (int l = 1; l <= lmax; l++) {
-                for (int m = -l; m < 0; m++) {
-                    rlm[utils::lm(l, m)] = t * ylm[utils::lm(l, m)].imag();
-                }
-
-                rlm[utils::lm(l, 0)] = ylm[utils::lm(l, 0)].real();
-
-                for (int m = 1; m <= l; m++) {
-                    rlm[utils::lm(l, m)] = t * ylm[utils::lm(l, m)].real();
-                }
-            }
-        }
-
-        /// Generate real or complex spherical harmonics for a given Cartesian vectors.
-        template <typename T>
-        static std::vector<T> spherical_harmonics(int lmax__, vector3d<double> vc__)
-        {
-            auto rtp = spherical_coordinates(vc__);
-            std::vector<T> v(utils::lmmax(lmax__));
-            spherical_harmonics(lmax__, rtp[1], rtp[2], v.data());
-            return std::move(v);
-        }
-
-        /// Compute element of the transformation matrix from complex to real spherical harmonics.
-        /** Real spherical harmonic can be written as a linear combination of complex harmonics:
-
-            \f[
-                R_{\ell m}(\theta, \phi) = \sum_{m'} a^{\ell}_{m' m}Y_{\ell m'}(\theta, \phi)
-            \f]
-            where
-            \f[
-                a^{\ell}_{m' m} = \langle Y_{\ell m'} | R_{\ell m} \rangle
-            \f]
-            which gives the name to this function.
-
-            Transformation from real to complex spherical harmonics is conjugate transpose:
-
-            \f[
-                Y_{\ell m}(\theta, \phi) = \sum_{m'} a^{\ell*}_{m m'}R_{\ell m'}(\theta, \phi)
-            \f]
-
-            Mathematica code:
-            \verbatim
-            b[m1_, m2_] :=
-             If[m1 == 0, 1,
-             If[m1 < 0 && m2 < 0, -I/Sqrt[2],
-             If[m1 > 0 && m2 < 0, (-1)^m1*I/Sqrt[2],
-             If[m1 < 0 && m2 > 0, (-1)^m2/Sqrt[2],
-             If[m1 > 0 && m2 > 0, 1/Sqrt[2]]]]]]
-
-            a[m1_, m2_] := If[Abs[m1] == Abs[m2], b[m1, m2], 0]
-
-            Rlm[l_, m_, t_, p_] := Sum[a[m1, m]*SphericalHarmonicY[l, m1, t, p], {m1, -l, l}]
-            \endverbatim
-         */
-        static inline double_complex ylm_dot_rlm(int l, int m1, int m2)
-        {
-            double const isqrt2 = 1.0 / std::sqrt(2);
-
-            assert(l >= 0 && std::abs(m1) <= l && std::abs(m2) <= l);
-
-            if (!((m1 == m2) || (m1 == -m2))) {
-                return double_complex(0, 0);
-            }
-
-            if (m1 == 0) {
-                return double_complex(1, 0);
-            }
-
-            if (m1 < 0) {
-                if (m2 < 0) {
-                    return -double_complex(0, isqrt2);
-                } else {
-                    return std::pow(-1.0, m2) * double_complex(isqrt2, 0);
-                }
-            } else {
-                if (m2 < 0) {
-                    return std::pow(-1.0, m1) * double_complex(0, isqrt2);
-                } else {
-                    return double_complex(isqrt2, 0);
-                }
-            }
-        }
-
-        static inline double_complex rlm_dot_ylm(int l, int m1, int m2)
-        {
-            return std::conj(ylm_dot_rlm(l, m2, m1));
-        }
-
-        /// Gaunt coefficent of three complex spherical harmonics.
-        /**
-         *  \f[
-         *    \langle Y_{\ell_1 m_1} | Y_{\ell_2 m_2} | Y_{\ell_3 m_3} \rangle
-         *  \f]
-         */
-        static double gaunt_ylm(int l1, int l2, int l3, int m1, int m2, int m3)
-        {
-            assert(l1 >= 0);
-            assert(l2 >= 0);
-            assert(l3 >= 0);
-            assert(m1 >= -l1 && m1 <= l1);
-            assert(m2 >= -l2 && m2 <= l2);
-            assert(m3 >= -l3 && m3 <= l3);
-
-            return std::pow(-1.0, std::abs(m1)) * std::sqrt(double(2 * l1 + 1) * double(2 * l2 + 1) * double(2 * l3 + 1) / fourpi) *
-                   gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, 0, 0, 0) *
-                   gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, -2 * m1, 2 * m2, 2 * m3);
-        }
-
-        /// Gaunt coefficent of three real spherical harmonics.
-        /**
-         *  \f[
-         *    \langle R_{\ell_1 m_1} | R_{\ell_2 m_2} | R_{\ell_3 m_3} \rangle
-         *  \f]
-         */
-        static double gaunt_rlm(int l1, int l2, int l3, int m1, int m2, int m3)
-        {
-            assert(l1 >= 0);
-            assert(l2 >= 0);
-            assert(l3 >= 0);
-            assert(m1 >= -l1 && m1 <= l1);
-            assert(m2 >= -l2 && m2 <= l2);
-            assert(m3 >= -l3 && m3 <= l3);
-
-            double d = 0;
-            for (int k1 = -l1; k1 <= l1; k1++) {
-                for (int k2 = -l2; k2 <= l2; k2++) {
-                    for (int k3 = -l3; k3 <= l3; k3++) {
-                        d += std::real(std::conj(SHT::ylm_dot_rlm(l1, k1, m1)) *
-                                                 SHT::ylm_dot_rlm(l2, k2, m2) *
-                                                 SHT::ylm_dot_rlm(l3, k3, m3)) * SHT::gaunt_ylm(l1, l2, l3, k1, k2, k3);
-                    }
-                }
-            }
-            return d;
-        }
-
-        /// Gaunt coefficent of two real spherical harmonics with a complex one.
-        /**
-         *  \f[
-         *    \langle R_{\ell_1 m_1} | Y_{\ell_2 m_2} | R_{\ell_3 m_3} \rangle
-         *  \f]
-         */
-        static double gaunt_rlm_ylm_rlm(int l1, int l2, int l3, int m1, int m2, int m3)
-        {
-            assert(l1 >= 0);
-            assert(l2 >= 0);
-            assert(l3 >= 0);
-            assert(m1 >= -l1 && m1 <= l1);
-            assert(m2 >= -l2 && m2 <= l2);
-            assert(m3 >= -l3 && m3 <= l3);
-
-            double d = 0;
-            for (int k1 = -l1; k1 <= l1; k1++) {
-                for (int k3 = -l3; k3 <= l3; k3++) {
-                    d += std::real(std::conj(SHT::ylm_dot_rlm(l1, k1, m1)) *
-                                   SHT::ylm_dot_rlm(l3, k3, m3)) * SHT::gaunt_ylm(l1, l2, l3, k1, m2, k3);
-                }
-            }
-            return d;
-        }
-
-        /// Gaunt coefficent of two complex and one real spherical harmonics.
-        /**
-         *  \f[
-         *    \langle Y_{\ell_1 m_1} | R_{\ell_2 m_2} | Y_{\ell_3 m_3} \rangle
-         *  \f]
-         */
-        static double_complex gaunt_hybrid(int l1, int l2, int l3, int m1, int m2, int m3)
-        {
-            assert(l1 >= 0);
-            assert(l2 >= 0);
-            assert(l3 >= 0);
-            assert(m1 >= -l1 && m1 <= l1);
-            assert(m2 >= -l2 && m2 <= l2);
-            assert(m3 >= -l3 && m3 <= l3);
-
-            if (m2 == 0) {
-                return double_complex(gaunt_ylm(l1, l2, l3, m1, m2, m3), 0.0);
-            } else {
-                return (ylm_dot_rlm(l2, m2, m2) * gaunt_ylm(l1, l2, l3, m1, m2, m3) +
-                        ylm_dot_rlm(l2, -m2, m2) * gaunt_ylm(l1, l2, l3, m1, -m2, m3));
-            }
-        }
-
-        void uniform_coverage()
-        {
-            tp_(0, 0) = pi;
-            tp_(1, 0) = 0;
-
-            for (int k = 1; k < num_points_ - 1; k++) {
-                double hk = -1.0 + double(2 * k) / double(num_points_ - 1);
-                tp_(0, k) = std::acos(hk);
-                double t = tp_(1, k - 1) + 3.80925122745582 / std::sqrt(double(num_points_)) / std::sqrt(1 - hk * hk);
-                tp_(1, k) = std::fmod(t, twopi);
-            }
-
-            tp_(0, num_points_ - 1) = 0;
-            tp_(1, num_points_ - 1) = 0;
-        }
-
-        /// Return Clebsch-Gordan coefficient.
-        /** Clebsch-Gordan coefficients arise when two angular momenta are combined into a
-         *  total angular momentum.
-         */
-        static inline double clebsch_gordan(int l1, int l2, int l3, int m1, int m2, int m3)
-        {
-            assert(l1 >= 0);
-            assert(l2 >= 0);
-            assert(l3 >= 0);
-            assert(m1 >= -l1 && m1 <= l1);
-            assert(m2 >= -l2 && m2 <= l2);
-            assert(m3 >= -l3 && m3 <= l3);
-
-            return std::pow(-1, l1 - l2 + m3) * std::sqrt(double(2 * l3 + 1)) *
-                   gsl_sf_coupling_3j(2 * l1, 2 * l2, 2 * l3, 2 * m1, 2 * m2, -2 * m3);
-        }
-
-        inline double_complex ylm_backward(int lm,  int itp) const
-        {
-            return ylm_backward_(lm, itp);
-        }
-
-        inline double rlm_backward(int lm,  int itp) const
-        {
-            return rlm_backward_(lm, itp);
-        }
-
-        inline double coord(int x, int itp) const
-        {
-            return coord_(x, itp);
-        }
-
-        inline vector3d<double> coord(int idx__) const
-        {
-            return vector3d<double>(coord_(0, idx__), coord_(1, idx__), coord(2, idx__));
-        }
-
-        inline double theta(int idx__) const
-        {
-            return tp_(0, idx__);
-        }
-
-        inline double phi(int idx__) const
-        {
-            return tp_(1, idx__);
-        }
-
-        inline double weight(int idx__) const
-        {
-            return w_[idx__];
-        }
-
-        inline int num_points() const
-        {
-            return num_points_;
-        }
-
-        inline int lmax() const
-        {
-            return lmax_;
-        }
-
-        inline int lmmax() const
-        {
-            return lmmax_;
-        }
-
-        static void wigner_d_matrix(int l, double beta, mdarray<double, 2>& d_mtrx__)
-        {
-            long double cos_b2 = std::cos((long double)beta / 2.0L);
-            long double sin_b2 = std::sin((long double)beta / 2.0L);
-
-            for (int m1 = -l; m1 <= l; m1++) {
-                for (int m2 = -l; m2 <= l; m2++) {
-                    long double d = 0;
-                    for (int j = 0; j <= std::min(l + m1, l - m2); j++) {
-                        if ((l - m2 - j) >= 0 && (l + m1 - j) >= 0 && (j + m2 - m1) >= 0) {
-                            long double g = (std::sqrt(utils::factorial<long double>(l + m1)) / utils::factorial<long double>(l - m2 - j)) *
-                                            (std::sqrt(utils::factorial<long double>(l - m1)) / utils::factorial<long double>(l + m1 - j)) *
-                                            (std::sqrt(utils::factorial<long double>(l - m2)) / utils::factorial<long double>(j + m2 - m1)) *
-                                            (std::sqrt(utils::factorial<long double>(l + m2)) / utils::factorial<long double>(j));
-                            d += g * std::pow(-1, j) * std::pow(cos_b2, 2 * l + m1 - m2 - 2 * j) * std::pow(sin_b2, 2 * j + m2 - m1);
-                        }
-                    }
-                    d_mtrx__(m1 + l, m2 + l) = (double)d;
-                }
-            }
-        }
-
-        static void rotation_matrix_l(int l, vector3d<double> euler_angles, int proper_rotation,
-                                      double_complex* rot_mtrx__, int ld)
-        {
-            mdarray<double_complex, 2> rot_mtrx(rot_mtrx__, ld, 2 * l + 1);
-
-            mdarray<double, 2> d_mtrx(2 * l + 1, 2 * l + 1);
-            wigner_d_matrix(l, euler_angles[1], d_mtrx);
-
-            for (int m1 = -l; m1 <= l; m1++) {
-                for (int m2 = -l; m2 <= l; m2++) {
-                    rot_mtrx(m1 + l, m2 + l) = std::exp(double_complex(0, -euler_angles[0] * m1 - euler_angles[2] * m2)) *
-                                               d_mtrx(m1 + l, m2 + l) * std::pow(proper_rotation, l);
-                }
-            }
-        }
-
-        static void rotation_matrix_l(int l, vector3d<double> euler_angles, int proper_rotation,
-                                      double* rot_mtrx__, int ld)
-        {
-            mdarray<double, 2> rot_mtrx_rlm(rot_mtrx__, ld, 2 * l + 1);
-            mdarray<double_complex, 2> rot_mtrx_ylm(2 * l + 1, 2 * l + 1);
-
-            mdarray<double, 2> d_mtrx(2 * l + 1, 2 * l + 1);
-            wigner_d_matrix(l, euler_angles[1], d_mtrx);
-
-            for (int m1 = -l; m1 <= l; m1++)
-            {
-                for (int m2 = -l; m2 <= l; m2++)
-                {
-                    rot_mtrx_ylm(m1 + l, m2 + l) = std::exp(double_complex(0, -euler_angles[0] * m1 - euler_angles[2] * m2)) *
-                                                   d_mtrx(m1 + l, m2 + l) * std::pow(proper_rotation, l);
-                }
-            }
-            for (int m1 = -l; m1 <= l; m1++)
-            {
-                auto i13 = (m1 == 0) ? std::vector<int>({0}) : std::vector<int>({-m1, m1});
-
-                for (int m2 = -l; m2 <= l; m2++)
-                {
-                    auto i24 = (m2 == 0) ? std::vector<int>({0}) : std::vector<int>({-m2, m2});
-
-                    for (int m3: i13)
-                    {
-                        for (int m4: i24)
-                        {
-                            rot_mtrx_rlm(m1 + l, m2 + l) += std::real(rlm_dot_ylm(l, m1, m3) *
-                                                                      rot_mtrx_ylm(m3 + l, m4 + l) *
-                                                                      ylm_dot_rlm(l, m4, m2));
-                        }
-                    }
-                }
-            }
-        }
-
-        template <typename T>
-        static void rotation_matrix(int              lmax,
-                                    vector3d<double> euler_angles,
-                                    int              proper_rotation,
-                                    mdarray<T, 2>&   rotm)
-        {
-            rotm.zero();
-
-            for (int l = 0; l <= lmax; l++) {
-                rotation_matrix_l(l, euler_angles, proper_rotation, &rotm(l * l, l * l), rotm.ld());
-            }
-        }
-
-        /// Compute derivative of real-spherical harmonic with respect to theta angle.
-        static void dRlm_dtheta(int lmax, double theta, double phi, mdarray<double, 1>& data)
-        {
-            assert(lmax <= 8);
-
-            data[0]=0;
-
-            if (lmax==0) return;
-
-            auto cos_theta = SHT::cosxn(lmax, theta);
-            auto sin_theta = SHT::sinxn(lmax, theta);
-            auto cos_phi = SHT::cosxn(lmax, phi);
-            auto sin_phi = SHT::sinxn(lmax, phi);
-
-            data[1]=-(std::sqrt(3/pi)*cos_theta[0]*sin_phi[0])/2.;
-
-            data[2]=-(std::sqrt(3/pi)*sin_theta[0])/2.;
-
-            data[3]=-(std::sqrt(3/pi)*cos_phi[0]*cos_theta[0])/2.;
-
-            if (lmax==1) return;
-
-            data[4]=-(std::sqrt(15/pi)*cos_phi[0]*cos_theta[0]*sin_phi[0]*sin_theta[0]);
-
-            data[5]=-(std::sqrt(15/pi)*cos_theta[1]*sin_phi[0])/2.;
-
-            data[6]=(-3*std::sqrt(5/pi)*cos_theta[0]*sin_theta[0])/2.;
-
-            data[7]=-(std::sqrt(15/pi)*cos_phi[0]*cos_theta[1])/2.;
-
-            data[8]=(std::sqrt(15/pi)*cos_phi[1]*sin_theta[1])/4.;
-
-            if (lmax==2) return;
-
-            data[9]=(-3*std::sqrt(35/(2.*pi))*cos_theta[0]*sin_phi[2]*std::pow(sin_theta[0],2))/4.;
-
-            data[10]=(std::sqrt(105/pi)*sin_phi[1]*(sin_theta[0] - 3*sin_theta[2]))/16.;
-
-            data[11]=-(std::sqrt(21/(2.*pi))*(cos_theta[0] + 15*cos_theta[2])*sin_phi[0])/16.;
-
-            data[12]=(-3*std::sqrt(7/pi)*(sin_theta[0] + 5*sin_theta[2]))/16.;
-
-            data[13]=-(std::sqrt(21/(2.*pi))*cos_phi[0]*(cos_theta[0] + 15*cos_theta[2]))/16.;
-
-            data[14]=-(std::sqrt(105/pi)*cos_phi[1]*(sin_theta[0] - 3*sin_theta[2]))/16.;
-
-            data[15]=(-3*std::sqrt(35/(2.*pi))*cos_phi[2]*cos_theta[0]*std::pow(sin_theta[0],2))/4.;
-
-            if (lmax==3) return;
-
-            data[16]=(-3*std::sqrt(35/pi)*cos_theta[0]*sin_phi[3]*std::pow(sin_theta[0],3))/4.;
-
-            data[17]=(-3*std::sqrt(35/(2.*pi))*(1 + 2*cos_theta[1])*sin_phi[2]*std::pow(sin_theta[0],2))/4.;
-
-            data[18]=(3*std::sqrt(5/pi)*sin_phi[1]*(2*sin_theta[1] - 7*sin_theta[3]))/16.;
-
-            data[19]=(-3*std::sqrt(5/(2.*pi))*(cos_theta[1] + 7*cos_theta[3])*sin_phi[0])/8.;
-
-            data[20]=(-15*(2*sin_theta[1] + 7*sin_theta[3]))/(32.*std::sqrt(pi));
-
-            data[21]=(-3*std::sqrt(5/(2.*pi))*cos_phi[0]*(cos_theta[1] + 7*cos_theta[3]))/8.;
-
-            data[22]=(3*std::sqrt(5/pi)*cos_phi[1]*(-2*sin_theta[1] + 7*sin_theta[3]))/16.;
-
-            data[23]=(-3*std::sqrt(35/(2.*pi))*cos_phi[2]*(1 + 2*cos_theta[1])*std::pow(sin_theta[0],2))/4.;
-
-            data[24]=(3*std::sqrt(35/pi)*cos_phi[3]*cos_theta[0]*std::pow(sin_theta[0],3))/4.;
-
-            if (lmax==4) return;
-
-            data[25]=(-15*std::sqrt(77/(2.*pi))*cos_theta[0]*sin_phi[4]*std::pow(sin_theta[0],4))/16.;
-
-            data[26]=(-3*std::sqrt(385/pi)*(3 + 5*cos_theta[1])*sin_phi[3]*std::pow(sin_theta[0],3))/32.;
-
-            data[27]=(-3*std::sqrt(385/(2.*pi))*cos_theta[0]*(1 + 15*cos_theta[1])*sin_phi[2]*std::pow(sin_theta[0],2))/32.;
-
-            data[28]=(std::sqrt(1155/pi)*sin_phi[1]*(2*sin_theta[0] + 3*(sin_theta[2] - 5*sin_theta[4])))/128.;
-
-            data[29]=-(std::sqrt(165/pi)*(2*cos_theta[0] + 21*(cos_theta[2] + 5*cos_theta[4]))*sin_phi[0])/256.;
-
-            data[30]=(-15*std::sqrt(11/pi)*(2*sin_theta[0] + 7*(sin_theta[2] + 3*sin_theta[4])))/256.;
-
-            data[31]=-(std::sqrt(165/pi)*cos_phi[0]*(2*cos_theta[0] + 21*(cos_theta[2] + 5*cos_theta[4])))/256.;
-
-            data[32]=(std::sqrt(1155/pi)*cos_phi[1]*(-2*sin_theta[0] - 3*sin_theta[2] + 15*sin_theta[4]))/128.;
-
-            data[33]=(-3*std::sqrt(385/(2.*pi))*cos_phi[2]*(17*cos_theta[0] + 15*cos_theta[2])*std::pow(sin_theta[0],2))/64.;
-
-            data[34]=(3*std::sqrt(385/pi)*cos_phi[3]*(3 + 5*cos_theta[1])*std::pow(sin_theta[0],3))/32.;
-
-            data[35]=(-15*std::sqrt(77/(2.*pi))*cos_phi[4]*cos_theta[0]*std::pow(sin_theta[0],4))/16.;
-
-            if (lmax==5) return;
-
-            data[36]=(-3*std::sqrt(3003/(2.*pi))*cos_theta[0]*sin_phi[5]*std::pow(sin_theta[0],5))/16.;
-
-            data[37]=(-3*std::sqrt(1001/(2.*pi))*(2 + 3*cos_theta[1])*sin_phi[4]*std::pow(sin_theta[0],4))/16.;
-
-            data[38]=(-3*std::sqrt(91/pi)*cos_theta[0]*(7 + 33*cos_theta[1])*sin_phi[3]*std::pow(sin_theta[0],3))/32.;
-
-            data[39]=(-3*std::sqrt(1365/(2.*pi))*(7 + 14*cos_theta[1] + 11*cos_theta[3])*sin_phi[2]*std::pow(sin_theta[0],2))/64.;
-
-            data[40]=(std::sqrt(1365/(2.*pi))*sin_phi[1]*(17*sin_theta[1] + 12*sin_theta[3] - 99*sin_theta[5]))/512.;
-
-            data[41]=-(std::sqrt(273/pi)*(5*cos_theta[1] + 24*cos_theta[3] + 99*cos_theta[5])*sin_phi[0])/256.;
-
-            data[42]=(-21*std::sqrt(13/pi)*(5*sin_theta[1] + 12*sin_theta[3] + 33*sin_theta[5]))/512.;
-
-            data[43]=-(std::sqrt(273/pi)*cos_phi[0]*(5*cos_theta[1] + 24*cos_theta[3] + 99*cos_theta[5]))/256.;
-
-            data[44]=(std::sqrt(1365/(2.*pi))*cos_phi[1]*(-17*sin_theta[1] - 12*sin_theta[3] + 99*sin_theta[5]))/512.;
-
-            data[45]=(-3*std::sqrt(1365/(2.*pi))*cos_phi[2]*(7 + 14*cos_theta[1] + 11*cos_theta[3])*std::pow(sin_theta[0],2))/64.;
-
-            data[46]=(3*std::sqrt(91/pi)*cos_phi[3]*(47*cos_theta[0] + 33*cos_theta[2])*std::pow(sin_theta[0],3))/64.;
-
-            data[47]=(-3*std::sqrt(1001/(2.*pi))*cos_phi[4]*(2 + 3*cos_theta[1])*std::pow(sin_theta[0],4))/16.;
-
-            data[48]=(3*std::sqrt(3003/(2.*pi))*cos_phi[5]*cos_theta[0]*std::pow(sin_theta[0],5))/16.;
-
-            if (lmax==6) return;
-
-            data[49]=(-21*std::sqrt(715/pi)*cos_theta[0]*sin_phi[6]*std::pow(sin_theta[0],6))/64.;
-
-            data[50]=(-3*std::sqrt(5005/(2.*pi))*(5 + 7*cos_theta[1])*sin_phi[5]*std::pow(sin_theta[0],5))/64.;
-
-            data[51]=(-3*std::sqrt(385/pi)*cos_theta[0]*(29 + 91*cos_theta[1])*sin_phi[4]*std::pow(sin_theta[0],4))/128.;
-
-            data[52]=(-3*std::sqrt(385/pi)*(81 + 148*cos_theta[1] + 91*cos_theta[3])*sin_phi[3]*std::pow(sin_theta[0],3))/256.;
-
-            data[53]=(-3*std::sqrt(35/pi)*cos_theta[0]*(523 + 396*cos_theta[1] + 1001*cos_theta[3])*sin_phi[2]*std::pow(sin_theta[0],2))/512.;
-
-            data[54]=(3*std::sqrt(35/(2.*pi))*sin_phi[1]*(75*sin_theta[0] + 171*sin_theta[2] + 55*sin_theta[4] - 1001*sin_theta[6]))/2048.;
-
-            data[55]=-(std::sqrt(105/pi)*(25*cos_theta[0] + 243*cos_theta[2] + 825*cos_theta[4] + 3003*cos_theta[6])*sin_phi[0])/4096.;
-
-            data[56]=(-7*std::sqrt(15/pi)*(25*sin_theta[0] + 81*sin_theta[2] + 165*sin_theta[4] + 429*sin_theta[6]))/2048.;
-
-            data[57]=-(std::sqrt(105/pi)*cos_phi[0]*(25*cos_theta[0] + 243*cos_theta[2] + 825*cos_theta[4] + 3003*cos_theta[6]))/4096.;
-
-            data[58]=(-3*std::sqrt(35/(2.*pi))*cos_phi[1]*(75*sin_theta[0] + 171*sin_theta[2] + 55*sin_theta[4] - 1001*sin_theta[6]))/2048.;
-
-            data[59]=(-3*std::sqrt(35/pi)*cos_phi[2]*(1442*cos_theta[0] + 1397*cos_theta[2] + 1001*cos_theta[4])*std::pow(sin_theta[0],2))/1024.;
-
-            data[60]=(3*std::sqrt(385/pi)*cos_phi[3]*(81 + 148*cos_theta[1] + 91*cos_theta[3])*std::pow(sin_theta[0],3))/256.;
-
-            data[61]=(-3*std::sqrt(385/pi)*cos_phi[4]*(149*cos_theta[0] + 91*cos_theta[2])*std::pow(sin_theta[0],4))/256.;
-
-            data[62]=(3*std::sqrt(5005/(2.*pi))*cos_phi[5]*(5 + 7*cos_theta[1])*std::pow(sin_theta[0],5))/64.;
-
-            data[63]=(-21*std::sqrt(715/pi)*cos_phi[6]*cos_theta[0]*std::pow(sin_theta[0],6))/64.;
-
-            if (lmax==7) return;
-
-            data[64]=(-3*std::sqrt(12155/pi)*cos_theta[0]*sin_phi[7]*std::pow(sin_theta[0],7))/32.;
-
-            data[65]=(-3*std::sqrt(12155/pi)*(3 + 4*cos_theta[1])*sin_phi[6]*std::pow(sin_theta[0],6))/64.;
-
-            data[66]=(-3*std::sqrt(7293/(2.*pi))*cos_theta[0]*(2 + 5*cos_theta[1])*sin_phi[5]*std::pow(sin_theta[0],5))/16.;
-
-            data[67]=(-3*std::sqrt(17017/pi)*(11 + 19*cos_theta[1] + 10*cos_theta[3])*sin_phi[4]*std::pow(sin_theta[0],4))/128.;
-
-            data[68]=(-3*std::sqrt(1309/pi)*cos_theta[0]*(43 + 52*cos_theta[1] + 65*cos_theta[3])*sin_phi[3]*std::pow(sin_theta[0],3))/128.;
-
-            data[69]=(-3*std::sqrt(19635/pi)*(21 + 42*cos_theta[1] + 39*cos_theta[3] + 26*cos_theta[5])*sin_phi[2]*std::pow(sin_theta[0],2))/512.;
-
-            data[70]=(-3*std::sqrt(595/(2.*pi))*(-8 + 121*cos_theta[1] + 143*cos_theta[5])*sin_phi[1]*sin_theta[1])/512.;
-
-            data[71]=(-3*std::sqrt(17/pi)*(35*cos_theta[1] + 154*cos_theta[3] + 429*cos_theta[5] + 1430*cos_theta[7])*sin_phi[0])/2048.;
-
-            data[72]=(-9*std::sqrt(17/pi)*(70*sin_theta[1] + 154*sin_theta[3] + 286*sin_theta[5] + 715*sin_theta[7]))/4096.;
-
-            data[73]=(-3*std::sqrt(17/pi)*cos_phi[0]*(35*cos_theta[1] + 154*cos_theta[3] + 429*cos_theta[5] + 1430*cos_theta[7]))/2048.;
-
-            data[74]=(3*std::sqrt(595/(2.*pi))*cos_phi[1]*(-16*sin_theta[1] - 22*sin_theta[3] + 143*sin_theta[7]))/1024.;
-
-            data[75]=(-3*std::sqrt(19635/pi)*cos_phi[2]*(21 + 42*cos_theta[1] + 39*cos_theta[3] + 26*cos_theta[5])*std::pow(sin_theta[0],2))/512.;
-
-            data[76]=(3*std::sqrt(1309/pi)*cos_phi[3]*(138*cos_theta[0] + 117*cos_theta[2] + 65*cos_theta[4])*std::pow(sin_theta[0],3))/256.;
-
-            data[77]=(-3*std::sqrt(17017/pi)*cos_phi[4]*(11 + 19*cos_theta[1] + 10*cos_theta[3])*std::pow(sin_theta[0],4))/128.;
-
-            data[78]=(3*std::sqrt(7293/(2.*pi))*cos_phi[5]*(9*cos_theta[0] + 5*cos_theta[2])*std::pow(sin_theta[0],5))/32.;
-
-            data[79]=(-3*std::sqrt(12155/pi)*cos_phi[6]*(3 + 4*cos_theta[1])*std::pow(sin_theta[0],6))/64.;
-
-            data[80]=(3*std::sqrt(12155/pi)*cos_phi[7]*cos_theta[0]*std::pow(sin_theta[0],7))/32.;
-        }
-
-        ///  Compute derivative of real-spherical harmonic with respect to phi angle and divide by sin(theta).
-        static void dRlm_dphi_sin_theta(int lmax, double theta, double phi, mdarray<double, 1>& data)
-        {
-            assert(lmax <= 8);
-
-            data[0]=0;
-
-            if (lmax==0) return;
-
-            auto cos_theta = SHT::cosxn(lmax, theta);
-            auto sin_theta = SHT::sinxn(lmax, theta);
-            auto cos_phi = SHT::cosxn(lmax, phi);
-            auto sin_phi = SHT::sinxn(lmax, phi);
-
-            data[1]=-(std::sqrt(3/pi)*cos_phi[0])/2.;
-
-            data[2]=0;
-
-            data[3]=(std::sqrt(3/pi)*sin_phi[0])/2.;
-
-            if (lmax==1) return;
-
-            data[4]=-(std::sqrt(15/pi)*cos_phi[1]*sin_theta[0])/2.;
-
-            data[5]=-(std::sqrt(15/pi)*cos_phi[0]*cos_theta[0])/2.;
-
-            data[6]=0;
-
-            data[7]=(std::sqrt(15/pi)*cos_theta[0]*sin_phi[0])/2.;
-
-            data[8]=-(std::sqrt(15/pi)*cos_phi[0]*sin_phi[0]*sin_theta[0]);
-
-            if (lmax==2) return;
-
-            data[9]=(-3*std::sqrt(35/(2.*pi))*cos_phi[2]*std::pow(sin_theta[0],2))/4.;
-
-            data[10]=-(std::sqrt(105/pi)*cos_phi[1]*sin_theta[1])/4.;
-
-            data[11]=-(std::sqrt(21/(2.*pi))*cos_phi[0]*(3 + 5*cos_theta[1]))/8.;
-
-            data[12]=0;
-
-            data[13]=(std::sqrt(21/(2.*pi))*(3 + 5*cos_theta[1])*sin_phi[0])/8.;
-
-            data[14]=-(std::sqrt(105/pi)*cos_phi[0]*cos_theta[0]*sin_phi[0]*sin_theta[0]);
-
-            data[15]=(3*std::sqrt(35/(2.*pi))*sin_phi[2]*std::pow(sin_theta[0],2))/4.;
-
-            if (lmax==3) return;
-
-            data[16]=(-3*std::sqrt(35/pi)*cos_phi[3]*std::pow(sin_theta[0],3))/4.;
-
-            data[17]=(-9*std::sqrt(35/(2.*pi))*cos_phi[2]*cos_theta[0]*std::pow(sin_theta[0],2))/4.;
-
-            data[18]=(-3*std::sqrt(5/pi)*cos_phi[1]*(3*sin_theta[0] + 7*sin_theta[2]))/16.;
-
-            data[19]=(-3*std::sqrt(5/(2.*pi))*cos_phi[0]*(9*cos_theta[0] + 7*cos_theta[2]))/16.;
-
-            data[20]=0;
-
-            data[21]=(3*std::sqrt(5/(2.*pi))*cos_theta[0]*(1 + 7*cos_theta[1])*sin_phi[0])/8.;
-
-            data[22]=(-3*std::sqrt(5/pi)*sin_phi[1]*(3*sin_theta[0] + 7*sin_theta[2]))/16.;
-
-            data[23]=(9*std::sqrt(35/(2.*pi))*cos_theta[0]*sin_phi[2]*std::pow(sin_theta[0],2))/4.;
-
-            data[24]=(-3*std::sqrt(35/pi)*sin_phi[3]*std::pow(sin_theta[0],3))/4.;
-
-            if (lmax==4) return;
-
-            data[25]=(-15*std::sqrt(77/(2.*pi))*cos_phi[4]*std::pow(sin_theta[0],4))/16.;
-
-            data[26]=(-3*std::sqrt(385/pi)*cos_phi[3]*cos_theta[0]*std::pow(sin_theta[0],3))/4.;
-
-            data[27]=(-3*std::sqrt(385/(2.*pi))*cos_phi[2]*(7 + 9*cos_theta[1])*std::pow(sin_theta[0],2))/32.;
-
-            data[28]=-(std::sqrt(1155/pi)*cos_phi[1]*(2*sin_theta[1] + 3*sin_theta[3]))/32.;
-
-            data[29]=-(std::sqrt(165/pi)*cos_phi[0]*(15 + 28*cos_theta[1] + 21*cos_theta[3]))/128.;
-
-            data[30]=0;
-
-            data[31]=(std::sqrt(165/pi)*(15 + 28*cos_theta[1] + 21*cos_theta[3])*sin_phi[0])/128.;
-
-            data[32]=-(std::sqrt(1155/pi)*sin_phi[1]*(2*sin_theta[1] + 3*sin_theta[3]))/32.;
-
-            data[33]=(3*std::sqrt(385/(2.*pi))*(7 + 9*cos_theta[1])*sin_phi[2]*std::pow(sin_theta[0],2))/32.;
-
-            data[34]=(-3*std::sqrt(385/pi)*cos_theta[0]*sin_phi[3]*std::pow(sin_theta[0],3))/4.;
-
-            data[35]=(15*std::sqrt(77/(2.*pi))*sin_phi[4]*std::pow(sin_theta[0],4))/16.;
-
-            if (lmax==5) return;
-
-            data[36]=(-3*std::sqrt(3003/(2.*pi))*cos_phi[5]*std::pow(sin_theta[0],5))/16.;
-
-            data[37]=(-15*std::sqrt(1001/(2.*pi))*cos_phi[4]*cos_theta[0]*std::pow(sin_theta[0],4))/16.;
-
-            data[38]=(-3*std::sqrt(91/pi)*cos_phi[3]*(9 + 11*cos_theta[1])*std::pow(sin_theta[0],3))/16.;
-
-            data[39]=(-3*std::sqrt(1365/(2.*pi))*cos_phi[2]*(21*cos_theta[0] + 11*cos_theta[2])*std::pow(sin_theta[0],2))/64.;
-
-            data[40]=-(std::sqrt(1365/(2.*pi))*cos_phi[1]*(10*sin_theta[0] + 27*sin_theta[2] + 33*sin_theta[4]))/256.;
-
-            data[41]=-(std::sqrt(273/pi)*cos_phi[0]*(50*cos_theta[0] + 45*cos_theta[2] + 33*cos_theta[4]))/256.;
-
-            data[42]=0;
-
-            data[43]=(std::sqrt(273/pi)*cos_theta[0]*(19 + 12*cos_theta[1] + 33*cos_theta[3])*sin_phi[0])/128.;
-
-            data[44]=-(std::sqrt(1365/(2.*pi))*sin_phi[1]*(10*sin_theta[0] + 27*sin_theta[2] + 33*sin_theta[4]))/256.;
-
-            data[45]=(3*std::sqrt(1365/(2.*pi))*cos_theta[0]*(5 + 11*cos_theta[1])*sin_phi[2]*std::pow(sin_theta[0],2))/32.;
-
-            data[46]=(-3*std::sqrt(91/pi)*(9 + 11*cos_theta[1])*sin_phi[3]*std::pow(sin_theta[0],3))/16.;
-
-            data[47]=(15*std::sqrt(1001/(2.*pi))*cos_theta[0]*sin_phi[4]*std::pow(sin_theta[0],4))/16.;
-
-            data[48]=(-3*std::sqrt(3003/(2.*pi))*sin_phi[5]*std::pow(sin_theta[0],5))/16.;
-
-            if (lmax==6) return;
-
-            data[49]=(-21*std::sqrt(715/pi)*cos_phi[6]*std::pow(sin_theta[0],6))/64.;
-
-            data[50]=(-9*std::sqrt(5005/(2.*pi))*cos_phi[5]*cos_theta[0]*std::pow(sin_theta[0],5))/16.;
-
-            data[51]=(-15*std::sqrt(385/pi)*cos_phi[4]*(11 + 13*cos_theta[1])*std::pow(sin_theta[0],4))/128.;
-
-            data[52]=(-3*std::sqrt(385/pi)*cos_phi[3]*(27*cos_theta[0] + 13*cos_theta[2])*std::pow(sin_theta[0],3))/32.;
-
-            data[53]=(-9*std::sqrt(35/pi)*cos_phi[2]*(189 + 308*cos_theta[1] + 143*cos_theta[3])*std::pow(sin_theta[0],2))/512.;
-
-            data[54]=(-3*std::sqrt(35/(2.*pi))*cos_phi[1]*(75*sin_theta[1] + 132*sin_theta[3] + 143*sin_theta[5]))/512.;
-
-            data[55]=-(std::sqrt(105/pi)*cos_phi[0]*(350 + 675*cos_theta[1] + 594*cos_theta[3] + 429*cos_theta[5]))/2048.;
-
-            data[56]=0;
-
-            data[57]=(std::sqrt(105/pi)*(350 + 675*cos_theta[1] + 594*cos_theta[3] + 429*cos_theta[5])*sin_phi[0])/2048.;
-
-            data[58]=(-3*std::sqrt(35/(2.*pi))*sin_phi[1]*(75*sin_theta[1] + 132*sin_theta[3] + 143*sin_theta[5]))/512.;
-
-            data[59]=(9*std::sqrt(35/pi)*(189 + 308*cos_theta[1] + 143*cos_theta[3])*sin_phi[2]*std::pow(sin_theta[0],2))/512.;
-
-            data[60]=(-3*std::sqrt(385/pi)*cos_theta[0]*(7 + 13*cos_theta[1])*sin_phi[3]*std::pow(sin_theta[0],3))/16.;
-
-            data[61]=(15*std::sqrt(385/pi)*(11 + 13*cos_theta[1])*sin_phi[4]*std::pow(sin_theta[0],4))/128.;
-
-            data[62]=(-9*std::sqrt(5005/(2.*pi))*cos_theta[0]*sin_phi[5]*std::pow(sin_theta[0],5))/16.;
-
-            data[63]=(21*std::sqrt(715/pi)*sin_phi[6]*std::pow(sin_theta[0],6))/64.;
-
-            if (lmax==7) return;
-
-            data[64]=(-3*std::sqrt(12155/pi)*cos_phi[7]*std::pow(sin_theta[0],7))/32.;
-
-            data[65]=(-21*std::sqrt(12155/pi)*cos_phi[6]*cos_theta[0]*std::pow(sin_theta[0],6))/64.;
-
-            data[66]=(-3*std::sqrt(7293/(2.*pi))*cos_phi[5]*(13 + 15*cos_theta[1])*std::pow(sin_theta[0],5))/64.;
-
-            data[67]=(-15*std::sqrt(17017/pi)*cos_phi[4]*(11*cos_theta[0] + 5*cos_theta[2])*std::pow(sin_theta[0],4))/256.;
-
-            data[68]=(-3*std::sqrt(1309/pi)*cos_phi[3]*(99 + 156*cos_theta[1] + 65*cos_theta[3])*std::pow(sin_theta[0],3))/256.;
-
-            data[69]=(-3*std::sqrt(19635/pi)*cos_phi[2]*(126*cos_theta[0] + 91*cos_theta[2] + 39*cos_theta[4])*std::pow(sin_theta[0],2))/1024.;
-
-            data[70]=(-3*std::sqrt(595/(2.*pi))*cos_phi[1]*(35*sin_theta[0] + 11*(9*sin_theta[2] + 13*(sin_theta[4] + sin_theta[6]))))/2048.;
-
-            data[71]=(-3*std::sqrt(17/pi)*cos_phi[0]*(1225*cos_theta[0] + 11*(105*cos_theta[2] + 91*cos_theta[4] + 65*cos_theta[6])))/4096.;
-
-            data[72]=0;
-
-            data[73]=(3*std::sqrt(17/pi)*cos_theta[0]*(178 + 869*cos_theta[1] + 286*cos_theta[3] + 715*cos_theta[5])*sin_phi[0])/2048.;
-
-            data[74]=(-3*std::sqrt(595/(2.*pi))*sin_phi[1]*(35*sin_theta[0] + 11*(9*sin_theta[2] + 13*(sin_theta[4] + sin_theta[6]))))/2048.;
-
-            data[75]=(3*std::sqrt(19635/pi)*cos_theta[0]*(37 + 52*cos_theta[1] + 39*cos_theta[3])*sin_phi[2]*std::pow(sin_theta[0],2))/512.;
-
-            data[76]=(-3*std::sqrt(1309/pi)*(99 + 156*cos_theta[1] + 65*cos_theta[3])*sin_phi[3]*std::pow(sin_theta[0],3))/256.;
-
-            data[77]=(15*std::sqrt(17017/pi)*(11*cos_theta[0] + 5*cos_theta[2])*sin_phi[4]*std::pow(sin_theta[0],4))/256.;
-
-            data[78]=(-3*std::sqrt(7293/(2.*pi))*(13 + 15*cos_theta[1])*sin_phi[5]*std::pow(sin_theta[0],5))/64.;
-
-            data[79]=(21*std::sqrt(12155/pi)*cos_theta[0]*sin_phi[6]*std::pow(sin_theta[0],6))/64.;
-
-            data[80]=(-3*std::sqrt(12155/pi)*sin_phi[7]*std::pow(sin_theta[0],7))/32.;
-        }
-
-        /// convert 3x3 transformation matrix to SU2 2x2 matrix
-        /// Create quaternion components from the 3x3 matrix. The components are just a w = Cos(\Omega/2)
-        /// and {x,y,z} = unit rotation vector multiplied by Sin[\Omega/2]
-        /// see https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation
-        /// and https://en.wikipedia.org/wiki/Rotation_group_SO(3)#Quaternions_of_unit_norm
-        static mdarray<double_complex, 2> rotation_matrix_su2(const matrix3d<double>& m)
-        {
-            double det = m.det() > 0 ? 1.0 : -1.0;
-
-            matrix3d<double> mat = m * det;
-            mdarray<double_complex, 2> su2mat(2, 2);
-
-            su2mat.zero();
-
-            /* make quaternion components*/
-            double w = sqrt( std::max( 0., 1. + mat(0,0) + mat(1,1) + mat(2,2) ) ) / 2.;
-            double x = sqrt( std::max( 0., 1. + mat(0,0) - mat(1,1) - mat(2,2) ) ) / 2.;
-            double y = sqrt( std::max( 0., 1. - mat(0,0) + mat(1,1) - mat(2,2) ) ) / 2.;
-            double z = sqrt( std::max( 0., 1. - mat(0,0) - mat(1,1) + mat(2,2) ) ) / 2.;
-            x = std::copysign( x, mat(2,1) - mat(1,2) );
-            y = std::copysign( y, mat(0,2) - mat(2,0) );
-            z = std::copysign( z, mat(1,0) - mat(0,1) );
-
-            su2mat(0, 0) = double_complex( w, -z);
-            su2mat(1, 1) = double_complex( w,  z);
-            su2mat(0, 1) = double_complex(-y, -x);
-            su2mat(1, 0) = double_complex( y, -x);
-
-            return std::move(su2mat);
-        }
-
-        /// Compute the derivatives of real spherical harmonics over the components of cartesian vector.
-        /** The following derivative is computed:
-         *  \f[
-         *    \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial r_{\mu}} =
-         *      \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial \theta_r} \frac{\partial \theta_r}{\partial r_{\mu}} +
-         *      \frac{\partial R_{\ell m}(\theta_r, \phi_r)}{\partial \phi_r} \frac{\partial \phi_r}{\partial r_{\mu}}
-         *  \f]
-         *  The derivatives of angles are:
-         *  \f[
-         *     \frac{\partial \theta_r}{\partial r_{x}} = \frac{\cos(\phi_r) \cos(\theta_r)}{r} \\
-         *     \frac{\partial \theta_r}{\partial r_{y}} = \frac{\cos(\theta_r) \sin(\phi_r)}{r} \\
-         *     \frac{\partial \theta_r}{\partial r_{z}} = -\frac{\sin(\theta_r)}{r}
-         *  \f]
-         *  and
-         *  \f[
-         *     \frac{\partial \phi_r}{\partial r_{x}} = -\frac{\sin(\phi_r)}{\sin(\theta_r) r} \\
-         *     \frac{\partial \phi_r}{\partial r_{y}} = \frac{\cos(\phi_r)}{\sin(\theta_r) r} \\
-         *     \frac{\partial \phi_r}{\partial r_{z}} = 0
-         *  \f]
-         *  The derivative of \f$ \phi \f$ has discontinuities at \f$ \theta = 0, \theta=\pi \f$. This, however, is not a problem, because
-         *  multiplication by the the derivative of \f$ R_{\ell m} \f$ removes it. The following functions have to be hardcoded:
-         *  \f[
-         *    \frac{\partial R_{\ell m}(\theta, \phi)}{\partial \theta} \\
-         *    \frac{\partial R_{\ell m}(\theta, \phi)}{\partial \phi} \frac{1}{\sin(\theta)}
-         *  \f]
-         *
-         *  Mathematica script for spherical harmonic derivatives:
-            \verbatim
-            Rlm[l_, m_, th_, ph_] :=
-             If[m > 0, Sqrt[2]*ComplexExpand[Re[SphericalHarmonicY[l, m, th, ph]]],
-               If[m < 0, Sqrt[2]*ComplexExpand[Im[SphericalHarmonicY[l, m, th, ph]]],
-                 If[m == 0, ComplexExpand[Re[SphericalHarmonicY[l, 0, th, ph]]]]
-               ]
-             ]
-            Do[Print[FullSimplify[D[Rlm[l, m, theta, phi], theta]]], {l, 0, 4}, {m, -l, l}]
-            Do[Print[FullSimplify[TrigExpand[D[Rlm[l, m, theta, phi], phi]/Sin[theta]]]], {l, 0, 4}, {m, -l, l}]
-            \endverbatim
-        */
-        static void dRlm_dr(int lmax__, vector3d<double>& r__, mdarray<double, 2>& data__)
-        {
-            /* get spherical coordinates of the Cartesian vector */
-            auto vrs = spherical_coordinates(r__);
-
-            if (vrs[0] < 1e-12) {
-                data__.zero();
-                return;
-            }
-
-            int lmmax = (lmax__ + 1) * (lmax__ + 1);
-
-            double theta = vrs[1];
-            double phi   = vrs[2];
-
-            vector3d<double> dtheta_dr({std::cos(phi) * std::cos(theta), std::cos(theta) * std::sin(phi), -std::sin(theta)});
-            vector3d<double> dphi_dr({-std::sin(phi), std::cos(phi), 0.0});
-
-            mdarray<double, 1> dRlm_dt(lmmax);
-            mdarray<double, 1> dRlm_dp_sin_t(lmmax);
-
-            dRlm_dtheta(lmax__, theta, phi, dRlm_dt);
-            dRlm_dphi_sin_theta(lmax__, theta, phi, dRlm_dp_sin_t);
-
-            for (int mu = 0; mu < 3; mu++) {
-                for (int lm = 0; lm < lmmax; lm++) {
-                    data__(lm, mu) = (dRlm_dt[lm] * dtheta_dr[mu] + dRlm_dp_sin_t[lm] * dphi_dr[mu]) / vrs[0];
-                }
-            }
-        }
-
-        /// Generate \f$ \cos(m x) \f$ for m in [1, n] using recursion.
-        static mdarray<double, 1> cosxn(int n__, double x__)
-        {
-            assert(n__ > 0);
-            mdarray<double, 1> data(n__);
-            data[0] = std::cos(x__);
-            if (n__ > 1) {
-                data[1] = std::cos(2 * x__);
-                for (int i = 2; i < n__; i++) {
-                    data[i] = 2 * data[0] * data[i - 1] - data[i - 2];
-                }
-            }
-            return std::move(data);
-        }
-
-        /// Generate \f$ \sin(m x) \f$ for m in [1, n] using recursion.
-        static mdarray<double, 1> sinxn(int n__, double x__)
-        {
-            assert(n__ > 0);
-            mdarray<double, 1> data(n__);
-            auto cosx = std::cos(x__);
-            data[0] = std::sin(x__);
-            if (n__ > 1) {
-                data[1] = std::sin(2 * x__);
-                for (int i = 2; i < n__; i++) {
-                    data[i] = 2 * cosx * data[i - 1] - data[i - 2];
-                }
-            }
-            return std::move(data);
-        }
-};
-
-template <>
-inline void SHT::backward_transform<double>(int ld, double const* flm, int nr, int lmmax, double* ftp)
-{
-    assert(lmmax <= lmmax_);
-    assert(ld >= lmmax);
-    linalg<CPU>::gemm(1, 0, num_points_, nr, lmmax, &rlm_backward_(0, 0), lmmax_, flm, ld, ftp, num_points_);
-}
-
-template <>
-inline void SHT::backward_transform<double_complex>(int ld, double_complex const* flm, int nr, int lmmax, double_complex* ftp)
-{
-    assert(lmmax <= lmmax_);
-    assert(ld >= lmmax);
-    linalg<CPU>::gemm(1, 0, num_points_, nr, lmmax, &ylm_backward_(0, 0), lmmax_, flm, ld, ftp, num_points_);
-}
-
-template <>
-inline void SHT::forward_transform<double>(double const* ftp, int nr, int lmmax, int ld, double* flm)
-{
-    assert(lmmax <= lmmax_);
-    assert(ld >= lmmax);
-    linalg<CPU>::gemm(1, 0, lmmax, nr, num_points_, &rlm_forward_(0, 0), num_points_, ftp, num_points_, flm, ld);
-}
-
-template <>
-inline void SHT::forward_transform<double_complex>(double_complex const* ftp, int nr, int lmmax, int ld, double_complex* flm)
-{
-    assert(lmmax <= lmmax_);
-    assert(ld >= lmmax);
-    linalg<CPU>::gemm(1, 0, lmmax, nr, num_points_, &ylm_forward_(0, 0), num_points_, ftp, num_points_, flm, ld);
-}
-
-}
-
-#endif // __SHT_H__
diff --git a/src/simulation_context.h b/src/simulation_context.h
index 2a774ea9c..6a865ac2b 100644
--- a/src/simulation_context.h
+++ b/src/simulation_context.h
@@ -33,7 +33,7 @@
 #include "radial_integrals.h"
 #include "utils/utils.hpp"
 #include "memory_pool.hpp"
-#include "augmentation_operator.h"
+#include "Density/augmentation_operator.hpp"
 
 #ifdef __GPU
 #include "SDDK/GPU/cuda.hpp"
diff --git a/src/sirius.h b/src/sirius.h
index 47325716b..b0650236e 100644
--- a/src/sirius.h
+++ b/src/sirius.h
@@ -38,11 +38,11 @@ using json = nlohmann::json;
 #include "radial_grid.h"
 #include "spline.h"
 #include "radial_solver.h"
-#include "sht.h"
-#include "gaunt.h"
+#include "SHT/sht.hpp"
+#include "SHT/gaunt.hpp"
 #include "sddk.hpp"
 #include "hdf5_tree.hpp"
-#include "xc_functional.h"
+#include "Potential/xc_functional.hpp"
 #include "descriptors.h"
 #include "mixer.h"
 #include "Unit_cell/free_atom.hpp"
diff --git a/src/spheric_function.h b/src/spheric_function.h
index c58467b83..065cf935c 100644
--- a/src/spheric_function.h
+++ b/src/spheric_function.h
@@ -30,7 +30,7 @@
 #include <typeinfo>
 #include "radial_grid.h"
 #include "spline.h"
-#include "sht.h"
+#include "SHT/sht.hpp"
 
 namespace sirius {
 
@@ -42,7 +42,7 @@ class Spheric_function: public mdarray<T, 2>
 
         /// Radial grid.
         Radial_grid<double> const* radial_grid_{nullptr};
-        
+
         int angular_domain_size_;
 
         Spheric_function(Spheric_function<domain_t, T> const& src__) = delete;