qmlcode · andersx · Jan 22, 2019 · Oct 25, 2018 · Oct 25, 2018 · Jan 21, 2019
diff --git a/qml/kernels/fkpca.f90 b/qml/kernels/fkpca.f90
@@ -0,0 +1,127 @@
+! MIT License
+!
+! Copyright (c) 2018 Anders Steen Christensen
+!
+! Permission is hereby granted, free of charge, to any person obtaining a copy
+! of this software and associated documentation files (the "Software"), to deal
+! in the Software without restriction, including without limitation the rights
+! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+! copies of the Software, and to permit persons to whom the Software is
+! furnished to do so, subject to the following conditions:
+!
+! The above copyright notice and this permission notice shall be included in all
+! copies or substantial portions of the Software.
+!
+! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+! SOFTWARE.
+
+
+subroutine fkpca(k, n, centering, kpca)
+
+    implicit none
+
+    double precision, dimension(:,:), intent(in) :: k
+    integer, intent(in) :: n
+    logical, intent(in) :: centering
+    double precision, dimension(n,n), intent(out) :: kpca
+
+    ! Eigenvalues
+    double precision, dimension(n) :: eigenvals
+
+    double precision, allocatable, dimension(:) :: work
+
+    integer :: lwork
+    integer :: info
+
+    integer :: i
+
+    double precision :: inv_n
+    double precision, allocatable, dimension(:) :: temp
+    double precision :: temp_sum
+
+    kpca(:,:) = k(:,:)
+
+    ! This first part centers the matrix,
+    ! basically Kpca = K - G@K - K@G + G@K@G, with G = 1/n
+    ! It is a bit hard to follow, sry, but it is very fast
+    ! and requires very little memory overhead.
+
+    if (centering) then
+
+        inv_n = 1.0d0 / n
+
+        allocate(temp(n))
+        temp(:) = 0.0d0
+
+        !$OMP PARALLEL DO
+        do i = 1, n
+            temp(i) = sum(k(i,:)) * inv_n
+        enddo
+        !$OMP END PARALLEL DO
+
+        temp_sum = sum(temp(:)) * inv_n
+
+        !$OMP PARALLEL DO
+        do i = 1, n
+            kpca(i,:) = kpca(i,:) + temp_sum
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL DO
+        do i = 1, n
+            kpca(:,i) = kpca(:,i) - temp(i)
+        enddo
+        !$OMP END PARALLEL DO
+
+        !$OMP PARALLEL DO
+        do i = 1, n
+            kpca(i,:) = kpca(i,:) - temp(i)
+        enddo
+        !$OMP END PARALLEL DO
+
+        deallocate(temp)
+
+    endif
+
+    ! This 2nd part solves the eigenvalue problem with the least
+    ! memory intensive solver, namely DSYEV(). DSYEVD() is twice
+    ! as fast, but requires a lot more memory, which quickly
+    ! becomes prohibitive.
+
+    ! Dry run which returns the optimal "lwork"
+    allocate(work(1))
+    call dsyev("V", "U", n, kpca, n, eigenvals, work, -1, info)
+    lwork = nint(work(1)) + 1
+    deallocate(work)
+
+    ! Get eigenvectors
+    allocate(work(lwork))
+    call dsyev("V", "U", n, kpca, n, eigenvals, work, lwork, info)
+    deallocate(work)
+
+    if (info < 0) then
+
+        write (*,*) "ERROR: The ", -info, "-th argument to DSYEV() had an illegal value."
+
+    else if (info > 0) then
+
+        write (*,*) "ERROR: DSYEV() failed to compute an eigenvalue."
+
+    end if
+
+    ! This 3rd part sorts the kernel PCA matrix such that the first PCA is kpca(1)
+    kpca = kpca(:,n:1:-1)
+    kpca = transpose(kpca)
+
+    !$OMP PARALLEL DO
+    do i = 1, n
+        kpca(i,:) = kpca(i,:) * sqrt(eigenvals(n - i + 1))
+    enddo
+    !$OMP END PARALLEL DO
+
+end subroutine fkpca
diff --git a/qml/kernels/kernels.py b/qml/kernels/kernels.py
@@ -36,6 +36,8 @@
 from .fkernels import fget_local_kernels_laplacian
 from .fkernels import fget_vector_kernels_gaussian, fget_vector_kernels_gaussian_symmetric
 
+from .fkernels import fkpca
+
 def laplacian_kernel(A, B, sigma):
     """ Calculates the Laplacian kernel matrix K, where :math:`K_{ij}`:
 
@@ -351,3 +353,32 @@ def get_local_kernels_laplacian(A, B, na, nb, sigmas):
     nsigmas = len(sigmas)
 
     return fget_local_kernels_laplacian(A.T, B.T, na, nb, sigmas, nma, nmb, nsigmas)
+
+
+def kpca(K, n=2, centering=True):
+    """ Calculates `n` first principal components for the kernel :math:`K`.
+
+        The PCA is calculated using an OpenMP parallel Fortran routine.
+
+        A square, symmetric kernel matrix is required. Centering of the kernel matrix 
+        is enabled by default, although this isn't a strict requirement.
+
+        :param K: 2D kernel matrix
+        :type K: numpy array
+        :param n: Number of kernel PCAs to return (default=2)
+        :type n: integer
+        :param centering: Whether to center the kernel matrix (default=True)
+        :type centering: bool
+
+        :return: array containing the principal components
+        :rtype: numpy array
+    """
+
+    assert K.shape[0] == K.shape[1], "ERROR: Square matrix required for Kernel PCA."
+    assert np.allclose(K, K.T, atol=1e-8), "ERROR: Symmetric matrix required for Kernel PCA."
+    assert n <= K.shape[0], "ERROR: Requested more principal components than matrix size."
+
+    size = K.shape[0]
+    pca = fkpca(K, size, centering)
+
+    return pca[:n]
diff --git a/setup.py b/setup.py
@@ -47,11 +47,14 @@
 
 
 ext_fkernels = Extension(name = '.kernels.fkernels',
-                          sources = ['qml/kernels/fkernels.f90'],
+                          sources = [
+                          'qml/kernels/fkernels.f90',
+                          'qml/kernels/fkpca.f90',
+                              ],
                           extra_f90_compile_args = COMPILER_FLAGS,
                           extra_f77_compile_args = COMPILER_FLAGS,
                           extra_compile_args = COMPILER_FLAGS,
-                          extra_link_args = LINKER_FLAGS,
+                          extra_link_args = LINKER_FLAGS + MATH_LINKER_FLAGS,
                           language = FORTRAN,
                           f2py_options=['--quiet'])
 

diff --git a/test/test_kernels.py b/test/test_kernels.py
@@ -23,7 +23,13 @@
 from __future__ import print_function
 
 import sys
+import os
 import numpy as np
+import scipy
+
+import sklearn
+from sklearn.decomposition import KernelPCA
+
 import qml
 from qml.kernels import laplacian_kernel
 from qml.kernels import gaussian_kernel
@@ -32,6 +38,27 @@
 from qml.kernels import linear_kernel
 from qml.kernels import matern_kernel
 from qml.kernels import sargan_kernel
+from qml.kernels import kpca
+
+def get_energies(filename):
+    """ Returns a dictionary with heats of formation for each xyz-file.
+    """
+
+    f = open(filename, "r")
+    lines = f.readlines()
+    f.close()
+
+    energies = dict()
+
+    for line in lines:
+        tokens = line.split()
+
+        xyz_name = tokens[0]
+        hof = float(tokens[1])
+
+        energies[xyz_name] = hof
+
+    return energies
 
 def test_laplacian_kernel():
 
@@ -136,7 +163,6 @@ def test_matern_kernel():
 
     for metric in ("l1", "l2"):
         for order in (0, 1, 2):
-            print(metric,order)
             matern(metric, order)
 
 def matern(metric, order):
@@ -220,9 +246,52 @@ def sargan(ngamma):
     # Check for symmetry:
     assert np.allclose(Ksymm, Ksymm.T), "Error in Sargan kernel"
 
+
+def array_nan_close(a, b):
+    # Compares arrays, ignoring nans
+
+    m = np.isfinite(a) & np.isfinite(b)
+    return np.allclose(a[m], b[m], atol=1e-8, rtol=0.0)
+
+
+def test_kpca():
+
+    test_dir = os.path.dirname(os.path.realpath(__file__))
+
+    # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenam
+    data = get_energies(test_dir + "/data/hof_qm7.txt")
+
+    # Generate a list of qml.data.Compound() objects
+    mols = []
+
+    keys = sorted(data.keys())
+
+    np.random.seed(666)
+    np.random.shuffle(keys)
+
+    n_mols = 100
+
+    for xyz_file in keys[:n_mols]:
+
+        mol = qml.data.Compound(xyz=test_dir + "/qm7/" + xyz_file)
+        mol.properties = data[xyz_file]
+        mol.generate_bob()
+        mols.append(mol)
+
+    X = np.array([mol.representation for mol in mols])
+    K = laplacian_kernel(X, X, 2e5)
+
+    pcas_qml = kpca(K, n=10)
+    pcas_sklearn = KernelPCA(10, eigen_solver="dense", kernel='precomputed').fit_transform(K)
+
+    assert array_nan_close(np.abs(pcas_sklearn.T), np.abs(pcas_qml)), "Error in Kernel PCA decomposition."
+
+
 if __name__ == "__main__":
+
     test_laplacian_kernel()
     test_gaussian_kernel()
     test_linear_kernel()
     test_matern_kernel()
     test_sargan_kernel()
+    test_kpca()