CGRtools 4.0 port (cimm-kzn#14)

* porting to CGRtools 4.0
Ilnura12 · Oct 15, 2019 · af067e8 · af067e8
1 parent 26760ab
commit af067e8
Show file tree

Hide file tree

Showing 18 changed files with 787 additions and 458 deletions.
diff --git a/CIMtools/applicability_domain/bounding_box.py b/CIMtools/applicability_domain/bounding_box.py
@@ -16,11 +16,11 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.validation import check_array, check_is_fitted
 
 
-class Box(BaseEstimator):
+class Box(BaseEstimator, ClassifierMixin):
     """ This approach defines AD as a bounding block, which is an N-dimensional hypercube
     defined on the basis of the maximum and minimum values of each descriptor used to construct the model.
     If test compound is outside of hypercube it is outside of AD model.
@@ -51,8 +51,8 @@ def fit(self, X, y=None):
         # Check that X have correct shape
         X = check_array(X)
 
-        self._x_min = X.min(axis=0) # axis=0 will find the minimum values by columns (for each feature)
-        self._x_max = X.max(axis=0) # axis=0 will find the minimum values by columns (for each feature)
+        self._x_min = X.min(axis=0)  # axis=0 will find the minimum values by columns (for each feature)
+        self._x_max = X.max(axis=0)  # axis=0 will find the minimum values by columns (for each feature)
         return self
 
     def predict(self, X):

diff --git a/CIMtools/applicability_domain/leverage.py b/CIMtools/applicability_domain/leverage.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #  Copyright 2019 Assima Rakhimbekova <asima.astana@outlook.com>
+#  Copyright 2019 Ramil Nugmanov <stsouko@live.ru>
 #  This file is part of CIMtools.
 #
 #  CIMtools is free software; you can redistribute it and/or modify
@@ -17,15 +18,15 @@
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 from numpy import array, column_stack, eye, hstack, linalg, ones, unique
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, clone, ClassifierMixin
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import KFold
 from sklearn.utils import safe_indexing
 from sklearn.utils.validation import check_array, check_is_fitted
 from ..metrics.applicability_domain_metrics import balanced_accuracy_score_with_ad, rmse_score_with_ad
 
 
-class Leverage(BaseEstimator):
+class Leverage(BaseEstimator, ClassifierMixin):
     """ Distance-based method
     The model space can be represented by a two-dimensional matrix comprising n chemicals (rows) and
     k variables (columns), called the descriptor matrix (X). The leverage of a chemical provides a measure of the
@@ -63,12 +64,14 @@ def __init__(self, threshold='auto', score='ba_ad', reg_model=None):
         if score not in ('ba_ad', 'rmse_ad'):
             raise ValueError('Invalid value for score. Allowed string values are "ba_ad", "rmse_ad".')
 
-    def __make_inverse_matrix(self, X):
+    @staticmethod
+    def __make_inverse_matrix(X):
         X = column_stack(((ones(X.shape[0])), X))
         influence_matrix = X.T.dot(X) + eye(X.shape[1]).dot(1e-8)
         return linalg.inv(influence_matrix)
 
-    def __find_leverages(self, X, inverse_influence_matrix):
+    @staticmethod
+    def __find_leverages(X, inverse_influence_matrix):
         X = column_stack(((ones(X.shape[0])), X))
         return array([X[i, :].dot(inverse_influence_matrix).dot(X[i, :]) for i in range(X.shape[0])])
 
@@ -106,7 +109,7 @@ def fit(self, X, y=None):
                 y_train = safe_indexing(y, train_index)
                 y_test = safe_indexing(y, test_index)
                 if self.reg_model is None:
-                    reg_model = RandomForestRegressor(n_estimators=500, random_state=1).fit(x_train, y_train)
+                    reg_model = RandomForestRegressor().fit(x_train, y_train)
                 else:
                     reg_model = clone(self.reg_model).fit(x_train, y_train)
                 Y_pred.append(reg_model.predict(x_test))
@@ -119,7 +122,7 @@ def fit(self, X, y=None):
                 AD_new = AD_stack <= z
                 if self.score == 'ba_ad':
                     val = balanced_accuracy_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
-                elif self.score == 'rmse_ad':
+                else:
                     val = rmse_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
                 if val >= score_value:
                     score_value = val
@@ -144,7 +147,7 @@ def predict_proba(self, X):
                    The objects distances to center of the training set.
         """
         # Check is fit had been called
-        check_is_fitted(self, ['inverse_influence_matrix'])
+        check_is_fitted(self, ['inverse_influence_matrix', 'threshold_value'])
         # Check that X have correct shape
         X = check_array(X)
         return self.__find_leverages(X, self.inverse_influence_matrix)

diff --git a/CIMtools/applicability_domain/reaction_type_control.py b/CIMtools/applicability_domain/reaction_type_control.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #  Copyright 2019 Assima Rakhimbekova <asima.astana@outlook.com>
+#  Copyright 2019 Ramil Nugmanov <stsouko@live.ru>
 #  This file is part of CIMtools.
 #
 #  CIMtools is free software; you can redistribute it and/or modify
@@ -18,11 +19,12 @@
 #
 from CGRtools.containers import ReactionContainer
 from numpy import array
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.validation import check_is_fitted
 from ..utils import iter2array
 
 
-class ReactionTypeControl:
+class ReactionTypeControl(BaseEstimator, ClassifierMixin):
     """Reaction Type Control (RTC) is performed using reaction signature.
 
     The signature includes both the reaction centre itself and its 1, 2, and so on the environment
@@ -43,10 +45,9 @@ def __get_signature(self, structure):
             return str(~structure)
         else:
             cgr = ~structure  # Condence Graph of Reaction
-            cgr.reset_query_marks()  # reset hyb and neighbors marks to atoms
             # get subgraph with atoms and their neighbors
-            aug_center = cgr.augmented_substructure(cgr.center_atoms, deep=self.env)
-            return format(aug_center, 'h')  # String for graph reaction center
+            aug_center = cgr.augmented_substructure(cgr.center_atoms, deep=self.env, as_query=True)
+            return format(aug_center, '!n')  # String for graph reaction center
 
     def fit(self, X):
         """Fit structure-based AD. The training model  memorizes the unique set of reaction signature.
@@ -73,7 +74,7 @@ def predict(self, X):
 
         Returns
         -------
-        self : array contains True (reaction in AD) and False (reaction residing outside AD).
+        a : array contains True (reaction in AD) and False (reaction residing outside AD).
         """
         check_is_fitted(self, ['_train_signatures'])
         X = iter2array(X, dtype=ReactionContainer)

diff --git a/CIMtools/applicability_domain/similarity_distance.py b/CIMtools/applicability_domain/similarity_distance.py
@@ -17,7 +17,7 @@
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 from numpy import hstack, mean, sqrt, var, unique
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, clone, ClassifierMixin
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.neighbors import BallTree
 from sklearn.model_selection import KFold
@@ -26,7 +26,7 @@
 from ..metrics.applicability_domain_metrics import balanced_accuracy_score_with_ad, rmse_score_with_ad
 
 
-class SimilarityDistance(BaseEstimator):
+class SimilarityDistance(BaseEstimator, ClassifierMixin):
     """ Distance-based method for  defining applicability domain (AD).
 
     In the case of non-linear kNN QSPR method, since the models are based on chemical similarity calculations,
@@ -173,7 +173,7 @@ def predict_proba(self, X):
         y : array, shape (n_samples,)
         """
         # Check is fit had been called
-        check_is_fitted(self, ['tree'])
+        check_is_fitted(self, ['tree', 'threshold_value'])
         # Check data
         X = check_array(X)
         return self.tree.query(X)[0].flatten()

diff --git a/CIMtools/base.py b/CIMtools/base.py
@@ -17,7 +17,7 @@
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 from CGRtools.containers import ReactionContainer
-from itertools import tee, chain
+from itertools import tee
 from sklearn.base import TransformerMixin
 from .utils import iter2array
 
@@ -45,7 +45,8 @@ def transform(self, x):
 def reaction_support(_class):
     class ReactionSupport(_class):
         def transform(self, x):
-            assert all(isinstance(s, ReactionContainer) for s in x), 'invalid dtype, olny ReactionContainers acceptable'
+            if not all(isinstance(s, ReactionContainer) for s in x):
+                raise TypeError('invalid dtype, only ReactionContainers acceptable')
 
             shifts = {}
             mols = []
@@ -57,16 +58,12 @@ def transform(self, x):
                     mols.extend(si)
 
             transformed = super().transform(mols)
-            assert len(transformed) == len(mols), 'unexpected transformed molecules amount'
+            if len(transformed) != len(mols):
+                raise ValueError('unexpected transformed molecules amount')
 
-            out = []
-            for s, r, p in zip(x, (transformed[y: z] for y, z in self.__pairwise(shifts['reactants'])),
-                                  (transformed[y: z] for y, z in self.__pairwise(shifts['products']))):
-                if any(i is None for i in chain(r, p)):
-                    out.append(None)
-                else:
-                    out.append(ReactionContainer(r, p, meta=s.meta))
-            return iter2array(out, allow_none=True)
+            return iter2array(ReactionContainer(r, p, meta=s.meta) for s, r, p in
+                              zip(x, (transformed[y: z] for y, z in self.__pairwise(shifts['reactants'])),
+                                     (transformed[y: z] for y, z in self.__pairwise(shifts['products']))))
 
         @staticmethod
         def __pairwise(iterable):

diff --git a/CIMtools/datasets/molconvert_chemaxon.py b/CIMtools/datasets/molconvert_chemaxon.py
@@ -16,7 +16,7 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
-from CGRtools.files import MRVread
+from CGRtools.files import MRVRead
 from io import StringIO, BytesIO
 from pathlib import Path
 from subprocess import run, PIPE
@@ -54,7 +54,7 @@ def molconvert_chemaxon(data):
     if p.returncode != 0:
         raise ConfigurationError(p.stderr.decode())
 
-    with BytesIO(p.stdout) as f, MRVread(f) as r:
+    with BytesIO(p.stdout) as f, MRVRead(f) as r:
         return iter2array(r)
 
 

diff --git a/CIMtools/metrics/applicability_domain_metrics.py b/CIMtools/metrics/applicability_domain_metrics.py
@@ -24,6 +24,7 @@ def balanced_accuracy_score_with_ad(Y_true, Y_pred, AD):
     AD_true = abs(Y_true - Y_pred) <= 3 * sqrt(mean_squared_error(Y_true, Y_pred))
     return balanced_accuracy_score(AD_true, AD)
 
+
 def rmse_score_with_ad(Y_true, Y_pred, AD):
     AD_out_n = ~AD
     s_n = AD.sum()
@@ -37,3 +38,6 @@ def rmse_score_with_ad(Y_true, Y_pred, AD):
     else:
         RMSE_AD_out_n = 0
     return RMSE_AD_out_n - RMSE_AD
+
+
+__all__ = ['balanced_accuracy_score_with_ad', 'rmse_score_with_ad']
diff --git a/CIMtools/pipeline/__init__.py b/CIMtools/pipeline/__init__.py
diff --git a/CIMtools/pipeline/filter.py b/CIMtools/pipeline/filter.py