Skip to content

Commit

Permalink
CGRtools 4.0 port (cimm-kzn#14)
Browse files Browse the repository at this point in the history
* porting to CGRtools 4.0
  • Loading branch information
stsouko authored Oct 15, 2019
1 parent 26760ab commit af067e8
Show file tree
Hide file tree
Showing 18 changed files with 787 additions and 458 deletions.
8 changes: 4 additions & 4 deletions CIMtools/applicability_domain/bounding_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_array, check_is_fitted


class Box(BaseEstimator):
class Box(BaseEstimator, ClassifierMixin):
""" This approach defines AD as a bounding block, which is an N-dimensional hypercube
defined on the basis of the maximum and minimum values of each descriptor used to construct the model.
If test compound is outside of hypercube it is outside of AD model.
Expand Down Expand Up @@ -51,8 +51,8 @@ def fit(self, X, y=None):
# Check that X have correct shape
X = check_array(X)

self._x_min = X.min(axis=0) # axis=0 will find the minimum values ​​by columns (for each feature)
self._x_max = X.max(axis=0) # axis=0 will find the minimum values ​​by columns (for each feature)
self._x_min = X.min(axis=0) # axis=0 will find the minimum values ​​by columns (for each feature)
self._x_max = X.max(axis=0) # axis=0 will find the minimum values ​​by columns (for each feature)
return self

def predict(self, X):
Expand Down
17 changes: 10 additions & 7 deletions CIMtools/applicability_domain/leverage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 Assima Rakhimbekova <asima.astana@outlook.com>
# Copyright 2019 Ramil Nugmanov <stsouko@live.ru>
# This file is part of CIMtools.
#
# CIMtools is free software; you can redistribute it and/or modify
Expand All @@ -17,15 +18,15 @@
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from numpy import array, column_stack, eye, hstack, linalg, ones, unique
from sklearn.base import BaseEstimator, clone
from sklearn.base import BaseEstimator, clone, ClassifierMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.utils import safe_indexing
from sklearn.utils.validation import check_array, check_is_fitted
from ..metrics.applicability_domain_metrics import balanced_accuracy_score_with_ad, rmse_score_with_ad


class Leverage(BaseEstimator):
class Leverage(BaseEstimator, ClassifierMixin):
""" Distance-based method
The model space can be represented by a two-dimensional matrix comprising n chemicals (rows) and
k variables (columns), called the descriptor matrix (X). The leverage of a chemical provides a measure of the
Expand Down Expand Up @@ -63,12 +64,14 @@ def __init__(self, threshold='auto', score='ba_ad', reg_model=None):
if score not in ('ba_ad', 'rmse_ad'):
raise ValueError('Invalid value for score. Allowed string values are "ba_ad", "rmse_ad".')

def __make_inverse_matrix(self, X):
@staticmethod
def __make_inverse_matrix(X):
X = column_stack(((ones(X.shape[0])), X))
influence_matrix = X.T.dot(X) + eye(X.shape[1]).dot(1e-8)
return linalg.inv(influence_matrix)

def __find_leverages(self, X, inverse_influence_matrix):
@staticmethod
def __find_leverages(X, inverse_influence_matrix):
X = column_stack(((ones(X.shape[0])), X))
return array([X[i, :].dot(inverse_influence_matrix).dot(X[i, :]) for i in range(X.shape[0])])

Expand Down Expand Up @@ -106,7 +109,7 @@ def fit(self, X, y=None):
y_train = safe_indexing(y, train_index)
y_test = safe_indexing(y, test_index)
if self.reg_model is None:
reg_model = RandomForestRegressor(n_estimators=500, random_state=1).fit(x_train, y_train)
reg_model = RandomForestRegressor().fit(x_train, y_train)
else:
reg_model = clone(self.reg_model).fit(x_train, y_train)
Y_pred.append(reg_model.predict(x_test))
Expand All @@ -119,7 +122,7 @@ def fit(self, X, y=None):
AD_new = AD_stack <= z
if self.score == 'ba_ad':
val = balanced_accuracy_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
elif self.score == 'rmse_ad':
else:
val = rmse_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
if val >= score_value:
score_value = val
Expand All @@ -144,7 +147,7 @@ def predict_proba(self, X):
The objects distances to center of the training set.
"""
# Check is fit had been called
check_is_fitted(self, ['inverse_influence_matrix'])
check_is_fitted(self, ['inverse_influence_matrix', 'threshold_value'])
# Check that X have correct shape
X = check_array(X)
return self.__find_leverages(X, self.inverse_influence_matrix)
Expand Down
11 changes: 6 additions & 5 deletions CIMtools/applicability_domain/reaction_type_control.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019 Assima Rakhimbekova <asima.astana@outlook.com>
# Copyright 2019 Ramil Nugmanov <stsouko@live.ru>
# This file is part of CIMtools.
#
# CIMtools is free software; you can redistribute it and/or modify
Expand All @@ -18,11 +19,12 @@
#
from CGRtools.containers import ReactionContainer
from numpy import array
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from ..utils import iter2array


class ReactionTypeControl:
class ReactionTypeControl(BaseEstimator, ClassifierMixin):
"""Reaction Type Control (RTC) is performed using reaction signature.
The signature includes both the reaction centre itself and its 1, 2, and so on the environment
Expand All @@ -43,10 +45,9 @@ def __get_signature(self, structure):
return str(~structure)
else:
cgr = ~structure # Condence Graph of Reaction
cgr.reset_query_marks() # reset hyb and neighbors marks to atoms
# get subgraph with atoms and their neighbors
aug_center = cgr.augmented_substructure(cgr.center_atoms, deep=self.env)
return format(aug_center, 'h') # String for graph reaction center
aug_center = cgr.augmented_substructure(cgr.center_atoms, deep=self.env, as_query=True)
return format(aug_center, '!n') # String for graph reaction center

def fit(self, X):
"""Fit structure-based AD. The training model memorizes the unique set of reaction signature.
Expand All @@ -73,7 +74,7 @@ def predict(self, X):
Returns
-------
self : array contains True (reaction in AD) and False (reaction residing outside AD).
a : array contains True (reaction in AD) and False (reaction residing outside AD).
"""
check_is_fitted(self, ['_train_signatures'])
X = iter2array(X, dtype=ReactionContainer)
Expand Down
6 changes: 3 additions & 3 deletions CIMtools/applicability_domain/similarity_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from numpy import hstack, mean, sqrt, var, unique
from sklearn.base import BaseEstimator, clone
from sklearn.base import BaseEstimator, clone, ClassifierMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import BallTree
from sklearn.model_selection import KFold
Expand All @@ -26,7 +26,7 @@
from ..metrics.applicability_domain_metrics import balanced_accuracy_score_with_ad, rmse_score_with_ad


class SimilarityDistance(BaseEstimator):
class SimilarityDistance(BaseEstimator, ClassifierMixin):
""" Distance-based method for defining applicability domain (AD).
In the case of non-linear kNN QSPR method, since the models are based on chemical similarity calculations,
Expand Down Expand Up @@ -173,7 +173,7 @@ def predict_proba(self, X):
y : array, shape (n_samples,)
"""
# Check is fit had been called
check_is_fitted(self, ['tree'])
check_is_fitted(self, ['tree', 'threshold_value'])
# Check data
X = check_array(X)
return self.tree.query(X)[0].flatten()
Expand Down
19 changes: 8 additions & 11 deletions CIMtools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from CGRtools.containers import ReactionContainer
from itertools import tee, chain
from itertools import tee
from sklearn.base import TransformerMixin
from .utils import iter2array

Expand Down Expand Up @@ -45,7 +45,8 @@ def transform(self, x):
def reaction_support(_class):
class ReactionSupport(_class):
def transform(self, x):
assert all(isinstance(s, ReactionContainer) for s in x), 'invalid dtype, olny ReactionContainers acceptable'
if not all(isinstance(s, ReactionContainer) for s in x):
raise TypeError('invalid dtype, only ReactionContainers acceptable')

shifts = {}
mols = []
Expand All @@ -57,16 +58,12 @@ def transform(self, x):
mols.extend(si)

transformed = super().transform(mols)
assert len(transformed) == len(mols), 'unexpected transformed molecules amount'
if len(transformed) != len(mols):
raise ValueError('unexpected transformed molecules amount')

out = []
for s, r, p in zip(x, (transformed[y: z] for y, z in self.__pairwise(shifts['reactants'])),
(transformed[y: z] for y, z in self.__pairwise(shifts['products']))):
if any(i is None for i in chain(r, p)):
out.append(None)
else:
out.append(ReactionContainer(r, p, meta=s.meta))
return iter2array(out, allow_none=True)
return iter2array(ReactionContainer(r, p, meta=s.meta) for s, r, p in
zip(x, (transformed[y: z] for y, z in self.__pairwise(shifts['reactants'])),
(transformed[y: z] for y, z in self.__pairwise(shifts['products']))))

@staticmethod
def __pairwise(iterable):
Expand Down
4 changes: 2 additions & 2 deletions CIMtools/datasets/molconvert_chemaxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from CGRtools.files import MRVread
from CGRtools.files import MRVRead
from io import StringIO, BytesIO
from pathlib import Path
from subprocess import run, PIPE
Expand Down Expand Up @@ -54,7 +54,7 @@ def molconvert_chemaxon(data):
if p.returncode != 0:
raise ConfigurationError(p.stderr.decode())

with BytesIO(p.stdout) as f, MRVread(f) as r:
with BytesIO(p.stdout) as f, MRVRead(f) as r:
return iter2array(r)


Expand Down
4 changes: 4 additions & 0 deletions CIMtools/metrics/applicability_domain_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def balanced_accuracy_score_with_ad(Y_true, Y_pred, AD):
AD_true = abs(Y_true - Y_pred) <= 3 * sqrt(mean_squared_error(Y_true, Y_pred))
return balanced_accuracy_score(AD_true, AD)


def rmse_score_with_ad(Y_true, Y_pred, AD):
AD_out_n = ~AD
s_n = AD.sum()
Expand All @@ -37,3 +38,6 @@ def rmse_score_with_ad(Y_true, Y_pred, AD):
else:
RMSE_AD_out_n = 0
return RMSE_AD_out_n - RMSE_AD


__all__ = ['balanced_accuracy_score_with_ad', 'rmse_score_with_ad']
19 changes: 0 additions & 19 deletions CIMtools/pipeline/__init__.py

This file was deleted.

107 changes: 0 additions & 107 deletions CIMtools/pipeline/filter.py

This file was deleted.

Loading

0 comments on commit af067e8

Please sign in to comment.