Skip to content

Commit

Permalink
Merge branch 'fix/ap_standardization' into 'dev'
Browse files Browse the repository at this point in the history
fix bug with features unordered and standardizer refit for applicability domain

See merge request cdd/QSPRpred!175
  • Loading branch information
HellevdM committed Mar 12, 2024
2 parents f6689f7 + 224f035 commit 21c42ec
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 194 deletions.
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Change Log

From v3.0.1 to v3.0.2
From v3.0.2 to v3.0.3

## Fixes

- Fixed a bug with incorrect checking of whether a data set contains descriptors or not.
- Fixed a bug where an attached standardizer would be refit when calling
`QSPRModel.predictMols` with `use_applicability_domain=True`.

## Changes

- If a `MoleculeTable` contains descriptors and `QSPRDataset.fromMolTable` is called the
descriptors are now automatically loaded as features of the created data set.
None.

## New Features

Expand Down
4 changes: 1 addition & 3 deletions qsprpred/data/processing/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,7 @@ def setUp(self):
def testApplicabilityDomain(self):
"""Test the applicability domain fitting, transforming and serialization."""
ad = MLChemADWrapper(
KNNApplicabilityDomain(
dist="rogerstanimoto", scaling=None, hard_threshold=0.75
)
KNNApplicabilityDomain(dist="jaccard", scaling=None, alpha=0.95)
)
ad.fit(self.dataset.X)
self.assertIsInstance(ad.contains(self.dataset.X), pd.DataFrame)
Expand Down
2 changes: 1 addition & 1 deletion qsprpred/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def predictMols(
# return predictions and if mols are within applicability domain if requested
if hasattr(self, "applicabilityDomain") and use_applicability_domain:
in_domain = self.applicabilityDomain.contains(
dataset.getFeatures(concat=True)
dataset.getFeatures(concat=True, ordered=True, refit_standardizer=False)
)
in_domain = self.handleInvalidsInPredictions(mols, in_domain, failed_mask)
return predictions, in_domain.values
Expand Down
152 changes: 110 additions & 42 deletions qsprpred/models/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest import TestCase

import numpy as np
from mlchemad.applicability_domains import KNNApplicabilityDomain
from parameterized import parameterized
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
Expand All @@ -13,35 +14,31 @@
accuracy_score,
explained_variance_score,
log_loss,
top_k_accuracy_score,
make_scorer,
mean_squared_error,
roc_auc_score,
top_k_accuracy_score,
)
from sklearn.metrics import make_scorer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor

from .assessment.classification import create_metrics_summary
from .assessment.regression import create_correlation_summary
from ..data.processing.applicability_domain import MLChemADWrapper
from ..models.early_stopping import EarlyStopping, EarlyStoppingMode, early_stopping
from ..models.metrics import SklearnMetrics
from ..models.monitors import (
BaseMonitor,
FileMonitor,
ListMonitor,
)
from ..models.monitors import BaseMonitor, FileMonitor, ListMonitor
from ..models.scikit_learn import SklearnModel
from ..tasks import TargetTasks
from ..utils.testing.base import QSPRTestCase
from ..utils.testing.check_mixins import ModelCheckMixIn, MonitorsCheckMixIn
from ..utils.testing.path_mixins import ModelDataSetsPathMixIn
from .assessment.classification import create_metrics_summary
from .assessment.regression import create_correlation_summary


class SklearnBaseModelTestCase(ModelDataSetsPathMixIn, ModelCheckMixIn, QSPRTestCase):
"""This class holds the tests for the SklearnModel class."""

def setUp(self):
super().setUp()
self.setUpPaths()
Expand Down Expand Up @@ -78,17 +75,14 @@ def getModel(

class TestSklearnRegression(SklearnBaseModelTestCase):
"""Test the SklearnModel class for regression models."""

@parameterized.expand(
[
(alg_name, TargetTasks.REGRESSION, alg_name, alg, random_state)
for alg, alg_name in (
(RandomForestRegressor, "RFR"),
(XGBRegressor, "XGBR"),
)
for random_state in ([None], [1, 42], [42, 42])
]
+ [
) for random_state in ([None], [1, 42], [42, 42])
] + [
(alg_name, TargetTasks.REGRESSION, alg_name, alg, [None])
for alg, alg_name in (
(PLSRegression, "PLSR"),
Expand All @@ -105,7 +99,10 @@ def testRegressionBasicFit(self, _, task, model_name, model_class, random_state)
parameters = None
# initialize dataset
dataset = self.createLargeTestDataSet(
target_props=[{"name": "CL", "task": task}],
target_props=[{
"name": "CL",
"task": task
}],
preparation_settings=self.getDefaultPrep(),
)
# initialize model for training from class
Expand Down Expand Up @@ -149,7 +146,10 @@ def testPLSRegressionSummaryWithSeed(self):
model_class = PLSRegression
parameters = None
dataset = self.createLargeTestDataSet(
target_props=[{"name": "CL", "task": task}],
target_props=[{
"name": "CL",
"task": task
}],
preparation_settings=self.getDefaultPrep(),
)
model = self.getModel(
Expand Down Expand Up @@ -177,16 +177,14 @@ def testPLSRegressionSummaryWithSeed(self):

class TestSklearnRegressionMultiTask(SklearnBaseModelTestCase):
"""Test the SklearnModel class for multi-task regression models."""

@parameterized.expand(
[
(alg_name, alg_name, alg, random_state)
for alg, alg_name in ((RandomForestRegressor, "RFR"),)
for alg, alg_name in ((RandomForestRegressor, "RFR"), )
for random_state in ([None], [1, 42], [42, 42])
]
+ [
] + [
(alg_name, alg_name, alg, [None])
for alg, alg_name in ((KNeighborsRegressor, "KNNR"),)
for alg, alg_name in ((KNeighborsRegressor, "KNNR"), )
]
)
def testRegressionMultiTaskFit(self, _, model_name, model_class, random_state):
Expand Down Expand Up @@ -246,13 +244,20 @@ def testRegressionMultiTaskFit(self, _, model_name, model_class, random_state):
class TestSklearnSerialization(SklearnBaseModelTestCase):
def testJSON(self):
dataset = self.createLargeTestDataSet(
target_props=[{"name": "CL", "task": TargetTasks.SINGLECLASS, "th": [6.5]}],
target_props=[{
"name": "CL",
"task": TargetTasks.SINGLECLASS,
"th": [6.5]
}],
preparation_settings=self.getDefaultPrep(),
)
model = self.getModel(
name="TestSerialization",
alg=RandomForestClassifier,
parameters={"n_jobs": self.nCPU, "n_estimators": 10},
parameters={
"n_jobs": self.nCPU,
"n_estimators": 10
},
random_state=42,
)
model.save()
Expand All @@ -271,28 +276,23 @@ def testJSON(self):

class TestSklearnClassification(SklearnBaseModelTestCase):
"""Test the SklearnModel class for classification models."""

@parameterized.expand(
[
(f"{alg_name}_{task}", task, th, alg_name, alg, random_state)
for alg, alg_name in (
(RandomForestClassifier, "RFC"),
(XGBClassifier, "XGBC"),
)
for task, th in (
) for task, th in (
(TargetTasks.SINGLECLASS, [6.5]),
(TargetTasks.MULTICLASS, [0, 2, 10, 1100]),
)
for random_state in ([None], [1, 42], [42, 42])
]
+ [
) for random_state in ([None], [1, 42], [42, 42])
] + [
(f"{alg_name}_{task}", task, th, alg_name, alg, [None])
for alg, alg_name in (
(SVC, "SVC"),
(KNeighborsClassifier, "KNNC"),
(GaussianNB, "NB"),
)
for task, th in (
) for task, th in (
(TargetTasks.SINGLECLASS, [6.5]),
(TargetTasks.MULTICLASS, [0, 2, 10, 1100]),
)
Expand All @@ -317,7 +317,11 @@ def testClassificationBasicFit(
parameters = {"subsample": 0.3}
# initialize dataset
dataset = self.createLargeTestDataSet(
target_props=[{"name": "CL", "task": task, "th": th}],
target_props=[{
"name": "CL",
"task": task,
"th": th
}],
preparation_settings=self.getDefaultPrep(),
)
# test classifier
Expand Down Expand Up @@ -361,7 +365,11 @@ def testRandomForestClassifierFitWithSeed(self):
}
# initialize dataset
dataset = self.createLargeTestDataSet(
target_props=[{"name": "CL", "task": TargetTasks.SINGLECLASS, "th": [6.5]}],
target_props=[{
"name": "CL",
"task": TargetTasks.SINGLECLASS,
"th": [6.5]
}],
preparation_settings=self.getDefaultPrep(),
)
# test classifier
Expand Down Expand Up @@ -391,16 +399,14 @@ def testRandomForestClassifierFitWithSeed(self):

class TestSklearnClassificationMultiTask(SklearnBaseModelTestCase):
"""Test the SklearnModel class for multi-task classification models."""

@parameterized.expand(
[
(alg_name, alg_name, alg, random_state)
for alg, alg_name in ((RandomForestClassifier, "RFC"),)
for alg, alg_name in ((RandomForestClassifier, "RFC"), )
for random_state in ([None], [1, 42], [42, 42])
]
+ [
] + [
(alg_name, alg_name, alg, [None])
for alg, alg_name in ((KNeighborsClassifier, "KNNC"),)
for alg, alg_name in ((KNeighborsClassifier, "KNNC"), )
]
)
def testClassificationMultiTaskFit(self, _, model_name, model_class, random_state):
Expand Down Expand Up @@ -470,7 +476,6 @@ def testClassificationMultiTaskFit(self, _, model_name, model_class, random_stat

class TestMetrics(TestCase):
"""Test the SklearnMetrics from the metrics module."""

def test_SklearnMetrics(self):
"""Test the sklearn metrics wrapper."""

Expand Down Expand Up @@ -648,7 +653,6 @@ def test_EarlyStopping(self):

def test_early_stopping_decorator(self):
"""Test the early stopping decorator."""

class test_class:
def __init__(self, support=True):
self.earlyStopping = EarlyStopping(EarlyStoppingMode.RECORDING)
Expand Down Expand Up @@ -752,3 +756,67 @@ def testListMonitor(self):
False,
[BaseMonitor(), FileMonitor()],
)


class TestAttachedApplicabilityDomain(ModelDataSetsPathMixIn, QSPRTestCase):
def setUp(self):
super().setUp()
self.setUpPaths()

def testAttachedApplicabilityDomain(self):
"""Test the attached applicability domain class."""

# initialize test dataset with attached applicability domain
dataset = self.createLargeTestDataSet(
target_props=[{
"name": "CL",
"task": "REGRESSION"
}],
preparation_settings={
**self.getDefaultPrep(),
"applicability_domain":
KNNApplicabilityDomain(dist="euclidean", alpha=0.9, scaling=None),
},
)
# initialize model for training
model = SklearnModel(
base_dir=self.generatedModelsPath,
alg=RandomForestRegressor,
name="RFR_with_AD",
parameters={"n_jobs": self.nCPU},
random_state=42,
)

model.fitDataset(dataset)

# check if the applicability domain is attached to the model
self.assertTrue(hasattr(model, "applicabilityDomain"))
self.assertIsInstance(model.applicabilityDomain, MLChemADWrapper)

# check if the applicability domain is saved and loaded correctly
model.save()
model2 = SklearnModel.fromFile(model.metaFile)
self.assertTrue(hasattr(model2, "applicabilityDomain"))
self.assertIsInstance(model2.applicabilityDomain, MLChemADWrapper)

# make predictions with mlchemad ap on the dataset directly
comparison_ap = KNNApplicabilityDomain(
dist="euclidean", alpha=0.9, scaling=None
)
features = dataset.getFeatures(
concat=True, ordered=True, refit_standardizer=False
)
comparison_ap.fit(features)
ap_pred = comparison_ap.contains(features)

# check if the applicability domain predictions from the dataset are equal to the ones from the model
_, ap_preds_model = model.predictMols(
dataset.df["SMILES"], use_applicability_domain=True
)
self.assertTrue(np.array_equal(ap_pred.reshape(-1, 1), ap_preds_model))

# check if the applicability domain predictions arrays are equal after saving and loading
_, ap_preds_model2 = model2.predictMols(
dataset.df["SMILES"], use_applicability_domain=True
)
self.assertTrue(np.array_equal(ap_pred.reshape(-1, 1), ap_preds_model2))
Loading

0 comments on commit 21c42ec

Please sign in to comment.