Skip to content

Commit

Permalink
Merge branch 'small_bug_fixes' into 'main'
Browse files Browse the repository at this point in the history
Small bug fixes

See merge request cdd/QSPRpred!93
  • Loading branch information
HellevdM committed Jun 18, 2023
2 parents 981b209 + 4fb629e commit fd59af7
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ From v1.3.1 to v2.0.0
- fixed serialization issues with `DataFrameDescriptorSet` and saving and loading of MSA for PCM descriptor calculations
- the Papyrus adapter was fixed so that the quality and data set filtering options work properly (before only high quality Papyrus++ data was fetched no matter the options)
- previously, in some cases cross-validation splits might not have been shuffled during hyperparameter optimization and evaluation on cross-validation folds (this might have resulted in suboptimal cross-validation performance and bad choices of hyperparameters), a fix was made in b029e78009d1fa7fdc694e388f244eb0ee1d8cc0
- score_func can now be set in `QSPRModel`.

## Changes

Expand All @@ -38,6 +39,7 @@ From v1.3.1 to v2.0.0
have a variance of 0.1 will be removed.
- Added the ExtendedValenceSignature molecular descriptor based on Jean-Loup Faulon's work.
- removed default parameter setting scikit-learn SVC and SVR `max_iter` 10000.
- added `matthews_corrcoef` to the supported metrics for binary classification.

## New Features
- New feature split `ManualSplit` for splitting data by a user-defined column
Expand Down
38 changes: 36 additions & 2 deletions qsprpred/data/utils/descriptor_utils/fingerprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
from rdkit import DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, rdMolDescriptors, rdmolops
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors, rdmolops

from .interfaces import Fingerprint

Expand Down Expand Up @@ -45,6 +45,38 @@ def __len__(self):
def getKey(self):
return "MorganFP"

class RDKitMACCSFP(Fingerprint):
"""RDKits implementation of MACCS keys fingerprint."""

def getFingerprints(self, mols):
"""Return the MACCS fingerprints for the input molecules.
Args:
mols: molecules to obtain the fingerprint of
Returns:
fingerprint (list): `list` of fingerprints for "mols"
"""
convertFP = DataStructs.ConvertToNumpyArray

ret = np.zeros((len(mols), len(self)))
for idx, mol in enumerate(mols):
fp = MACCSkeys.GenMACCSKeys(mol)
np_fp = np.zeros(len(fp))
convertFP(fp, np_fp)
ret[idx] = np_fp

return ret

@property
def settings(self):
return {}

def __len__(self):
return 167

def getKey(self):
return "RDKitMACCSFP"

class MaccsFP(Fingerprint):
def __init__(self, nBits=167, **kwargs):
Expand Down Expand Up @@ -267,7 +299,6 @@ def getFingerprint(self, fp_type, *args, **kwargs):

def getMorganFP(self, *args, **kwargs):
return MorganFP(*args, **kwargs)

def getMaccsFP(self, *args, **kwargs):
return MaccsFP(*args, **kwargs)

Expand All @@ -288,6 +319,9 @@ def getPatternFP(self, *args, **kwargs):

def getLayeredFP(self, *args, **kwargs):
return LayeredFP(*args, **kwargs)

def getRDKitMACCSFP(self, *args, **kwargs):
return RDKitMACCSFP(*args, **kwargs)

def getCDKFP(self, *args, **kwargs):
from qsprpred.extra.data.utils.descriptor_utils.fingerprints import CDKFP
Expand Down
7 changes: 6 additions & 1 deletion qsprpred/deep/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,9 @@ def evaluate(
save: bool = True,
es_val_size: float = 0.1,
parameters: dict | None = None,
score_func=None
) -> np.ndarray:
"""Make predictions for cross-validation and independent test set.
"""Make predictions for crossvalidation and independent test set.
Args:
save (bool):
Expand All @@ -286,8 +287,12 @@ def evaluate(
Returns:
np.ndarray:
predictions for test set and cross-validation for further analysis
score_func (Metric):
scoring function for the model, if None, the default scoring function
for the task is used
"""
evalparams = self.parameters if parameters is None else parameters
score_func = self.scoreFunc if score_func is None else score_func
X, X_ind = self.data.getFeatures()
y, y_ind = self.data.getTargetPropertiesValues()
last_save_epochs = 0
Expand Down
3 changes: 2 additions & 1 deletion qsprpred/model_CLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,8 @@ def QSPR_modelling(args):
counts = mydataset.y.value_counts()
scale_pos_weight = (
counts[0] / counts[1] if
(args.sample_weighing and not mydataset.isMultiClass()) else 1
(args.sample_weighing and len(tasks)==1 and
not tasks[0].isMultiClass()) else 1
)
if alg_dict[model_type] == XGBClassifier:
parameters["scale_pos_weight"] = scale_pos_weight
Expand Down
8 changes: 6 additions & 2 deletions qsprpred/models/hyperparam_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ def objective(self, trial : optuna.trial.Trial, model: QSPRModel) -> float:
# evaluate the model with the current parameters and return the score
y, y_ind = model.data.getTargetPropertiesValues()
score = self.scoreFunc(
y, model.evaluate(save=False, parameters=bayesian_params)
y, model.evaluate(save=False,
parameters=bayesian_params,
score_func=self.scoreFunc)
)
return score

Expand Down Expand Up @@ -182,7 +184,9 @@ def optimize(self, model: QSPRModel, save_params: bool = True) -> dict:
for params in ParameterGrid(self.paramGrid):
logger.info(params)
y, y_ind = model.data.getTargetPropertiesValues()
score = self.scoreFunc(y, model.evaluate(save=False, parameters=params))
score = self.scoreFunc(y, model.evaluate(save=False,
parameters=params,
score_func=self.scoreFunc))
logger.info("Score: %s" % score)
if score > self.bestScore:
self.bestScore = score
Expand Down
2 changes: 2 additions & 0 deletions qsprpred/models/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class SklearnMetric(Metric):
"jaccard_micro",
"jaccard_macro",
"jaccard_weighted",
'matthews_corrcoef',
]
multiClassMetrics = [
"neg_log_loss",
Expand Down Expand Up @@ -303,6 +304,7 @@ def needsDiscreteToScore(self):
"recall_macro",
"recall_weighted",
"recall_samples",
"matthews_corrcoef",
]

def supportsTask(self, task: ModelTasks):
Expand Down
5 changes: 5 additions & 0 deletions qsprpred/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def evaluate(
self,
save: bool = True,
parameters: dict = None,
score_func=None,
**kwargs
) -> float | np.ndarray:
"""Make predictions for crossvalidation and independent test set.
Expand All @@ -122,6 +123,8 @@ def evaluate(
(don't save predictions when used in bayesian optimization)
parameters (dict):
model parameters, if None, the parameters from the model are used
score_func (Metric):
metric to use for scoring, if None, the metric from the model is used
**kwargs:
additional keyword arguments for the estimator's predict method
Expand All @@ -130,6 +133,8 @@ def evaluate(
predictions for evaluation
"""
evalparams = self.parameters if parameters is None else parameters
score_func = self.scoreFunc if score_func is None else score_func

# check if data is available
self.checkForData()
folds = self.data.createFolds()
Expand Down

0 comments on commit fd59af7

Please sign in to comment.