Skip to content

Commit

Permalink
Merge branch 'fix/qspr_scorer' into 'master'
Browse files Browse the repository at this point in the history
v3.4.2

See merge request cdd/DrugEx!97
  • Loading branch information
martin-sicho committed Mar 3, 2023
2 parents 1069f0e + 2d4fa36 commit 28bb959
Show file tree
Hide file tree
Showing 11 changed files with 20,747 additions and 29,548 deletions.
14 changes: 3 additions & 11 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,13 @@
# Change Log
From v3.4.0 to v3.4.1
From v3.4.1 to v3.4.2

## Fixes

- Content of output files during model training and molecule generation (broken due to refactoring in `v3.4.0`):
- During fine-tuning, the training (`train_loss`) and the validation (`valid_loss`) loss, the rations of valid (`valid_ratio`) and accurate (`accurate_ratio`, only for transformers) molecules are saved in `_fit.tsv`
- During RL, the rations of valid (`valid_ratio`), accurate (`accurate_ratio`, only for transformers), unique (`unique_ratio`) and desired (`desired_ratio`) molecules and the average arithmetic (`avg_amean`) and geometric (`avg_gmean`) of the modified scores are saved in `_fit.tsv`
- In `DrugExEnvironment.getScores()` set all modified scores to 0 for invalid molecules (fixes bug resulting from refactoring in `v3.4.0`)
- Fixed the CLI so that it supports new QSPRPred models.
- Fixed the tutorial for scaffold-based generation.
- The `QSPRPredScorer` now functions properly when presented with rdkit molecules instead of SMILES strings. It also does not modify the input list anymore.

## Changes

- Minimal supported version of QSPRPred compatible with the tutorial and CLI is now `v1.3.0.dev0`.
- The `train` CLI script now uses the `'-p', '--predictor'` option to specify the QSPRPred model to use. It takes a path to the model's `_meta.json` file. More models can be specified this way.
- This changes the original meaning of the `'-ta', '--active_targets'`, `'-ti', '--inactive_targets'` and `'-tw', '--window_targets'` options. These now serve to link the models to the particular type of target. The name of the QSPRPred model is used to determine the type of target it represents. For example, if the QSPRPred model is called `A2AR_RandomForestClassifier`, then the `'-ta', '--active_targets'` option will be used to link to the `A2AR_RandomForestClassifier` as a predictor predicting activity towards a target.
- Standard crowding distance is now the default ranking method for the `train` script (equiv. to `--scheme PRCD`, previously was `--scheme PRTD`).
None.

## Removed Features

Expand Down
2 changes: 1 addition & 1 deletion drugex/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
On: 24.06.22, 10:36
"""

VERSION = "3.4.1"
VERSION = "3.4.2"
2 changes: 1 addition & 1 deletion drugex/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def DownloadTutorial(args):
# Link to DrugEx v3 pretrained model (graph-based; Papyrus 05.5)
link_pretrained_model2 = "https://zenodo.org/record/7085421/files/DrugEx_PT_Papyrus05.5.zip?download=1"
# Link to QSAR example model
link_qsar_model = "https://zenodo.org/record/7650233/files/qspr.zip?download=1"
link_qsar_model = "https://zenodo.org/record/7694931/files/qspr.zip?download=1"

# Download model files
pretrained_models_path_rnn = os.path.join(args.out_dir, 'models', 'pretrained', 'smiles-rnn')
Expand Down
2 changes: 1 addition & 1 deletion drugex/training/generators/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def generate(self, input_frags: List[str] = None, input_dataset: DataSet = None,
evaluator=evaluator,
no_multifrag_smiles=no_multifrag_smiles,
unmodified_scores=raw_scores
)
)[evaluator.getScorerKeys()]
], axis=1)

if not keep_frags:
Expand Down
2 changes: 1 addition & 1 deletion drugex/training/generators/sequence_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,5 +323,5 @@ def generate(self, num_samples=100, batch_size=32, n_proc=1,
# Post-processing
df = df_all.head(num_samples)
if evaluator:
df = pd.concat([df, self.evaluate(df.SMILES.tolist(), evaluator=evaluator, no_multifrag_smiles=no_multifrag_smiles, unmodified_scores=raw_scores)], axis=1)
df = pd.concat([df, self.evaluate(df.SMILES.tolist(), evaluator=evaluator, no_multifrag_smiles=no_multifrag_smiles, unmodified_scores=raw_scores)[evaluator.getScorerKeys()]], axis=1)
return df.drop('Frags', axis=1).round(decimals=3)
21 changes: 13 additions & 8 deletions drugex/training/scorers/qsprpred.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
from rdkit import Chem

from drugex.logs import logger
from drugex.training.scorers.interfaces import Scorer
from qsprpred.models.tasks import ModelTasks

Expand All @@ -18,26 +19,30 @@ def __init__(self, model, invalids_score=0.0, modifier=None):
self.invalidsScore = invalids_score

def getScores(self, mols, frags=None):
parsed_mols = []
if type(mols[0]) != str:
invalids = 0
for idx, mol in enumerate(mols):
for mol in mols:
try:
mol = Chem.SanitizeMol(mol)
Chem.SanitizeMol(mol)
mol = Chem.MolToSmiles(mol) if mol and mol.GetNumAtoms() > 1 else "INVALID"
except:
except Exception as exp:
logger.debug(f"Error processing molecule: {Chem.MolToSmiles(mol) if mol else mol} -> \n\t {exp}")
mol = "INVALID"
if mol == "INVALID":
invalids += 1
mols[idx] = mol
parsed_mols.append(mol)

if invalids == len(mols):
return np.array([self.invalidsScore] * len(mols))
if invalids == len(parsed_mols):
return np.array([self.invalidsScore] * len(parsed_mols))
else:
parsed_mols = mols

if self.model.task == ModelTasks.REGRESSION:
return self.model.predictMols(mols)
return self.model.predictMols(parsed_mols)
else:
# FIXME: currently we only assume that the model is a binary classifier with the positive class being the last one in the list of probabilities
return np.array([probas[-1] if not np.isnan(probas[-1]) else self.invalidsScore for probas in self.model.predictMols(mols, use_probas=True)])
return np.array([probas[-1] if not np.isnan(probas[-1]) else self.invalidsScore for probas in self.model.predictMols(parsed_mols, use_probas=True)])

def getKey(self):
return f"QSPRpred_{self.model.name}"
18 changes: 18 additions & 0 deletions drugex/training/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

import numpy as np
import pandas as pd
from rdkit import Chem

from drugex.data.corpus.corpus import SequenceCorpus
from drugex.data.corpus.vocabulary import VocGraph, VocSmiles
from drugex.data.datasets import (GraphFragDataSet, SmilesDataSet,
Expand Down Expand Up @@ -119,6 +121,22 @@ def getPredictor():
ret = MockScorer()
return ret

class TestScorer(TestCase):

def test_getScores(self):
scorer = getPredictor()
mols = ["CCO", "CC"]
scores = scorer.getScores(mols)
self.assertEqual(len(scores), len(mols))
self.assertTrue(all([isinstance(score, float) and score > 0 for score in scores]))

mols = [Chem.MolFromSmiles("CCO"), Chem.MolFromSmiles("CC")]
scores = scorer.getScores(mols)
self.assertEqual(len(scores), len(mols))
self.assertTrue(all([isinstance(score, float) and score > 0 for score in scores]))

# TODO: check with empty and invalid molecules as well

class TrainingTestCase(TestCase):

# input file information
Expand Down
Loading

0 comments on commit 28bb959

Please sign in to comment.