Merge branch 'fix/qspr_scorer' into 'master'

v3.4.2 See merge request cdd/DrugEx!97
CDDLeiden · Mar 3, 2023 · 28bb959 · 28bb959
2 parents 1069f0e + 2d4fa36
commit 28bb959
Show file tree

Hide file tree

Showing 11 changed files with 20,747 additions and 29,548 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,21 +1,13 @@
 # Change Log
-From v3.4.0 to v3.4.1
+From v3.4.1 to v3.4.2
 
 ## Fixes
 
-- Content of output files during model training and molecule generation (broken due to refactoring in `v3.4.0`):
-  - During fine-tuning, the training (`train_loss`) and the validation (`valid_loss`) loss, the rations of valid (`valid_ratio`) and accurate (`accurate_ratio`, only for transformers) molecules are saved in `_fit.tsv`
-  - During RL, the rations of valid (`valid_ratio`), accurate (`accurate_ratio`, only for transformers), unique (`unique_ratio`) and desired (`desired_ratio`) molecules and the average arithmetic (`avg_amean`) and geometric (`avg_gmean`) of the modified scores are saved in `_fit.tsv`
-- In `DrugExEnvironment.getScores()` set all modified scores to 0 for invalid molecules (fixes bug resulting from refactoring in `v3.4.0`)
-- Fixed the CLI so that it supports new QSPRPred models.
-- Fixed the tutorial for scaffold-based generation.
+- The `QSPRPredScorer` now functions properly when presented with rdkit molecules instead of SMILES strings. It also does not modify the input list anymore.
 
 ## Changes
 
-- Minimal supported version of QSPRPred compatible with the tutorial and CLI is now `v1.3.0.dev0`.
-- The `train` CLI script now uses the `'-p', '--predictor'` option to specify the QSPRPred model to use. It takes a path to the model's `_meta.json` file. More models can be specified this way.
-  - This changes the original meaning of the `'-ta', '--active_targets'`, `'-ti', '--inactive_targets'` and `'-tw', '--window_targets'` options. These now serve to link the models to the particular type of target. The name of the QSPRPred model is used to determine the type of target it represents. For example, if the QSPRPred model is called `A2AR_RandomForestClassifier`, then the `'-ta', '--active_targets'` option will be used to link to the `A2AR_RandomForestClassifier` as a predictor predicting activity towards a target. 
-- Standard crowding distance is now the default ranking method for the `train` script (equiv. to `--scheme PRCD`, previously was `--scheme PRTD`).
+None.
 
 ## Removed Features
 

diff --git a/drugex/about.py b/drugex/about.py
@@ -5,4 +5,4 @@
 On: 24.06.22, 10:36
 """
 
-VERSION = "3.4.1"
+VERSION = "3.4.2"
diff --git a/drugex/download.py b/drugex/download.py
@@ -32,7 +32,7 @@ def DownloadTutorial(args):
     # Link to DrugEx v3 pretrained model (graph-based; Papyrus 05.5)
     link_pretrained_model2 = "https://zenodo.org/record/7085421/files/DrugEx_PT_Papyrus05.5.zip?download=1"
     # Link to QSAR example model
-    link_qsar_model = "https://zenodo.org/record/7650233/files/qspr.zip?download=1"
+    link_qsar_model = "https://zenodo.org/record/7694931/files/qspr.zip?download=1"
 
     # Download model files
     pretrained_models_path_rnn = os.path.join(args.out_dir, 'models', 'pretrained', 'smiles-rnn')

diff --git a/drugex/training/generators/interfaces.py b/drugex/training/generators/interfaces.py
@@ -445,7 +445,7 @@ def generate(self, input_frags: List[str] = None, input_dataset: DataSet = None,
                     evaluator=evaluator,
                     no_multifrag_smiles=no_multifrag_smiles,
                     unmodified_scores=raw_scores
-                )
+                )[evaluator.getScorerKeys()]
             ], axis=1)
 
         if not keep_frags:

diff --git a/drugex/training/generators/sequence_rnn.py b/drugex/training/generators/sequence_rnn.py
@@ -323,5 +323,5 @@ def generate(self, num_samples=100, batch_size=32, n_proc=1,
         # Post-processing
         df = df_all.head(num_samples)
         if evaluator:
-            df = pd.concat([df, self.evaluate(df.SMILES.tolist(), evaluator=evaluator, no_multifrag_smiles=no_multifrag_smiles, unmodified_scores=raw_scores)], axis=1)    
+            df = pd.concat([df, self.evaluate(df.SMILES.tolist(), evaluator=evaluator, no_multifrag_smiles=no_multifrag_smiles, unmodified_scores=raw_scores)[evaluator.getScorerKeys()]], axis=1)    
         return df.drop('Frags', axis=1).round(decimals=3)
diff --git a/drugex/training/scorers/qsprpred.py b/drugex/training/scorers/qsprpred.py
@@ -7,6 +7,7 @@
 import numpy as np
 from rdkit import Chem
 
+from drugex.logs import logger
 from drugex.training.scorers.interfaces import Scorer
 from qsprpred.models.tasks import ModelTasks
 
@@ -18,26 +19,30 @@ def __init__(self, model, invalids_score=0.0, modifier=None):
         self.invalidsScore = invalids_score
 
     def getScores(self, mols, frags=None):
+        parsed_mols = []
         if type(mols[0]) != str:
             invalids = 0
-            for idx, mol in enumerate(mols):
+            for mol in mols:
                 try:
-                    mol = Chem.SanitizeMol(mol)
+                    Chem.SanitizeMol(mol)
                     mol = Chem.MolToSmiles(mol) if mol and mol.GetNumAtoms() > 1 else "INVALID"
-                except:
+                except Exception as exp:
+                    logger.debug(f"Error processing molecule: {Chem.MolToSmiles(mol) if mol else mol} -> \n\t {exp}")
                     mol = "INVALID"
                 if mol == "INVALID":
                     invalids += 1
-                mols[idx] = mol
+                parsed_mols.append(mol)
 
-            if invalids == len(mols):
-                return np.array([self.invalidsScore] * len(mols))
+            if invalids == len(parsed_mols):
+                return np.array([self.invalidsScore] * len(parsed_mols))
+        else:
+            parsed_mols = mols
 
         if self.model.task == ModelTasks.REGRESSION:
-            return self.model.predictMols(mols)
+            return self.model.predictMols(parsed_mols)
         else:
             # FIXME: currently we only assume that the model is a binary classifier with the positive class being the last one in the list of probabilities
-            return np.array([probas[-1] if not np.isnan(probas[-1]) else self.invalidsScore for probas in  self.model.predictMols(mols, use_probas=True)])
+            return np.array([probas[-1] if not np.isnan(probas[-1]) else self.invalidsScore for probas in  self.model.predictMols(parsed_mols, use_probas=True)])
 
     def getKey(self):
         return f"QSPRpred_{self.model.name}"
diff --git a/drugex/training/tests.py b/drugex/training/tests.py
@@ -13,6 +13,8 @@
 
 import numpy as np
 import pandas as pd
+from rdkit import Chem
+
 from drugex.data.corpus.corpus import SequenceCorpus
 from drugex.data.corpus.vocabulary import VocGraph, VocSmiles
 from drugex.data.datasets import (GraphFragDataSet, SmilesDataSet,
@@ -119,6 +121,22 @@ def getPredictor():
         ret = MockScorer()
     return ret
 
+class TestScorer(TestCase):
+
+    def test_getScores(self):
+        scorer = getPredictor()
+        mols = ["CCO", "CC"]
+        scores = scorer.getScores(mols)
+        self.assertEqual(len(scores), len(mols))
+        self.assertTrue(all([isinstance(score, float) and score > 0 for score in scores]))
+
+        mols = [Chem.MolFromSmiles("CCO"), Chem.MolFromSmiles("CC")]
+        scores = scorer.getScores(mols)
+        self.assertEqual(len(scores), len(mols))
+        self.assertTrue(all([isinstance(score, float) and score > 0 for score in scores]))
+
+        # TODO: check with empty and invalid molecules as well
+
 class TrainingTestCase(TestCase):
 
     # input file information