Merge pull request #840 from choderalab/support-rst7

Add support for amber rst7 files and make jobid 1-based
choderalab · Dec 4, 2017 · 1a80cd5 · 1a80cd5
2 parents 462231a + ed4c039
commit 1a80cd5
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 101 deletions.
diff --git a/Yank/commands/script.py b/Yank/commands/script.py
@@ -35,7 +35,7 @@
 
 Optional Arguments:
   --jobid=INTEGER               You can run only a subset of the experiments by specifying jobid and njobs, where
-                                0 <= job_id <= n_jobs-1. In this case, njobs must be specified as well and YANK will
+                                1 <= job_id <= n_jobs. In this case, njobs must be specified as well and YANK will
                                 run only 1/n_jobs of the experiments. This can be used to run several separate YANK
                                 executions in parallel starting from the same script.
   --njobs=INTEGER               Specify the total number of parallel executions. jobid has to be specified too.

diff --git a/Yank/experiment.py b/Yank/experiment.py
@@ -502,7 +502,7 @@ class ExperimentBuilder(object):
         can load it later by using :func:`parse` (default is None).
     job_id : None or int
         If you want to split the experiments among different executions,
-        you can set this to an integer 0 <= job_id <= n_jobs-1, and this
+        you can set this to an integer 1 <= job_id <= n_jobs, and this
         :class:`ExperimentBuilder` will run only 1/n_jobs of the experiments.
     n_jobs : None or int
         If ``job_id`` is specified, this is the total number of jobs that
@@ -606,8 +606,8 @@ def __init__(self, script=None, job_id=None, n_jobs=None):
         if job_id is not None:
             if n_jobs is None:
                 raise ValueError('n_jobs must be specified together with job_id')
-            if not 0 <= job_id <= n_jobs:
-                raise ValueError('job_id must be between 0 and n_jobs ({})'.format(n_jobs))
+            if not 1 <= job_id <= n_jobs:
+                raise ValueError('job_id must be between 1 and n_jobs ({})'.format(n_jobs))
 
         self._job_id = job_id
         self._n_jobs = n_jobs
@@ -1021,7 +1021,10 @@ def _expand_experiments(self):
 
             # Loop over all combinations
             for name, combination in experiment.named_combinations(separator='_', max_name_length=50):
-                if self._job_id is None or experiment_id % self._n_jobs == self._job_id:
+                # Both self._job_id and self._job_id-1 work (self._job_id is 1-based),
+                # but we use the latter just because it makes it easier to identify in
+                # advance which job ids are associated to which experiments.
+                if self._job_id is None or experiment_id % self._n_jobs == self._job_id-1:
                     yield os.path.join(output_dir, name), combination
                 experiment_id += 1
 
@@ -1437,7 +1440,6 @@ def _to_unit_unless_none(input_quantity):
         for solvent_id, solvent_descr in solvents_description.items():
             if solvent_validator.validate(solvent_descr):
                 validated_solvents[solvent_id] = solvent_validator.document
-                print(solvent_validator.document)
             else:
                 error = "Solvent '{}' did not validate! Check the schema error below for details\n{}"
                 raise YamlParseError(error.format(solvent_id, yaml.dump(solvent_validator.errors)))
@@ -1658,15 +1660,21 @@ def _is_pipeline_solvent_with_receptor(field, solvent_id, error):
                 type: string
                 validator: file_exists
             dependencies: phase2_path
-            coerce: sort_alphabetically_by_extension
+            validator: supported_system_files
         phase2_path:
             required: no
             type: list
             schema:
                 type: string
                 validator: file_exists
-            validator: is_system_files_matching_phase1
-            coerce: sort_alphabetically_by_extension
+            validator: supported_system_files
+        gromacs_include_dir:
+            required: no
+            type: string
+            dependencies: [phase1_path, phase2_path]
+            validator: directory_exists
+
+        # Solvents
         solvent:
             required: no
             type: string
@@ -1694,13 +1702,8 @@ def _is_pipeline_solvent_with_receptor(field, solvent_id, error):
             oneof:
                 - dependencies: [phase1_path, phase2_path, solvent1]
                 - dependencies: [solute, solvent1]
-        gromacs_include_dir:
-            required: no
-            type: string
-            dependencies: [phase1_path, phase2_path]
-            validator: directory_exists
 
-        # Non-Prebuilt setup
+        # Automatic pipeline
         receptor:
             required: no
             type: string

diff --git a/Yank/pipeline.py b/Yank/pipeline.py
@@ -857,6 +857,7 @@ def get_system_files_paths(self, system_id):
         """
         Paths = collections.namedtuple('Paths', ['position_path', 'parameters_path'])
         system_dir = os.path.join(self.setup_dir, self.SYSTEMS_DIR, system_id)
+
         if 'receptor' in self.systems[system_id]:
             system_files_paths = [
                 Paths(position_path=os.path.join(system_dir, 'complex.inpcrd'),
@@ -872,13 +873,20 @@ def get_system_files_paths(self, system_id):
                       parameters_path=os.path.join(system_dir, 'solvent2.prmtop'))
             ]
         else:
-            system_files_paths = [
-                Paths(position_path=self.systems[system_id]['phase1_path'][0],
-                      parameters_path=self.systems[system_id]['phase1_path'][1]),
-                Paths(position_path=self.systems[system_id]['phase2_path'][0],
-                      parameters_path=self.systems[system_id]['phase2_path'][1])
-            ]
-
+            parameter_file_extensions = {'prmtop', 'top', 'xml'}
+            system_files_paths = []
+            for phase_path_name in ['phase1_path', 'phase2_path']:
+                file_paths = self.systems[system_id][phase_path_name]
+                assert len(file_paths) == 2
+
+                # Make sure that the position file is first.
+                first_file_extension = os.path.splitext(file_paths[0])[1][1:]
+                if first_file_extension in parameter_file_extensions:
+                    file_paths = list(reversed(file_paths))
+
+                # Append Paths object.
+                system_files_paths.append(Paths(position_path=file_paths[0],
+                                                parameters_path=file_paths[1]))
         return system_files_paths
 
     def is_molecule_setup(self, molecule_id):

diff --git a/Yank/schema/validator.py b/Yank/schema/validator.py
@@ -15,20 +15,19 @@ class YANKCerberusValidator(cerberus.Validator):
     Custom cerberus.Validator class for YANK extending the Validator to include all the methods needed
     by YANK
     """
+
     # ====================================================
     # DATA COERCION
     # ====================================================
+
     def _normalize_coerce_single_str_to_list(self, value):
         """Cast a single string to a list of string"""
         return [value] if isinstance(value, str) else value
 
-    def _normalize_coerce_sort_alphabetically_by_extension(self, files):
-        provided_extensions = [os.path.splitext(filepath)[1][1:] for filepath in files]
-        return [filepath for (ext, filepath) in sorted(zip(provided_extensions, files))]
-
     # ====================================================
     # DATA VALIDATORS
     # ====================================================
+
     def _validator_file_exists(self, field, filepath):
         """Assert that the file is in fact, a file!"""
         if not os.path.isfile(filepath):
@@ -61,47 +60,32 @@ def _validator_int_or_all_string(self, field, value):
         if value != 'all' and not isinstance(value, int):
             self._error(field, "{} must be an int or the string 'all'".format(value))
 
-    def _validator_is_system_files_matching_phase1(self, field, phase2):
-        """Ensure the phase2_filepaths match the type of the phase1 paths"""
-        phase1 = self.document.get('phase1_path')
-        # Handle non-existant
-        if phase1 is None:
-            self._error(field, "phase1_path must be present to use phase2_path")
-            return
-        # Handle the not iterable type
-        try:
-            _ = [f for f in phase1]
-            _ = [f for f in phase2]
-        except TypeError:
-            self._error(field, 'phase1_path and phase2_path must be a list of file paths')
-            return
-        # Now process
-        provided_phase1_extensions = [os.path.splitext(filepath)[1][1:] for filepath in phase1]
-        provided_phase2_extensions = [os.path.splitext(filepath)[1][1:] for filepath in phase2]
-        # Check phase1 extensions
-        phase1_type = None
-        valid_extensions = None
-        expected_extensions = {
-            'amber': ['inpcrd', 'prmtop'],
-            'gromacs': ['gro', 'top'],
-            'openmm': ['pdb', 'xml']
-        }
-        for extension_type, valid_extensions in expected_extensions.items():
-            if sorted(provided_phase1_extensions) == sorted(valid_extensions):
-                phase1_type = extension_type
+    def _validator_supported_system_files(self, field, file_paths):
+        """Ensure the input system files are supported."""
+        # Obtain the extension of the system files.
+        file_extensions = {os.path.splitext(file_path)[1][1:] for file_path in file_paths}
+
+        # Find a match for the extensions.
+        expected_extensions = [
+            ('amber', {'inpcrd', 'prmtop'}),
+            ('amber', {'rst7', 'prmtop'}),
+            ('gromacs', {'gro', 'top'}),
+            ('openmm', {'pdb', 'xml'})
+        ]
+        file_extension_type = None
+        for extension_type, valid_extensions in expected_extensions:
+            if file_extensions == valid_extensions:
+                file_extension_type = extension_type
                 break
-        if phase1_type is None:
-            self._error(field, 'phase1_path must have file extensions matching one of the following types: '
-                               '{}'.format(expected_extensions))
-            return
-        # Ensure phase 2 is of the same type
-        if sorted(provided_phase2_extensions) != sorted(valid_extensions):
-            self._error(field, 'phase2_path must have files of the same extensions as phase1_path ({}) to ensure '
-                               'phases came from the same setup pipeline.'.format(phase1_type))
+
+        # Verify we found a match.
+        if file_extension_type is None:
+            self._error(field, '{} must have file extensions matching one of the '
+                               'following types: {}'.format(field, expected_extensions))
             return
 
-        logger.debug('Correctly recognized phase1_path files ({}) and phase2_path files ({}) '
-                     'as {} files'.format(phase1, phase2, phase1_type))
+        logger.debug('Correctly recognized {}files ({}) as {} '
+                     'files'.format(field, file_paths, file_extension_type))
 
     # ====================================================
     # DATA TYPES

diff --git a/Yank/tests/test_experiment.py b/Yank/tests/test_experiment.py
@@ -23,7 +23,7 @@
 import mdtraj
 import numpy as np
 
-from nose.tools import assert_raises, assert_equal
+from nose.tools import assert_raises, assert_equal, assert_raises_regexp
 from nose.plugins.attrib import attr
 
 from yank.experiment import *
@@ -594,48 +594,70 @@ def test_validation_wrong_systems():
     """.format(data_paths['lysozyme'])
     basic_script = yaml.load(textwrap.dedent(basic_script))
 
+    # Each test case is a pair (regexp_error, system_description).
     systems = [
-        {'receptor': 'rec', 'ligand': 'lig'},
-        {'receptor': 'rec_region', 'ligand': 'lig_region', 'solvent': 'solv'},
-        {'receptor': 'rec', 'ligand': 1, 'solvent': 'solv'},
-        {'receptor': 'rec', 'ligand': 'lig', 'solvent': ['solv', 'solv']},
-        {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'unknown'},
-        {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'solv4',
-            'leap': {'parameters': ['leaprc.gaff', 'leaprc.ff14SB']}},
-        {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'solv3',
-            'parameters': 'leaprc.ff14SB'},
+        ("'solvent' is required",
+            {'receptor': 'rec', 'ligand': 'lig'}),
 
-        {'phase1_path': data_paths['bentol-complex'][0],
-         'phase2_path': data_paths['bentol-solvent'],
-         'ligand_dsl': 'resname BEN', 'solvent': 'solv'},
-        {'phase1_path': ['nonexistingpath.prmtop', 'nonexistingpath.inpcrd'],
-         'phase2_path': data_paths['bentol-solvent'],
-         'ligand_dsl': 'resname BEN', 'solvent': 'solv'},
-        {'phase1_path': data_paths['bentol-complex'],
-         'phase2_path': data_paths['bentol-solvent'],
-         'ligand_dsl': 3.4, 'solvent': 'solv'},
-        {'phase1_path': data_paths['bentol-complex'],
-         'phase2_path': data_paths['bentol-solvent'],
-         'ligand_dsl': 'resname BEN', 'solvent1': 'unknown',
-         'solvent2': 'solv2'},
+        ("regions\(s\) clashing",
+            {'receptor': 'rec_region', 'ligand': 'lig_region', 'solvent': 'solv'}),
 
-        {'phase1_path': data_paths['bentol-complex'],
-         'phase2_path': data_paths['pxylene-solvent'],
-         'ligand_dsl': 'resname p-xylene', 'solvent': 'solv',
-         'gromacs_include_dir': data_paths['pxylene-gro-include']},
+        ("ligand: \[must be of string type\]",
+            {'receptor': 'rec', 'ligand': 1, 'solvent': 'solv'}),
 
-        {'phase1_path': data_paths['toluene-solvent'],
-         'phase2_path': data_paths['toluene-vacuum'],
-         'ligand_dsl': 'resname TOL', 'solvent': 'cantbespecified'},
+        ("solvent: \[must be of string type\]",
+            {'receptor': 'rec', 'ligand': 'lig', 'solvent': ['solv', 'solv']}),
+
+        ("unallowed value unknown",
+            {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'unknown'}),
+
+        ("solv4 does not specify clearance",
+            {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'solv4',
+             'leap': {'parameters': ['leaprc.gaff', 'leaprc.ff14SB']}}),
+
+        ("parameters: \[unknown field\]",
+            {'receptor': 'rec', 'ligand': 'lig', 'solvent': 'solv3',
+             'parameters': 'leaprc.ff14SB'}),
+
+        ("phase1_path: \[must be of list type\]",
+            {'phase1_path': data_paths['bentol-complex'][0],
+             'phase2_path': data_paths['bentol-solvent'],
+             'ligand_dsl': 'resname BEN', 'solvent': 'solv'}),
+
+        ("File path nonexistingpath.prmtop does not exist.",
+            {'phase1_path': ['nonexistingpath.prmtop', 'nonexistingpath.inpcrd'],
+            'phase2_path': data_paths['bentol-solvent'],
+            'ligand_dsl': 'resname BEN', 'solvent': 'solv'}),
+
+        ("ligand_dsl: \[must be of string type\]",
+            {'phase1_path': data_paths['bentol-complex'],
+             'phase2_path': data_paths['bentol-solvent'],
+             'ligand_dsl': 3.4, 'solvent': 'solv'}),
+
+        ("unallowed value unknown",
+            {'phase1_path': data_paths['bentol-complex'],
+             'phase2_path': data_paths['bentol-solvent'],
+             'ligand_dsl': 'resname BEN', 'solvent1': 'unknown',
+             'solvent2': 'solv2'}),
 
-        {'receptor': 'rec', 'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv'},
-        {'ligand': 'lig', 'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv'},
-        {'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv', 'leap': 'leaprc.gaff'}
+        ("unallowed value cantbespecified",
+            {'phase1_path': data_paths['toluene-solvent'],
+             'phase2_path': data_paths['toluene-vacuum'],
+             'ligand_dsl': 'resname TOL', 'solvent': 'cantbespecified'}),
+
+        ("field 'ligand' is required",
+            {'receptor': 'rec', 'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv'}),
+
+        ("''ligand'' must not be present with ''solute''",
+            {'ligand': 'lig', 'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv'}),
+
+        ("leap: \[must be of dict type\]",
+            {'solute': 'lig', 'solvent1': 'solv', 'solvent2': 'solv', 'leap': 'leaprc.gaff'})
     ]
-    for system in systems:
+    for regexp, system in systems:
         modified_script = basic_script.copy()
         modified_script['systems'] = {'sys': system}
-        yield assert_raises, YamlParseError, exp_builder.parse, modified_script
+        yield assert_raises_regexp, YamlParseError, regexp, exp_builder.parse, modified_script
 
 
 def test_order_phases():
@@ -1853,11 +1875,11 @@ def test_expand_experiments():
     experiment_systems = utils.CombinatorialLeaf(['explicit-system', 'implicit-system', 'hydration-system'])
     template_script['experiments']['system'] = experiment_systems
 
-    exp_builder = ExperimentBuilder(script=template_script, job_id=0, n_jobs=2)
+    exp_builder = ExperimentBuilder(script=template_script, job_id=1, n_jobs=2)
     experiments = list(exp_builder._expand_experiments())
     assert len(experiments) == 2
 
-    exp_builder = ExperimentBuilder(script=template_script, job_id=1, n_jobs=2)
+    exp_builder = ExperimentBuilder(script=template_script, job_id=2, n_jobs=2)
     experiments = list(exp_builder._expand_experiments())
     assert len(experiments) == 1