Extract script and template info from Modeller models

ihmwg · Oct 24, 2023 · 1260a0d · 1260a0d
1 parent da801c8
commit 1260a0d
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 64 deletions.
diff --git a/ihm/metadata.py b/ihm/metadata.py
@@ -25,6 +25,7 @@
 import warnings
 import sys
 import re
+import collections
 
 # Handle different naming of urllib in Python 2/3
 try:
@@ -47,6 +48,46 @@ def _get_modeller(version, date):
         citation=ihm.citations.modeller)
 
 
+ModellerTemplate = collections.namedtuple(
+    'ModellerTemplate', ['name', 'template_begin', 'template_chain',
+                         'template_end', 'target_begin', 'target_chain',
+                         'target_end', 'pct_seq_id'])
+
+
+def _handle_modeller_template(info, template_path_map, target_dataset,
+                              alnfile):
+    """Create a Template object from Modeller PDB header information."""
+    template_seq_id_range = (int(info.template_begin),
+                             int(info.template_end))
+    seq_id_range = (int(info.target_begin), int(info.target_end))
+    sequence_identity = startmodel.SequenceIdentity(
+        float(info.pct_seq_id), SequenceIdentityDenominator.SHORTER_LENGTH)
+
+    # Assume a code of 1abc, 1abc_N, 1abcX, or 1abcX_N refers
+    # to a real PDB structure
+    m = re.match(r'(\d[a-zA-Z0-9]{3})[a-zA-Z]?(_.*)?$', info.name)
+    if m:
+        template_db_code = m.group(1).upper()
+        loc = location.PDBLocation(template_db_code)
+    else:
+        # Otherwise, look up the PDB file in TEMPLATE PATH remarks
+        fname = template_path_map[info.name]
+        loc = location.InputFileLocation(
+            fname, details="Template for comparative modeling")
+    d = dataset.PDBDataset(loc, details=loc.details)
+
+    # Make the comparative model dataset derive from the template's
+    target_dataset.parents.append(d)
+
+    return (info.target_chain,
+            startmodel.Template(
+                dataset=d, asym_id=info.template_chain,
+                seq_id_range=seq_id_range,
+                template_seq_id_range=template_seq_id_range,
+                sequence_identity=sequence_identity,
+                alignment_file=alnfile))
+
+
 class Parser(object):
     """Base class for all metadata parsers."""
 
@@ -492,12 +533,17 @@ def _get_templates_script(self, pdbname, target_dataset):
                         fname, details="Script for starting comparative model")
                 m = tmpre.match(line)
                 if m:
-                    template_info.append(m)
+                    t = ModellerTemplate(
+                        name=m.group(1), template_begin=m.group(2),
+                        template_chain=m.group(3), template_end=m.group(4),
+                        target_begin=m.group(5), target_chain=m.group(6),
+                        target_end=m.group(7), pct_seq_id=m.group(8))
+                    template_info.append(t)
 
         templates = {}
         for t in template_info:
-            chain, template = self._handle_template(t, template_path_map,
-                                                    target_dataset, alnfile)
+            chain, template = _handle_modeller_template(
+                t, template_path_map, target_dataset, alnfile)
             if chain not in templates:
                 templates[chain] = []
             templates[chain].append(template)
@@ -507,41 +553,6 @@ def _get_templates_script(self, pdbname, target_dataset):
                                       key=operator.attrgetter('seq_id_range'))
         return templates, script
 
-    def _handle_template(self, info, template_path_map, target_dataset,
-                         alnfile):
-        """Create a Template object from Modeller PDB header information."""
-        template_code = info.group(1)
-        template_seq_id_range = (int(info.group(2)), int(info.group(4)))
-        template_asym_id = info.group(3)
-        seq_id_range = (int(info.group(5)), int(info.group(7)))
-        target_asym_id = info.group(6)
-        sequence_identity = startmodel.SequenceIdentity(
-            float(info.group(8)), SequenceIdentityDenominator.SHORTER_LENGTH)
-
-        # Assume a code of 1abc, 1abc_N, 1abcX, or 1abcX_N refers
-        # to a real PDB structure
-        m = re.match(r'(\d[a-zA-Z0-9]{3})[a-zA-Z]?(_.*)?$', template_code)
-        if m:
-            template_db_code = m.group(1).upper()
-            loc = location.PDBLocation(template_db_code)
-        else:
-            # Otherwise, look up the PDB file in TEMPLATE PATH remarks
-            fname = template_path_map[template_code]
-            loc = location.InputFileLocation(
-                fname, details="Template for comparative modeling")
-        d = dataset.PDBDataset(loc, details=loc.details)
-
-        # Make the comparative model dataset derive from the template's
-        target_dataset.parents.append(d)
-
-        return (target_asym_id,
-                startmodel.Template(
-                    dataset=d, asym_id=template_asym_id,
-                    seq_id_range=seq_id_range,
-                    template_seq_id_range=template_seq_id_range,
-                    sequence_identity=sequence_identity,
-                    alignment_file=alnfile))
-
     def _parse_pdb_records(self, fh, first_line):
         """Extract information from an official PDB"""
         metadata = []
@@ -658,6 +669,44 @@ def __call__(self, method):
             self.m['software'].append(s)
 
 
+class _ModellerHandler(ihm.reader.Handler):
+    """Handle the Modeller-specific _modeller category"""
+    def __init__(self, m, filename):
+        self.m = m
+        self.filename = filename
+        self.m['alnfile'] = self.m['script'] = None
+
+    def __call__(self, alignment, script):
+        if alignment:
+            # Paths are relative to that of the mmCIF file
+            fname = util._get_relative_path(self.filename, alignment)
+            self.m['alnfile'] = location.InputFileLocation(
+                fname, details="Alignment for starting comparative model")
+        if script:
+            fname = util._get_relative_path(self.filename, script)
+            self.m['script'] = location.WorkflowFileLocation(
+                fname, details="Script for starting comparative model")
+
+
+class _ModellerTemplateHandler(ihm.reader.Handler):
+    """Handle the Modeller-specific _modeller_template category"""
+    def __init__(self, m):
+        self.m = m
+        self.m['modeller_templates'] = []
+
+    def __call__(self, name, template_begin, template_end, target_begin,
+                 target_end, pct_seq_id):
+        tmp_begin, tmp_chain = template_begin.split(':', 1)
+        tmp_end, tmp_chain = template_end.split(':', 1)
+        tgt_begin, tgt_chain = target_begin.split(':', 1)
+        tgt_end, tgt_chain = target_end.split(':', 1)
+        t = ModellerTemplate(name=name, template_begin=tmp_begin,
+                             template_end=tmp_end, template_chain=tmp_chain,
+                             target_begin=tgt_begin, target_end=tgt_end,
+                             target_chain=tgt_chain, pct_seq_id=pct_seq_id)
+        self.m['modeller_templates'].append(t)
+
+
 class CIFParser(Parser):
     """Extract metadata from an mmCIF file. Currently, this does not handle
        information from comparative modeling packages such as MODELLER
@@ -677,8 +726,15 @@ def parse_file(self, filename):
                     as an entry in the PDB or Model Archive databases if the
                     file contains appropriate headers, otherwise to the
                     file itself;
-                    and 'software' pointing to a list of software used to
-                    generate the file (as :class:`ihm.Software` objects);
+                    'templates' pointing to a dict with keys the asym (chain)
+                    IDs in the PDB file and values the list of comparative
+                    model templates used to model that chain as
+                    :class:`ihm.startmodel.Template` objects;
+                    'software' pointing to a list of software used to generate
+                    the file (as :class:`ihm.Software` objects);
+                    'script' pointing to the script used to generate the
+                    file, if any (as :class:`ihm.location.WorkflowFileLocation`
+                    objects).
         """
         m = {'db': {}, 'title': 'Starting model structure',
              'software': []}
@@ -687,13 +743,18 @@ def parse_file(self, filename):
             structh = _StructHandler(m)
             arevhisth = _AuditRevHistHandler(m)
             exptlh = _ExptlHandler(m)
+            modellerh = _ModellerHandler(m, filename)
+            modtmplh = _ModellerTemplateHandler(m)
             r = ihm.format.CifReader(
                 fh, {'_database_2': dbh, '_struct': structh,
                      '_pdbx_audit_revision_history': arevhisth,
-                     '_exptl': exptlh})
+                     '_exptl': exptlh, '_modeller': modellerh,
+                     '_modeller_template': modtmplh})
             r.read_file()
         dset = self._get_dataset(filename, m)
-        return {'dataset': dset, 'software': m['software']}
+        return {'dataset': dset, 'software': m['software'],
+                'templates': self._get_templates(filename, m, dset),
+                'script': m['script']}
 
     def _get_dataset(self, filename, m):
         # Check for known databases. Note that if a file is in multiple
@@ -708,3 +769,19 @@ def _get_dataset(self, filename, m):
         loc = location.InputFileLocation(filename, details=m['title'])
         return dataset.ComparativeModelDataset(
             location=loc, details=loc.details)
+
+    def _get_templates(self, filename, m, dset):
+        alnfile = m['alnfile']
+        template_path_map = {}
+        templates = {}
+        for t in m['modeller_templates']:
+            chain, template = _handle_modeller_template(t, template_path_map,
+                                                        dset, alnfile)
+            if chain not in templates:
+                templates[chain] = []
+            templates[chain].append(template)
+        # Sort templates by starting residue, then ending residue
+        for chain in templates.keys():
+            templates[chain] = sorted(templates[chain],
+                                      key=operator.attrgetter('seq_id_range'))
+        return templates
diff --git a/test/input/modeller_model.cif b/test/input/modeller_model.cif
@@ -7,7 +7,7 @@ _modeller.objective_function        266.4716
 _modeller.best_template_pct_seq_id  37.037
 _modeller.sequence 1fdx
 _modeller.alignment modeller_model.ali
-_modeller.script model-cif.py
+_modeller.script modeller_model.py
 #
 loop_
 _modeller_template.id

diff --git a/test/test_metadata.py b/test/test_metadata.py
@@ -266,8 +266,11 @@ def test_modeller_model_no_aln(self):
             for t in templates:
                 self.assertIsNone(t.alignment_file)
 
-    def check_modeller_model(self, pdbname):
-        p = self._parse_pdb(pdbname)
+    def check_modeller_model(self, pdbname, cif=False):
+        if cif:
+            p = self._parse_cif(pdbname)
+        else:
+            p = self._parse_pdb(pdbname)
         dataset = p['dataset']
         self.assertEqual(sorted(p['templates'].keys()), ['A', 'B'])
         s1, s2 = p['templates']['A']
@@ -304,11 +307,18 @@ def check_modeller_model(self, pdbname):
         self.assertEqual(p3.location.access_code, '1ABC')
         s, = p['software']
         self.assertEqual(s.name, 'MODELLER')
-        self.assertEqual(s.version, '9.18')
-        self.assertEqual(
-            s.description,
-            'Comparative modeling by satisfaction of spatial restraints, '
-            'build 2017/02/10 22:21:34')
+        if cif:
+            self.assertEqual(s.version, '10.4')
+            self.assertEqual(
+                s.description,
+                'Comparative modeling by satisfaction of spatial restraints, '
+                'build 2023/10/23 11:26:12')
+        else:
+            self.assertEqual(s.version, '9.18')
+            self.assertEqual(
+                s.description,
+                'Comparative modeling by satisfaction of spatial restraints, '
+                'build 2017/02/10 22:21:34')
         return p
 
     def test_modeller_local(self):
@@ -471,20 +481,13 @@ def test_cif_unknown(self):
     def test_cif_modeller_model(self):
         """Test CIFParser when given a Modeller model"""
         fname = utils.get_input_file_name(TOPDIR, 'modeller_model.cif')
-        p = self._parse_cif(fname)
-        dataset = p['dataset']
-        self.assertEqual(dataset.data_type, 'Comparative model')
-        self.assertEqual(dataset.location.path, fname)
-        self.assertIsNone(dataset.location.repo)
-        self.assertEqual(dataset.location.details,
-                         'Starting model structure')
-        s, = p['software']
-        self.assertEqual(s.name, 'MODELLER')
-        self.assertEqual(s.version, '10.4')
-        self.assertEqual(
-            s.description,
-            'Comparative modeling by satisfaction of spatial restraints, '
-            'build 2023/10/23 11:26:12')
+        p = self.check_modeller_model(fname, cif=True)
+        aliname = utils.get_input_file_name(TOPDIR, 'modeller_model.ali')
+        script = utils.get_input_file_name(TOPDIR, 'modeller_model.py')
+        self.assertEqual(p['script'].path, script)
+        for templates in p['templates'].values():
+            for t in templates:
+                self.assertEqual(t.alignment_file.path, aliname)
 
 
 if __name__ == '__main__':