Skip to content

Commit

Permalink
Extract script and template info from Modeller models
Browse files Browse the repository at this point in the history
  • Loading branch information
benmwebb committed Oct 24, 2023
1 parent da801c8 commit 1260a0d
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 64 deletions.
161 changes: 119 additions & 42 deletions ihm/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import warnings
import sys
import re
import collections

# Handle different naming of urllib in Python 2/3
try:
Expand All @@ -47,6 +48,46 @@ def _get_modeller(version, date):
citation=ihm.citations.modeller)


ModellerTemplate = collections.namedtuple(
'ModellerTemplate', ['name', 'template_begin', 'template_chain',
'template_end', 'target_begin', 'target_chain',
'target_end', 'pct_seq_id'])


def _handle_modeller_template(info, template_path_map, target_dataset,
alnfile):
"""Create a Template object from Modeller PDB header information."""
template_seq_id_range = (int(info.template_begin),
int(info.template_end))
seq_id_range = (int(info.target_begin), int(info.target_end))
sequence_identity = startmodel.SequenceIdentity(
float(info.pct_seq_id), SequenceIdentityDenominator.SHORTER_LENGTH)

# Assume a code of 1abc, 1abc_N, 1abcX, or 1abcX_N refers
# to a real PDB structure
m = re.match(r'(\d[a-zA-Z0-9]{3})[a-zA-Z]?(_.*)?$', info.name)
if m:
template_db_code = m.group(1).upper()
loc = location.PDBLocation(template_db_code)
else:
# Otherwise, look up the PDB file in TEMPLATE PATH remarks
fname = template_path_map[info.name]
loc = location.InputFileLocation(
fname, details="Template for comparative modeling")
d = dataset.PDBDataset(loc, details=loc.details)

# Make the comparative model dataset derive from the template's
target_dataset.parents.append(d)

return (info.target_chain,
startmodel.Template(
dataset=d, asym_id=info.template_chain,
seq_id_range=seq_id_range,
template_seq_id_range=template_seq_id_range,
sequence_identity=sequence_identity,
alignment_file=alnfile))


class Parser(object):
"""Base class for all metadata parsers."""

Expand Down Expand Up @@ -492,12 +533,17 @@ def _get_templates_script(self, pdbname, target_dataset):
fname, details="Script for starting comparative model")
m = tmpre.match(line)
if m:
template_info.append(m)
t = ModellerTemplate(
name=m.group(1), template_begin=m.group(2),
template_chain=m.group(3), template_end=m.group(4),
target_begin=m.group(5), target_chain=m.group(6),
target_end=m.group(7), pct_seq_id=m.group(8))
template_info.append(t)

templates = {}
for t in template_info:
chain, template = self._handle_template(t, template_path_map,
target_dataset, alnfile)
chain, template = _handle_modeller_template(
t, template_path_map, target_dataset, alnfile)
if chain not in templates:
templates[chain] = []
templates[chain].append(template)
Expand All @@ -507,41 +553,6 @@ def _get_templates_script(self, pdbname, target_dataset):
key=operator.attrgetter('seq_id_range'))
return templates, script

def _handle_template(self, info, template_path_map, target_dataset,
alnfile):
"""Create a Template object from Modeller PDB header information."""
template_code = info.group(1)
template_seq_id_range = (int(info.group(2)), int(info.group(4)))
template_asym_id = info.group(3)
seq_id_range = (int(info.group(5)), int(info.group(7)))
target_asym_id = info.group(6)
sequence_identity = startmodel.SequenceIdentity(
float(info.group(8)), SequenceIdentityDenominator.SHORTER_LENGTH)

# Assume a code of 1abc, 1abc_N, 1abcX, or 1abcX_N refers
# to a real PDB structure
m = re.match(r'(\d[a-zA-Z0-9]{3})[a-zA-Z]?(_.*)?$', template_code)
if m:
template_db_code = m.group(1).upper()
loc = location.PDBLocation(template_db_code)
else:
# Otherwise, look up the PDB file in TEMPLATE PATH remarks
fname = template_path_map[template_code]
loc = location.InputFileLocation(
fname, details="Template for comparative modeling")
d = dataset.PDBDataset(loc, details=loc.details)

# Make the comparative model dataset derive from the template's
target_dataset.parents.append(d)

return (target_asym_id,
startmodel.Template(
dataset=d, asym_id=template_asym_id,
seq_id_range=seq_id_range,
template_seq_id_range=template_seq_id_range,
sequence_identity=sequence_identity,
alignment_file=alnfile))

def _parse_pdb_records(self, fh, first_line):
"""Extract information from an official PDB"""
metadata = []
Expand Down Expand Up @@ -658,6 +669,44 @@ def __call__(self, method):
self.m['software'].append(s)


class _ModellerHandler(ihm.reader.Handler):
"""Handle the Modeller-specific _modeller category"""
def __init__(self, m, filename):
self.m = m
self.filename = filename
self.m['alnfile'] = self.m['script'] = None

def __call__(self, alignment, script):
if alignment:
# Paths are relative to that of the mmCIF file
fname = util._get_relative_path(self.filename, alignment)
self.m['alnfile'] = location.InputFileLocation(
fname, details="Alignment for starting comparative model")
if script:
fname = util._get_relative_path(self.filename, script)
self.m['script'] = location.WorkflowFileLocation(
fname, details="Script for starting comparative model")


class _ModellerTemplateHandler(ihm.reader.Handler):
"""Handle the Modeller-specific _modeller_template category"""
def __init__(self, m):
self.m = m
self.m['modeller_templates'] = []

def __call__(self, name, template_begin, template_end, target_begin,
target_end, pct_seq_id):
tmp_begin, tmp_chain = template_begin.split(':', 1)
tmp_end, tmp_chain = template_end.split(':', 1)
tgt_begin, tgt_chain = target_begin.split(':', 1)
tgt_end, tgt_chain = target_end.split(':', 1)
t = ModellerTemplate(name=name, template_begin=tmp_begin,
template_end=tmp_end, template_chain=tmp_chain,
target_begin=tgt_begin, target_end=tgt_end,
target_chain=tgt_chain, pct_seq_id=pct_seq_id)
self.m['modeller_templates'].append(t)


class CIFParser(Parser):
"""Extract metadata from an mmCIF file. Currently, this does not handle
information from comparative modeling packages such as MODELLER
Expand All @@ -677,8 +726,15 @@ def parse_file(self, filename):
as an entry in the PDB or Model Archive databases if the
file contains appropriate headers, otherwise to the
file itself;
and 'software' pointing to a list of software used to
generate the file (as :class:`ihm.Software` objects);
'templates' pointing to a dict with keys the asym (chain)
IDs in the PDB file and values the list of comparative
model templates used to model that chain as
:class:`ihm.startmodel.Template` objects;
'software' pointing to a list of software used to generate
the file (as :class:`ihm.Software` objects);
'script' pointing to the script used to generate the
file, if any (as :class:`ihm.location.WorkflowFileLocation`
objects).
"""
m = {'db': {}, 'title': 'Starting model structure',
'software': []}
Expand All @@ -687,13 +743,18 @@ def parse_file(self, filename):
structh = _StructHandler(m)
arevhisth = _AuditRevHistHandler(m)
exptlh = _ExptlHandler(m)
modellerh = _ModellerHandler(m, filename)
modtmplh = _ModellerTemplateHandler(m)
r = ihm.format.CifReader(
fh, {'_database_2': dbh, '_struct': structh,
'_pdbx_audit_revision_history': arevhisth,
'_exptl': exptlh})
'_exptl': exptlh, '_modeller': modellerh,
'_modeller_template': modtmplh})
r.read_file()
dset = self._get_dataset(filename, m)
return {'dataset': dset, 'software': m['software']}
return {'dataset': dset, 'software': m['software'],
'templates': self._get_templates(filename, m, dset),
'script': m['script']}

def _get_dataset(self, filename, m):
# Check for known databases. Note that if a file is in multiple
Expand All @@ -708,3 +769,19 @@ def _get_dataset(self, filename, m):
loc = location.InputFileLocation(filename, details=m['title'])
return dataset.ComparativeModelDataset(
location=loc, details=loc.details)

def _get_templates(self, filename, m, dset):
alnfile = m['alnfile']
template_path_map = {}
templates = {}
for t in m['modeller_templates']:
chain, template = _handle_modeller_template(t, template_path_map,
dset, alnfile)
if chain not in templates:
templates[chain] = []
templates[chain].append(template)
# Sort templates by starting residue, then ending residue
for chain in templates.keys():
templates[chain] = sorted(templates[chain],
key=operator.attrgetter('seq_id_range'))
return templates
2 changes: 1 addition & 1 deletion test/input/modeller_model.cif
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ _modeller.objective_function 266.4716
_modeller.best_template_pct_seq_id 37.037
_modeller.sequence 1fdx
_modeller.alignment modeller_model.ali
_modeller.script model-cif.py
_modeller.script modeller_model.py
#
loop_
_modeller_template.id
Expand Down
45 changes: 24 additions & 21 deletions test/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,11 @@ def test_modeller_model_no_aln(self):
for t in templates:
self.assertIsNone(t.alignment_file)

def check_modeller_model(self, pdbname):
p = self._parse_pdb(pdbname)
def check_modeller_model(self, pdbname, cif=False):
if cif:
p = self._parse_cif(pdbname)
else:
p = self._parse_pdb(pdbname)
dataset = p['dataset']
self.assertEqual(sorted(p['templates'].keys()), ['A', 'B'])
s1, s2 = p['templates']['A']
Expand Down Expand Up @@ -304,11 +307,18 @@ def check_modeller_model(self, pdbname):
self.assertEqual(p3.location.access_code, '1ABC')
s, = p['software']
self.assertEqual(s.name, 'MODELLER')
self.assertEqual(s.version, '9.18')
self.assertEqual(
s.description,
'Comparative modeling by satisfaction of spatial restraints, '
'build 2017/02/10 22:21:34')
if cif:
self.assertEqual(s.version, '10.4')
self.assertEqual(
s.description,
'Comparative modeling by satisfaction of spatial restraints, '
'build 2023/10/23 11:26:12')
else:
self.assertEqual(s.version, '9.18')
self.assertEqual(
s.description,
'Comparative modeling by satisfaction of spatial restraints, '
'build 2017/02/10 22:21:34')
return p

def test_modeller_local(self):
Expand Down Expand Up @@ -471,20 +481,13 @@ def test_cif_unknown(self):
def test_cif_modeller_model(self):
"""Test CIFParser when given a Modeller model"""
fname = utils.get_input_file_name(TOPDIR, 'modeller_model.cif')
p = self._parse_cif(fname)
dataset = p['dataset']
self.assertEqual(dataset.data_type, 'Comparative model')
self.assertEqual(dataset.location.path, fname)
self.assertIsNone(dataset.location.repo)
self.assertEqual(dataset.location.details,
'Starting model structure')
s, = p['software']
self.assertEqual(s.name, 'MODELLER')
self.assertEqual(s.version, '10.4')
self.assertEqual(
s.description,
'Comparative modeling by satisfaction of spatial restraints, '
'build 2023/10/23 11:26:12')
p = self.check_modeller_model(fname, cif=True)
aliname = utils.get_input_file_name(TOPDIR, 'modeller_model.ali')
script = utils.get_input_file_name(TOPDIR, 'modeller_model.py')
self.assertEqual(p['script'].path, script)
for templates in p['templates'].values():
for t in templates:
self.assertEqual(t.alignment_file.path, aliname)


if __name__ == '__main__':
Expand Down

0 comments on commit 1260a0d

Please sign in to comment.