diff --git a/docs/model.rst b/docs/model.rst index 8006d8e..23b024d 100644 --- a/docs/model.rst +++ b/docs/model.rst @@ -17,5 +17,7 @@ The :mod:`modelcif.model` Python module .. autoclass:: AbInitioModel +.. autoclass:: NotModeledResidueRange + .. autoclass:: ModelGroup :members: diff --git a/modelcif/model.py b/modelcif/model.py index 9876813..25d34d3 100644 --- a/modelcif/model.py +++ b/modelcif/model.py @@ -2,6 +2,8 @@ import ihm.representation from ihm.model import Atom, ModelGroup # noqa: F401 import modelcif.data +from ihm.util import _check_residue_range + # Provide ma-specific docs for Atom if sys.version_info[0] >= 3: @@ -62,10 +64,11 @@ def __init__(self, assembly, name=None): [ihm.representation.AtomicSegment(seg, rigid=False) for seg in assembly]) self._atoms = [] - # ModelCIF doesn't support not-modeled residue ranges, but python-ihm - # uses this information to populate the pdbx_poly_seq_scheme table, - # so add an empty range here + + #: List of residue ranges that were explicitly not modeled. See + #: :class:`NotModeledResidueRange`. self.not_modeled_residue_ranges = [] + #: Quality scores for the model or part of it (a simple list of #: metric objects; see :mod:`modelcif.qa_metric`) self.qa_metrics = [] @@ -116,3 +119,20 @@ class AbInitioModel(Model): """ model_type = "Ab initio model" other_details = None + + +class NotModeledResidueRange(object): + """A range of residues that were explicitly not modeled. + See :attr:`Model.not_modeled_residue_ranges`. + These ranges are not explicitly stored in the mmCIF file, + but they will be excluded from the ``pdbx_poly_seq_scheme`` table. + + :param asym_unit: The asymmetric unit to which the residues belong. + :type asym_unit: :class:`~modelcif.AsymUnit` + :param int seq_id_begin: Starting residue in the range. + :param int seq_id_end: Ending residue in the range. + """ + def __init__(self, asym_unit, seq_id_begin, seq_id_end): + self.asym_unit = asym_unit + self.seq_id_begin, self.seq_id_end = seq_id_begin, seq_id_end + _check_residue_range((seq_id_begin, seq_id_end), asym_unit.entity) diff --git a/modelcif/reader.py b/modelcif/reader.py index a13311d..298e1e1 100644 --- a/modelcif/reader.py +++ b/modelcif/reader.py @@ -663,7 +663,7 @@ def __call__(self, assembly_id, asym_id, seq_id_begin, seq_id_end): if seq_id_begin is None and seq_id_end is None: a.append(asym) else: - a.append(asym(int(seq_id_begin), int(seq_id_end), _check=False)) + a.append(asym(int(seq_id_begin), int(seq_id_end))) def finalize(self): # Any AsymUnitRange which covers an entire asym, @@ -899,7 +899,7 @@ def __call__(self, model_id, label_asym_id, label_seq_id, metric_id, model = self.sysr.models.get_by_id(model_id) asym = self.sysr.asym_units.get_by_id(label_asym_id) seq_id = self.get_int(label_seq_id) - residue = asym.residue(seq_id, _check=False) + residue = asym.residue(seq_id) metric_class = self.sysr.qa_by_id[metric_id] model.qa_metrics.append(metric_class(residue, self.get_float(metric_value))) @@ -913,10 +913,10 @@ def __call__(self, model_id, label_asym_id_1, label_seq_id_1, model = self.sysr.models.get_by_id(model_id) asym1 = self.sysr.asym_units.get_by_id(label_asym_id_1) seq_id1 = self.get_int(label_seq_id_1) - residue1 = asym1.residue(seq_id1, _check=False) + residue1 = asym1.residue(seq_id1) asym2 = self.sysr.asym_units.get_by_id(label_asym_id_2) seq_id2 = self.get_int(label_seq_id_2) - residue2 = asym2.residue(seq_id2, _check=False) + residue2 = asym2.residue(seq_id2) metric_class = self.sysr.qa_by_id[metric_id] model.qa_metrics.append(metric_class(residue1, residue2, self.get_float(metric_value))) diff --git a/modelcif/util/__init__.py b/modelcif/util/__init__.py new file mode 100644 index 0000000..2ae2839 --- /dev/null +++ b/modelcif/util/__init__.py @@ -0,0 +1 @@ +pass diff --git a/modelcif/util/make_mmcif.py b/modelcif/util/make_mmcif.py new file mode 100644 index 0000000..9239385 --- /dev/null +++ b/modelcif/util/make_mmcif.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +""" +Add minimal ModelCIF-related tables to an mmCIF file. + +Given any mmCIF file as input, this script will add any missing +ModelCIF-related tables and write out a new file that is minimally compliant +with the ModelCIF dictionary. + +This is done by simply reading in the original file with python-modelcif and +then writing it out again, so + a) any data in the input file that is not understood by python-modelcif + will be lost on output; and + b) input files that aren't compliant with the PDBx dictionary, or that + contain syntax errors or other problems, may crash or otherwise confuse + python-modelcif. +""" + + +import modelcif.reader +import modelcif.dumper +import modelcif.model +import ihm.util +import os +import argparse + + +def add_modelcif_info(s): + if not s.title: + s.title = 'Auto-generated system' + + for model_group in s.model_groups: + for model in model_group: + model.not_modeled_residue_ranges.extend( + _get_not_modeled_residues(model)) + return s + + +def _get_not_modeled_residues(model): + """Yield NotModeledResidueRange objects for all residue ranges in the + Model that are not referenced by Atom, Sphere, or pre-existing + NotModeledResidueRange objects""" + for assem in model.assembly: + asym = assem.asym if hasattr(assem, 'asym') else assem + if not asym.entity.is_polymeric(): + continue + # Make a set of all residue indices of this asym "handled" either + # by being modeled (with Atom or Sphere objects) or by being + # explicitly marked as not-modeled + handled_residues = set() + for rr in model.not_modeled_residue_ranges: + if rr.asym_unit is asym: + for seq_id in range(rr.seq_id_begin, rr.seq_id_end + 1): + handled_residues.add(seq_id) + for atom in model._atoms: + if atom.asym_unit is asym: + handled_residues.add(atom.seq_id) + # Convert set to a list of residue ranges + handled_residues = ihm.util._make_range_from_list( + sorted(handled_residues)) + # Return not-modeled for each non-handled range + for r in ihm.util._invert_ranges(handled_residues, + end=assem.seq_id_range[1], + start=assem.seq_id_range[0]): + yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1]) + + +def get_args(): + p = argparse.ArgumentParser( + description="Add minimal ModelCIF-related tables to an mmCIF file.") + p.add_argument("input", metavar="input.cif", help="input mmCIF file name") + p.add_argument("output", metavar="output.cif", + help="output mmCIF file name", + default="output.cif", nargs="?") + return p.parse_args() + + +def main(): + args = get_args() + + if (os.path.exists(args.input) and os.path.exists(args.output) + and os.path.samefile(args.input, args.output)): + raise ValueError("Input and output are the same file") + + with open(args.input) as fh: + with open(args.output, 'w') as fhout: + modelcif.dumper.write( + fhout, + [add_modelcif_info(s) for s in modelcif.reader.read(fh)]) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index b66828d..24f2473 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ author='Ben Webb', author_email='ben@salilab.org', url='https://github.com/ihmwg/python-modelcif', - packages=['modelcif'], + packages=['modelcif', 'modelcif.util'], install_requires=['ihm>=1.5'], classifiers=[ "Programming Language :: Python :: 2.7", diff --git a/test/input/no_title.cif b/test/input/no_title.cif new file mode 100644 index 0000000..56db457 --- /dev/null +++ b/test/input/no_title.cif @@ -0,0 +1,3 @@ +data_PDBDEV_00000025 +_entry.id PDBDEV_00000025 +_struct.entry_id PDBDEV_00000025 diff --git a/test/input/not_modeled.cif b/test/input/not_modeled.cif new file mode 100644 index 0000000..367ae17 --- /dev/null +++ b/test/input/not_modeled.cif @@ -0,0 +1,89 @@ +data_model +# +_exptl.method 'model, MODELLER Version 9.24 2020/08/21 11:54:31' +# +_modeller.version 9.24 +# +loop_ +_struct_asym.id +_struct_asym.entity_id +_struct_asym.details +A 1 ? +B 2 ? +# +loop_ +_entity_poly_seq.entity_id +_entity_poly_seq.num +_entity_poly_seq.mon_id +1 1 VAL +1 2 GLY +1 3 GLN +1 4 GLN +1 5 TYR +1 6 SER +1 7 SER +2 1 ASP +2 2 GLU +# +loop_ +_atom_site.group_PDB +_atom_site.type_symbol +_atom_site.label_atom_id +_atom_site.label_alt_id +_atom_site.label_comp_id +_atom_site.label_asym_id +_atom_site.auth_asym_id +_atom_site.label_seq_id +_atom_site.auth_seq_id +_atom_site.pdbx_PDB_ins_code +_atom_site.Cartn_x +_atom_site.Cartn_y +_atom_site.Cartn_z +_atom_site.occupancy +_atom_site.B_iso_or_equiv +_atom_site.label_entity_id +_atom_site.id +_atom_site.pdbx_PDB_model_num +ATOM N N . VAL A A 1 2 ? 115.846 27.965 -26.370 1.000 141.830 1 1 1 +ATOM C CA . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 2 1 +ATOM C C . VAL A A 1 2 ? 113.517 27.504 -27.287 1.000 143.910 1 3 1 +ATOM O O . VAL A A 1 2 ? 113.885 27.746 -28.441 1.000 146.600 1 4 1 +ATOM C CB . VAL A A 1 2 ? 113.901 29.406 -25.683 1.000 143.750 1 5 1 +ATOM C CG1 . VAL A A 1 2 ? 115.030 30.438 -25.931 1.000 144.590 1 6 1 +ATOM C CG2 . VAL A A 1 2 ? 112.669 29.783 -26.486 1.000 144.500 1 7 1 +ATOM N N . GLY A A 2 3 ? 112.371 26.869 -27.012 1.000 142.200 1 8 1 +ATOM C CA . GLY A A 2 3 ? 111.506 26.368 -28.075 1.000 137.530 1 9 1 +ATOM C C . GLY A A 2 3 ? 111.719 24.869 -28.275 1.000 135.820 1 10 1 +ATOM O O . GLY A A 2 3 ? 110.768 24.093 -28.268 1.000 134.380 1 11 1 +ATOM N N . GLN A A 3 4 ? 112.989 24.479 -28.392 1.000 134.310 1 12 1 +ATOM C CA . GLN A A 3 4 ? 113.468 23.113 -28.639 1.000 128.420 1 13 1 +ATOM C C . GLN A A 3 4 ? 113.556 22.956 -30.163 1.000 121.240 1 14 1 +ATOM O O . GLN A A 3 4 ? 113.552 23.977 -30.840 1.000 127.090 1 15 1 +ATOM C CB . GLN A A 3 4 ? 112.614 22.038 -27.919 1.000 132.340 1 16 1 +ATOM C CG . GLN A A 3 4 ? 113.028 21.943 -26.407 1.000 135.370 1 17 1 +ATOM C CD . GLN A A 3 4 ? 112.604 20.667 -25.677 1.000 138.260 1 18 1 +ATOM O OE1 . GLN A A 3 4 ? 112.836 19.543 -26.150 1.000 141.450 1 19 1 +ATOM N NE2 . GLN A A 3 4 ? 112.006 20.839 -24.497 1.000 139.310 1 20 1 +ATOM N N . GLN A A 4 5 ? 113.648 21.739 -30.710 1.000 124.970 1 21 1 +ATOM C CA . GLN A A 4 5 ? 113.808 21.534 -32.168 1.000 117.620 1 22 1 +ATOM C C . GLN A A 4 5 ? 114.778 22.519 -32.833 1.000 112.980 1 23 1 +ATOM O O . GLN A A 4 5 ? 114.677 23.727 -32.677 1.000 116.850 1 24 1 +ATOM C CB . GLN A A 4 5 ? 112.456 21.545 -32.905 1.000 121.870 1 25 1 +ATOM C CG . GLN A A 4 5 ? 111.763 20.153 -32.917 1.000 123.750 1 26 1 +ATOM C CD . GLN A A 4 5 ? 110.863 19.874 -34.145 1.000 123.650 1 27 1 +ATOM O OE1 . GLN A A 4 5 ? 110.040 20.712 -34.537 1.000 122.500 1 28 1 +ATOM N NE2 . GLN A A 4 5 ? 111.008 18.674 -34.737 1.000 122.090 1 29 1 +ATOM N N . SER A A 7 8 ? 117.999 25.245 -39.224 1.000 89.750 1 48 1 +ATOM C CA . SER A A 7 8 ? 119.165 25.590 -40.036 1.000 87.320 1 49 1 +ATOM C C . SER A A 7 8 ? 119.224 27.089 -40.277 1.000 84.820 1 50 1 +ATOM O O . SER A A 7 8 ? 120.074 27.594 -41.008 1.000 84.020 1 51 1 +ATOM C CB . SER A A 7 8 ? 119.112 24.859 -41.383 1.000 88.180 1 52 1 +ATOM O OG . SER A A 7 8 ? 117.956 25.221 -42.117 1.000 88.850 1 53 1 +ATOM N N . ASP B B 1 3 ? 71.339 57.678 52.031 1.000 152.010 2 54 1 +ATOM C CA . ASP B B 1 3 ? 70.427 58.819 51.717 1.000 152.390 2 55 1 +ATOM C C . ASP B B 1 3 ? 70.144 58.821 50.222 1.000 151.960 2 56 1 +ATOM O O . ASP B B 1 3 ? 70.984 59.245 49.435 1.000 151.590 2 57 1 +ATOM C CB . ASP B B 1 3 ? 71.083 60.142 52.119 1.000 153.250 2 58 1 +ATOM C CG . ASP B B 1 3 ? 71.660 60.105 53.526 1.000 154.120 2 59 1 +ATOM O OD1 . ASP B B 1 3 ? 72.652 59.371 53.741 1.000 154.200 2 60 1 +ATOM O OD2 . ASP B B 1 3 ? 71.119 60.804 54.415 1.000 154.250 2 61 1 diff --git a/test/input/struct_only.cif b/test/input/struct_only.cif new file mode 100644 index 0000000..fd8617c --- /dev/null +++ b/test/input/struct_only.cif @@ -0,0 +1,4 @@ +data_PDBDEV_00000025 +_entry.id PDBDEV_00000025 +_struct.entry_id PDBDEV_00000025 +_struct.title 'Architecture of Pol II(G) and molecular mechanism of transcription regulation by Gdown1' diff --git a/test/test_make_mmcif.py b/test/test_make_mmcif.py index e14c857..2b86065 100644 --- a/test/test_make_mmcif.py +++ b/test/test_make_mmcif.py @@ -7,34 +7,95 @@ TOPDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) utils.set_search_paths(TOPDIR) import modelcif.reader +import modelcif.util.make_mmcif # Script should also be importable -MAKE_MMCIF = os.path.join(TOPDIR, 'util', 'make-mmcif.py') +MAKE_MMCIF = os.path.join(TOPDIR, 'modelcif', 'util', 'make_mmcif.py') class Tests(unittest.TestCase): - @unittest.skipIf(sys.version_info[0] < 3, "make-mmcif.py needs Python 3") + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") + def test_simple(self): + """Simple test of make_mmcif utility script""" + incif = utils.get_input_file_name(TOPDIR, 'struct_only.cif') + subprocess.check_call([sys.executable, MAKE_MMCIF, incif]) + with open('output.cif') as fh: + s, = modelcif.reader.read(fh) + self.assertEqual(s.title, + 'Architecture of Pol II(G) and molecular mechanism ' + 'of transcription regulation by Gdown1') + os.unlink('output.cif') + + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") + def test_non_default_output(self): + """Simple test of make_mmcif with non-default output name""" + incif = utils.get_input_file_name(TOPDIR, 'struct_only.cif') + subprocess.check_call([sys.executable, MAKE_MMCIF, incif, + 'non-default-output.cif']) + with open('non-default-output.cif') as fh: + s, = modelcif.reader.read(fh) + self.assertEqual(s.title, + 'Architecture of Pol II(G) and molecular mechanism ' + 'of transcription regulation by Gdown1') + os.unlink('non-default-output.cif') + + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") + def test_no_title(self): + """Check that make_mmcif adds missing title""" + incif = utils.get_input_file_name(TOPDIR, 'no_title.cif') + subprocess.check_call([sys.executable, MAKE_MMCIF, incif]) + with open('output.cif') as fh: + s, = modelcif.reader.read(fh) + self.assertEqual(s.title, 'Auto-generated system') + os.unlink('output.cif') + + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") def test_bad_usage(self): - """Bad usage of make-mmcif utility script""" + """Bad usage of make_mmcif utility script""" ret = subprocess.call([sys.executable, MAKE_MMCIF]) + self.assertEqual(ret, 2) + + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") + def test_same_file(self): + """Check that make_mmcif fails if input and output are the same""" + incif = utils.get_input_file_name(TOPDIR, 'struct_only.cif') + ret = subprocess.call([sys.executable, MAKE_MMCIF, incif, incif]) self.assertEqual(ret, 1) - @unittest.skipIf(sys.version_info[0] < 3, "make-mmcif.py needs Python 3") - def test_mini(self): - """Check that make-mmcif works given only basic atom info""" - incif = utils.get_input_file_name(TOPDIR, 'mini.cif') + @unittest.skipIf(sys.version_info[0] < 3, "make_mmcif.py needs Python 3") + def test_not_modeled(self): + """Check addition of not-modeled residue information""" + incif = utils.get_input_file_name(TOPDIR, 'not_modeled.cif') subprocess.check_call([sys.executable, MAKE_MMCIF, incif]) with open('output.cif') as fh: - s, = modelcif.reader.read(fh) - self.assertEqual(s.title, 'Auto-generated system') - self.assertEqual(len(s.protocols), 1) - p = s.protocols[0] - self.assertEqual(len(p.steps), 1) - step = p.steps[0] - self.assertIsInstance(step, modelcif.protocol.ModelingStep) - self.assertEqual(step.name, 'modeling') - self.assertEqual(step.input_data, []) - self.assertEqual(step.output_data, []) + contents = fh.readlines() + loop = contents.index("_pdbx_poly_seq_scheme.pdb_ins_code\n") + scheme = "".join(contents[loop-11:loop+11]) + # Residues 5 and 6 in chain A, and 2 in chain B, are missing from + # atom_site, so should now be missing from the scheme table. + self.assertEqual(scheme, """# +loop_ +_pdbx_poly_seq_scheme.asym_id +_pdbx_poly_seq_scheme.entity_id +_pdbx_poly_seq_scheme.seq_id +_pdbx_poly_seq_scheme.mon_id +_pdbx_poly_seq_scheme.pdb_seq_num +_pdbx_poly_seq_scheme.auth_seq_num +_pdbx_poly_seq_scheme.pdb_mon_id +_pdbx_poly_seq_scheme.auth_mon_id +_pdbx_poly_seq_scheme.pdb_strand_id +_pdbx_poly_seq_scheme.pdb_ins_code +A 1 1 VAL 2 2 VAL VAL A ? +A 1 2 GLY 3 3 GLY GLY A ? +A 1 3 GLN 4 4 GLN GLN A ? +A 1 4 GLN 5 5 GLN GLN A ? +A 1 5 TYR 5 ? ? ? A . +A 1 6 SER 6 ? ? ? A . +A 1 7 SER 8 8 SER SER A ? +B 2 1 ASP 3 3 ASP ASP B ? +B 2 2 GLU 2 ? ? ? B . +# +""") os.unlink('output.cif')