Skip to content

Commit

Permalink
Add make_mmcif script, exclude not-modeled residues
Browse files Browse the repository at this point in the history
Add a make_mmcif utility script, similar to that in
python-ihm, to add minimal ModelCIF tables to an
arbitrary mmCIF file. Have this script detect any
non-modeled ranges, add NotModeledResidueRange objects
for them, and use those objects to exclude these
ranges from the pdbx_poly_seq_scheme table, in a similar
fashion to python-ihm (ihmwg/python-ihm#150).
  • Loading branch information
benmwebb committed Sep 27, 2024
1 parent 09fe394 commit c242e39
Show file tree
Hide file tree
Showing 10 changed files with 298 additions and 25 deletions.
2 changes: 2 additions & 0 deletions docs/model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ The :mod:`modelcif.model` Python module

.. autoclass:: AbInitioModel

.. autoclass:: NotModeledResidueRange

.. autoclass:: ModelGroup
:members:
26 changes: 23 additions & 3 deletions modelcif/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import ihm.representation
from ihm.model import Atom, ModelGroup # noqa: F401
import modelcif.data
from ihm.util import _check_residue_range


# Provide ma-specific docs for Atom
if sys.version_info[0] >= 3:
Expand Down Expand Up @@ -62,10 +64,11 @@ def __init__(self, assembly, name=None):
[ihm.representation.AtomicSegment(seg, rigid=False)
for seg in assembly])
self._atoms = []
# ModelCIF doesn't support not-modeled residue ranges, but python-ihm
# uses this information to populate the pdbx_poly_seq_scheme table,
# so add an empty range here

#: List of residue ranges that were explicitly not modeled. See
#: :class:`NotModeledResidueRange`.
self.not_modeled_residue_ranges = []

#: Quality scores for the model or part of it (a simple list of
#: metric objects; see :mod:`modelcif.qa_metric`)
self.qa_metrics = []
Expand Down Expand Up @@ -116,3 +119,20 @@ class AbInitioModel(Model):
"""
model_type = "Ab initio model"
other_details = None


class NotModeledResidueRange(object):
"""A range of residues that were explicitly not modeled.
See :attr:`Model.not_modeled_residue_ranges`.
These ranges are not explicitly stored in the mmCIF file,
but they will be excluded from the ``pdbx_poly_seq_scheme`` table.
:param asym_unit: The asymmetric unit to which the residues belong.
:type asym_unit: :class:`~modelcif.AsymUnit`
:param int seq_id_begin: Starting residue in the range.
:param int seq_id_end: Ending residue in the range.
"""
def __init__(self, asym_unit, seq_id_begin, seq_id_end):
self.asym_unit = asym_unit
self.seq_id_begin, self.seq_id_end = seq_id_begin, seq_id_end
_check_residue_range((seq_id_begin, seq_id_end), asym_unit.entity)
8 changes: 4 additions & 4 deletions modelcif/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ def __call__(self, assembly_id, asym_id, seq_id_begin, seq_id_end):
if seq_id_begin is None and seq_id_end is None:
a.append(asym)
else:
a.append(asym(int(seq_id_begin), int(seq_id_end), _check=False))
a.append(asym(int(seq_id_begin), int(seq_id_end)))

def finalize(self):
# Any AsymUnitRange which covers an entire asym,
Expand Down Expand Up @@ -899,7 +899,7 @@ def __call__(self, model_id, label_asym_id, label_seq_id, metric_id,
model = self.sysr.models.get_by_id(model_id)
asym = self.sysr.asym_units.get_by_id(label_asym_id)
seq_id = self.get_int(label_seq_id)
residue = asym.residue(seq_id, _check=False)
residue = asym.residue(seq_id)
metric_class = self.sysr.qa_by_id[metric_id]
model.qa_metrics.append(metric_class(residue,
self.get_float(metric_value)))
Expand All @@ -913,10 +913,10 @@ def __call__(self, model_id, label_asym_id_1, label_seq_id_1,
model = self.sysr.models.get_by_id(model_id)
asym1 = self.sysr.asym_units.get_by_id(label_asym_id_1)
seq_id1 = self.get_int(label_seq_id_1)
residue1 = asym1.residue(seq_id1, _check=False)
residue1 = asym1.residue(seq_id1)
asym2 = self.sysr.asym_units.get_by_id(label_asym_id_2)
seq_id2 = self.get_int(label_seq_id_2)
residue2 = asym2.residue(seq_id2, _check=False)
residue2 = asym2.residue(seq_id2)
metric_class = self.sysr.qa_by_id[metric_id]
model.qa_metrics.append(metric_class(residue1, residue2,
self.get_float(metric_value)))
Expand Down
1 change: 1 addition & 0 deletions modelcif/util/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pass
93 changes: 93 additions & 0 deletions modelcif/util/make_mmcif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3

"""
Add minimal ModelCIF-related tables to an mmCIF file.
Given any mmCIF file as input, this script will add any missing
ModelCIF-related tables and write out a new file that is minimally compliant
with the ModelCIF dictionary.
This is done by simply reading in the original file with python-modelcif and
then writing it out again, so
a) any data in the input file that is not understood by python-modelcif
will be lost on output; and
b) input files that aren't compliant with the PDBx dictionary, or that
contain syntax errors or other problems, may crash or otherwise confuse
python-modelcif.
"""


import modelcif.reader
import modelcif.dumper
import modelcif.model
import ihm.util
import os
import argparse


def add_modelcif_info(s):
if not s.title:
s.title = 'Auto-generated system'

for model_group in s.model_groups:
for model in model_group:
model.not_modeled_residue_ranges.extend(
_get_not_modeled_residues(model))
return s


def _get_not_modeled_residues(model):
"""Yield NotModeledResidueRange objects for all residue ranges in the
Model that are not referenced by Atom, Sphere, or pre-existing
NotModeledResidueRange objects"""
for assem in model.assembly:
asym = assem.asym if hasattr(assem, 'asym') else assem
if not asym.entity.is_polymeric():
continue
# Make a set of all residue indices of this asym "handled" either
# by being modeled (with Atom or Sphere objects) or by being
# explicitly marked as not-modeled
handled_residues = set()
for rr in model.not_modeled_residue_ranges:
if rr.asym_unit is asym:
for seq_id in range(rr.seq_id_begin, rr.seq_id_end + 1):
handled_residues.add(seq_id)
for atom in model._atoms:
if atom.asym_unit is asym:
handled_residues.add(atom.seq_id)
# Convert set to a list of residue ranges
handled_residues = ihm.util._make_range_from_list(
sorted(handled_residues))
# Return not-modeled for each non-handled range
for r in ihm.util._invert_ranges(handled_residues,
end=assem.seq_id_range[1],
start=assem.seq_id_range[0]):
yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1])


def get_args():
p = argparse.ArgumentParser(
description="Add minimal ModelCIF-related tables to an mmCIF file.")
p.add_argument("input", metavar="input.cif", help="input mmCIF file name")
p.add_argument("output", metavar="output.cif",
help="output mmCIF file name",
default="output.cif", nargs="?")
return p.parse_args()


def main():
args = get_args()

if (os.path.exists(args.input) and os.path.exists(args.output)
and os.path.samefile(args.input, args.output)):
raise ValueError("Input and output are the same file")

with open(args.input) as fh:
with open(args.output, 'w') as fhout:
modelcif.dumper.write(
fhout,
[add_modelcif_info(s) for s in modelcif.reader.read(fh)])


if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author='Ben Webb',
author_email='[email protected]',
url='https://github.com/ihmwg/python-modelcif',
packages=['modelcif'],
packages=['modelcif', 'modelcif.util'],
install_requires=['ihm>=1.5'],
classifiers=[
"Programming Language :: Python :: 2.7",
Expand Down
3 changes: 3 additions & 0 deletions test/input/no_title.cif
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data_PDBDEV_00000025
_entry.id PDBDEV_00000025
_struct.entry_id PDBDEV_00000025
89 changes: 89 additions & 0 deletions test/input/not_modeled.cif
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
data_model
#
_exptl.method 'model, MODELLER Version 9.24 2020/08/21 11:54:31'
#
_modeller.version 9.24
#
loop_
_struct_asym.id
_struct_asym.entity_id
_struct_asym.details
A 1 ?
B 2 ?
#
loop_
_entity_poly_seq.entity_id
_entity_poly_seq.num
_entity_poly_seq.mon_id
1 1 VAL
1 2 GLY
1 3 GLN
1 4 GLN
1 5 TYR
1 6 SER
1 7 SER
2 1 ASP
2 2 GLU
#
loop_
_atom_site.group_PDB
_atom_site.type_symbol
_atom_site.label_atom_id
_atom_site.label_alt_id
_atom_site.label_comp_id
_atom_site.label_asym_id
_atom_site.auth_asym_id
_atom_site.label_seq_id
_atom_site.auth_seq_id
_atom_site.pdbx_PDB_ins_code
_atom_site.Cartn_x
_atom_site.Cartn_y
_atom_site.Cartn_z
_atom_site.occupancy
_atom_site.B_iso_or_equiv
_atom_site.label_entity_id
_atom_site.id
_atom_site.pdbx_PDB_model_num
ATOM N N . VAL A A 1 2 ? 115.846 27.965 -26.370 1.000 141.830 1 1 1
ATOM C CA . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 2 1
ATOM C C . VAL A A 1 2 ? 113.517 27.504 -27.287 1.000 143.910 1 3 1
ATOM O O . VAL A A 1 2 ? 113.885 27.746 -28.441 1.000 146.600 1 4 1
ATOM C CB . VAL A A 1 2 ? 113.901 29.406 -25.683 1.000 143.750 1 5 1
ATOM C CG1 . VAL A A 1 2 ? 115.030 30.438 -25.931 1.000 144.590 1 6 1
ATOM C CG2 . VAL A A 1 2 ? 112.669 29.783 -26.486 1.000 144.500 1 7 1
ATOM N N . GLY A A 2 3 ? 112.371 26.869 -27.012 1.000 142.200 1 8 1
ATOM C CA . GLY A A 2 3 ? 111.506 26.368 -28.075 1.000 137.530 1 9 1
ATOM C C . GLY A A 2 3 ? 111.719 24.869 -28.275 1.000 135.820 1 10 1
ATOM O O . GLY A A 2 3 ? 110.768 24.093 -28.268 1.000 134.380 1 11 1
ATOM N N . GLN A A 3 4 ? 112.989 24.479 -28.392 1.000 134.310 1 12 1
ATOM C CA . GLN A A 3 4 ? 113.468 23.113 -28.639 1.000 128.420 1 13 1
ATOM C C . GLN A A 3 4 ? 113.556 22.956 -30.163 1.000 121.240 1 14 1
ATOM O O . GLN A A 3 4 ? 113.552 23.977 -30.840 1.000 127.090 1 15 1
ATOM C CB . GLN A A 3 4 ? 112.614 22.038 -27.919 1.000 132.340 1 16 1
ATOM C CG . GLN A A 3 4 ? 113.028 21.943 -26.407 1.000 135.370 1 17 1
ATOM C CD . GLN A A 3 4 ? 112.604 20.667 -25.677 1.000 138.260 1 18 1
ATOM O OE1 . GLN A A 3 4 ? 112.836 19.543 -26.150 1.000 141.450 1 19 1
ATOM N NE2 . GLN A A 3 4 ? 112.006 20.839 -24.497 1.000 139.310 1 20 1
ATOM N N . GLN A A 4 5 ? 113.648 21.739 -30.710 1.000 124.970 1 21 1
ATOM C CA . GLN A A 4 5 ? 113.808 21.534 -32.168 1.000 117.620 1 22 1
ATOM C C . GLN A A 4 5 ? 114.778 22.519 -32.833 1.000 112.980 1 23 1
ATOM O O . GLN A A 4 5 ? 114.677 23.727 -32.677 1.000 116.850 1 24 1
ATOM C CB . GLN A A 4 5 ? 112.456 21.545 -32.905 1.000 121.870 1 25 1
ATOM C CG . GLN A A 4 5 ? 111.763 20.153 -32.917 1.000 123.750 1 26 1
ATOM C CD . GLN A A 4 5 ? 110.863 19.874 -34.145 1.000 123.650 1 27 1
ATOM O OE1 . GLN A A 4 5 ? 110.040 20.712 -34.537 1.000 122.500 1 28 1
ATOM N NE2 . GLN A A 4 5 ? 111.008 18.674 -34.737 1.000 122.090 1 29 1
ATOM N N . SER A A 7 8 ? 117.999 25.245 -39.224 1.000 89.750 1 48 1
ATOM C CA . SER A A 7 8 ? 119.165 25.590 -40.036 1.000 87.320 1 49 1
ATOM C C . SER A A 7 8 ? 119.224 27.089 -40.277 1.000 84.820 1 50 1
ATOM O O . SER A A 7 8 ? 120.074 27.594 -41.008 1.000 84.020 1 51 1
ATOM C CB . SER A A 7 8 ? 119.112 24.859 -41.383 1.000 88.180 1 52 1
ATOM O OG . SER A A 7 8 ? 117.956 25.221 -42.117 1.000 88.850 1 53 1
ATOM N N . ASP B B 1 3 ? 71.339 57.678 52.031 1.000 152.010 2 54 1
ATOM C CA . ASP B B 1 3 ? 70.427 58.819 51.717 1.000 152.390 2 55 1
ATOM C C . ASP B B 1 3 ? 70.144 58.821 50.222 1.000 151.960 2 56 1
ATOM O O . ASP B B 1 3 ? 70.984 59.245 49.435 1.000 151.590 2 57 1
ATOM C CB . ASP B B 1 3 ? 71.083 60.142 52.119 1.000 153.250 2 58 1
ATOM C CG . ASP B B 1 3 ? 71.660 60.105 53.526 1.000 154.120 2 59 1
ATOM O OD1 . ASP B B 1 3 ? 72.652 59.371 53.741 1.000 154.200 2 60 1
ATOM O OD2 . ASP B B 1 3 ? 71.119 60.804 54.415 1.000 154.250 2 61 1
4 changes: 4 additions & 0 deletions test/input/struct_only.cif
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
data_PDBDEV_00000025
_entry.id PDBDEV_00000025
_struct.entry_id PDBDEV_00000025
_struct.title 'Architecture of Pol II(G) and molecular mechanism of transcription regulation by Gdown1'
Loading

0 comments on commit c242e39

Please sign in to comment.