Skip to content

Commit

Permalink
Output struct_ref and related tables
Browse files Browse the repository at this point in the history
The ma_target_ref_db_details table does not
fully capture the database sequence; for example
it does not allow gapped alignments between database
and entity, or sequence differences, both of which are
handled by the struct_ref* tables. Rework our
TargetReference class to inherit from ihm.reference.Sequence
and use this class to output both ma_target_ref_db_details
and struct_ref tables. Relates #34. We still need to
add support to the reader for struct_ref, combining its
information with the ma_target_ref_db_details data.
  • Loading branch information
benmwebb committed Apr 19, 2024
1 parent 2448ed9 commit 56135f1
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 35 deletions.
6 changes: 6 additions & 0 deletions docs/reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ The :mod:`modelcif.reference` Python module

.. autoclass:: UniProt

.. autoclass:: Alignment
:members:

.. autoclass:: SeqDif
:members:

.. autoclass:: TemplateReference
:members:

Expand Down
3 changes: 2 additions & 1 deletion examples/mkmodbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@

# Next, the target (model) sequence, together with a link to the reference
# sequence (in UniProt):
s = modelcif.reference.UniProt(code='MED1_YEAST', accession='Q12321')
s = modelcif.reference.UniProt(code='MED1_YEAST', accession='Q12321',
sequence='DSYVETLDCC')
model_e = modelcif.Entity('DSYVETLDCC', description='Model subunit',
references=[s])

Expand Down
9 changes: 9 additions & 0 deletions modelcif/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ def _before_write(self):
self._all_software_groups()))
self.software = list(_remove_identical(
self._all_ref_software()))
self._add_missing_reference_sequence()

def _add_missing_reference_sequence(self):
"""If any TargetReference has no sequence, use that of the Entity"""
for e in self.entities + self.target_entities:
for r in e.references:
if r.sequence is None:
r.sequence = "".join(comp.code_canonical
for comp in e.sequence)

def _check_after_write(self):
pass
Expand Down
20 changes: 15 additions & 5 deletions modelcif/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,19 @@ def dump(self, system, writer):
"seq_db_sequence_checksum"]) as lp:
for e in entities:
for r in e.references:
db_begin = (e.seq_id_range[0] if r.align_begin is None
else r.align_begin)
db_end = (e.seq_id_range[1] if r.align_end is None
else r.align_end)
if r.align_begin is None:
db_begin = min(a.db_begin for a in r._get_alignments())
else:
db_begin = r.align_begin
if r.align_end is None:
aligns = [a for a in r._get_alignments()
if a.db_end is not None]
if aligns:
db_end = max(a.db_end for a in aligns)
else:
db_end = len(r.sequence)
else:
db_end = r.align_end
lp.write(target_entity_id=e._id, db_name=r.name,
db_name_other_details=r.other_details,
db_code=r.code, db_accession=r.accession,
Expand Down Expand Up @@ -802,7 +811,8 @@ class ModelCIFVariant(Variant):
ihm.dumper._GrantDumper, _ChemCompDumper, _ChemCompDescriptorDumper,
ihm.dumper._EntityDumper,
ihm.dumper._EntitySrcGenDumper, ihm.dumper._EntitySrcNatDumper,
ihm.dumper._EntitySrcSynDumper, _TargetRefDBDumper,
ihm.dumper._EntitySrcSynDumper, ihm.dumper._StructRefDumper,
_TargetRefDBDumper,
ihm.dumper._EntityPolyDumper, _EntityNonPolyDumper,
ihm.dumper._EntityPolySeqDumper, ihm.dumper._StructAsymDumper,
ihm.dumper._PolySeqSchemeDumper, ihm.dumper._NonPolySchemeDumper,
Expand Down
20 changes: 12 additions & 8 deletions modelcif/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import inspect
import collections
import functools
import warnings


def _get_date(iso_date_str):
Expand Down Expand Up @@ -401,14 +402,17 @@ def __call__(self, target_entity_id, db_name, db_name_other_details,
seq_db_sequence_version_date, seq_db_sequence_checksum):
e = self.sysr.entities.get_by_id(target_entity_id)
typ = self.type_map.get(db_name, db_name_other_details)
ref = typ(code=db_code, accession=db_accession,
align_begin=self.get_int(seq_db_align_begin),
align_end=self.get_int(seq_db_align_end),
isoform=seq_db_isoform, ncbi_taxonomy_id=ncbi_taxonomy_id,
organism_scientific=organism_scientific,
sequence_version_date=_get_date(
seq_db_sequence_version_date),
sequence_crc64=seq_db_sequence_checksum)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ref = typ(code=db_code, accession=db_accession,
align_begin=self.get_int(seq_db_align_begin),
align_end=self.get_int(seq_db_align_end),
isoform=seq_db_isoform,
ncbi_taxonomy_id=ncbi_taxonomy_id,
organism_scientific=organism_scientific,
sequence_version_date=_get_date(
seq_db_sequence_version_date),
sequence_crc64=seq_db_sequence_checksum)
e.references.append(ref)


Expand Down
37 changes: 30 additions & 7 deletions modelcif/reference.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""Classes for linking back to a sequence or structure database."""

import warnings
import ihm.reference
from ihm.reference import Alignment, SeqDif # noqa: F401

class TargetReference(object):

class TargetReference(ihm.reference.Sequence):
"""Point to the sequence of a target :class:`modelcif.Entity` in a sequence
database. Typically a subclass such as :class:`UniProt` is used,
although to use a custom database, make a new subclass and provide
Expand All @@ -14,14 +18,15 @@ class CustomRef(TargetReference):
multiple sequences used in template searches or alignment
construction; this class relates to just the modeled sequence itself.
See also :attr:`alignments` to describe the correspondence between
the database and entity sequences.
:param str code: The name of the sequence in the database.
:param str accession: The database accession.
:param int align_begin: Beginning index of the sequence in the database.
If omitted, it will be assumed that the Entity corresponds to
the full database sequence.
Deprecated; use :attr:`alignments` instead.
:param int align_end: Ending index of the sequence in the database.
If omitted, it will be assumed that the Entity corresponds to
the full database sequence.
Deprecated; use :attr:`alignments` instead.
:param str isoform: Sequence isoform, if applicable.
:param str ncbi_taxonomy_id: Taxonomy identifier provided by NCBI.
:param str organism_scientific: Scientific name of the organism.
Expand All @@ -32,21 +37,39 @@ class CustomRef(TargetReference):
:class:`datetime.datetime`
:param str sequence_crc64: The CRC64 sum of the original database
sequence.
:param str sequence: The complete database sequence, as a string of
one-letter codes. If omitted, will default to the canonical
sequence of the associated :class:`Entity`.
:param str details: Longer text describing the sequence.
"""

name = 'Other'

def __init__(self, code, accession, align_begin=None, align_end=None,
isoform=None, ncbi_taxonomy_id=None,
organism_scientific=None, sequence_version_date=None,
sequence_crc64=None):
self.code, self.accession = code, accession
sequence_crc64=None, sequence=None, details=None):
super(TargetReference, self).__init__(
db_name=self.name, db_code=code, accession=accession,
sequence=sequence, details=details)
self.align_begin, self.align_end = align_begin, align_end
self.isoform = isoform
self.ncbi_taxonomy_id = ncbi_taxonomy_id
self.organism_scientific = organism_scientific
self.sequence_version_date = sequence_version_date
self.sequence_crc64 = sequence_crc64
if align_begin or align_end:
warnings.warn(
"align_begin and align_end are deprecated, and will be "
"removed in a future python-modelcif release. Specify the "
"database sequence and provide one or more "
"modelcif.reference.Alignment objects intead.", stacklevel=2)
if sequence is None:
warnings.warn(
"No sequence provided. The canonical sequence of the Entity "
"will be used instead.", stacklevel=2)

code = property(lambda self: self.db_code)

def _get_other_details(self):
if (type(self) is not TargetReference
Expand Down
126 changes: 116 additions & 10 deletions test/test_dumper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import date
import warnings
import utils
import os
import unittest
Expand Down Expand Up @@ -484,16 +485,36 @@ class CustomRef(modelcif.reference.TargetReference):
"""my custom ref"""

system = modelcif.System()
ref1 = modelcif.reference.UniProt(
code='testcode', accession='testacc', align_begin=4, align_end=8,
isoform='testiso', ncbi_taxonomy_id='1234',
organism_scientific='testorg',
sequence_version_date=date(1979, 11, 22),
sequence_crc64="A123B456C789D1E2")
ref2 = modelcif.reference.UniProt(code='c2', accession='a2')
ref3 = CustomRef(code='c3', accession='a3', isoform=ihm.unknown)

e1 = modelcif.Entity('ACGT', references=[ref1, ref2, ref3])
# Default alignment but with explicit align begin, end
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ref1 = modelcif.reference.UniProt(
code='testcode', accession='testacc', align_begin=4,
align_end=8, isoform='testiso', ncbi_taxonomy_id='1234',
organism_scientific='testorg',
sequence_version_date=date(1979, 11, 22),
sequence_crc64="A123B456C789D1E2",
sequence='ACGT')
# Default alignment (entire sequence)
ref2 = modelcif.reference.UniProt(code='c2', accession='a2',
sequence='ACGT')
ref3 = CustomRef(code='c3', accession='a3', isoform=ihm.unknown,
sequence='ACGT')

# Explicit alignment that extends to the end of the db sequence
ref4 = modelcif.reference.UniProt(code='c4', accession='a4',
sequence='CCACGT')
ref4.alignments.append(modelcif.reference.Alignment(db_begin=3))

# Explicit alignment with explicit db_end
ref5 = modelcif.reference.UniProt(code='c5', accession='a5',
sequence='XXXACXXGTXXX')
ref5.alignments.append(modelcif.reference.Alignment(
db_begin=4, db_end=5))
ref5.alignments.append(modelcif.reference.Alignment(
db_begin=8, db_end=9))

e1 = modelcif.Entity('ACGT', references=[ref1, ref2, ref3, ref4, ref5])
e1._id = 1
system.target_entities.append(e1)

Expand All @@ -517,6 +538,8 @@ class CustomRef(modelcif.reference.TargetReference):
1 UNP . testcode testacc testiso 4 8 1234 testorg 1979-11-22 A123B456C789D1E2
1 UNP . c2 a2 . 1 4 . . . .
1 Other 'my custom ref' c3 a3 ? 1 4 . . . .
1 UNP . c4 a4 . 3 6 . . . .
1 UNP . c5 a5 . 4 9 . . . .
#
""")

Expand Down Expand Up @@ -1178,6 +1201,89 @@ class MockObject(object):
#
""")

def test_struct_ref(self):
"""Test StructRefDumper"""
system = ihm.System()
lpep = ihm.LPeptideAlphabet()
sd = modelcif.reference.SeqDif(
seq_id=2, db_monomer=lpep['W'],
monomer=lpep['S'], details='Test mutation')
# Test non-mandatory db_monomer
sd2 = modelcif.reference.SeqDif(
seq_id=3, db_monomer=None,
monomer=lpep['P'], details='Test mutation')
r1 = modelcif.reference.UniProt(
code='NUP84_YEAST', accession='P52891', sequence='MELWPTYQT',
details='test sequence')
r1.alignments.append(modelcif.reference.Alignment(
db_begin=3, seq_dif=[sd, sd2]))
r2 = modelcif.reference.UniProt(
code='testcode', accession='testacc', sequence='MELSPTYQT',
details='test2')
r2.alignments.append(modelcif.reference.Alignment(
db_begin=4, db_end=5, entity_begin=2, entity_end=3))
r2.alignments.append(modelcif.reference.Alignment(
db_begin=9, db_end=9, entity_begin=4, entity_end=4))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
r3 = modelcif.reference.UniProt(
code='testcode2', accession='testacc2', sequence=None)
r3.alignments.append(modelcif.reference.Alignment(
db_begin=4, db_end=5, entity_begin=2, entity_end=3))
r4 = modelcif.reference.UniProt(
code='testcode3', accession='testacc3', sequence=ihm.unknown)
r4.alignments.append(modelcif.reference.Alignment(
db_begin=4, db_end=5, entity_begin=2, entity_end=3))
system.entities.append(modelcif.Entity(
'LSPT', references=[r1, r2, r3, r4]))
dumper = ihm.dumper._EntityDumper()
dumper.finalize(system) # Assign entity IDs

dumper = ihm.dumper._StructRefDumper()
dumper.finalize(system) # Assign IDs
out = _get_dumper_output(dumper, system)
self.assertEqual(out, """#
loop_
_struct_ref.id
_struct_ref.entity_id
_struct_ref.db_name
_struct_ref.db_code
_struct_ref.pdbx_db_accession
_struct_ref.pdbx_align_begin
_struct_ref.pdbx_seq_one_letter_code
_struct_ref.details
1 1 UNP NUP84_YEAST P52891 3 LWPTYQT 'test sequence'
2 1 UNP testcode testacc 4 SPTYQT test2
3 1 UNP testcode2 testacc2 4 . .
4 1 UNP testcode3 testacc3 4 ? .
#
#
loop_
_struct_ref_seq.align_id
_struct_ref_seq.ref_id
_struct_ref_seq.seq_align_beg
_struct_ref_seq.seq_align_end
_struct_ref_seq.db_align_beg
_struct_ref_seq.db_align_end
1 1 1 4 3 6
2 2 2 3 4 5
3 2 4 4 9 9
4 3 2 3 4 5
5 4 2 3 4 5
#
#
loop_
_struct_ref_seq_dif.pdbx_ordinal
_struct_ref_seq_dif.align_id
_struct_ref_seq_dif.seq_num
_struct_ref_seq_dif.db_mon_id
_struct_ref_seq_dif.mon_id
_struct_ref_seq_dif.details
1 1 2 TRP SER 'Test mutation'
2 1 3 ? PRO 'Test mutation'
#
""")


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion test/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_mkmodbase_example(self):
# can read it
with open(os.path.join(tmpdir, 'output.cif')) as fh:
contents = fh.readlines()
self.assertEqual(len(contents), 418)
self.assertEqual(len(contents), 440)
with open(os.path.join(tmpdir, 'output.cif')) as fh:
s, = modelcif.reader.read(fh)

Expand Down
15 changes: 12 additions & 3 deletions test/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,21 @@ class CustomRef(modelcif.reference.TemplateReference):

def test_target_reference(self):
"""Test TargetReference classes"""
ref = modelcif.reference.UniProt("code", "acc")
ref = modelcif.reference.UniProt("code", "acc", sequence='CC')
self.assertEqual(ref.name, "UNP")
self.assertIsNone(ref.other_details)

# Reference with (deprecated) align begin, end
self.assertWarns(UserWarning, modelcif.reference.UniProt,
"code", "acc", align_begin=1, align_end=10,
sequence='CC')

# Reference without explicit sequence
self.assertWarns(UserWarning, modelcif.reference.UniProt,
"code", "acc")

# generic "other" reference
ref = modelcif.reference.TargetReference("code", "acc")
ref = modelcif.reference.TargetReference("code", "acc", sequence='CC')
self.assertEqual(ref.name, "Other")
self.assertIsNone(ref.other_details)

Expand All @@ -44,7 +53,7 @@ class CustomRef(modelcif.reference.TargetReference):
"""foo
bar"""

ref = CustomRef("code", "acc")
ref = CustomRef("code", "acc", sequence='CC')
self.assertEqual(ref.name, "Other")
self.assertEqual(ref.other_details, "foo")

Expand Down

0 comments on commit 56135f1

Please sign in to comment.