From 56135f1603d2add2ade43132c03907ad6f3d6375 Mon Sep 17 00:00:00 2001 From: Ben Webb Date: Fri, 19 Apr 2024 15:28:02 -0700 Subject: [PATCH] Output struct_ref and related tables The ma_target_ref_db_details table does not fully capture the database sequence; for example it does not allow gapped alignments between database and entity, or sequence differences, both of which are handled by the struct_ref* tables. Rework our TargetReference class to inherit from ihm.reference.Sequence and use this class to output both ma_target_ref_db_details and struct_ref tables. Relates #34. We still need to add support to the reader for struct_ref, combining its information with the ma_target_ref_db_details data. --- docs/reference.rst | 6 ++ examples/mkmodbase.py | 3 +- modelcif/__init__.py | 9 +++ modelcif/dumper.py | 20 +++++-- modelcif/reader.py | 20 ++++--- modelcif/reference.py | 37 +++++++++--- test/test_dumper.py | 126 +++++++++++++++++++++++++++++++++++++---- test/test_examples.py | 2 +- test/test_reference.py | 15 ++++- 9 files changed, 203 insertions(+), 35 deletions(-) diff --git a/docs/reference.rst b/docs/reference.rst index c53484b..a45b66a 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -12,6 +12,12 @@ The :mod:`modelcif.reference` Python module .. autoclass:: UniProt +.. autoclass:: Alignment + :members: + +.. autoclass:: SeqDif + :members: + .. autoclass:: TemplateReference :members: diff --git a/examples/mkmodbase.py b/examples/mkmodbase.py index 95eccff..780b535 100644 --- a/examples/mkmodbase.py +++ b/examples/mkmodbase.py @@ -54,7 +54,8 @@ # Next, the target (model) sequence, together with a link to the reference # sequence (in UniProt): -s = modelcif.reference.UniProt(code='MED1_YEAST', accession='Q12321') +s = modelcif.reference.UniProt(code='MED1_YEAST', accession='Q12321', + sequence='DSYVETLDCC') model_e = modelcif.Entity('DSYVETLDCC', description='Model subunit', references=[s]) diff --git a/modelcif/__init__.py b/modelcif/__init__.py index efd117b..533a64f 100644 --- a/modelcif/__init__.py +++ b/modelcif/__init__.py @@ -120,6 +120,15 @@ def _before_write(self): self._all_software_groups())) self.software = list(_remove_identical( self._all_ref_software())) + self._add_missing_reference_sequence() + + def _add_missing_reference_sequence(self): + """If any TargetReference has no sequence, use that of the Entity""" + for e in self.entities + self.target_entities: + for r in e.references: + if r.sequence is None: + r.sequence = "".join(comp.code_canonical + for comp in e.sequence) def _check_after_write(self): pass diff --git a/modelcif/dumper.py b/modelcif/dumper.py index 0691533..6395f54 100644 --- a/modelcif/dumper.py +++ b/modelcif/dumper.py @@ -117,10 +117,19 @@ def dump(self, system, writer): "seq_db_sequence_checksum"]) as lp: for e in entities: for r in e.references: - db_begin = (e.seq_id_range[0] if r.align_begin is None - else r.align_begin) - db_end = (e.seq_id_range[1] if r.align_end is None - else r.align_end) + if r.align_begin is None: + db_begin = min(a.db_begin for a in r._get_alignments()) + else: + db_begin = r.align_begin + if r.align_end is None: + aligns = [a for a in r._get_alignments() + if a.db_end is not None] + if aligns: + db_end = max(a.db_end for a in aligns) + else: + db_end = len(r.sequence) + else: + db_end = r.align_end lp.write(target_entity_id=e._id, db_name=r.name, db_name_other_details=r.other_details, db_code=r.code, db_accession=r.accession, @@ -802,7 +811,8 @@ class ModelCIFVariant(Variant): ihm.dumper._GrantDumper, _ChemCompDumper, _ChemCompDescriptorDumper, ihm.dumper._EntityDumper, ihm.dumper._EntitySrcGenDumper, ihm.dumper._EntitySrcNatDumper, - ihm.dumper._EntitySrcSynDumper, _TargetRefDBDumper, + ihm.dumper._EntitySrcSynDumper, ihm.dumper._StructRefDumper, + _TargetRefDBDumper, ihm.dumper._EntityPolyDumper, _EntityNonPolyDumper, ihm.dumper._EntityPolySeqDumper, ihm.dumper._StructAsymDumper, ihm.dumper._PolySeqSchemeDumper, ihm.dumper._NonPolySchemeDumper, diff --git a/modelcif/reader.py b/modelcif/reader.py index f300261..bafdebe 100644 --- a/modelcif/reader.py +++ b/modelcif/reader.py @@ -19,6 +19,7 @@ import inspect import collections import functools +import warnings def _get_date(iso_date_str): @@ -401,14 +402,17 @@ def __call__(self, target_entity_id, db_name, db_name_other_details, seq_db_sequence_version_date, seq_db_sequence_checksum): e = self.sysr.entities.get_by_id(target_entity_id) typ = self.type_map.get(db_name, db_name_other_details) - ref = typ(code=db_code, accession=db_accession, - align_begin=self.get_int(seq_db_align_begin), - align_end=self.get_int(seq_db_align_end), - isoform=seq_db_isoform, ncbi_taxonomy_id=ncbi_taxonomy_id, - organism_scientific=organism_scientific, - sequence_version_date=_get_date( - seq_db_sequence_version_date), - sequence_crc64=seq_db_sequence_checksum) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ref = typ(code=db_code, accession=db_accession, + align_begin=self.get_int(seq_db_align_begin), + align_end=self.get_int(seq_db_align_end), + isoform=seq_db_isoform, + ncbi_taxonomy_id=ncbi_taxonomy_id, + organism_scientific=organism_scientific, + sequence_version_date=_get_date( + seq_db_sequence_version_date), + sequence_crc64=seq_db_sequence_checksum) e.references.append(ref) diff --git a/modelcif/reference.py b/modelcif/reference.py index b377838..d6ca92c 100644 --- a/modelcif/reference.py +++ b/modelcif/reference.py @@ -1,7 +1,11 @@ """Classes for linking back to a sequence or structure database.""" +import warnings +import ihm.reference +from ihm.reference import Alignment, SeqDif # noqa: F401 -class TargetReference(object): + +class TargetReference(ihm.reference.Sequence): """Point to the sequence of a target :class:`modelcif.Entity` in a sequence database. Typically a subclass such as :class:`UniProt` is used, although to use a custom database, make a new subclass and provide @@ -14,14 +18,15 @@ class CustomRef(TargetReference): multiple sequences used in template searches or alignment construction; this class relates to just the modeled sequence itself. + See also :attr:`alignments` to describe the correspondence between + the database and entity sequences. + :param str code: The name of the sequence in the database. :param str accession: The database accession. :param int align_begin: Beginning index of the sequence in the database. - If omitted, it will be assumed that the Entity corresponds to - the full database sequence. + Deprecated; use :attr:`alignments` instead. :param int align_end: Ending index of the sequence in the database. - If omitted, it will be assumed that the Entity corresponds to - the full database sequence. + Deprecated; use :attr:`alignments` instead. :param str isoform: Sequence isoform, if applicable. :param str ncbi_taxonomy_id: Taxonomy identifier provided by NCBI. :param str organism_scientific: Scientific name of the organism. @@ -32,6 +37,10 @@ class CustomRef(TargetReference): :class:`datetime.datetime` :param str sequence_crc64: The CRC64 sum of the original database sequence. + :param str sequence: The complete database sequence, as a string of + one-letter codes. If omitted, will default to the canonical + sequence of the associated :class:`Entity`. + :param str details: Longer text describing the sequence. """ name = 'Other' @@ -39,14 +48,28 @@ class CustomRef(TargetReference): def __init__(self, code, accession, align_begin=None, align_end=None, isoform=None, ncbi_taxonomy_id=None, organism_scientific=None, sequence_version_date=None, - sequence_crc64=None): - self.code, self.accession = code, accession + sequence_crc64=None, sequence=None, details=None): + super(TargetReference, self).__init__( + db_name=self.name, db_code=code, accession=accession, + sequence=sequence, details=details) self.align_begin, self.align_end = align_begin, align_end self.isoform = isoform self.ncbi_taxonomy_id = ncbi_taxonomy_id self.organism_scientific = organism_scientific self.sequence_version_date = sequence_version_date self.sequence_crc64 = sequence_crc64 + if align_begin or align_end: + warnings.warn( + "align_begin and align_end are deprecated, and will be " + "removed in a future python-modelcif release. Specify the " + "database sequence and provide one or more " + "modelcif.reference.Alignment objects intead.", stacklevel=2) + if sequence is None: + warnings.warn( + "No sequence provided. The canonical sequence of the Entity " + "will be used instead.", stacklevel=2) + + code = property(lambda self: self.db_code) def _get_other_details(self): if (type(self) is not TargetReference diff --git a/test/test_dumper.py b/test/test_dumper.py index 3c48c8c..4325a22 100644 --- a/test/test_dumper.py +++ b/test/test_dumper.py @@ -1,4 +1,5 @@ from datetime import date +import warnings import utils import os import unittest @@ -484,16 +485,36 @@ class CustomRef(modelcif.reference.TargetReference): """my custom ref""" system = modelcif.System() - ref1 = modelcif.reference.UniProt( - code='testcode', accession='testacc', align_begin=4, align_end=8, - isoform='testiso', ncbi_taxonomy_id='1234', - organism_scientific='testorg', - sequence_version_date=date(1979, 11, 22), - sequence_crc64="A123B456C789D1E2") - ref2 = modelcif.reference.UniProt(code='c2', accession='a2') - ref3 = CustomRef(code='c3', accession='a3', isoform=ihm.unknown) - - e1 = modelcif.Entity('ACGT', references=[ref1, ref2, ref3]) + # Default alignment but with explicit align begin, end + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ref1 = modelcif.reference.UniProt( + code='testcode', accession='testacc', align_begin=4, + align_end=8, isoform='testiso', ncbi_taxonomy_id='1234', + organism_scientific='testorg', + sequence_version_date=date(1979, 11, 22), + sequence_crc64="A123B456C789D1E2", + sequence='ACGT') + # Default alignment (entire sequence) + ref2 = modelcif.reference.UniProt(code='c2', accession='a2', + sequence='ACGT') + ref3 = CustomRef(code='c3', accession='a3', isoform=ihm.unknown, + sequence='ACGT') + + # Explicit alignment that extends to the end of the db sequence + ref4 = modelcif.reference.UniProt(code='c4', accession='a4', + sequence='CCACGT') + ref4.alignments.append(modelcif.reference.Alignment(db_begin=3)) + + # Explicit alignment with explicit db_end + ref5 = modelcif.reference.UniProt(code='c5', accession='a5', + sequence='XXXACXXGTXXX') + ref5.alignments.append(modelcif.reference.Alignment( + db_begin=4, db_end=5)) + ref5.alignments.append(modelcif.reference.Alignment( + db_begin=8, db_end=9)) + + e1 = modelcif.Entity('ACGT', references=[ref1, ref2, ref3, ref4, ref5]) e1._id = 1 system.target_entities.append(e1) @@ -517,6 +538,8 @@ class CustomRef(modelcif.reference.TargetReference): 1 UNP . testcode testacc testiso 4 8 1234 testorg 1979-11-22 A123B456C789D1E2 1 UNP . c2 a2 . 1 4 . . . . 1 Other 'my custom ref' c3 a3 ? 1 4 . . . . +1 UNP . c4 a4 . 3 6 . . . . +1 UNP . c5 a5 . 4 9 . . . . # """) @@ -1178,6 +1201,89 @@ class MockObject(object): # """) + def test_struct_ref(self): + """Test StructRefDumper""" + system = ihm.System() + lpep = ihm.LPeptideAlphabet() + sd = modelcif.reference.SeqDif( + seq_id=2, db_monomer=lpep['W'], + monomer=lpep['S'], details='Test mutation') + # Test non-mandatory db_monomer + sd2 = modelcif.reference.SeqDif( + seq_id=3, db_monomer=None, + monomer=lpep['P'], details='Test mutation') + r1 = modelcif.reference.UniProt( + code='NUP84_YEAST', accession='P52891', sequence='MELWPTYQT', + details='test sequence') + r1.alignments.append(modelcif.reference.Alignment( + db_begin=3, seq_dif=[sd, sd2])) + r2 = modelcif.reference.UniProt( + code='testcode', accession='testacc', sequence='MELSPTYQT', + details='test2') + r2.alignments.append(modelcif.reference.Alignment( + db_begin=4, db_end=5, entity_begin=2, entity_end=3)) + r2.alignments.append(modelcif.reference.Alignment( + db_begin=9, db_end=9, entity_begin=4, entity_end=4)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + r3 = modelcif.reference.UniProt( + code='testcode2', accession='testacc2', sequence=None) + r3.alignments.append(modelcif.reference.Alignment( + db_begin=4, db_end=5, entity_begin=2, entity_end=3)) + r4 = modelcif.reference.UniProt( + code='testcode3', accession='testacc3', sequence=ihm.unknown) + r4.alignments.append(modelcif.reference.Alignment( + db_begin=4, db_end=5, entity_begin=2, entity_end=3)) + system.entities.append(modelcif.Entity( + 'LSPT', references=[r1, r2, r3, r4])) + dumper = ihm.dumper._EntityDumper() + dumper.finalize(system) # Assign entity IDs + + dumper = ihm.dumper._StructRefDumper() + dumper.finalize(system) # Assign IDs + out = _get_dumper_output(dumper, system) + self.assertEqual(out, """# +loop_ +_struct_ref.id +_struct_ref.entity_id +_struct_ref.db_name +_struct_ref.db_code +_struct_ref.pdbx_db_accession +_struct_ref.pdbx_align_begin +_struct_ref.pdbx_seq_one_letter_code +_struct_ref.details +1 1 UNP NUP84_YEAST P52891 3 LWPTYQT 'test sequence' +2 1 UNP testcode testacc 4 SPTYQT test2 +3 1 UNP testcode2 testacc2 4 . . +4 1 UNP testcode3 testacc3 4 ? . +# +# +loop_ +_struct_ref_seq.align_id +_struct_ref_seq.ref_id +_struct_ref_seq.seq_align_beg +_struct_ref_seq.seq_align_end +_struct_ref_seq.db_align_beg +_struct_ref_seq.db_align_end +1 1 1 4 3 6 +2 2 2 3 4 5 +3 2 4 4 9 9 +4 3 2 3 4 5 +5 4 2 3 4 5 +# +# +loop_ +_struct_ref_seq_dif.pdbx_ordinal +_struct_ref_seq_dif.align_id +_struct_ref_seq_dif.seq_num +_struct_ref_seq_dif.db_mon_id +_struct_ref_seq_dif.mon_id +_struct_ref_seq_dif.details +1 1 2 TRP SER 'Test mutation' +2 1 3 ? PRO 'Test mutation' +# +""") + if __name__ == '__main__': unittest.main() diff --git a/test/test_examples.py b/test/test_examples.py index 4232da8..989fa7e 100644 --- a/test/test_examples.py +++ b/test/test_examples.py @@ -59,7 +59,7 @@ def test_mkmodbase_example(self): # can read it with open(os.path.join(tmpdir, 'output.cif')) as fh: contents = fh.readlines() - self.assertEqual(len(contents), 418) + self.assertEqual(len(contents), 440) with open(os.path.join(tmpdir, 'output.cif')) as fh: s, = modelcif.reader.read(fh) diff --git a/test/test_reference.py b/test/test_reference.py index 86f28a5..460b2f3 100644 --- a/test/test_reference.py +++ b/test/test_reference.py @@ -30,12 +30,21 @@ class CustomRef(modelcif.reference.TemplateReference): def test_target_reference(self): """Test TargetReference classes""" - ref = modelcif.reference.UniProt("code", "acc") + ref = modelcif.reference.UniProt("code", "acc", sequence='CC') self.assertEqual(ref.name, "UNP") self.assertIsNone(ref.other_details) + # Reference with (deprecated) align begin, end + self.assertWarns(UserWarning, modelcif.reference.UniProt, + "code", "acc", align_begin=1, align_end=10, + sequence='CC') + + # Reference without explicit sequence + self.assertWarns(UserWarning, modelcif.reference.UniProt, + "code", "acc") + # generic "other" reference - ref = modelcif.reference.TargetReference("code", "acc") + ref = modelcif.reference.TargetReference("code", "acc", sequence='CC') self.assertEqual(ref.name, "Other") self.assertIsNone(ref.other_details) @@ -44,7 +53,7 @@ class CustomRef(modelcif.reference.TargetReference): """foo bar""" - ref = CustomRef("code", "acc") + ref = CustomRef("code", "acc", sequence='CC') self.assertEqual(ref.name, "Other") self.assertEqual(ref.other_details, "foo")