Allow combining multiple mmCIF files into one

ihmwg · Feb 13, 2024 · 3f6543b · 3f6543b
1 parent a3baf8b
commit 3f6543b
Show file tree

Hide file tree

Showing 3 changed files with 232 additions and 3 deletions.
diff --git a/test/input/mini_add.cif b/test/input/mini_add.cif
@@ -0,0 +1,55 @@
+data_model
+#
+_exptl.method 'model, MODELLER Version 9.24 2020/08/21 11:54:31'
+#
+_modeller.version 9.24
+#
+loop_
+_struct_asym.id
+_struct_asym.entity_id
+_struct_asym.details
+A 1 ?
+B 2 ?
+#
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+1 1 VAL
+1 2 GLY
+1 3 GLN
+1 4 GLN
+1 5 TYR
+1 6 SER
+1 7 SER
+2 1 PRO
+2 2 GLU
+#
+loop_
+_atom_site.group_PDB
+_atom_site.type_symbol
+_atom_site.label_atom_id
+_atom_site.label_alt_id
+_atom_site.label_comp_id
+_atom_site.label_asym_id
+_atom_site.auth_asym_id
+_atom_site.label_seq_id
+_atom_site.auth_seq_id
+_atom_site.pdbx_PDB_ins_code
+_atom_site.Cartn_x
+_atom_site.Cartn_y
+_atom_site.Cartn_z
+_atom_site.occupancy
+_atom_site.B_iso_or_equiv
+_atom_site.label_entity_id
+_atom_site.id
+_atom_site.pdbx_PDB_model_num
+ATOM C CA . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 2 1
+ATOM C CA . GLY A A 2 3 ? 111.506 26.368 -28.075 1.000 137.530 1 9 1
+ATOM C CA . GLN A A 3 4 ? 113.468 23.113 -28.639 1.000 128.420 1 13 1
+ATOM C CA . GLN A A 4 5 ? 113.808 21.534 -32.168 1.000 117.620 1 22 1
+ATOM C CA . TYR A A 5 6 ? 116.743 22.770 -34.259 1.000 103.700 1 31 1
+ATOM C CA . SER A A 6 7 ? 116.626 25.161 -37.229 1.000 93.490 1 43 1
+ATOM C CA . SER A A 7 8 ? 119.165 25.590 -40.036 1.000 87.320 1 49 1
+ATOM C CA . PRO B B 1 3 ? 70.427 58.819 51.717 1.000 152.390 2 55 1
+ATOM C CA . GLU B B 2 4 ? 68.584 58.274 48.425 1.000 152.090 2 63 1
diff --git a/test/test_make_mmcif.py b/test/test_make_mmcif.py
@@ -104,6 +104,50 @@ def test_pass_through(self):
         self.assertEqual(s.title, 'Output from simple-docking example')
         os.unlink('output.cif')
 
+    @unittest.skipIf(sys.version_info[0] < 3, "make-mmcif.py needs Python 3")
+    def test_add_polymers(self):
+        """Check that make-mmcif combines polymer information"""
+        # mini.cif contains two chains A, B
+        incif = utils.get_input_file_name(TOPDIR, 'mini.cif')
+        # mini_add.cif also contains A, B; A is the same sequence as mini.cif
+        # but B is different (so should be renamed C when we add)
+        addcif = utils.get_input_file_name(TOPDIR, 'mini_add.cif')
+        subprocess.check_call([sys.executable, MAKE_MMCIF, incif,
+                               '--add', addcif])
+        with open('output.cif') as fh:
+            s, = ihm.reader.read(fh)
+        self.assertEqual(len(s.entities), 3)
+        self.assertEqual(len(s.asym_units), 3)
+        self.assertEqual(len(s.state_groups), 2)
+        # Model from mini.cif
+        self.assertEqual(len(s.state_groups[0]), 1)
+        self.assertEqual(len(s.state_groups[0][0]), 1)
+        self.assertEqual(len(s.state_groups[0][0][0]), 1)
+        m = s.state_groups[0][0][0][0]
+        self.assertEqual(m.protocol.name, 'modeling')
+        self.assertEqual(m.assembly.name, 'Modeled assembly')
+        chain_a, chain_b, = m.representation
+        self.assertIs(chain_a.asym_unit.asym, s.asym_units[0])
+        self.assertIs(chain_b.asym_unit.asym, s.asym_units[1])
+        for chain in chain_a, chain_b:
+            self.assertIsInstance(chain, ihm.representation.AtomicSegment)
+            self.assertFalse(chain.rigid)
+        # Model from mini_add.cif
+        self.assertEqual(len(s.state_groups[1]), 1)
+        self.assertEqual(len(s.state_groups[1][0]), 1)
+        self.assertEqual(len(s.state_groups[1][0][0]), 1)
+        m = s.state_groups[1][0][0][0]
+        self.assertEqual(m.protocol.name, 'modeling')
+        self.assertEqual(m.assembly.name, 'Modeled assembly')
+        chain_a, chain_c, = m.representation
+        self.assertIs(chain_a.asym_unit.asym, s.asym_units[0])
+        self.assertIs(chain_c.asym_unit.asym, s.asym_units[2])
+        for chain in chain_a, chain_c:
+            self.assertIsInstance(chain, ihm.representation.AtomicSegment)
+            self.assertFalse(chain.rigid)
+        self.assertEqual(s.title, 'Auto-generated system')
+        os.unlink('output.cif')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/util/make-mmcif.py b/util/make-mmcif.py
@@ -14,6 +14,11 @@
   b) input files that aren't compliant with the PDBx dictionary, or that
      contain syntax errors or other problems, may crash or otherwise confuse
      python-ihm.
+
+The --add option can also be used to combine multiple input mmCIF files into
+one. This is typically used when the mmCIF files contain models with
+differing composition. Only model (coordinate) information is combined, not
+other IHM information such as starting models or restraints.
 """
 
 
@@ -53,13 +58,129 @@ def add_ihm_info(s):
     return s
 
 
+def add_ihm_info_one_system(fname):
+    """Read mmCIF file `fname`, which must contain a single System, and
+       return it with any missing IHM data added."""
+    with open(fname) as fh:
+        systems = ihm.reader.read(fh)
+    if len(systems) != 1:
+        raise ValueError("mmCIF file %s must contain exactly 1 data block "
+                         "(%d found)" % (fname, len(systems)))
+    return add_ihm_info(systems[0])
+
+
+def combine(s, other_s):
+    """Add models from the System `other_s` into the System `s`.
+       After running this function, `s` will contain all Models from both
+       systems. The models are added to new StateGroup(s) in `s`.
+       Note that this function also modifies `other_s` in place, so that
+       System should no longer be used after calling this function."""
+    # First map all Entity and AsymUnit objects in `other_s` to equivalent
+    # objects in `s`
+    entity_map = combine_entities(s, other_s)
+    asym_map = combine_asyms(s, other_s, entity_map)
+    # Now handle the Models themselves
+    combine_atoms(s, other_s, asym_map)
+
+
+def combine_entities(s, other_s):
+    """Add `other_s` entities into `s`. Returns a dict that maps Entities
+       in `other_s` to equivalent objects in `s`."""
+    entity_map = {}
+    sequences = dict((e.sequence, e) for e in s.entities)
+    for e in other_s.entities:
+        if e.sequence in sequences:
+            # If the `other_s` Entity already exists in `s`, map to it
+            entity_map[e] = sequences[e.sequence]
+        else:
+            # Otherwise, add the `other_s` Entity to `s`
+            s.entities.append(e)
+            entity_map[e] = e
+    return entity_map
+
+
+def combine_asyms(s, other_s, entity_map):
+    """Add `other_s` asyms into `s`. Returns a dict that maps AsymUnits
+       in `other_s` to equivalent objects in `s`."""
+    asym_map = {}
+    # Collect author-provided information for existing asyms. For polymers,
+    # we use the author-provided chain ID; for non-polymers, we also use
+    # the author-provided residue number of the first (only) residue
+    poly_asyms = dict(((a.entity, a.strand_id), a)
+                      for a in s.asym_units if a.entity.is_polymeric())
+    nonpoly_asyms = dict(((a.entity, a.strand_id, a.auth_seq_id_map[1]), a)
+                         for a in s.asym_units
+                         if a.entity.type == 'non-polymer')
+
+    def map_asym(asym, orig_asym):
+        if orig_asym:
+            # If an equivalent asym already exists, use it (and its asym_id)
+            asym_map[asym] = orig_asym
+        else:
+            # Otherwise, add a new asym
+            asym_map[asym] = asym
+            asym.id = None  # Assign new ID
+            s.asym_units.append(asym)
+
+    for asym in other_s.asym_units:
+        # Point to Entity in `s`, not `other_s`
+        asym.entity = entity_map[asym.entity]
+        # For polymers and non-polymers, if an asym in `other_s` has the
+        # same author-provided information and entity_id as an asym in `s`,
+        # reuse the asym_id
+        if asym.entity.is_polymeric():
+            map_asym(asym, poly_asyms.get((asym.entity, asym.strand_id)))
+        elif asym.entity.type == 'non-polymer':
+            map_asym(asym, nonpoly_asyms.get((asym.entity, asym.strand_id,
+                                              asym.auth_seq_id_map[1])))
+        else:
+            # For waters and branched entities, always assign a new asym_id
+            asym_map[asym] = asym
+            asym.id = None  # Assign new ID
+            s.asym_units.append(asym)
+    return asym_map
+
+
+def combine_atoms(s, other_s, asym_map):
+    """Add `other_s` atoms into `s`"""
+    seen_asmb = set()
+    seen_rep = set()
+    for state_group in other_s.state_groups:
+        for state in state_group:
+            for model_group in state:
+                for model in model_group:
+                    # Assembly, Representation and Atom and Sphere objects
+                    # all reference `other_s` asyms. We must map these to
+                    # asyms in `s`.
+                    asmb = model.assembly
+                    if id(asmb) not in seen_asmb:
+                        seen_asmb.add(id(asmb))
+                        # todo: also handle AsymUnitRange
+                        asmb[:] = [asym_map[asym] for asym in asmb]
+                    rep = model.representation
+                    if id(rep) not in seen_rep:
+                        seen_rep.add(id(rep))
+                        for seg in rep:
+                            seg.asym_unit = asym_map[seg.asym_unit]
+                    for atom in model._atoms:
+                        atom.asym_unit = asym_map[atom.asym_unit]
+                    for sphere in model._spheres:
+                        sphere.asym_unit = asym_map[sphere.asym_unit]
+
+    # Add all models as new state groups
+    s.state_groups.extend(other_s.state_groups)
+
+
 def get_args():
     p = argparse.ArgumentParser(
         description="Add minimal IHM-related tables to an mmCIF file.")
     p.add_argument("input", metavar="input.cif", help="input mmCIF file name")
     p.add_argument("output", metavar="output.cif",
                    help="output mmCIF file name",
                    default="output.cif", nargs="?")
+    p.add_argument("--add", "-a", action='append', metavar="add.cif",
+                   help="also add model information from the named mmCIF "
+                        "file to the output file")
     return p.parse_args()
 
 
@@ -69,8 +190,17 @@ def get_args():
         and os.path.samefile(args.input, args.output)):
     raise ValueError("Input and output are the same file")
 
-with open(args.input) as fh:
+if args.add:
+    s = add_ihm_info_one_system(args.input)
+    for other in args.add:
+        other_s = add_ihm_info_one_system(other)
+        combine(s, other_s)
     with open(args.output, 'w') as fhout:
         ihm.dumper.write(
-            fhout, [add_ihm_info(s) for s in ihm.reader.read(fh)],
-            variant=ihm.dumper.IgnoreVariant(['_audit_conform']))
+            fhout, [s], variant=ihm.dumper.IgnoreVariant(['_audit_conform']))
+else:
+    with open(args.input) as fh:
+        with open(args.output, 'w') as fhout:
+            ihm.dumper.write(
+                fhout, [add_ihm_info(s) for s in ihm.reader.read(fh)],
+                variant=ihm.dumper.IgnoreVariant(['_audit_conform']))