Merge branch 'master' of github.com:prody/ProDy into scipion

jamesmkrieger · Oct 25, 2023 · 86114ab · 86114ab
2 parents f35c44d + e5d0c8d
commit 86114ab
Show file tree

Hide file tree

Showing 12 changed files with 17,294 additions and 179 deletions.
diff --git a/prody/atomic/select.py b/prody/atomic/select.py
@@ -1341,6 +1341,7 @@ def _and2(self, sel, loc, tokens, subset=None):
         isDataLabel = atoms.isDataLabel
         append = None
         wasand = False
+        wasdata = False
         while tokens:
             # check whether token is an array to avoid array == str comparison
             token = tokens.pop(0)
@@ -1354,9 +1355,10 @@ def _and2(self, sel, loc, tokens, subset=None):
                             .format(repr('and ... and')), ['and', 'and'])
                     append = None
                     wasand = True
+                    wasdata = False
                     continue
 
-                elif isFlagLabel(token):
+                elif isFlagLabel(token) and not wasdata:
                     flags.append(token)
                     append = None
 
@@ -1396,10 +1398,12 @@ def _and2(self, sel, loc, tokens, subset=None):
                         evals.append([])
                         append = evals[-1].append
                     append(token)
+                    wasdata = True
 
                 elif token in UNARY:
                     unary.append([])
                     append = unary[-1].append
+                    wasdata = False
 
                     if token == 'not':
                         append((token,))

diff --git a/prody/database/dali.py b/prody/database/dali.py
@@ -402,9 +402,9 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
             cutoff_len = 0
         elif not isinstance(cutoff_len, (float, int)):
             raise TypeError('cutoff_len must be a float or an integer')
-        elif cutoff_len <= 1 and cutoff_len > 0:
+        elif cutoff_len <= 1 and cutoff_len >= 0:
             cutoff_len = int(cutoff_len*self._max_index)
-        elif cutoff_len <= self._max_index and cutoff_len > 0:
+        elif cutoff_len <= self._max_index and cutoff_len > 1:
             cutoff_len = int(cutoff_len)
         else:
             raise ValueError('cutoff_len must be a float between 0 and 1, or an int not greater than the max length')
@@ -455,13 +455,13 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
         pdbListAll = self._pdbListAll
         missing_ind_dict = dict()
         ref_indices_set = set(range(self._max_index))
+        query = self._pdbId+self._chain
         filterListLen = []
         filterListRMSD = []
         filterListZ = []
         filterListIdentity = []
 
-        # keep the first PDB (query PDB)
-        for pdb_chain in pdbListAll[1:]:
+        for pdb_chain in pdbListAll:
             temp_dict = daliInfo[pdb_chain]
             # filter: len_align, identity, rmsd, Z
             if temp_dict['len_align'] < cutoff_len:
@@ -503,7 +503,10 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
         filterDict = {'len': filterListLen, 'rmsd': filterListRMSD, 'Z': filterListZ, 'identity': filterListIdentity}
         self._filterList = filterList
         self._filterDict = filterDict
-        self._pdbList = [self._pdbListAll[0]] + [item for item in self._pdbListAll[1:] if not item in filterList]
+        if query in self._pdbListAll:
+            self._pdbList = [query] + [item for item in self._pdbListAll if item not in filterList]
+        else:
+            self._pdbList = [item for item in self._pdbListAll if item not in filterList]
         LOGGER.info(str(len(filterList)) + ' PDBs have been filtered out from '+str(len(pdbListAll))+' Dali hits (remaining: '+str(len(pdbListAll)-len(filterList))+').')
         return self._pdbList
 

diff --git a/prody/dynamics/exanm.py b/prody/dynamics/exanm.py
@@ -183,7 +183,7 @@ def buildMembrane(self, coords, **kwargs):
 
     def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs):
         """Build Hessian matrix for given coordinate set. 
-        **kwargs** are passed to :method:`.buildMembrane`.
+        **kwargs** are passed to :meth:`.buildMembrane`.
 
         :arg coords: a coordinate set or an object with ``getCoords`` method
         :type coords: :class:`numpy.ndarray`

diff --git a/prody/dynamics/exgnm.py b/prody/dynamics/exgnm.py
@@ -181,7 +181,7 @@ def buildMembrane(self, coords, **kwargs):
 
     def buildKirchhoff(self, coords, cutoff=15., gamma=1., **kwargs):
         """Build Kirchhoff matrix for given coordinate set. 
-        **kwargs** are passed to :method:`.buildMembrane`.
+        **kwargs** are passed to :meth:`.buildMembrane`.
 
         :arg coords: a coordinate set or an object with ``getCoords`` method
         :type coords: :class:`numpy.ndarray`

diff --git a/prody/proteins/blastpdb.py b/prody/proteins/blastpdb.py
@@ -297,10 +297,10 @@ def getHits(self, percent_identity=0., percent_overlap=0., chain=False):
             to or higher than this value will be returned, default is ``0.``
         :type percent_identity: float
         :arg percent_overlap: PDB hits with percent coverage of the query
-          sequence equivalent or better will be returned, default is ``0.``
+            sequence equivalent or better will be returned, default is ``0.``
         :type percent_overlap: float
         :arg chain: if chain is **True**, individual chains in a PDB file
-          will be considered as separate hits , default is **False**
+            will be considered as separate hits, default is **False**
         :type chain: bool"""
 
         assert isinstance(percent_identity, (float, int)), \

diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py
@@ -19,7 +19,7 @@
 from .cifheader import getCIFHeaderDict
 from .header import buildBiomolecules, assignSecstr, isHelix, isSheet
 
-__all__ = ['parseMMCIFStream', 'parseMMCIF']
+__all__ = ['parseMMCIFStream', 'parseMMCIF', 'parseCIF']
 
 
 class MMCIFParseError(Exception):
@@ -36,6 +36,11 @@ class MMCIFParseError(Exception):
         chains are parsed
     :type chain: str
 
+    :arg segment: segment identifiers for parsing specific chains, e.g.
+        ``segment='A'``, ``segment='B'``, ``segment='DE'``, by default all
+        segment are parsed
+    :type segment: str
+
     :arg subset: a predefined keyword to parse subset of atoms, valid keywords
         are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
         (read all atoms), e.g. ``subset='bb'``
@@ -50,6 +55,13 @@ class MMCIFParseError(Exception):
          alternate locations will be parsed and each will be appended as a
          distinct coordinate set, default is ``"A"``
     :type altloc: str
+
+    :arg unite_chains: unite chains with the same segment name (auth_asym_id), making chain ids be 
+        auth_asym_id instead of label_asym_id. This can be helpful in some cases e.g. alignments, but can 
+        cause some problems too. For example, using :meth:`.buildBiomolecules` afterwards requires original 
+        chain id (label_asym_id). Using biomol=True, inside parseMMCIF is fine.
+        Default is *False*
+    :type unite_chains: bool
     """
 
 _PDBSubsets = {'ca': 'ca', 'calpha': 'ca', 'bb': 'bb', 'backbone': 'bb'}
@@ -66,15 +78,9 @@ def parseMMCIF(pdb, **kwargs):
     :arg pdb: a PDB identifier or a filename
         If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function.
     :type pdb: str
-
-    :arg chain: comma separated string or list-like of chain IDs
-    :type chain: str, tuple, list, :class:`~numpy.ndarray`
-
-    :arg unite_chains: unite chains with the same segment name
-        Default is *False*
-    :type unite_chains: bool
     """
     chain = kwargs.pop('chain', None)
+    segment = kwargs.pop('segment', None)
     title = kwargs.get('title', None)
     unite_chains = kwargs.get('unite_chains', False)
     auto_bonds = SETTINGS.get('auto_bonds')
@@ -119,12 +125,30 @@ def parseMMCIF(pdb, **kwargs):
             title = title[3:]
         kwargs['title'] = title
     cif = openFile(pdb, 'rt')
-    result = parseMMCIFStream(cif, chain=chain, **kwargs)
+    result = parseMMCIFStream(cif, chain=chain, segment=segment, **kwargs)
     cif.close()
     if unite_chains:
-        result.setSegnames(result.getChids())
+        if isinstance(result, AtomGroup):
+            result.setChids(result.getSegnames())
+
+        elif isinstance(result, list):
+            # e.g. multiple biomol assemblies
+            [r.setChids(r.getSegnames()) for r in result if isinstance(r, AtomGroup)]
+
+        elif isinstance(result, tuple):
+            # atoms, header
+            if isinstance(result[0], AtomGroup):
+                result[0].setChids(result[0].getSegnames())
+
+            elif isinstance(result[0], list):
+                # e.g. multiple biomol assemblies
+                [r.setChids(r.getSegnames()) for r in result[0] if isinstance(r, AtomGroup)]
+
+        elif result is not None:
+            raise TypeError('result from parseMMCIFStream should be a tuple, AtomGroup or list')
     return result
 
+parseCIF = parseMMCIF
 
 def parseMMCIFStream(stream, **kwargs):
     """Returns an :class:`.AtomGroup` and/or a class:`.StarDict` 
@@ -138,6 +162,8 @@ def parseMMCIFStream(stream, **kwargs):
     model = kwargs.get('model')
     subset = kwargs.get('subset')
     chain = kwargs.get('chain')
+    segment = kwargs.get('segment')
+    unite_chains = kwargs.get('unite_chains', False)
     altloc = kwargs.get('altloc', 'A')
     header = kwargs.get('header', False)
     assert isinstance(header, bool), 'header must be a boolean'
@@ -150,6 +176,15 @@ def parseMMCIFStream(stream, **kwargs):
             raise TypeError('model must be an integer, {0} is invalid'
                             .format(str(model)))
     title_suffix = ''
+
+
+    if chain is not None:
+        if not isinstance(chain, str):
+            raise TypeError('chain must be a string')
+        elif len(chain) == 0:
+            raise ValueError('chain must not be an empty string')
+        title_suffix = chain + title_suffix
+
     if subset:
         try:
             subset = _PDBSubsets[subset.lower()]
@@ -159,12 +194,13 @@ def parseMMCIFStream(stream, **kwargs):
             raise ValueError('{0} is not a valid subset'
                              .format(repr(subset)))
         title_suffix = '_' + subset
-    if chain is not None:
-        if not isinstance(chain, str):
-            raise TypeError('chain must be a string')
-        elif len(chain) == 0:
-            raise ValueError('chain must not be an empty string')
-        title_suffix = '_' + chain + title_suffix
+
+    if segment is not None:
+        if not isinstance(segment, str):
+            raise TypeError('segment must be a string')
+        elif len(segment) == 0:
+            raise ValueError('segment must not be an empty string')
+        title_suffix = '_' + segment + title_suffix
 
     ag = None
     if 'ag' in kwargs:
@@ -194,7 +230,8 @@ def parseMMCIFStream(stream, **kwargs):
         if header or biomol or secondary:
             hd = getCIFHeaderDict(lines)
 
-        _parseMMCIFLines(ag, lines, model, chain, subset, altloc)
+        _parseMMCIFLines(ag, lines, model, chain, subset, altloc, 
+                         segment, unite_chains)
 
         if ag.numAtoms() > 0:
             LOGGER.report('{0} atoms and {1} coordinate set(s) were '
@@ -239,7 +276,7 @@ def parseMMCIFStream(stream, **kwargs):
 
 
 def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
-                     altloc_torf):
+                     altloc_torf, segment, unite_chains):
     """Returns an AtomGroup. See also :func:`.parsePDBStream()`.
 
     :arg lines: mmCIF lines
@@ -375,15 +412,24 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
             if not (atomname in subset and resname in protein_resnames):
                 continue
 
-        chID = line.split()[fields['auth_asym_id']]
+        chID = line.split()[fields['label_asym_id']]
+        segID = line.split()[fields['auth_asym_id']]
+
         if chain is not None:
             if isinstance(chain, str):
                 chain = chain.split(',')
             if not chID in chain:
+                if not unite_chains:
+                    continue
+                if not segID in chain:
+                    continue
+
+        if segment is not None:
+            if isinstance(segment, str):
+                segment = segment.split(',')
+            if not segID in segment:
                 continue
 
-        segID = line.split()[fields['label_asym_id']]
-
         alt = line.split()[fields['label_alt_id']]
         if alt not in which_altlocs and which_altlocs != 'all':
             continue

diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py
@@ -2,6 +2,7 @@
 """This module defines functions for parsing header data from PDB files."""
 
 from collections import defaultdict, OrderedDict
+import numpy as np
 import os.path
 
 from prody import LOGGER
@@ -185,7 +186,7 @@ def _getBiomoltrans(lines):
         currentBiomolecule = item1["_pdbx_struct_assembly_gen.assembly_id"]
         applyToChains = []
 
-        chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].split(',')
+        chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].replace(';','').strip().split(',')
         applyToChains.extend(chains)
 
         biomt = biomolecule[currentBiomolecule]
@@ -777,7 +778,7 @@ def _getPolymers(lines):
             poly = polymers.get(ch, Polymer(ch))
             polymers[ch] = poly
             poly.sequence += ''.join(item[
-                '_entity_poly.pdbx_seq_one_letter_code'][1:-2].split())
+                '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())
 
     # DBREF block 1
     items2 = parseSTARSection(lines, '_struct_ref')
@@ -1251,10 +1252,10 @@ def _getUnobservedSeq(lines):
 
     unobs_seqs = OrderedDict()
     for item in unobs:
-        chid = item['_pdbx_unobs_or_zero_occ_residues.label_asym_id']
+        chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
         if not chid in unobs_seqs.keys():
             unobs_seqs[chid] = ''
-        unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.label_comp_id']]
+        unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]
 
     if len(unobs_seqs) == 0:
         return None
@@ -1271,14 +1272,44 @@ def _getUnobservedSeq(lines):
         return None
 
     alns = OrderedDict()
-    for key, seq in full_seqs.items():
+    for k, (key, seq) in enumerate(full_seqs.items()):
         if key in unobs_seqs.keys():
             unobs_seq = unobs_seqs[key]
-            alns[key] = alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
-                                         MISMATCH_SCORE=-1000,
-                                         ALIGNMENT_METHOD='global',
-                                         GAP_PENALTY=GAP_PENALTY,
-                                         GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2]
+            # initialise alignment (quite possibly incorrect)
+            aln = list(alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
+                                   MISMATCH_SCORE=-1000,
+                                   ALIGNMENT_METHOD='global',
+                                   GAP_PENALTY=GAP_PENALTY,
+                                   GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2])
+
+            # fix it
+            prev_chid = unobs[0]['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
+            i = 0
+            for item in unobs:
+                chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
+                if chid != prev_chid:
+                    prev_chid = chid
+                    i = 0
+
+                if chid == key:
+                    one_letter = AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]
+                    good_pos = int(item['_pdbx_unobs_or_zero_occ_residues.label_seq_id']) - 1
+
+                    row1_list = list(aln[0])
+
+                    arr_unobs_seq = np.array(list(unobs_seq))
+                    unobs_rep = np.where(arr_unobs_seq[:i+1] == one_letter)[0].shape[0] - 1
+                    actual_pos = np.where(np.array(row1_list) == one_letter)[0][unobs_rep]
+
+                    if actual_pos != good_pos:
+                        row1_list[good_pos] = one_letter
+                        row1_list[actual_pos] = '-'
+
+                    aln[0] = ''.join(row1_list)
+
+                i += 1
+
+            alns[key] = aln
 
     return alns
 
@@ -1303,7 +1334,7 @@ def _getUnobservedSeq(lines):
                                      for line in lines][0],
     'identifier': lambda lines: [line.split('_')[1]
                                  if line.find("data") == 0 else ''
-                                 for line in lines][0],
+                                 for line in lines][0].strip(),
     'title': _getTitle,
     'experiment': lambda lines: [line.split()[1]
                                  if line.find("_exptl.method") != -1 else None

diff --git a/prody/proteins/header.py b/prody/proteins/header.py
@@ -1111,10 +1111,13 @@ def buildBiomolecules(header, atoms, biomol=None):
             translation[2] = line2[3]
             t = Transformation(rotation, translation)
 
-            newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0])).copy()
+            newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0]))
             if newag is None:
                 continue
-            newag.all.setSegnames(decToHybrid36(times+1,resnum=True))
+            newag = newag.copy()
+            segnames = newag.all.getSegnames()
+            newag.all.setSegnames(np.array([segname + decToHybrid36(times+1, resnum=True) 
+                                            for segname in segnames]))
             for acsi in range(newag.numCoordsets()):
                 newag.setACSIndex(acsi)
                 newag = t.apply(newag)