Skip to content

Commit

Permalink
Merge branch 'master' of github.com:prody/ProDy into scipion
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesmkrieger committed Oct 25, 2023
2 parents f35c44d + e5d0c8d commit 86114ab
Show file tree
Hide file tree
Showing 12 changed files with 17,294 additions and 179 deletions.
6 changes: 5 additions & 1 deletion prody/atomic/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,7 @@ def _and2(self, sel, loc, tokens, subset=None):
isDataLabel = atoms.isDataLabel
append = None
wasand = False
wasdata = False
while tokens:
# check whether token is an array to avoid array == str comparison
token = tokens.pop(0)
Expand All @@ -1354,9 +1355,10 @@ def _and2(self, sel, loc, tokens, subset=None):
.format(repr('and ... and')), ['and', 'and'])
append = None
wasand = True
wasdata = False
continue

elif isFlagLabel(token):
elif isFlagLabel(token) and not wasdata:
flags.append(token)
append = None

Expand Down Expand Up @@ -1396,10 +1398,12 @@ def _and2(self, sel, loc, tokens, subset=None):
evals.append([])
append = evals[-1].append
append(token)
wasdata = True

elif token in UNARY:
unary.append([])
append = unary[-1].append
wasdata = False

if token == 'not':
append((token,))
Expand Down
13 changes: 8 additions & 5 deletions prody/database/dali.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,9 +402,9 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
cutoff_len = 0
elif not isinstance(cutoff_len, (float, int)):
raise TypeError('cutoff_len must be a float or an integer')
elif cutoff_len <= 1 and cutoff_len > 0:
elif cutoff_len <= 1 and cutoff_len >= 0:
cutoff_len = int(cutoff_len*self._max_index)
elif cutoff_len <= self._max_index and cutoff_len > 0:
elif cutoff_len <= self._max_index and cutoff_len > 1:
cutoff_len = int(cutoff_len)
else:
raise ValueError('cutoff_len must be a float between 0 and 1, or an int not greater than the max length')
Expand Down Expand Up @@ -455,13 +455,13 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
pdbListAll = self._pdbListAll
missing_ind_dict = dict()
ref_indices_set = set(range(self._max_index))
query = self._pdbId+self._chain
filterListLen = []
filterListRMSD = []
filterListZ = []
filterListIdentity = []

# keep the first PDB (query PDB)
for pdb_chain in pdbListAll[1:]:
for pdb_chain in pdbListAll:
temp_dict = daliInfo[pdb_chain]
# filter: len_align, identity, rmsd, Z
if temp_dict['len_align'] < cutoff_len:
Expand Down Expand Up @@ -503,7 +503,10 @@ def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identi
filterDict = {'len': filterListLen, 'rmsd': filterListRMSD, 'Z': filterListZ, 'identity': filterListIdentity}
self._filterList = filterList
self._filterDict = filterDict
self._pdbList = [self._pdbListAll[0]] + [item for item in self._pdbListAll[1:] if not item in filterList]
if query in self._pdbListAll:
self._pdbList = [query] + [item for item in self._pdbListAll if item not in filterList]
else:
self._pdbList = [item for item in self._pdbListAll if item not in filterList]
LOGGER.info(str(len(filterList)) + ' PDBs have been filtered out from '+str(len(pdbListAll))+' Dali hits (remaining: '+str(len(pdbListAll)-len(filterList))+').')
return self._pdbList

Expand Down
2 changes: 1 addition & 1 deletion prody/dynamics/exanm.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def buildMembrane(self, coords, **kwargs):

def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs):
"""Build Hessian matrix for given coordinate set.
**kwargs** are passed to :method:`.buildMembrane`.
**kwargs** are passed to :meth:`.buildMembrane`.
:arg coords: a coordinate set or an object with ``getCoords`` method
:type coords: :class:`numpy.ndarray`
Expand Down
2 changes: 1 addition & 1 deletion prody/dynamics/exgnm.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def buildMembrane(self, coords, **kwargs):

def buildKirchhoff(self, coords, cutoff=15., gamma=1., **kwargs):
"""Build Kirchhoff matrix for given coordinate set.
**kwargs** are passed to :method:`.buildMembrane`.
**kwargs** are passed to :meth:`.buildMembrane`.
:arg coords: a coordinate set or an object with ``getCoords`` method
:type coords: :class:`numpy.ndarray`
Expand Down
4 changes: 2 additions & 2 deletions prody/proteins/blastpdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,10 @@ def getHits(self, percent_identity=0., percent_overlap=0., chain=False):
to or higher than this value will be returned, default is ``0.``
:type percent_identity: float
:arg percent_overlap: PDB hits with percent coverage of the query
sequence equivalent or better will be returned, default is ``0.``
sequence equivalent or better will be returned, default is ``0.``
:type percent_overlap: float
:arg chain: if chain is **True**, individual chains in a PDB file
will be considered as separate hits , default is **False**
will be considered as separate hits, default is **False**
:type chain: bool"""

assert isinstance(percent_identity, (float, int)), \
Expand Down
88 changes: 67 additions & 21 deletions prody/proteins/ciffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .cifheader import getCIFHeaderDict
from .header import buildBiomolecules, assignSecstr, isHelix, isSheet

__all__ = ['parseMMCIFStream', 'parseMMCIF']
__all__ = ['parseMMCIFStream', 'parseMMCIF', 'parseCIF']


class MMCIFParseError(Exception):
Expand All @@ -36,6 +36,11 @@ class MMCIFParseError(Exception):
chains are parsed
:type chain: str
:arg segment: segment identifiers for parsing specific chains, e.g.
``segment='A'``, ``segment='B'``, ``segment='DE'``, by default all
segment are parsed
:type segment: str
:arg subset: a predefined keyword to parse subset of atoms, valid keywords
are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
(read all atoms), e.g. ``subset='bb'``
Expand All @@ -50,6 +55,13 @@ class MMCIFParseError(Exception):
alternate locations will be parsed and each will be appended as a
distinct coordinate set, default is ``"A"``
:type altloc: str
:arg unite_chains: unite chains with the same segment name (auth_asym_id), making chain ids be
auth_asym_id instead of label_asym_id. This can be helpful in some cases e.g. alignments, but can
cause some problems too. For example, using :meth:`.buildBiomolecules` afterwards requires original
chain id (label_asym_id). Using biomol=True, inside parseMMCIF is fine.
Default is *False*
:type unite_chains: bool
"""

_PDBSubsets = {'ca': 'ca', 'calpha': 'ca', 'bb': 'bb', 'backbone': 'bb'}
Expand All @@ -66,15 +78,9 @@ def parseMMCIF(pdb, **kwargs):
:arg pdb: a PDB identifier or a filename
If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function.
:type pdb: str
:arg chain: comma separated string or list-like of chain IDs
:type chain: str, tuple, list, :class:`~numpy.ndarray`
:arg unite_chains: unite chains with the same segment name
Default is *False*
:type unite_chains: bool
"""
chain = kwargs.pop('chain', None)
segment = kwargs.pop('segment', None)
title = kwargs.get('title', None)
unite_chains = kwargs.get('unite_chains', False)
auto_bonds = SETTINGS.get('auto_bonds')
Expand Down Expand Up @@ -119,12 +125,30 @@ def parseMMCIF(pdb, **kwargs):
title = title[3:]
kwargs['title'] = title
cif = openFile(pdb, 'rt')
result = parseMMCIFStream(cif, chain=chain, **kwargs)
result = parseMMCIFStream(cif, chain=chain, segment=segment, **kwargs)
cif.close()
if unite_chains:
result.setSegnames(result.getChids())
if isinstance(result, AtomGroup):
result.setChids(result.getSegnames())

elif isinstance(result, list):
# e.g. multiple biomol assemblies
[r.setChids(r.getSegnames()) for r in result if isinstance(r, AtomGroup)]

elif isinstance(result, tuple):
# atoms, header
if isinstance(result[0], AtomGroup):
result[0].setChids(result[0].getSegnames())

elif isinstance(result[0], list):
# e.g. multiple biomol assemblies
[r.setChids(r.getSegnames()) for r in result[0] if isinstance(r, AtomGroup)]

elif result is not None:
raise TypeError('result from parseMMCIFStream should be a tuple, AtomGroup or list')
return result

parseCIF = parseMMCIF

def parseMMCIFStream(stream, **kwargs):
"""Returns an :class:`.AtomGroup` and/or a class:`.StarDict`
Expand All @@ -138,6 +162,8 @@ def parseMMCIFStream(stream, **kwargs):
model = kwargs.get('model')
subset = kwargs.get('subset')
chain = kwargs.get('chain')
segment = kwargs.get('segment')
unite_chains = kwargs.get('unite_chains', False)
altloc = kwargs.get('altloc', 'A')
header = kwargs.get('header', False)
assert isinstance(header, bool), 'header must be a boolean'
Expand All @@ -150,6 +176,15 @@ def parseMMCIFStream(stream, **kwargs):
raise TypeError('model must be an integer, {0} is invalid'
.format(str(model)))
title_suffix = ''


if chain is not None:
if not isinstance(chain, str):
raise TypeError('chain must be a string')
elif len(chain) == 0:
raise ValueError('chain must not be an empty string')
title_suffix = chain + title_suffix

if subset:
try:
subset = _PDBSubsets[subset.lower()]
Expand All @@ -159,12 +194,13 @@ def parseMMCIFStream(stream, **kwargs):
raise ValueError('{0} is not a valid subset'
.format(repr(subset)))
title_suffix = '_' + subset
if chain is not None:
if not isinstance(chain, str):
raise TypeError('chain must be a string')
elif len(chain) == 0:
raise ValueError('chain must not be an empty string')
title_suffix = '_' + chain + title_suffix

if segment is not None:
if not isinstance(segment, str):
raise TypeError('segment must be a string')
elif len(segment) == 0:
raise ValueError('segment must not be an empty string')
title_suffix = '_' + segment + title_suffix

ag = None
if 'ag' in kwargs:
Expand Down Expand Up @@ -194,7 +230,8 @@ def parseMMCIFStream(stream, **kwargs):
if header or biomol or secondary:
hd = getCIFHeaderDict(lines)

_parseMMCIFLines(ag, lines, model, chain, subset, altloc)
_parseMMCIFLines(ag, lines, model, chain, subset, altloc,
segment, unite_chains)

if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate set(s) were '
Expand Down Expand Up @@ -239,7 +276,7 @@ def parseMMCIFStream(stream, **kwargs):


def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
altloc_torf):
altloc_torf, segment, unite_chains):
"""Returns an AtomGroup. See also :func:`.parsePDBStream()`.
:arg lines: mmCIF lines
Expand Down Expand Up @@ -375,15 +412,24 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
if not (atomname in subset and resname in protein_resnames):
continue

chID = line.split()[fields['auth_asym_id']]
chID = line.split()[fields['label_asym_id']]
segID = line.split()[fields['auth_asym_id']]

if chain is not None:
if isinstance(chain, str):
chain = chain.split(',')
if not chID in chain:
if not unite_chains:
continue
if not segID in chain:
continue

if segment is not None:
if isinstance(segment, str):
segment = segment.split(',')
if not segID in segment:
continue

segID = line.split()[fields['label_asym_id']]

alt = line.split()[fields['label_alt_id']]
if alt not in which_altlocs and which_altlocs != 'all':
continue
Expand Down
53 changes: 42 additions & 11 deletions prody/proteins/cifheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""This module defines functions for parsing header data from PDB files."""

from collections import defaultdict, OrderedDict
import numpy as np
import os.path

from prody import LOGGER
Expand Down Expand Up @@ -185,7 +186,7 @@ def _getBiomoltrans(lines):
currentBiomolecule = item1["_pdbx_struct_assembly_gen.assembly_id"]
applyToChains = []

chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].split(',')
chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].replace(';','').strip().split(',')
applyToChains.extend(chains)

biomt = biomolecule[currentBiomolecule]
Expand Down Expand Up @@ -777,7 +778,7 @@ def _getPolymers(lines):
poly = polymers.get(ch, Polymer(ch))
polymers[ch] = poly
poly.sequence += ''.join(item[
'_entity_poly.pdbx_seq_one_letter_code'][1:-2].split())
'_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())

# DBREF block 1
items2 = parseSTARSection(lines, '_struct_ref')
Expand Down Expand Up @@ -1251,10 +1252,10 @@ def _getUnobservedSeq(lines):

unobs_seqs = OrderedDict()
for item in unobs:
chid = item['_pdbx_unobs_or_zero_occ_residues.label_asym_id']
chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
if not chid in unobs_seqs.keys():
unobs_seqs[chid] = ''
unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.label_comp_id']]
unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]

if len(unobs_seqs) == 0:
return None
Expand All @@ -1271,14 +1272,44 @@ def _getUnobservedSeq(lines):
return None

alns = OrderedDict()
for key, seq in full_seqs.items():
for k, (key, seq) in enumerate(full_seqs.items()):
if key in unobs_seqs.keys():
unobs_seq = unobs_seqs[key]
alns[key] = alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
MISMATCH_SCORE=-1000,
ALIGNMENT_METHOD='global',
GAP_PENALTY=GAP_PENALTY,
GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2]
# initialise alignment (quite possibly incorrect)
aln = list(alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
MISMATCH_SCORE=-1000,
ALIGNMENT_METHOD='global',
GAP_PENALTY=GAP_PENALTY,
GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2])

# fix it
prev_chid = unobs[0]['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
i = 0
for item in unobs:
chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
if chid != prev_chid:
prev_chid = chid
i = 0

if chid == key:
one_letter = AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]
good_pos = int(item['_pdbx_unobs_or_zero_occ_residues.label_seq_id']) - 1

row1_list = list(aln[0])

arr_unobs_seq = np.array(list(unobs_seq))
unobs_rep = np.where(arr_unobs_seq[:i+1] == one_letter)[0].shape[0] - 1
actual_pos = np.where(np.array(row1_list) == one_letter)[0][unobs_rep]

if actual_pos != good_pos:
row1_list[good_pos] = one_letter
row1_list[actual_pos] = '-'

aln[0] = ''.join(row1_list)

i += 1

alns[key] = aln

return alns

Expand All @@ -1303,7 +1334,7 @@ def _getUnobservedSeq(lines):
for line in lines][0],
'identifier': lambda lines: [line.split('_')[1]
if line.find("data") == 0 else ''
for line in lines][0],
for line in lines][0].strip(),
'title': _getTitle,
'experiment': lambda lines: [line.split()[1]
if line.find("_exptl.method") != -1 else None
Expand Down
7 changes: 5 additions & 2 deletions prody/proteins/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,10 +1111,13 @@ def buildBiomolecules(header, atoms, biomol=None):
translation[2] = line2[3]
t = Transformation(rotation, translation)

newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0])).copy()
newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0]))
if newag is None:
continue
newag.all.setSegnames(decToHybrid36(times+1,resnum=True))
newag = newag.copy()
segnames = newag.all.getSegnames()
newag.all.setSegnames(np.array([segname + decToHybrid36(times+1, resnum=True)
for segname in segnames]))
for acsi in range(newag.numCoordsets()):
newag.setACSIndex(acsi)
newag = t.apply(newag)
Expand Down
Loading

0 comments on commit 86114ab

Please sign in to comment.