Skip to content

Commit

Permalink
Merge pull request prody#1752 from vratins/master
Browse files Browse the repository at this point in the history
MMTF Parser
  • Loading branch information
jamesmkrieger authored Oct 4, 2023
2 parents e7064a4 + b2da750 commit 3eb38b1
Show file tree
Hide file tree
Showing 11 changed files with 11,478 additions and 2 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ jobs:
source activate test
conda install --yes numpy scipy nose pyparsing requests
if [[ ${{ matrix.python-version }} == "2.7" ]]; then conda install --yes unittest2; fi
pip install mmtf-python
pip install .
python setup.py build_ext --inplace --force
- name: Test with pytest
run: |
source activate test
conda install --yes pytest
pytest
4 changes: 4 additions & 0 deletions prody/proteins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@
from .pdbfile import *
__all__.extend(pdbfile.__all__)

from . import mmtffile
from .mmtffile import *
__all__.extend(mmtffile.__all__)

from . import emdfile
from .emdfile import *
__all__.extend(emdfile.__all__)
Expand Down
349 changes: 349 additions & 0 deletions prody/proteins/mmtffile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# -*- coding: utf-8 -*-
"""This module defines functions for parsing MMTF files and structure objects.
.. MMTF files: https://mmtf.rcsb.org"""

from numbers import Number
import os.path
from collections import OrderedDict

from prody.atomic import AtomGroup
from prody.atomic import flags
from prody.atomic import ATOMIC_FIELDS

from prody.utilities import openFile, isListLike, copy
from prody import LOGGER
from prody.utilities.misctools import getMasses
from .header import getHeaderDict, buildBiomolecules

import struct as st
import numpy as np

__all__ = ['parseMMTF']

_parseMMTFdoc = """
:arg chain: Chain identifier(s) to parse (e.g., 'A' for chain A). If not provided,
all chains are parsed. If a PDB ID is used, chain can also be specified as part
of the ID (e.g., '1abcA' to parse chain A).
:type chain: str, optional
:arg title: Title to assign to the resulting AtomGroup. If not provided, the
title is extracted from the MMTF structure or set to the PDB ID.
:type title: str, optional
:arg subset: a predefined keyword to parse subset of atoms, valid keywords
are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
(read all atoms), e.g. ``subset='bb'``
:type subset: str
:arg model: model index or None (read all models), e.g. ``model=10``
:type model: int, list
:arg altloc: if a location indicator is passed, such as ``'A'`` or ``'B'``,
only indicated alternate locations will be parsed as the single
coordinate set of the AtomGroup, if *altloc* is set **True** all
alternate locations will be parsed and each will be appended as a
distinct coordinate set, default is ``"A"``
:type altloc: str
"""

def parseMMTF(mmtf_struc, **kwargs):
"""
Parse an MMTF (Macromolecular Transmission Format) structure or fetch it from the PDB,
and return an AtomGroup containing the parsed data.
:param mmtf_struc: The MMTF structure to parse. It can be provided in one of the following ways:
- A string representing a PDB ID (e.g., '1abc').
- The filename of an MMTF file (ending with '.mmtf' or '.mmtf.gz').
- An MMTF structure object (file parsed through mmtf-python).
:type mmtf_struc: str or MMTF Structure object
:param chain: Chain identifier(s) to parse (e.g., 'A' for chain A). If not provided,
all chains are parsed. If a PDB ID is used, chain can also be specified as part
of the ID (e.g., '1abcA' to parse chain A).
:type chain: str, optional
:return: An AtomGroup containing the parsed atomic data.
:rtype: AtomGroup
"""
try:
from mmtf import fetch, parse
import mmtf
except ImportError:
raise ImportError("Install mmtf to read in mmtf structure objects (e.g. pip install mmtf-python)")

chain = kwargs.pop('chain', None)
title = kwargs.get('title', None)

if type(mmtf_struc)==str:
if not mmtf_struc.endswith(".mmtf") and not mmtf_struc.endswith(".mmtf.gz"): #entering just the pdb id
if len(mmtf_struc) == 5 and mmtf_struc.isalnum():
if chain is None:
chain = mmtf_struc[-1]
mmtf_struc = mmtf_struc[:4]
else:
raise ValueError('Please provide chain as a keyword argument '
'or part of the PDB ID, not both')
else:
chain = chain

if len(mmtf_struc) == 4 and mmtf_struc.isalnum():
if title is None:
title = mmtf_struc
kwargs['title'] = title
structure = fetch(mmtf_struc)
if structure is None:
raise IOError('mmtf file for {0} could not be downloaded.'
.format(mmtf_struc))
mmtf_struc = structure
else:
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(mmtf_struc))

else: #entering the mmtf file name
structure=parse(mmtf_struc)
if structure is None:
raise IOError('mmtf file for {0} could not be downloaded.'
.format(mmtf_struc))
mmtf_struc = structure
title = mmtf_struc.structure_id

#if none of the above loops are entered, user should have passed a mmtf structure object
if title is None:
title = mmtf_struc.structure_id

result = _parseMMTF(mmtf_struc, chain=chain, **kwargs)

return result

parseMMTF.__doc__ += _parseMMTFdoc


def _parseMMTF(mmtf_struc, **kwargs):
LOGGER.timeit()
ag = AtomGroup()
model = kwargs.get('model')
subset = kwargs.get('subset')
chain = kwargs.get('chain')
header = kwargs.get('header', False)
get_bonds = kwargs.get('bonds',False)
altloc_sel = kwargs.get('altloc', 'A')

assert isinstance(header, bool), 'header must be a boolean'

if model is not None:
if isinstance(model, int):
if model < 0:
raise ValueError('model must be greater than 0')
else:
raise TypeError('model must be an integer, {0} is invalid'
.format(str(model)))
title_suffix = ''

biomol = kwargs.get('biomol', False)
hd = set_header(mmtf_struc)
ag = set_info(ag, mmtf_struc, get_bonds, altloc_sel)

if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate set(s) were '
'parsed in %.2fs.'.format(ag.numAtoms(),
ag.numCoordsets()))
else:
ag = None
LOGGER.warn('Atomic data could not be parsed, please '
'check the input file.')



if ag is not None and isinstance(hd, dict):
if biomol:
ag = buildBiomolecules(hd, ag)
if isinstance(ag, list):
LOGGER.info('Biomolecular transformations were applied, {0} '
'biomolecule(s) are returned.'.format(len(ag)))
else:
LOGGER.info('Biomolecular transformations were applied to the '
'coordinate data.')

if header:
return ag, hd
else:
return ag


def set_header(data_api):

#get the transform here and convert it to the format that prody wants
assembly = data_api.bio_assembly[0]
chain_list = data_api.chain_name_list
assembly = bio_transform(assembly, chain_list)

header = {
'r_work': data_api.r_work,
'r_free': data_api.r_free,
'resolution': data_api.resolution,
'title': data_api.title,
'deposition_date': data_api.deposition_date,
'release_date': data_api.release_date,
'experimental_methods': data_api.experimental_methods,
'biomoltrans': assembly
}

return header

def bio_transform(input_data, chain_list):

output_dict = {}
index = input_data['name']
transforms=[]

for item in input_data['transformList']:
chains = list(set([chain_list[i] for i in item['chainIndexList']]))[::-1]
transforms.append(chains)
for i in range(0, len(item['matrix'])-4, 4):
string_slice = item['matrix'][i:i+4]
formatted_string = ' '.join(map(str, string_slice))
transforms.append(formatted_string)

output_dict[index]=transforms

return output_dict

def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'):

mmtfHETATMtypes = set([
"D-SACCHARIDE",
"D-SACCHARIDE 1,4 AND 1,4 LINKING",
"D-SACCHARIDE 1,4 AND 1,6 LINKING",
"L-SACCHARIDE",
"L-SACCHARIDE 1,4 AND 1,4 LINKING",
"L-SACCHARIDE 1,4 AND 1,6 LINKING",
"NON-POLYMER",
"OTHER",
"PEPTIDE-LIKE",
"SACCHARIDE"])

asize = mmtf_data.num_atoms

if len(mmtf_data.chains_per_model) > 1:
# get number of atoms in first model
asize = 0
groupIndex = 0
modelChainCount = mmtf_data.chains_per_model[0]
for chainGroupCount in mmtf_data.groups_per_chain[:modelChainCount]:
#traverse groups
for _ in range(chainGroupCount):
group = mmtf_data.group_list[mmtf_data.group_type_list[groupIndex]]
asize += len(group['atomNameList'])
groupIndex += 1

fields = OrderedDict()

x = mmtf_data.x_coord_list
y = mmtf_data.y_coord_list
z = mmtf_data.z_coord_list

if len(x) != mmtf_data.num_models*asize:
LOGGER.warn('Multi-model MMTF files with different molecules not supported. Keeping only first model')
coords = np.array([x, y, z]).T[:asize].reshape(1, asize, 3)
else:
coords = np.array([x, y, z]).T.reshape(mmtf_data.num_models, asize, 3)

# Initialize arrays for atom properties
atom_names = np.empty(asize, dtype=ATOMIC_FIELDS['name'].dtype)
resnames = np.empty(asize, dtype=ATOMIC_FIELDS['resname'].dtype)
resnums = np.empty(asize, dtype=ATOMIC_FIELDS['resnum'].dtype)
chainids = np.empty(asize, dtype=ATOMIC_FIELDS['chain'].dtype)
segnames = np.empty(asize, dtype=ATOMIC_FIELDS['segment'].dtype)
hetero = np.zeros(asize, dtype=bool)
termini = np.zeros(asize, dtype=bool)
altlocs = np.empty(asize, dtype=ATOMIC_FIELDS['altloc'].dtype)
icodes = np.empty(asize, dtype=ATOMIC_FIELDS['icode'].dtype)
serials = np.empty(asize, dtype=ATOMIC_FIELDS['serial'].dtype)
elements = np.empty(asize, dtype=ATOMIC_FIELDS['element'].dtype)
bfactors = np.empty(asize, dtype=ATOMIC_FIELDS['beta'].dtype)
occupancies = np.empty(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype)

modelIndex = 0
chainIndex = 0
groupIndex = 0
atomIndex = 0

#traverse models
for modelChainCount in mmtf_data.chains_per_model:
chain_name_list = mmtf_data.chain_name_list

#traverse chains
for k in range(modelChainCount):
chainGroupCount = mmtf_data.groups_per_chain[chainIndex]

#traverse groups
for _ in range(chainGroupCount):
group = mmtf_data.group_list[mmtf_data.group_type_list[groupIndex]]
groupAtomCount = len(group.get('atomNameList'))

#traverse atoms
for i in range(groupAtomCount):
bfactors[atomIndex] = mmtf_data.b_factor_list[atomIndex]
altlocs[atomIndex] = mmtf_data.alt_loc_list[atomIndex]
occupancies[atomIndex] = mmtf_data.occupancy_list[atomIndex]
atom_names[atomIndex] = group.get('atomNameList')[i]
elements[atomIndex] = group.get('elementList')[i]
resnames[atomIndex] = group.get('groupName')
resnums[atomIndex] = mmtf_data.group_id_list[groupIndex]
icodes[atomIndex] = mmtf_data.ins_code_list[groupIndex]
chainids[atomIndex] = chain_name_list[k]
if group.get('chemCompType') in mmtfHETATMtypes:
hetero[atomIndex] = True
serials[atomIndex] = atomIndex + 1
segnames[atomIndex]=''

atomIndex += 1

groupIndex += 1
chainIndex += 1
modelIndex += 1
break

#detect termini based on chain changes
termini[:-1] = chainids[1:] != chainids[:-1]
termini[-1] = True #the last atom is always a terminus

mask = np.full(asize, True, dtype=bool)
if altloc_sel != 'all':
#mask out any unwanted alternative locations
mask = (altlocs == '') | (altlocs == altloc_sel)

atomgroup.setCoords(coords[:,mask])
atomgroup.setNames(atom_names[mask])
atomgroup.setResnums(resnums[mask])
atomgroup.setResnames(resnames[mask])
atomgroup.setChids(chainids[mask])
atomgroup.setElements(elements[mask])
atomgroup.setMasses(getMasses(elements[mask]))
atomgroup.setBetas(bfactors[mask])
atomgroup.setAltlocs(altlocs[mask])
atomgroup.setOccupancies(occupancies[mask])
atomgroup.setFlags('hetatm', hetero[mask])
atomgroup.setFlags('pdbter', termini[mask])
atomgroup.setFlags('selpdbter', termini[mask])
atomgroup.setSerials(serials[mask])
atomgroup.setIcodes(icodes[mask])
atomgroup.setSegnames(segnames[mask])
atomgroup.setTitle(mmtf_data.structure_id)

if get_bonds and hasattr(mmtf_data,'bond_atom_list'):
#have to remap any masked out atoms
remaining = np.arange(asize)[mask]
remap = np.full(asize,-1)
remap[remaining] = np.arange(len(remaining))
allbonds = np.array(mmtf_data.bond_atom_list).reshape(-1,2)
nonpeptide = []
#irgnore bonds between C and N in adjacent residues
for a,b in allbonds:
if a < asize and b < asize and mask[a] and mask[b] :
if atom_names[a] != 'N' or atom_names[b] != 'C' or resnums[a]-resnums[b] != 1:
nonpeptide.append((remap[a],remap[b]))
atomgroup.setBonds(nonpeptide)

return atomgroup
Binary file added prody/tests/datafiles/1pwc.mmtf
Binary file not shown.
Loading

0 comments on commit 3eb38b1

Please sign in to comment.