Skip to content

Commit

Permalink
Merge pull request #123 from BioPandas/mmtf-gzip
Browse files Browse the repository at this point in the history
Add MMTF gzip reading support
  • Loading branch information
a-r-j authored Apr 17, 2023
2 parents 16172f0 + c554a46 commit 1ff101c
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 9 deletions.
34 changes: 26 additions & 8 deletions biopandas/mmtf/pandas_mmtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import pandas as pd
from looseversion import LooseVersion
from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse
from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse, parse_gzip

from biopandas.constants import protein_letters_3to1_extended

Expand Down Expand Up @@ -45,7 +45,10 @@ def df(self, value: Any):
# self._df = value

def read_mmtf(self, filename: str):
self.mmtf = parse(filename)
if filename.endswith(".gz"):
self.mmtf = parse_gzip(filename)
else:
self.mmtf = parse(filename)
self.mmtf_path = filename
df = self._mmtf_to_df(self.mmtf)
self._df["ATOM"] = df.loc[df.record_name == "ATOM"]
Expand Down Expand Up @@ -496,7 +499,7 @@ def parse_mmtf(file_path: str) -> pd.DataFrame:
:return: Dataframe of protein structure.
:rtype: pd.DataFrame
"""
df = parse(file_path)
df = parse_gzip(file_path) if file_path.endswith(".gz") else parse(file_path)
return mmtf_to_df(df)


Expand Down Expand Up @@ -530,16 +533,28 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
model_indices = mmtf_obj.chains_per_model
model_indices = [sum(model_indices[:i+1]) for i in range(len(model_indices))]
ch_idx = 0

entity_types = {}
for i in mmtf_obj.entity_list:
for chain in i["chainIndexList"]:
entity_types[chain] = i["type"]

for idx, i in enumerate(mmtf_obj.group_type_list):
res = mmtf_obj.group_list[i]
record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
#record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
#record = (
# "ATOM"
# if res["chemCompType"] in ["L-PEPTIDE LINKING", "PEPTIDE LINKING"]
# else "HETATM"
#)
if idx == chain_indices[ch_idx]:
ch_idx += 1
record = "ATOM" if entity_types[ch_idx] == "polymer" else "HETATM"

for _ in res["atomNameList"]:
data["residue_name"].append(res["groupName"])
data["residue_number"].append(mmtf_obj.group_id_list[idx])
data["chain_id"].append(mmtf_obj.chain_name_list[ch_idx])
data["chain_id"].append([mmtf_obj.chain_name_list[ch_idx]])
data["model_id"].append(int(np.argwhere(np.array(model_indices)>ch_idx)[0]) + 1)
data["record_name"].append(record)
data["insertion"].append(mmtf_obj.ins_code_list[idx])
Expand All @@ -565,9 +580,13 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
continue
data[k] = [i for sublist in v for i in sublist]

return pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
df = pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
df.alt_loc = df.alt_loc.str.replace("\x00", "")
df.insertion = df.insertion.str.replace("\x00", "")
return df

def _seq1(seq, charmap: Dict[str, str], undef_code="X"):
# sourcery skip: dict-assign-update-to-union
"""Convert protein sequence from three-letter to one-letter code.
The single required input argument 'seq' should be a protein sequence
using three-letter codes, either as a Python string or as a Seq or
Expand Down Expand Up @@ -650,7 +669,6 @@ def write_mmtf(df: pd.DataFrame, file_path: str):

node_ids = df.model_id.astype(str) + ":" + df.chain_id + ":" + df.residue_name + ":" + df.residue_number.astype(str) + ":" + df.insertion.astype(str)
df["residue_id"] = node_ids

# Tracks values to replace them at the end
chains_per_model = []
groups_per_chain = []
Expand Down Expand Up @@ -750,7 +768,7 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
encoder.set_atom_info(
atom_name=row.atom_name,
serial_number=row.atom_number,
alternative_location_id="\x00" if row.alt_loc == " " else row.alt_loc,
alternative_location_id="\x00" if row.alt_loc == "" else row.alt_loc,
x=row.x_coord,
y=row.y_coord,
z=row.z_coord,
Expand Down
102 changes: 102 additions & 0 deletions biopandas/mmtf/tests/test_read_mmtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# BioPandas
# Author: Sebastian Raschka <[email protected]>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas


import os
from urllib.error import HTTPError, URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
from nose.tools import raises

from biopandas.mmtf import PandasMmtf
from biopandas.pdb import PandasPdb
from biopandas.testutils import assert_raises

MMTF_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf")
MMTF_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf.gz")

PDB_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb")
PDB_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb.gz")


ATOM_DF_COLUMNS = [
"record_name",
"atom_number",
"atom_name",
#"alt_loc",
"residue_name",
"chain_id",
"residue_number",
#"insertion",
"x_coord",
"y_coord",
"z_coord",
"occupancy",
"b_factor",
"element_symbol",
#"charge",
]

def test_fetch_pdb():
"""Test fetch_pdb"""
ppdb = PandasMmtf()
ppdb.fetch_mmtf("3eiy")
assert max(ppdb.df["ATOM"].residue_number) == 175


def test__read_mmtf():
"""Test public _read_pdb with gzip files"""
pmmtf = PandasMmtf()
ppdb = PandasPdb()
pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME)

ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME)

pd.testing.assert_frame_equal(
pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
)

ATOM_DF_COLUMNS.remove("atom_number")
ATOM_DF_COLUMNS.remove("element_symbol")
pd.testing.assert_frame_equal(
pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
)


def test__read_mmtf_gz():
"""Test public _read_pdb with gzip files"""
pmmtf = PandasMmtf()
ppdb = PandasPdb()
pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME_GZ)
ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME_GZ)


pmmtf.df["ATOM"].alt_loc.replace('\x00', "", inplace=True)
pmmtf.df["HETATM"].alt_loc.replace('\x00', "", inplace=True)

pd.testing.assert_frame_equal(
pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
)
#pd.testing.assert_frame_equal(
# pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
# ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
# )


def test_read_mmtf():
"""Test public read_pdb"""
ppdb = PandasMmtf()
ppdb.read_mmtf(MMTF_TESTDATA_FILENAME)
assert ppdb.mmtf_path == MMTF_TESTDATA_FILENAME




10 changes: 9 additions & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@ The CHANGELOG for the current development version is available at
[https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md](https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md).


### 0.5.0dev (UNRELEASED)
### 0.5.0dev1 (11/4/2023)

- Adds supprt for reading Gzipped MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
- Improves reliability of parsing polymer/non-polymer entities in MMTF parsing. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
- Improves reliability of parsing multicharacter chain IDs from MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
- Replaces null terminator chars in parsed MMTF dataframe with the empty string. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))


### 0.5.0dev0 (3/4/2023)

##### Downloads

Expand Down

0 comments on commit 1ff101c

Please sign in to comment.