diff --git a/biopandas/mmtf/pandas_mmtf.py b/biopandas/mmtf/pandas_mmtf.py index 2585f52..0c36110 100644 --- a/biopandas/mmtf/pandas_mmtf.py +++ b/biopandas/mmtf/pandas_mmtf.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd from looseversion import LooseVersion -from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse +from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse, parse_gzip from biopandas.constants import protein_letters_3to1_extended @@ -45,7 +45,10 @@ def df(self, value: Any): # self._df = value def read_mmtf(self, filename: str): - self.mmtf = parse(filename) + if filename.endswith(".gz"): + self.mmtf = parse_gzip(filename) + else: + self.mmtf = parse(filename) self.mmtf_path = filename df = self._mmtf_to_df(self.mmtf) self._df["ATOM"] = df.loc[df.record_name == "ATOM"] @@ -496,7 +499,7 @@ def parse_mmtf(file_path: str) -> pd.DataFrame: :return: Dataframe of protein structure. :rtype: pd.DataFrame """ - df = parse(file_path) + df = parse_gzip(file_path) if file_path.endswith(".gz") else parse(file_path) return mmtf_to_df(df) @@ -530,16 +533,28 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame: model_indices = mmtf_obj.chains_per_model model_indices = [sum(model_indices[:i+1]) for i in range(len(model_indices))] ch_idx = 0 + + entity_types = {} + for i in mmtf_obj.entity_list: + for chain in i["chainIndexList"]: + entity_types[chain] = i["type"] + for idx, i in enumerate(mmtf_obj.group_type_list): res = mmtf_obj.group_list[i] - record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM" + #record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM" + #record = ( + # "ATOM" + # if res["chemCompType"] in ["L-PEPTIDE LINKING", "PEPTIDE LINKING"] + # else "HETATM" + #) if idx == chain_indices[ch_idx]: ch_idx += 1 + record = "ATOM" if entity_types[ch_idx] == "polymer" else "HETATM" for _ in res["atomNameList"]: data["residue_name"].append(res["groupName"]) data["residue_number"].append(mmtf_obj.group_id_list[idx]) - data["chain_id"].append(mmtf_obj.chain_name_list[ch_idx]) + data["chain_id"].append([mmtf_obj.chain_name_list[ch_idx]]) data["model_id"].append(int(np.argwhere(np.array(model_indices)>ch_idx)[0]) + 1) data["record_name"].append(record) data["insertion"].append(mmtf_obj.ins_code_list[idx]) @@ -565,9 +580,13 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame: continue data[k] = [i for sublist in v for i in sublist] - return pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"]) + df = pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"]) + df.alt_loc = df.alt_loc.str.replace("\x00", "") + df.insertion = df.insertion.str.replace("\x00", "") + return df def _seq1(seq, charmap: Dict[str, str], undef_code="X"): + # sourcery skip: dict-assign-update-to-union """Convert protein sequence from three-letter to one-letter code. The single required input argument 'seq' should be a protein sequence using three-letter codes, either as a Python string or as a Seq or @@ -650,7 +669,6 @@ def write_mmtf(df: pd.DataFrame, file_path: str): node_ids = df.model_id.astype(str) + ":" + df.chain_id + ":" + df.residue_name + ":" + df.residue_number.astype(str) + ":" + df.insertion.astype(str) df["residue_id"] = node_ids - # Tracks values to replace them at the end chains_per_model = [] groups_per_chain = [] @@ -750,7 +768,7 @@ def write_mmtf(df: pd.DataFrame, file_path: str): encoder.set_atom_info( atom_name=row.atom_name, serial_number=row.atom_number, - alternative_location_id="\x00" if row.alt_loc == " " else row.alt_loc, + alternative_location_id="\x00" if row.alt_loc == "" else row.alt_loc, x=row.x_coord, y=row.y_coord, z=row.z_coord, diff --git a/biopandas/mmtf/tests/test_read_mmtf.py b/biopandas/mmtf/tests/test_read_mmtf.py new file mode 100644 index 0000000..35e7c02 --- /dev/null +++ b/biopandas/mmtf/tests/test_read_mmtf.py @@ -0,0 +1,102 @@ +# BioPandas +# Author: Sebastian Raschka +# License: BSD 3 clause +# Project Website: http://rasbt.github.io/biopandas/ +# Code Repository: https://github.com/rasbt/biopandas + + +import os +from urllib.error import HTTPError, URLError +from urllib.request import urlopen + +import numpy as np +import pandas as pd +from nose.tools import raises + +from biopandas.mmtf import PandasMmtf +from biopandas.pdb import PandasPdb +from biopandas.testutils import assert_raises + +MMTF_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf") +MMTF_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf.gz") + +PDB_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb") +PDB_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb.gz") + + +ATOM_DF_COLUMNS = [ + "record_name", + "atom_number", + "atom_name", + #"alt_loc", + "residue_name", + "chain_id", + "residue_number", + #"insertion", + "x_coord", + "y_coord", + "z_coord", + "occupancy", + "b_factor", + "element_symbol", + #"charge", +] + +def test_fetch_pdb(): + """Test fetch_pdb""" + ppdb = PandasMmtf() + ppdb.fetch_mmtf("3eiy") + assert max(ppdb.df["ATOM"].residue_number) == 175 + + +def test__read_mmtf(): + """Test public _read_pdb with gzip files""" + pmmtf = PandasMmtf() + ppdb = PandasPdb() + pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME) + + ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME) + + pd.testing.assert_frame_equal( + pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ) + + ATOM_DF_COLUMNS.remove("atom_number") + ATOM_DF_COLUMNS.remove("element_symbol") + pd.testing.assert_frame_equal( + pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ) + + +def test__read_mmtf_gz(): + """Test public _read_pdb with gzip files""" + pmmtf = PandasMmtf() + ppdb = PandasPdb() + pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME_GZ) + ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME_GZ) + + + pmmtf.df["ATOM"].alt_loc.replace('\x00', "", inplace=True) + pmmtf.df["HETATM"].alt_loc.replace('\x00', "", inplace=True) + + pd.testing.assert_frame_equal( + pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True), + ) + #pd.testing.assert_frame_equal( + # pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True), + # ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True), + # ) + + +def test_read_mmtf(): + """Test public read_pdb""" + ppdb = PandasMmtf() + ppdb.read_mmtf(MMTF_TESTDATA_FILENAME) + assert ppdb.mmtf_path == MMTF_TESTDATA_FILENAME + + + + diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 565750f..42b4da0 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -4,7 +4,15 @@ The CHANGELOG for the current development version is available at [https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md](https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md). -### 0.5.0dev (UNRELEASED) +### 0.5.0dev1 (11/4/2023) + +- Adds supprt for reading Gzipped MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files)) +- Improves reliability of parsing polymer/non-polymer entities in MMTF parsing. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files)) +- Improves reliability of parsing multicharacter chain IDs from MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files)) +- Replaces null terminator chars in parsed MMTF dataframe with the empty string. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files)) + + +### 0.5.0dev0 (3/4/2023) ##### Downloads