Skip to content

Commit

Permalink
changes in code.
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Nov 30, 2023
1 parent e999c58 commit 281c4de
Showing 1 changed file with 7 additions and 12 deletions.
19 changes: 7 additions & 12 deletions bin/peptide_normalization_stream.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python

import logging
import os
import random
import uuid
from matplotlib.backends.backend_pdf import PdfPages
import pyarrow.parquet as pq
from ibaq.ibaqpy_commons import *
import swifter


def read_large_parquet(parquet_path: str, batch_size: int = 100000):
Expand Down Expand Up @@ -269,9 +270,7 @@ def peptide_normalization(
else:
msstats_df = msstats_df[FEATURE_COLUMNS]
msstats_df = msstats_df.rename(columns=parquet_map)
msstats_df[PROTEIN_NAME] = msstats_df.apply(
lambda x: ",".join(x[PROTEIN_NAME]), axis=1
)
msstats_df[PROTEIN_NAME] = msstats_df.swifter.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis=1 )
if label == "LFQ":
msstats_df.drop(CHANNEL, inplace=True, axis=1)
else:
Expand All @@ -287,12 +286,9 @@ def peptide_normalization(
lambda x: inner_canonical_dict[x[PEPTIDE_SEQUENCE]], axis=1
)
# Filter peptides with less amino acids than min_aa (default: 7)
msstats_df = msstats_df[
msstats_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)
msstats_df = msstats_df[msstats_df.swifter.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)
]
msstats_df[PROTEIN_NAME] = msstats_df[PROTEIN_NAME].apply(
parse_uniprot_accession
)
msstats_df[PROTEIN_NAME] = msstats_df[PROTEIN_NAME].swifter.apply(parse_uniprot_accession)

if FRACTION not in msstats_df.columns:
msstats_df[FRACTION] = 1
Expand Down Expand Up @@ -353,9 +349,8 @@ def peptide_normalization(
file_name = f"{temp}/{sample}.csv"
write_mode = "a" if os.path.exists(file_name) else "w"
header = False if os.path.exists(file_name) else True
result_df[result_df[SAMPLE_ID] == sample].to_csv(
file_name, index=False, header=header, mode=write_mode
)
result_df[result_df[SAMPLE_ID] == sample].to_csv(file_name, index=False, header=header, mode=write_mode)
logging.info("Print the file: {}".format(file_name))
unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter(
lambda x: len(set(x[PROTEIN_NAME])) == 1
)[[PEPTIDE_CANONICAL, PROTEIN_NAME]]
Expand Down

0 comments on commit 281c4de

Please sign in to comment.