Merge pull request #33 from WangHong007/master

adapt to new pandas
bigbio · Nov 21, 2023 · 3882201 · 3882201
2 parents 5b3e0d5 + bb6bad5
commit 3882201
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -198,15 +198,15 @@ First, peptide intensity dataframe was grouped according to protein name, sample
 If protein-group exists in the peptide intensity dataframe, the intensity of all proteins in the protein-group is summed based on the above steps, and then multiplied by the number of proteins in the protein-group.
 
 #### 3. IBAQ Normalization  
-Normalize the ibaq values using the total ibaq of the sample. The resulted ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization), for the ibaq ppb and log10 shifted by 10 (ProteomicsDB)
+Normalize the ibaq values using the total ibaq of the sample. The resulted ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization), for the ibaq ppb and log10 shifted by 10 (ProteomicsDB).
 
 ### Compute TPA - compute_tpa.py
 The total protein approach (TPA) is a label- and standard-free method for absolute protein quantitation of proteins using large-scale proteomic data. In the current version of the tool, the TPA value is the total intensity of the protein divided by its theoretical molecular mass.
 
 [ProteomicRuler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4256500/) is a method for protein copy number and concentration estimation that does not require the use of isotope labeled standards. It uses the mass spectrum signal of histones as a "proteomic ruler" because it is proportional to the amount of DNA in the sample, which in turn depends on cell count. Thus, this approach can add an absolute scale to the mass spectrometry readout and allow estimates of the copy number of individual proteins in each cell.
 
 ```asciidoc
-python compute_tpa.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --contaminants contaminants_ids.tsv --peptides PXD003947-peptides.csv --ruler --ploidy 2 --cpc 200 --output PXD003947-tpa.tsv
+python compute_tpa.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --organism 'human' --peptides PXD003947-peptides.csv --ruler --ploidy 2 --cpc 200 --output PXD003947-tpa.tsv --verbose
 ```
 
 ```asciidoc
@@ -218,7 +218,7 @@ Usage: compute_tpa.py [OPTIONS]
 
   :param fasta: Fasta file used to perform the peptide identification
   :param peptides: Peptide intensity file
-  :param contaminants: Contaminants file
+  :param organism: Organism source of the data
   :param ruler: Whether to compute protein copy number, weight and concentration.
   :param ploidy: Ploidy number
   :param cpc: Cellular protein concentration(g/L)
@@ -230,7 +230,7 @@ Usage: compute_tpa.py [OPTIONS]
 Options:
   -f, --fasta TEXT      Protein database to compute IBAQ values  [required]
   -p, --peptides TEXT   Peptide identifications with intensities following the peptide intensity output  [required]
-  --contaminants        Contaminants and high abundant proteins to be removed
+  -m, --organism        Organism source of the data.
   -r, --ruler           Calculate protein copy number and concentration according to ProteomicRuler
   -n, --ploidy          Ploidy number (default: 2)
   -c, --cpc             Cellular protein concentration(g/L) (default: 200)

diff --git a/bin/compute_ibaq.py b/bin/compute_ibaq.py
@@ -13,7 +13,7 @@
 from ibaq.ibaqpy_commons import (CONDITION, IBAQ, IBAQ_LOG, IBAQ_NORMALIZED,
                                  IBAQ_PPB, NORM_INTENSITY, PROTEIN_NAME,
                                  SAMPLE_ID, plot_box_plot, plot_distributions,
-                                 print_help_msg)
+                                 print_help_msg, get_accession)
 
 
 def normalize(group):
@@ -43,19 +43,6 @@ def normalize_ibaq(res: DataFrame) -> DataFrame:
     return res
 
 
-def get_accession(identifier: str) -> str:
-    """
-    Get protein accession from the identifier  (e.g. sp|P12345|PROT_NAME)
-    :param identifier: Protein identifier
-    :return: Protein accession
-    """
-    identifier_lst = identifier.split("|")
-    if len(identifier_lst) == 1:
-        return identifier_lst[0]
-    else:
-        return identifier_lst[1]
-
-
 @click.command()
 @click.option("-f", "--fasta", help="Protein database to compute IBAQ values")
 @click.option(

diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py
@@ -10,23 +10,33 @@
 from pyopenms import *
 
 from bin.compute_ibaq import print_help_msg
-from ibaq.ibaqpy_commons import (CONDITION, INTENSITY, PROTEIN_NAME, SAMPLE_ID,
+from ibaq.ibaqpy_commons import (CONDITION, NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID,
                                  plot_box_plot, plot_distributions,
-                                 remove_contaminants_decoys)
+                                 remove_contaminants_decoys, get_accession)
+
+
+def handle_nonstandard_aa(aa_seq: str) -> (list, str):
+    """Any nonstandard amoni acid will be removed.
+
+    :param aa_seq: Protein sequences from multiple database.
+    :return: One list contains nonstandard amoni acids and one remain sequence.
+    """
+    standard_aa = 'ARNDBCEQZGHILKMFPSTWYV'
+    nonstandard_aa_lst = [aa for aa in aa_seq if aa not in standard_aa]
+    considered_seq = ''.join([aa for aa in aa_seq if aa in standard_aa])
+    return nonstandard_aa_lst, considered_seq
 
 
 @click.command()
 @click.option("-f", "--fasta", help="Protein database")
-@click.option(
-    "--contaminants", help="Contaminants and high abundant proteins to be removed"
-)
 @click.option(
     "-p",
     "--peptides",
     help="Peptide identifications with intensities following the peptide intensity output",
 )
 @click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True)
 @click.option("-n", "--ploidy", help="Ploidy number", default=2)
+@click.option("-m", "--organism", help="Organism source of the data", default="human")
 @click.option("-c", "--cpc", help="Cellular protein concentration(g/L)", default=200)
 @click.option("-o", "--output", help="Output file with the proteins and other values")
 @click.option(
@@ -42,9 +52,9 @@
 )
 def tpa_compute(
     fasta: str,
-    contaminants: str,
     peptides: str,
     ruler: bool,
+    organism: str,
     ploidy: int,
     cpc: float,
     output: str,
@@ -55,9 +65,9 @@ def tpa_compute(
     This command computes the protein copies and concentrations according to a file output of peptides with the
     format described in peptide_contaminants_file_generation.py.
     :param fasta: Fasta file used to perform the peptide identification.
-    :param contaminants: Contaminants file.
     :param peptides: Peptide intensity file without normalization.
     :param ruler: Whether to compute protein copies, weight and concentration.
+    :param organism: Organism source of the data.
     :param ploidy: Ploidy number.
     :param cpc: Cellular protein concentration(g/L).
     :param output: Output format containing the TPA values, protein copy numbers and concentrations.
@@ -70,17 +80,15 @@ def tpa_compute(
         exit(1)
 
     data = pd.read_csv(
-        peptides, sep=",", usecols=[PROTEIN_NAME, INTENSITY, SAMPLE_ID, CONDITION]
+        peptides, sep=",", usecols=[PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION]
     )
-    print("Remove contaminants...")
-    data = remove_contaminants_decoys(data, contaminants)
-    data[INTENSITY] = data[INTENSITY].astype("float")
-    data = data.dropna(subset=[INTENSITY])
-    data = data[data[INTENSITY] > 0]
+    data[NORM_INTENSITY] = data[NORM_INTENSITY].astype("float")
+    data = data.dropna(subset=[NORM_INTENSITY])
+    data = data[data[NORM_INTENSITY] > 0]
     print(data.head())
 
     res = pd.DataFrame(
-        data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[INTENSITY].sum()
+        data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum()
     )
     res = res.reset_index()
     proteins = res[PROTEIN_NAME].unique().tolist()
@@ -91,10 +99,16 @@ def tpa_compute(
     fasta_proteins = list()  # type: list[FASTAEntry]
     FASTAFile().load(fasta, fasta_proteins)
     for entry in fasta_proteins:
-        accession, name = entry.identifier.split("|")[1:]
-        if name in proteins:
-            mw = AASequence().fromString(entry.sequence).getMonoWeight()
-            mw_dict[name] = mw
+        accession = get_accession(entry.identifier)
+        if accession in proteins:
+            try:
+                mw = AASequence().fromString(entry.sequence).getMonoWeight()
+                mw_dict[accession] = mw
+            except:
+                error_aa, seq = handle_nonstandard_aa(entry.sequence)
+                mw = AASequence().fromString(seq).getMonoWeight()
+                mw_dict[accession] = mw
+                print(f"Nonstandard amimo acids found in {accession}: {error_aa}, ignored!")
 
     res = res[res[PROTEIN_NAME].isin(mw_dict.keys())]
 
@@ -108,7 +122,7 @@ def get_protein_group_mw(group: str) -> float:
     )
     res["MolecularWeight"] = res["MolecularWeight"].fillna(1)
     res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1)
-    res["TPA"] = res[INTENSITY] / res["MolecularWeight"]
+    res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"]
     # Print the distribution of the protein TPA values
     if verbose:
         plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10
@@ -135,7 +149,7 @@ def get_protein_group_mw(group: str) -> float:
         avogadro = 6.02214129e23
         average_base_pair_mass = 617.96  # 615.8771
 
-        organism = res.loc[0, PROTEIN_NAME].split("_")[1].lower()
+        organism = organism.lower()
         histone_df = pd.read_json(
             open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")
         ).T
@@ -153,12 +167,12 @@ def calculate(protein_intensity, histone_intensity, mw):
 
         def proteomic_ruler(df):
             histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][
-                INTENSITY
+                NORM_INTENSITY
             ].sum()
             histone_intensity = histone_intensity if histone_intensity > 0 else 1
             df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(
                 lambda x: calculate(
-                    x[INTENSITY], histone_intensity, x["MolecularWeight"]
+                    x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"]
                 ),
                 axis=1,
                 result_type="expand",
@@ -171,14 +185,15 @@ def proteomic_ruler(df):
 
         if verbose:
             density = plot_distributions(
-                res, "Copy", SAMPLE_ID, log2=True, title="Copy numbers Distribution"
+                res, "Copy", SAMPLE_ID, width=plot_width, log2=True, title="Copy numbers Distribution"
             )
             plt.show()
             pdf.savefig(density)
             box = plot_box_plot(
                 res,
                 "Copy",
                 SAMPLE_ID,
+                width=plot_width,
                 log2=True,
                 title="Copy numbers Distribution",
                 violin=False,
@@ -190,6 +205,7 @@ def proteomic_ruler(df):
                 res,
                 "Concentration[nM]",
                 SAMPLE_ID,
+                width=plot_width,
                 log2=True,
                 title="Concentration[nM] Distribution",
             )
@@ -199,6 +215,7 @@ def proteomic_ruler(df):
                 res,
                 "Concentration[nM]",
                 SAMPLE_ID,
+                width=plot_width,
                 log2=True,
                 title="Concentration[nM] Distribution",
                 violin=False,

diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 import gc
-import uuid
 import click
 import matplotlib.pyplot as plt
 import numpy as np
@@ -357,7 +356,7 @@ def impute_peptide_intensities(dataset_df, field, class_field):
 @click.option(
     "--qc_report",
     help="PDF file to store multiple QC images",
-    default=f"peptideNorm-QCprofile-{str(uuid.uuid4())}.pdf",
+    default="peptideNorm-QCprofile.pdf",
 )
 def peptide_normalization(
     msstats: str,

diff --git a/bin/peptide_normalization_stream.py b/bin/peptide_normalization_stream.py
@@ -140,7 +140,7 @@ def extract_label_from_sdrf(sdrf_path: str, compression: bool) -> tuple:
 @click.option(
     "--qc_report",
     help="PDF file to store multiple QC images",
-    default=f"StreamPeptideNorm-QCprofile-{str(uuid.uuid4())}.pdf",
+    default=f"StreamPeptideNorm-QCprofile.pdf",
 )
 def peptide_normalization(
     msstats: str,
@@ -201,7 +201,6 @@ def peptide_normalization(
         msstats_chunks = read_large_parquet(parquet, batch_size=chunksize)
 
     # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides
-    # if not os.path.exists("ibaqpy_temp/"):
     temp = f"Temp-{str(uuid.uuid4())}/"
     os.mkdir(temp)
     print(f"IBAQPY WARNING: Writing files into {temp}...")

diff --git a/ibaq/ibaqpy_commons.py b/ibaq/ibaqpy_commons.py
@@ -151,6 +151,19 @@ def print_help_msg(command: click.Command):
         click.echo(command.get_help(ctx))
 
 
+def get_accession(identifier: str) -> str:
+    """
+    Get protein accession from the identifier  (e.g. sp|P12345|PROT_NAME)
+    :param identifier: Protein identifier
+    :return: Protein accession
+    """
+    identifier_lst = identifier.split("|")
+    if len(identifier_lst) == 1:
+        return identifier_lst[0]
+    else:
+        return identifier_lst[1]
+
+
 def remove_protein_by_ids(
     dataset: DataFrame, protein_file: str, protein_field=PROTEIN_NAME
 ) -> DataFrame:
@@ -217,7 +230,7 @@ def plot_distributions(
     :return:
     """
     pd.set_option("mode.chained_assignment", None)
-    normalize = dataset[[field, class_field]]
+    normalize = dataset[[field, class_field]].reset_index(drop=True)
     if log2:
         normalize[field] = np.log2(normalize[field])
     normalize.dropna(subset=[field], inplace=True)