Skip to content

Commit

Permalink
Merge pull request #33 from WangHong007/master
Browse files Browse the repository at this point in the history
adapt to new pandas
  • Loading branch information
ypriverol authored Nov 21, 2023
2 parents 5b3e0d5 + bb6bad5 commit 3882201
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 46 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,15 +198,15 @@ First, peptide intensity dataframe was grouped according to protein name, sample
If protein-group exists in the peptide intensity dataframe, the intensity of all proteins in the protein-group is summed based on the above steps, and then multiplied by the number of proteins in the protein-group.

#### 3. IBAQ Normalization
Normalize the ibaq values using the total ibaq of the sample. The resulted ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization), for the ibaq ppb and log10 shifted by 10 (ProteomicsDB)
Normalize the ibaq values using the total ibaq of the sample. The resulted ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization), for the ibaq ppb and log10 shifted by 10 (ProteomicsDB).

### Compute TPA - compute_tpa.py
The total protein approach (TPA) is a label- and standard-free method for absolute protein quantitation of proteins using large-scale proteomic data. In the current version of the tool, the TPA value is the total intensity of the protein divided by its theoretical molecular mass.

[ProteomicRuler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4256500/) is a method for protein copy number and concentration estimation that does not require the use of isotope labeled standards. It uses the mass spectrum signal of histones as a "proteomic ruler" because it is proportional to the amount of DNA in the sample, which in turn depends on cell count. Thus, this approach can add an absolute scale to the mass spectrometry readout and allow estimates of the copy number of individual proteins in each cell.

```asciidoc
python compute_tpa.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --contaminants contaminants_ids.tsv --peptides PXD003947-peptides.csv --ruler --ploidy 2 --cpc 200 --output PXD003947-tpa.tsv
python compute_tpa.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --organism 'human' --peptides PXD003947-peptides.csv --ruler --ploidy 2 --cpc 200 --output PXD003947-tpa.tsv --verbose
```

```asciidoc
Expand All @@ -218,7 +218,7 @@ Usage: compute_tpa.py [OPTIONS]

:param fasta: Fasta file used to perform the peptide identification
:param peptides: Peptide intensity file
:param contaminants: Contaminants file
:param organism: Organism source of the data
:param ruler: Whether to compute protein copy number, weight and concentration.
:param ploidy: Ploidy number
:param cpc: Cellular protein concentration(g/L)
Expand All @@ -230,7 +230,7 @@ Usage: compute_tpa.py [OPTIONS]
Options:
-f, --fasta TEXT Protein database to compute IBAQ values [required]
-p, --peptides TEXT Peptide identifications with intensities following the peptide intensity output [required]
--contaminants Contaminants and high abundant proteins to be removed
-m, --organism Organism source of the data.
-r, --ruler Calculate protein copy number and concentration according to ProteomicRuler
-n, --ploidy Ploidy number (default: 2)
-c, --cpc Cellular protein concentration(g/L) (default: 200)
Expand Down
15 changes: 1 addition & 14 deletions bin/compute_ibaq.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ibaq.ibaqpy_commons import (CONDITION, IBAQ, IBAQ_LOG, IBAQ_NORMALIZED,
IBAQ_PPB, NORM_INTENSITY, PROTEIN_NAME,
SAMPLE_ID, plot_box_plot, plot_distributions,
print_help_msg)
print_help_msg, get_accession)


def normalize(group):
Expand Down Expand Up @@ -43,19 +43,6 @@ def normalize_ibaq(res: DataFrame) -> DataFrame:
return res


def get_accession(identifier: str) -> str:
"""
Get protein accession from the identifier (e.g. sp|P12345|PROT_NAME)
:param identifier: Protein identifier
:return: Protein accession
"""
identifier_lst = identifier.split("|")
if len(identifier_lst) == 1:
return identifier_lst[0]
else:
return identifier_lst[1]


@click.command()
@click.option("-f", "--fasta", help="Protein database to compute IBAQ values")
@click.option(
Expand Down
63 changes: 40 additions & 23 deletions bin/compute_tpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,33 @@
from pyopenms import *

from bin.compute_ibaq import print_help_msg
from ibaq.ibaqpy_commons import (CONDITION, INTENSITY, PROTEIN_NAME, SAMPLE_ID,
from ibaq.ibaqpy_commons import (CONDITION, NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID,
plot_box_plot, plot_distributions,
remove_contaminants_decoys)
remove_contaminants_decoys, get_accession)


def handle_nonstandard_aa(aa_seq: str) -> (list, str):
"""Any nonstandard amoni acid will be removed.
:param aa_seq: Protein sequences from multiple database.
:return: One list contains nonstandard amoni acids and one remain sequence.
"""
standard_aa = 'ARNDBCEQZGHILKMFPSTWYV'
nonstandard_aa_lst = [aa for aa in aa_seq if aa not in standard_aa]
considered_seq = ''.join([aa for aa in aa_seq if aa in standard_aa])
return nonstandard_aa_lst, considered_seq


@click.command()
@click.option("-f", "--fasta", help="Protein database")
@click.option(
"--contaminants", help="Contaminants and high abundant proteins to be removed"
)
@click.option(
"-p",
"--peptides",
help="Peptide identifications with intensities following the peptide intensity output",
)
@click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True)
@click.option("-n", "--ploidy", help="Ploidy number", default=2)
@click.option("-m", "--organism", help="Organism source of the data", default="human")
@click.option("-c", "--cpc", help="Cellular protein concentration(g/L)", default=200)
@click.option("-o", "--output", help="Output file with the proteins and other values")
@click.option(
Expand All @@ -42,9 +52,9 @@
)
def tpa_compute(
fasta: str,
contaminants: str,
peptides: str,
ruler: bool,
organism: str,
ploidy: int,
cpc: float,
output: str,
Expand All @@ -55,9 +65,9 @@ def tpa_compute(
This command computes the protein copies and concentrations according to a file output of peptides with the
format described in peptide_contaminants_file_generation.py.
:param fasta: Fasta file used to perform the peptide identification.
:param contaminants: Contaminants file.
:param peptides: Peptide intensity file without normalization.
:param ruler: Whether to compute protein copies, weight and concentration.
:param organism: Organism source of the data.
:param ploidy: Ploidy number.
:param cpc: Cellular protein concentration(g/L).
:param output: Output format containing the TPA values, protein copy numbers and concentrations.
Expand All @@ -70,17 +80,15 @@ def tpa_compute(
exit(1)

data = pd.read_csv(
peptides, sep=",", usecols=[PROTEIN_NAME, INTENSITY, SAMPLE_ID, CONDITION]
peptides, sep=",", usecols=[PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION]
)
print("Remove contaminants...")
data = remove_contaminants_decoys(data, contaminants)
data[INTENSITY] = data[INTENSITY].astype("float")
data = data.dropna(subset=[INTENSITY])
data = data[data[INTENSITY] > 0]
data[NORM_INTENSITY] = data[NORM_INTENSITY].astype("float")
data = data.dropna(subset=[NORM_INTENSITY])
data = data[data[NORM_INTENSITY] > 0]
print(data.head())

res = pd.DataFrame(
data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[INTENSITY].sum()
data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum()
)
res = res.reset_index()
proteins = res[PROTEIN_NAME].unique().tolist()
Expand All @@ -91,10 +99,16 @@ def tpa_compute(
fasta_proteins = list() # type: list[FASTAEntry]
FASTAFile().load(fasta, fasta_proteins)
for entry in fasta_proteins:
accession, name = entry.identifier.split("|")[1:]
if name in proteins:
mw = AASequence().fromString(entry.sequence).getMonoWeight()
mw_dict[name] = mw
accession = get_accession(entry.identifier)
if accession in proteins:
try:
mw = AASequence().fromString(entry.sequence).getMonoWeight()
mw_dict[accession] = mw
except:
error_aa, seq = handle_nonstandard_aa(entry.sequence)
mw = AASequence().fromString(seq).getMonoWeight()
mw_dict[accession] = mw
print(f"Nonstandard amimo acids found in {accession}: {error_aa}, ignored!")

res = res[res[PROTEIN_NAME].isin(mw_dict.keys())]

Expand All @@ -108,7 +122,7 @@ def get_protein_group_mw(group: str) -> float:
)
res["MolecularWeight"] = res["MolecularWeight"].fillna(1)
res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1)
res["TPA"] = res[INTENSITY] / res["MolecularWeight"]
res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"]
# Print the distribution of the protein TPA values
if verbose:
plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10
Expand All @@ -135,7 +149,7 @@ def get_protein_group_mw(group: str) -> float:
avogadro = 6.02214129e23
average_base_pair_mass = 617.96 # 615.8771

organism = res.loc[0, PROTEIN_NAME].split("_")[1].lower()
organism = organism.lower()
histone_df = pd.read_json(
open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")
).T
Expand All @@ -153,12 +167,12 @@ def calculate(protein_intensity, histone_intensity, mw):

def proteomic_ruler(df):
histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][
INTENSITY
NORM_INTENSITY
].sum()
histone_intensity = histone_intensity if histone_intensity > 0 else 1
df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(
lambda x: calculate(
x[INTENSITY], histone_intensity, x["MolecularWeight"]
x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"]
),
axis=1,
result_type="expand",
Expand All @@ -171,14 +185,15 @@ def proteomic_ruler(df):

if verbose:
density = plot_distributions(
res, "Copy", SAMPLE_ID, log2=True, title="Copy numbers Distribution"
res, "Copy", SAMPLE_ID, width=plot_width, log2=True, title="Copy numbers Distribution"
)
plt.show()
pdf.savefig(density)
box = plot_box_plot(
res,
"Copy",
SAMPLE_ID,
width=plot_width,
log2=True,
title="Copy numbers Distribution",
violin=False,
Expand All @@ -190,6 +205,7 @@ def proteomic_ruler(df):
res,
"Concentration[nM]",
SAMPLE_ID,
width=plot_width,
log2=True,
title="Concentration[nM] Distribution",
)
Expand All @@ -199,6 +215,7 @@ def proteomic_ruler(df):
res,
"Concentration[nM]",
SAMPLE_ID,
width=plot_width,
log2=True,
title="Concentration[nM] Distribution",
violin=False,
Expand Down
3 changes: 1 addition & 2 deletions bin/peptide_normalization.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
import gc
import uuid
import click
import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -357,7 +356,7 @@ def impute_peptide_intensities(dataset_df, field, class_field):
@click.option(
"--qc_report",
help="PDF file to store multiple QC images",
default=f"peptideNorm-QCprofile-{str(uuid.uuid4())}.pdf",
default="peptideNorm-QCprofile.pdf",
)
def peptide_normalization(
msstats: str,
Expand Down
3 changes: 1 addition & 2 deletions bin/peptide_normalization_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def extract_label_from_sdrf(sdrf_path: str, compression: bool) -> tuple:
@click.option(
"--qc_report",
help="PDF file to store multiple QC images",
default=f"StreamPeptideNorm-QCprofile-{str(uuid.uuid4())}.pdf",
default=f"StreamPeptideNorm-QCprofile.pdf",
)
def peptide_normalization(
msstats: str,
Expand Down Expand Up @@ -201,7 +201,6 @@ def peptide_normalization(
msstats_chunks = read_large_parquet(parquet, batch_size=chunksize)

# TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides
# if not os.path.exists("ibaqpy_temp/"):
temp = f"Temp-{str(uuid.uuid4())}/"
os.mkdir(temp)
print(f"IBAQPY WARNING: Writing files into {temp}...")
Expand Down
15 changes: 14 additions & 1 deletion ibaq/ibaqpy_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,19 @@ def print_help_msg(command: click.Command):
click.echo(command.get_help(ctx))


def get_accession(identifier: str) -> str:
"""
Get protein accession from the identifier (e.g. sp|P12345|PROT_NAME)
:param identifier: Protein identifier
:return: Protein accession
"""
identifier_lst = identifier.split("|")
if len(identifier_lst) == 1:
return identifier_lst[0]
else:
return identifier_lst[1]


def remove_protein_by_ids(
dataset: DataFrame, protein_file: str, protein_field=PROTEIN_NAME
) -> DataFrame:
Expand Down Expand Up @@ -217,7 +230,7 @@ def plot_distributions(
:return:
"""
pd.set_option("mode.chained_assignment", None)
normalize = dataset[[field, class_field]]
normalize = dataset[[field, class_field]].reset_index(drop=True)
if log2:
normalize[field] = np.log2(normalize[field])
normalize.dropna(subset=[field], inplace=True)
Expand Down

0 comments on commit 3882201

Please sign in to comment.