diff --git a/README.md b/README.md index e61c58e..97a2827 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute ### Peptide Normalization - peptide_normalization.py ```asciidoc -python peptide_normalization.py --peptides PXD003947-peptides.csv --impute --normalize --contaminants data/contaminants_ids.tsv --output PXD003947-peptides-norm.tsv +python peptide_normalization.py --msstats PXD003947.sdrf_openms_design_msstats_in.csv --sdrf PXD003947.sdrf.tsv --remove_decoy_contaminants data/contaminants_ids.tsv --remove_low_frequency_peptides --output PXD003947-peptides-norm.tsv ``` The command provides an additional `flag` for skip_normalization, pnormalization, compress, log2, violin, verbose. @@ -70,31 +70,40 @@ The command provides an additional `flag` for skip_normalization, pnormalization Usage: peptide_normalization.py [OPTIONS] Options: - -m, --msstats TEXT MsStats file import generated by quantms [required] - -s, --sdrf TEXT SDRF file import generated by quantms [required] - --min_aa INTEGER Minimum number of amino acids to filter peptides - --min_unique INTEGER Minimum number of unique peptides to filter proteins - --contaminants TEXT Contaminants and high abundant proteins to be removed - --output TEXT Peptide intensity file including other all properties - for normalization - --skip_normalization Skip normalization step - --nmethod TEXT Normalization method used to normalize intensities for - all samples (options: qnorm) - --pnormalization Normalize the peptide intensities using different - methods (options: qnorm) - --compress Read the input peptides file in compress gzip file - --log2 Transform to log2 the peptide intensity values before - normalization - --violin Use violin plot instead of boxplot for distribution - representations - --verbose Print addition information about the distributions of - the intensities, number of peptides remove after - normalization, etc. - --qc_report TEXT PDF file to store multiple QC images - --help Show this message and exit. + -m, --msstats TEXT MsStats file import generated by quantms + -p, --parquet TEXT Parquet file import generated by quantms + -s, --sdrf TEXT SDRF file import generated by quantms + --min_aa INTEGER Minimum number of amino acids to filter + peptides + --min_unique INTEGER Minimum number of unique peptides to filter + proteins + --remove_ids TEXT Remove specific protein ids from the + analysis using a file with one id per line + --remove_decoy_contaminants Remove decoy and contaminants proteins from + the analysis + --remove_low_frequency_peptides Remove peptides that are present in less + than 20% of the samples + --output TEXT Peptide intensity file including other all + properties for normalization + --skip_normalization Skip normalization step + --nmethod TEXT Normalization method used to normalize + intensities for all samples (options: qnorm) + --pnormalization Normalize the peptide intensities using + different methods (options: qnorm) + --compress Read the input peptides file in compress + gzip file + --log2 Transform to log2 the peptide intensity + values before normalization + --violin Use violin plot instead of boxplot for + distribution representations + --verbose Print addition information about the + distributions of the intensities, number of + peptides remove after normalization, etc. + --qc_report TEXT PDF file to store multiple QC images + --help Show this message and exit. ``` -Peptide normalization starts from the output file from script `peptide_file_generation.py`. The structure of the input contains the following columns: +Peptide normalization starts from the peptides dataframe. The structure of the input contains the following columns: - ProteinName: Protein name - PeptideSequence: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)` @@ -245,6 +254,60 @@ For cellular protein copy number calculation, the uniprot accession of histones In the calculation of protein concentration, the volume is calculated according to the cell protein concentration first, and then the protein mass is divided by the volume to calculate the intracellular protein concentration. +### Datasets Merge - datasets_merge.py +There are batch effects in protein identification and quantitative results between different studies, which may be caused by differences in experimental techniques, conditional methods, data analysis, etc. Here we provide a method to apply batch effect correction. First to impute ibaq data, then remove outliers using `hdbscan`, and apply batch effect correction using `pycombat`. + + +```asciidoc +python datasets_merge.py datasets_merge --data_folder ../ibaqpy_test/ --output datasets-merge.csv --verbose +``` + +```asciidoc +python datasets_merge.py --help +Usage: datasets_merge.py [OPTIONS] + + Merge ibaq results from compute_ibaq.py. + + :param data_folder: Data dolfer contains SDRFs and ibaq CSVs. + :param output: Output file after batch effect removal. + :param covariate: Indicators included in covariate consideration when datasets are merged. + :param organism: Organism to keep in input data. + :param covariate_to_keep: Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'. + :param non_missing_percent_to_keep: non-missing values in each group. + :param n_components: Number of principal components to be computed. + :param min_cluster: The minimum size of clusters. + :param min_sample_num: The minimum number of samples in a neighborhood for a point to be considered as a core point. + :param n_iter: Number of iterations to be performed. + :param verbose/quiet: Output debug information. + +Options: + Options: + -d, --data_folder TEXT Data dolfer contains SDRFs and ibaq CSVs. [required] + -o, --output TEXT Output file after batch effect removal. [required] + -c, --covariate TEXT Indicators included in covariate consideration when datasets are merged. + --organism TEXT Organism to keep in input data. + --covariate_to_keep TEXT Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'. + --non_missing_percent_to_keep FLOAT + non-missing values in each group. + --n_components TEXT Number of principal components to be computed. + --min_cluster TEXT The minimum size of clusters. + --min_sample_num TEXT The minimum number of samples in a neighborhood for a point to be considered as a core point. + --n_iter TEXT Number of iterations to be performed. + -v, --verbose / -q, --quiet Output debug information. + --help Show this message and exit. +``` + +#### Impute Missing Values +Remove proteins missing more than 30% of all samples. Users can keep tissue parts interested, and data will be converted to a expression matrix (samples as columns, proteins as rows), then missing values will be imputed with `KNNImputer`. + +#### Remove Outliers +When outliers are removed, multiple hierarchical clustering is performed using `hdbscan.HDBSCAN`, where outliers are labeled -1 in the PCA plot. When clustering is performed, the default cluster minimum value and the minimum number of neighbors of the core point are the minimum number of samples in all datasets. + +**Users can skip this step and do outliers removal manually.* + +#### Batch Effect Correction +Using `pycombat` for batch effect correction, and batch is set to `datasets` (refers specifically to PXD ids) and the covariate should be `tissue_part`. + ### Credits - Julianus Pfeuffer diff --git a/bin/datasets_merger.py b/bin/datasets_merger.py index 1c82b9e..687d99f 100644 --- a/bin/datasets_merger.py +++ b/bin/datasets_merger.py @@ -2,6 +2,7 @@ import re import click from ibaq.combiner import Combiner +from ibaq import __version__ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -13,7 +14,7 @@ def cli(): """ pass - +@click.version_option(version=__version__, package_name="ibaqpy", message="%(package)s %(version)s") @click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq") @click.option("--data_folder", "-d", help="Data dolfer contains SDRFs and ibaq CSVs.", required=True) @click.option("--output", "-o", help="Output file after batch effect removal.", required=True) @@ -21,11 +22,12 @@ def cli(): "--covariate", "-c", default=None, - help="Indicators included in covariate consideration when datasets are merged.", + help="Indicator included in covariate consideration when datasets are merged.", ) @click.option("--organism", help="Organism to keep in input data.", default="HUMAN") @click.option("--covariate_to_keep", "-k", help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", default=None) @click.option("--non_missing_percent_to_keep", "-m", help="non-missing values in each group.", default=0.3) +@click.option("--skip_outliers_removal", help="Skip removing outliers in all datasets.", default=False, is_flag=True) @click.option("--n_components", help="Number of principal components to be computed.", default=None) @click.option("--min_cluster", help="The minimum size of clusters.", default=None) @click.option( @@ -44,6 +46,7 @@ def datasets_merge( organism: str, covariate_to_keep: list, non_missing_percent_to_keep: float, + skip_outliers_removal: bool, n_components: int, min_cluster: int, min_sample_num: int, @@ -54,7 +57,8 @@ def datasets_merge( covariate_to_keep = re.split(",\s*", covariate_to_keep) combiner = Combiner(data_folder = data_folder, covariate = covariate) combiner.imputer(covariate_to_keep) - combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter) + if not skip_outliers_removal: + combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter) combiner.batch_correction(n_components, covariate_to_keep) result = combiner.df_corrected result.to_csv(output, sep=",", index=True) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 6440724..f78dee4 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -248,15 +248,13 @@ def peptide_normalization(msstats: str, parquet: str, sdrf: str, min_aa: int, mi pnormalization: bool, compress: bool, log2: bool, violin: bool, verbose: bool, qc_report: str) -> None: - def quit(): + if output is None: print_help_msg(peptide_normalization) exit(1) - if output is None: - quit() - if parquet is None and (msstats is None or sdrf is None): - quit() + print_help_msg(peptide_normalization) + exit(1) compression_method = 'gzip' if compress else None diff --git a/ibaq/__init__.py b/ibaq/__init__.py index e69de29..e344246 100644 --- a/ibaq/__init__.py +++ b/ibaq/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.3" \ No newline at end of file diff --git a/ibaq/combiner.py b/ibaq/combiner.py index 0cc6195..0f71a5f 100644 --- a/ibaq/combiner.py +++ b/ibaq/combiner.py @@ -14,7 +14,7 @@ class Combiner: - def __init__(self, data_folder: os.PathLike, covariate: str, organism: str="HUMAN"): + def __init__(self, data_folder: os.PathLike, covariate: str=None, organism: str="HUMAN"): """Generate concated IbaqNorm and metadata. """ logger.info("Combining SDRFs and ibaq results ...") @@ -174,14 +174,13 @@ def batch_correction(self, n_components: int=None, tissue_parts_to_keep: int=Non self.metadata = self.metadata.reset_index(drop=True) self.metadata = self.metadata.set_index("sample_id").reindex(columns, axis=0).reset_index() if self.covariate: - # get the covariates from metadata as list + # get the covariates from metadata as a list covariates_index = self.metadata[self.covariate].tolist() - print(covariates_index) else: - covariates_index = None + covariates_index = [] # apply batch correction - self.df_corrected = apply_batch_correction(self.df, self.batch_index, covs=["tissue_part"] * len(columns)) + self.df_corrected = apply_batch_correction(self.df, self.batch_index, covs=covariates_index) print(self.df_corrected) # plot PCA of corrected data diff --git a/ibaq/utils.py b/ibaq/utils.py index cf92992..d8c04bd 100644 --- a/ibaq/utils.py +++ b/ibaq/utils.py @@ -352,7 +352,7 @@ def apply_batch_correction( raise ValueError("Every batch factor should have at least 2 samples.") # If not None, check if the number of covariates match the number of samples - if covs is not None: + if covs: if len(df.columns) != len(covs): raise ValueError( "The number of samples should match the number of covariates." diff --git a/testdata/PXD003947-featrue.parquet b/testdata/PXD003947-featrue.parquet new file mode 100644 index 0000000..e73c5e5 Binary files /dev/null and b/testdata/PXD003947-featrue.parquet differ