Merge pull request #29 from WangHong007/master

ibaqpy update
bigbio · Sep 8, 2023 · 3e33b5b · 3e33b5b
2 parents 6d2da52 + e79b5dc
commit 3e33b5b
Show file tree

Hide file tree

Showing 15 changed files with 1,036 additions and 931 deletions.
diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute
 ### Peptide Normalization - peptide_normalization.py
 
 ```asciidoc
-python peptide_normalization.py --peptides PXD003947-peptides.csv --impute --normalize --contaminants data/contaminants_ids.tsv --output PXD003947-peptides-norm.tsv
+python peptide_normalization.py --msstats PXD003947.sdrf_openms_design_msstats_in.csv --sdrf PXD003947.sdrf.tsv --remove_decoy_contaminants data/contaminants_ids.tsv --remove_low_frequency_peptides --output PXD003947-peptides-norm.tsv
 ``` 
 
 The command provides an additional `flag` for skip_normalization, pnormalization, compress, log2, violin, verbose.
@@ -70,31 +70,40 @@ The command provides an additional `flag` for skip_normalization, pnormalization
 Usage: peptide_normalization.py [OPTIONS]
 
 Options:
-  -m, --msstats TEXT    MsStats file import generated by quantms  [required]
-  -s, --sdrf TEXT       SDRF file import generated by quantms  [required]
-  --min_aa INTEGER      Minimum number of amino acids to filter peptides
-  --min_unique INTEGER  Minimum number of unique peptides to filter proteins
-  --contaminants TEXT   Contaminants and high abundant proteins to be removed
-  --output TEXT         Peptide intensity file including other all properties
-                        for normalization
-  --skip_normalization  Skip normalization step
-  --nmethod TEXT        Normalization method used to normalize intensities for
-                        all samples (options: qnorm)
-  --pnormalization      Normalize the peptide intensities using different
-                        methods (options: qnorm)
-  --compress            Read the input peptides file in compress gzip file
-  --log2                Transform to log2 the peptide intensity values before
-                        normalization
-  --violin              Use violin plot instead of boxplot for distribution
-                        representations
-  --verbose             Print addition information about the distributions of
-                        the intensities, number of peptides remove after
-                        normalization, etc.
-  --qc_report TEXT      PDF file to store multiple QC images
-  --help                Show this message and exit.
+  -m, --msstats TEXT              MsStats file import generated by quantms
+  -p, --parquet TEXT              Parquet file import generated by quantms
+  -s, --sdrf TEXT                 SDRF file import generated by quantms
+  --min_aa INTEGER                Minimum number of amino acids to filter
+                                  peptides
+  --min_unique INTEGER            Minimum number of unique peptides to filter
+                                  proteins
+  --remove_ids TEXT               Remove specific protein ids from the
+                                  analysis using a file with one id per line
+  --remove_decoy_contaminants     Remove decoy and contaminants proteins from
+                                  the analysis
+  --remove_low_frequency_peptides Remove peptides that are present in less
+                                  than 20% of the samples
+  --output TEXT                   Peptide intensity file including other all
+                                  properties for normalization
+  --skip_normalization            Skip normalization step
+  --nmethod TEXT                  Normalization method used to normalize
+                                  intensities for all samples (options: qnorm)
+  --pnormalization                Normalize the peptide intensities using
+                                  different methods (options: qnorm)
+  --compress                      Read the input peptides file in compress
+                                  gzip file
+  --log2                          Transform to log2 the peptide intensity
+                                  values before normalization
+  --violin                        Use violin plot instead of boxplot for
+                                  distribution representations
+  --verbose                       Print addition information about the
+                                  distributions of the intensities, number of
+                                  peptides remove after normalization, etc.
+  --qc_report TEXT                PDF file to store multiple QC images
+  --help                          Show this message and exit.
 ```
 
-Peptide normalization starts from the output file from script `peptide_file_generation.py`. The structure of the input contains the following columns: 
+Peptide normalization starts from the peptides dataframe. The structure of the input contains the following columns: 
 
 - ProteinName: Protein name
 - PeptideSequence: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)`
@@ -245,6 +254,60 @@ For cellular protein copy number calculation, the uniprot accession of histones
 
 In the calculation of protein concentration, the volume is calculated according to the cell protein concentration first, and then the protein mass is divided by the volume to calculate the intracellular protein concentration.
 
+### Datasets Merge - datasets_merge.py
+There are batch effects in protein identification and quantitative results between different studies, which may be caused by differences in experimental techniques, conditional methods, data analysis, etc. Here we provide a method to apply batch effect correction.  First to impute ibaq data, then remove outliers using `hdbscan`, and apply batch effect correction using `pycombat`.
+
+
+```asciidoc
+python datasets_merge.py datasets_merge --data_folder ../ibaqpy_test/ --output datasets-merge.csv --verbose
+``` 
+
+```asciidoc
+python datasets_merge.py --help
+Usage: datasets_merge.py [OPTIONS]
+
+  Merge ibaq results from compute_ibaq.py.
+
+  :param data_folder: Data dolfer contains SDRFs and ibaq CSVs.
+  :param output: Output file after batch effect removal.
+  :param covariate: Indicators included in covariate consideration when datasets are merged.
+  :param organism: Organism to keep in input data.
+  :param covariate_to_keep: Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.
+  :param non_missing_percent_to_keep: non-missing values in each group.
+  :param n_components: Number of principal components to be computed.
+  :param min_cluster: The minimum size of clusters.
+  :param min_sample_num: The minimum number of samples in a neighborhood for a point to be considered as a core point.
+  :param n_iter: Number of iterations to be performed.
+  :param verbose/quiet: Output debug information.
+
+Options:
+  Options:
+  -d, --data_folder TEXT          Data dolfer contains SDRFs and ibaq CSVs. [required]
+  -o, --output TEXT               Output file after batch effect removal. [required]
+  -c, --covariate TEXT            Indicators included in covariate consideration when datasets are merged.
+  --organism TEXT                 Organism to keep in input data.
+  --covariate_to_keep TEXT        Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.
+  --non_missing_percent_to_keep FLOAT
+                                  non-missing values in each group.
+  --n_components TEXT             Number of principal components to be computed.
+  --min_cluster TEXT              The minimum size of clusters.
+  --min_sample_num TEXT           The minimum number of samples in a neighborhood for a point to be considered as a core point.
+  --n_iter TEXT                   Number of iterations to be performed.
+  -v, --verbose / -q, --quiet     Output debug information.
+  --help                          Show this message and exit.
+```
+
+#### Impute Missing Values
+Remove proteins missing more than 30% of all samples. Users can keep tissue parts interested, and data will be converted to a expression matrix (samples as columns, proteins as rows), then missing values will be imputed with `KNNImputer`. 
+
+#### Remove Outliers
+When outliers are removed, multiple hierarchical clustering is performed using `hdbscan.HDBSCAN`, where outliers are labeled -1 in the PCA plot. When clustering is performed, the default cluster minimum value and the minimum number of neighbors of the core point are the minimum number of samples in all datasets.
+
+**Users can skip this step and do outliers removal manually.*
+
+#### Batch Effect Correction
+Using `pycombat` for batch effect correction, and batch is set to `datasets` (refers specifically to PXD ids) and the covariate should be `tissue_part`.
+
 ### Credits 
 
 - Julianus Pfeuffer

diff --git a/bin/datasets_merger.py b/bin/datasets_merger.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+import re
+import click
+from ibaq.combiner import Combiner
+from ibaq import __version__
+
+CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+
+
+@click.group(context_settings=CONTEXT_SETTINGS)
+def cli():
+    """
+    This is the main tool that gives access to all commands.
+    """
+    pass
+
+@click.version_option(version=__version__, package_name="ibaqpy", message="%(package)s %(version)s")
+@click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq")
+@click.option("--data_folder", "-d", help="Data dolfer contains SDRFs and ibaq CSVs.", required=True)
+@click.option("--output", "-o", help="Output file after batch effect removal.", required=True)
+@click.option(
+    "--covariate",
+    "-c",
+    default=None,
+    help="Indicator included in covariate consideration when datasets are merged.",
+)
+@click.option("--organism", help="Organism to keep in input data.", default="HUMAN")
+@click.option("--covariate_to_keep", "-k", help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", default=None)
+@click.option("--non_missing_percent_to_keep", "-m", help="non-missing values in each group.", default=0.3)
+@click.option("--skip_outliers_removal", help="Skip removing outliers in all datasets.", default=False, is_flag=True)
+@click.option("--n_components", help="Number of principal components to be computed.", default=None)
+@click.option("--min_cluster", help="The minimum size of clusters.", default=None)
+@click.option(
+    "--min_sample_num",
+    help="The minimum number of samples in a neighborhood for a point to be considered as a core point.",
+    default=None,
+)
+@click.option("--n_iter", help="Number of iterations to be performed.", default=None)
+@click.option("--verbose/--quiet", "-v/-q", help="Output debug information.", default=False, is_flag=True)
+@click.pass_context
+def datasets_merge(
+    ctx,
+    data_folder: str,
+    output: str,
+    covariate: str,
+    organism: str,
+    covariate_to_keep: list,
+    non_missing_percent_to_keep: float,
+    skip_outliers_removal: bool,
+    n_components: int,
+    min_cluster: int,
+    min_sample_num: int,
+    n_iter: int,
+    verbose: bool,
+):
+    if covariate_to_keep:
+        covariate_to_keep = re.split(",\s*", covariate_to_keep)
+    combiner = Combiner(data_folder = data_folder, covariate = covariate)
+    combiner.imputer(covariate_to_keep)
+    if not skip_outliers_removal:
+        combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter)
+    combiner.batch_correction(n_components, covariate_to_keep)
+    result = combiner.df_corrected
+    result.to_csv(output, sep=",", index=True)
+
+cli.add_command(datasets_merge)
+
+
+def main():
+    cli()
+
+
+if __name__ == "__main__":
+    main()