ibaqpy update

bigbio · Sep 7, 2023 · 3f0be3a · 3f0be3a
1 parent 2432672
commit 3f0be3a
Show file tree

Hide file tree

Showing 12 changed files with 892 additions and 855 deletions.
diff --git a/bin/datasets_merger.py b/bin/datasets_merger.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import re
+import click
+from ibaq.combiner import Combiner
+
+CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+
+
+@click.group(context_settings=CONTEXT_SETTINGS)
+def cli():
+    """
+    This is the main tool that gives access to all commands.
+    """
+    pass
+
+
+@click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq")
+@click.option("--data_folder", "-d", help="Data dolfer contains SDRFs and ibaq CSVs.", required=True)
+@click.option("--output", "-o", help="Output file after batch effect removal.", required=True)
+@click.option(
+    "--covariate",
+    "-c",
+    default=None,
+    help="Indicators included in covariate consideration when datasets are merged.",
+)
+@click.option("--organism", help="Organism to keep in input data.", default="HUMAN")
+@click.option("--covariate_to_keep", "-k", help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", default=None)
+@click.option("--non_missing_percent_to_keep", "-m", help="non-missing values in each group.", default=0.3)
+@click.option("--n_components", help="Number of principal components to be computed.", default=None)
+@click.option("--min_cluster", help="The minimum size of clusters.", default=None)
+@click.option(
+    "--min_sample_num",
+    help="The minimum number of samples in a neighborhood for a point to be considered as a core point.",
+    default=None,
+)
+@click.option("--n_iter", help="Number of iterations to be performed.", default=None)
+@click.option("--verbose/--quiet", "-v/-q", help="Output debug information.", default=False, is_flag=True)
+@click.pass_context
+def datasets_merge(
+    ctx,
+    data_folder: str,
+    output: str,  
+    covariate: str,
+    organism: str,
+    covariate_to_keep: list,
+    non_missing_percent_to_keep: float,
+    n_components: int,
+    min_cluster: int,
+    min_sample_num: int,
+    n_iter: int,
+    verbose: bool,
+):
+    if covariate_to_keep:
+        covariate_to_keep = re.split(",\s*", covariate_to_keep)
+    combiner = Combiner(data_folder = data_folder, covariate = covariate)
+    combiner.imputer(covariate_to_keep)
+    combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter)
+    combiner.batch_correction(n_components, covariate_to_keep)
+    result = combiner.df_corrected
+    result.to_csv(output, sep=",", index=True)
+
+cli.add_command(datasets_merge)
+
+
+def main():
+    cli()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py
@@ -13,7 +13,7 @@
     PROTEIN_NAME, STUDY_ID, CONDITION, get_canonical_peptide, plot_distributions, plot_box_plot, \
     sum_peptidoform_intensities, CHANNEL, REFERENCE, remove_extension_file, ISOTOPE_LABEL_TYPE, FRAGMENT_ION, TMT16plex, \
     TMT11plex, TMT10plex, TMT6plex, get_reference_name, get_study_accession, ITRAQ4plex, ITRAQ8plex, \
-    parse_uniprot_accession, remove_protein_by_ids
+    parse_uniprot_accession, remove_protein_by_ids, FEATURE_COLUMNS, parquet_map
 
 
 def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None:
@@ -122,7 +122,7 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame:
     return dataset_df
 
 
-def remove_low_frequency_peptides(dataset_df: DataFrame, percentage_samples: float = 0.20):
+def remove_low_frequency_peptides_(dataset_df: DataFrame, percentage_samples: float = 0.20):
     """
     Remove peptides that are present in less than 20% of the samples.
     :param dataset_df: dataframe with the data
@@ -220,7 +220,8 @@ def impute_peptide_intensities(dataset_df, field, class_field):
 
 
 @click.command()
-@click.option("-m", "--msstats", help="MsStats file import generated by quantms", required=True)
+@click.option("-m", "--msstats", help="MsStats file import generated by quantms", default=None)
+@click.option("-p", "--parquet", help="Parquet file import generated by quantms", default=None)
 @click.option("-s", "--sdrf", help="SDRF file import generated by quantms", required=True)
 @click.option("--min_aa", help="Minimum number of amino acids to filter peptides", default=7)
 @click.option("--min_unique", help="Minimum number of unique peptides to filter proteins", default=2)
@@ -241,57 +242,65 @@ def impute_peptide_intensities(dataset_df, field, class_field):
                    "after normalization, etc.",
               is_flag=True)
 @click.option("--qc_report", help="PDF file to store multiple QC images", default="peptideNorm-QCprofile.pdf")
-def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int, remove_ids: str,
+def peptide_normalization(msstats: str, parquet: str, sdrf: str, min_aa: int, min_unique: int, remove_ids: str,
                           remove_decoy_contaminants: bool, remove_low_frequency_peptides: bool,
                           output: str, skip_normalization: bool, nmethod: str,
                           pnormalization: bool, compress: bool, log2: bool,
                           violin: bool, verbose: bool, qc_report: str) -> None:
 
 
-    if msstats is None or sdrf is None or output is None:
+    if (msstats is None and parquet is None) or sdrf is None or output is None:
         print_help_msg(peptide_normalization)
         exit(1)
 
     compression_method = 'gzip' if compress else None
 
-    # Read the msstats file
-    msstats_df = pd.read_csv(msstats, sep=',', compression=compression_method, dtype={CONDITION: 'category', ISOTOPE_LABEL_TYPE: 'category'})
+    if not parquet is None:
+        feature_df = pd.read_parquet(parquet)[FEATURE_COLUMNS]
+        feature_df = feature_df.rename(columns=parquet_map)
+        feature_df[PROTEIN_NAME] = feature_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis = 1)
+        label_type = feature_df[CHANNEL].unique().tolist()
+        if len(label_type) == 1:
+            feature_df.drop(CHANNEL, inplace=True, axis = 1)
+    else:
+        # Read the msstats file
+        feature_df = pd.read_csv(msstats, sep=',', compression=compression_method, dtype={CONDITION: 'category', ISOTOPE_LABEL_TYPE: 'category'})
     # Remove 0 intensity signals from the msstats file
-    msstats_df = msstats_df[msstats_df[INTENSITY] > 0]
-    msstats_df[PEPTIDE_CANONICAL] = msstats_df.apply(lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1)
+    print(feature_df)
+    feature_df = feature_df[feature_df[INTENSITY] > 0]
+    feature_df[PEPTIDE_CANONICAL] = feature_df.apply(lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1)
     # Only peptides with more than min_aa (default: 7) amino acids are retained
-    msstats_df = msstats_df[msstats_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)]
+    feature_df = feature_df[feature_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1)]
     # Only proteins with unique peptides number greater than min_unique (default: 2) are retained
-    unique_peptides = set(msstats_df.groupby(PEPTIDE_CANONICAL).filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[
+    unique_peptides = set(feature_df.groupby(PEPTIDE_CANONICAL).filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[
                               PEPTIDE_CANONICAL].tolist())
-    strong_proteins = set(msstats_df[msstats_df[PEPTIDE_CANONICAL].isin(unique_peptides)].groupby(PROTEIN_NAME).filter(
+    strong_proteins = set(feature_df[feature_df[PEPTIDE_CANONICAL].isin(unique_peptides)].groupby(PROTEIN_NAME).filter(
         lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[PROTEIN_NAME].tolist())
-    msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)]
+    feature_df = feature_df[feature_df[PROTEIN_NAME].isin(strong_proteins)]
 
-    msstats_df.rename(
+    feature_df.rename(
         columns={'ProteinName': PROTEIN_NAME, 'PeptideSequence': PEPTIDE_SEQUENCE, 'PrecursorCharge': PEPTIDE_CHARGE,
                  'Run': RUN,
                  'Condition': CONDITION, 'Intensity': INTENSITY}, inplace=True)
 
-    print(msstats_df)
-    msstats_df[PROTEIN_NAME] = msstats_df[PROTEIN_NAME].apply(parse_uniprot_accession)
+    feature_df[PROTEIN_NAME] = feature_df[PROTEIN_NAME].apply(parse_uniprot_accession)
 
     # Read the sdrf file
     sdrf_df = pd.read_csv(sdrf, sep='\t', compression=compression_method)
     sdrf_df[REFERENCE] = sdrf_df['comment[data file]'].apply(remove_extension_file)
     print(sdrf_df)
 
-    if FRACTION not in msstats_df.columns:
-        msstats_df[FRACTION] = 1
-        msstats_df = msstats_df[
+    if FRACTION not in feature_df.columns:
+        feature_df[FRACTION] = 1
+        feature_df = feature_df[
             [PROTEIN_NAME, PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, INTENSITY, REFERENCE, CONDITION, RUN,
              BIOREPLICATE, FRACTION, FRAGMENT_ION, ISOTOPE_LABEL_TYPE]]
 
     # Merged the SDRF with the Resulted file
     labels = set(sdrf_df['comment[label]'])
-    if CHANNEL not in msstats_df.columns:
-        msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(remove_extension_file)
-        dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE])
+    if CHANNEL not in feature_df.columns:
+        feature_df[REFERENCE] = feature_df[REFERENCE].apply(remove_extension_file)
+        dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE])
     elif 'TMT' in ','.join(labels) or 'tmt' in ','.join(labels):
         if (len(labels) > 11 or "TMT134N" in labels or "TMT133C" in labels
                 or "TMT133N" in labels or "TMT132C" in labels or "TMT132N" in labels):
@@ -305,8 +314,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
         choice = pd.DataFrame.from_dict(choice, orient='index', columns=[CHANNEL]).reset_index().rename(
             columns={'index': 'comment[label]'})
         sdrf_df = sdrf_df.merge(choice, on='comment[label]', how='left')
-        msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_reference_name)
-        dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
+        feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_reference_name)
+        dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
                              on=[REFERENCE, CHANNEL])
         # result_df.drop(CHANNEL, axis=1, inplace=True)
         dataset_df = dataset_df[dataset_df["Condition"] != "Empty"]
@@ -319,8 +328,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
         choice = pd.DataFrame.from_dict(choice, orient='index', columns=[CHANNEL]).reset_index().rename(
             columns={'index': 'comment[label]'})
         sdrf_df = sdrf_df.merge(choice, on='comment[label]', how='left')
-        msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_reference_name)
-        dataset_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
+        feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_reference_name)
+        dataset_df = pd.merge(feature_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left',
                              on=[REFERENCE, CHANNEL])
         dataset_df = dataset_df[dataset_df["Condition"] != "Empty"]
         dataset_df.rename(columns={'Charge': PEPTIDE_CHARGE}, inplace=True)
@@ -329,11 +338,12 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
         exit(1)
 
     # Remove the intermediate variables and free the memory
-    del msstats_df, sdrf_df
+    del feature_df, sdrf_df
     gc.collect()
 
-    dataset_df.rename(columns={'source name': SAMPLE_ID}, inplace=True)
-
+    if msstats:
+        dataset_df.rename(columns={'source name': SAMPLE_ID}, inplace=True)
+    print(dataset_df)
     dataset_df[STUDY_ID] = dataset_df[SAMPLE_ID].apply(get_study_accession)
     dataset_df = dataset_df.filter(
         items=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, PROTEIN_NAME, STUDY_ID, CONDITION,
@@ -426,7 +436,8 @@ def peptide_normalization(msstats: str, sdrf: str, min_aa: int, min_unique: int,
 
     if remove_low_frequency_peptides:
         print("Peptides before removing low frequency peptides: " + str(len(dataset_df.index)))
-        dataset_df = remove_low_frequency_peptides(dataset_df, 0.20)
+        print(dataset_df)
+        dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20)
         print_dataset_size(dataset_df, "Peptides after remove low frequency peptides: ", verbose)
 
     # Perform imputation using Random Forest in Peptide Intensities

diff --git a/conda-environment.yaml b/conda-environment.yaml
@@ -11,6 +11,7 @@ dependencies:
   - pandas
   - scikit-learn
   - numpy
+  - pyarrow
   - click
   - matplotlib
   - pip:
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ dependencies: @@
       - pandas
       - scikit-learn
       - numpy
+      - pyarrow
       - click
       - matplotlib
       - pip:
@@ Expand Down @@