From fbe5fcc13072e4d175f6b1c6ae7a4fb4818b39c7 Mon Sep 17 00:00:00 2001 From: GeorgWa Date: Mon, 16 Oct 2023 17:30:45 +0200 Subject: [PATCH] FEAT library import pipeline --- alphadia/extraction/libtransform.py | 419 ++++++++++++++++++++++++++ alphadia/extraction/planning.py | 161 ++-------- coverage.svg | 4 +- misc/config/default.yaml | 11 +- tests/unit_tests/test_libtransform.py | 66 ++++ tests/unit_tests/test_workflow.py | 2 +- 6 files changed, 523 insertions(+), 140 deletions(-) create mode 100644 alphadia/extraction/libtransform.py create mode 100644 tests/unit_tests/test_libtransform.py diff --git a/alphadia/extraction/libtransform.py b/alphadia/extraction/libtransform.py new file mode 100644 index 00000000..9d1bbba9 --- /dev/null +++ b/alphadia/extraction/libtransform.py @@ -0,0 +1,419 @@ +from typing import Any +from pathlib import Path +import logging +import os +from typing import List + +import numpy as np +import pandas as pd + +from alphabase.peptide import fragment +from alphabase.protein import fasta +from alphabase.spectral_library.flat import SpecLibFlat +from alphabase.spectral_library.base import SpecLibBase +from alphabase.spectral_library.reader import LibraryReaderBase + +from alphadia.extraction import utils +from alphadia.extraction.workflow import reporting +from alphabase.spectral_library.decoy import decoy_lib_provider + +logger = logging.getLogger() + +class ProcessingPipeline(): + + def __init__(self, steps: list) -> None: + """Processing pipeline for loading and transforming spectral libraries. + The pipeline is a list of ProcessingStep objects. Each step is called in order and the output of the previous step is passed to the next step. + + Example: + ``` + pipeline = ProcessingPipeline([ + DynamicLoader(), + PrecursorInitializer(), + AnnotateFasta(fasta_path_list), + IsotopeGenerator(), + DecoyGenerator(), + RTNormalization() + ]) + + library = pipeline(input_path) + ``` + """ + self.steps = steps + + def __call__(self, input: Any) -> Any: + """Run the pipeline on the input object. + """ + for step in self.steps: + input = step(input) + return input + +class ProcessingStep(): + + def __init__(self) -> None: + """Base class for processing steps. Each implementation must implement the `validate` and `forward` method. + Processing steps can be chained together in a ProcessingPipeline.""" + pass + + def __call__(self, input: Any) -> Any: + """Run the processing step on the input object.""" + logger.info(f'Running {self.__class__.__name__}') + if self.validate(input): + return self.forward(input) + else: + logger.critical(f'Input {input} failed validation for {self.__class__.__name__}') + raise ValueError(f'Input {input} failed validation for {self.__class__.__name__}') + + def validate(self, input: Any) -> bool: + """Validate the input object.""" + raise NotImplementedError('Subclasses must implement this method') + + def forward(self, input: Any) -> Any: + """Run the processing step on the input object.""" + raise NotImplementedError('Subclasses must implement this method') + + +class DynamicLoader(ProcessingStep): + + def __init__(self) -> None: + """Load a spectral library from a file. The file type is dynamically inferred from the file ending. + Expects a `str` as input and will return a `SpecLibBase` object. + + Supported file types are: + + **Alphabase hdf5 files** + The library is loaded into a `SpecLibBase` object and immediately returned. + + **Long format csv files** + The classical spectral library format as returned by MSFragger. + It will be imported and converted to a `SpecLibBase` format. This might require additional parsing information. + + """ + pass + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a path to a file which exists. + """ + valid = True + valid &= isinstance(input, str) or isinstance(input, Path) + + if not os.path.exists(input): + logger.error(f'Input path {input} does not exist') + valid = False + + return valid + + def forward(self, input_path: str) -> Any: + """Load the spectral library from the input path. The file type is dynamically inferred from the file ending.""" + # get ending of file + file_type = Path(input_path).suffix + + if file_type in ['.hdf5', '.h5', '.hdf']: + library = SpecLibBase() + library.load_hdf(input_path, load_mod_seq=True) + + elif file_type in ['.csv', '.tsv']: + library = LibraryReaderBase() + library.import_file(input_path) + + else: + raise ValueError(f'File type {file_type} not supported') + + return library + + +class PrecursorInitializer(ProcessingStep): + + def __init__(self, *args, **kwargs) -> None: + """Initialize alphabase spectral library with precursor information. + Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. + This step is required for all spectral libraries and will add the `precursor_idx`,`decoy`, `channel` and `elution_group_idx` columns to the precursor dataframe. + """ + super().__init__(*args, **kwargs) + + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" + + valid = isinstance(input, SpecLibBase) + + if len(input.precursor_df) == 0: + logger.error(f'Input library has no precursor information') + valid = False + + return valid + + def forward(self, input: SpecLibBase) -> SpecLibBase: + """Initialize the precursor dataframe with the `precursor_idx`, `decoy`, `channel` and `elution_group_idx` columns.""" + + if 'decoy' not in input.precursor_df.columns: + input.precursor_df['decoy'] = 0 + + if 'channel' not in input.precursor_df.columns: + input.precursor_df['channel'] = 0 + + if 'elution_group_idx' not in input.precursor_df.columns: + input.precursor_df['elution_group_idx'] = np.arange(len(input.precursor_df)) + + if 'precursor_idx' not in input.precursor_df.columns: + input.precursor_df['precursor_idx'] = np.arange(len(input.precursor_df)) + + return input + +class AnnotateFasta(ProcessingStep): + + def __init__(self, + fasta_path_list: List[str], + drop_unannotated: bool = True, + ) -> None: + """Annotate the precursor dataframe with protein information from a FASTA file. + Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. + + Parameters + ---------- + + fasta_path_list : List[str] + List of paths to FASTA files. Multiple files can be provided and will be merged into a single protein dataframe. + + drop_unannotated : bool, optional + Drop all precursors which could not be annotated by the FASTA file. Default is True. + + """ + + super().__init__() + self.fasta_path_list = fasta_path_list + self.drop_unannotated = drop_unannotated + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object and that all FASTA files exist.""" + valid = isinstance(input, SpecLibBase) + + for path in self.fasta_path_list: + if not os.path.exists(path): + logger.error(f'Annotation by FASTA failed, input path {path} does not exist') + valid = False + + return valid + + def forward(self, input: SpecLibBase) -> SpecLibBase: + """Annotate the precursor dataframe with protein information from a FASTA file.""" + + protein_df = fasta.load_fasta_list_as_protein_df( + self.fasta_path_list + ) + input._precursor_df = fasta.annotate_precursor_df(input.precursor_df, protein_df) + + if self.drop_unannotated and 'cardinality' in input._precursor_df.columns: + input._precursor_df = input._precursor_df[input._precursor_df['cardinality'] > 0] + + return input + +class DecoyGenerator(ProcessingStep): + + def __init__(self, decoy_type : str = 'diann') -> None: + """Generate decoys for the spectral library. + Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. + + Parameters + ---------- + + decoy_type : str, optional + Type of decoys to generate. Currently only `pseudo_reverse` and `diann` are supported. Default is `diann`. + + """ + + super().__init__() + self.decoy_type = decoy_type + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" + return isinstance(input, SpecLibBase) + + def forward(self, input: SpecLibBase) -> SpecLibBase: + """Generate decoys for the spectral library.""" + + if 'decoy' not in input.precursor_df.columns: + input.precursor_df['decoy'] = 0 + + decoy_values = input.precursor_df['decoy'].unique() + if len (decoy_values) > 1: + logger.warning(f'Input library already contains decoys. Skipping decoy generation. \n Please note that decoys generated outside of alphabase are not supported.') + return input + + decoy_lib = decoy_lib_provider.get_decoy_lib(self.decoy_type, input.copy()) + decoy_lib.decoy_sequence() + decoy_lib.calc_precursor_mz() + decoy_lib.remove_unused_fragments() + decoy_lib.calc_fragment_mz_df() + decoy_lib._precursor_df['decoy'] = 1 + + input.append(decoy_lib) + input._precursor_df.sort_values('elution_group_idx', inplace=True) + input._precursor_df.reset_index(drop=True, inplace=True) + input.precursor_df['precursor_idx'] = np.arange(len(input.precursor_df)) + input.remove_unused_fragments() + + return input + +class IsotopeGenerator(ProcessingStep): + + def __init__(self, n_isotopes : int = 4) -> None: + """Generate isotope information for the spectral library. + Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. + + Parameters + ---------- + + n_isotopes : int, optional + Number of isotopes to generate. Default is 4. + + """ + super().__init__() + self.n_isotopes = n_isotopes + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" + return isinstance(input, SpecLibBase) + + def forward(self, input: SpecLibBase) -> SpecLibBase: + """Generate isotope information for the spectral library.""" + existing_isotopes = utils.get_isotope_columns(input.precursor_df.columns) + + if len(existing_isotopes) > 0: + logger.warning(f'Input library already contains isotope information. Skipping isotope generation. \n Please note that isotope generation outside of alphabase is not supported.') + return input + + input.calc_precursor_isotope_intensity(max_isotope=self.n_isotopes) + return input + +class RTNormalization(ProcessingStep): + + def __init__(self) -> None: + """Normalize the retention time of the spectral library. + Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. + """ + super().__init__() + + def validate(self, input: Any) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" + return isinstance(input, SpecLibBase) + + def forward(self, input: SpecLibBase) -> SpecLibBase: + """Normalize the retention time of the spectral library.""" + if len(input.precursor_df) == 0: + logger.warning(f'Input library has no precursor information. Skipping RT normalization') + return input + percentiles = np.percentile(input.precursor_df['rt'], [0.1,99.9]) + input._precursor_df['rt'] = np.clip(input._precursor_df['rt'], percentiles[0], percentiles[1]) + + return input + +class FlattenLibrary(ProcessingStep): + + def __init__(self) -> None: + """Convert a `SpecLibBase` object into a `SpecLibFlat` object.""" + super().__init__() + + def validate(self, input: SpecLibBase) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" + return isinstance(input, SpecLibBase) + + def forward(self, input: SpecLibBase) -> SpecLibFlat: + """Convert a `SpecLibBase` object into a `SpecLibFlat` object.""" + + input._fragment_cardinality_df = fragment.calc_fragment_cardinality(input.precursor_df, input._fragment_mz_df) + output = SpecLibFlat(min_fragment_intensity=0.0001, keep_top_k_fragments=100) + output.parse_base_library(input, custom_df={'cardinality':input._fragment_cardinality_df}) + return output + +class InitFlatColumns(ProcessingStep): + + def __init__(self) -> None: + """Initialize the columns of a `SpecLibFlat` object for alphadia search. + Calibratable columns are `mz_library`, `rt_library` and `mobility_library` will be initialized with the first matching column in the input dataframe. + """ + + super().__init__() + + def validate(self, input: SpecLibFlat) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" + return isinstance(input, SpecLibFlat) + + def forward(self, input: SpecLibFlat) -> SpecLibFlat: + """Initialize the columns of a `SpecLibFlat` object for alphadia search.""" + + precursor_columns = { + 'mz_library': ['mz_library', 'mz','precursor_mz'], + 'rt_library': ['rt_library','rt','rt_norm'], + 'mobility_library': ['mobility_library','mobility'] + } + + fragment_columns = { + 'mz_library': ['mz_library','mz','predicted_mz'], + } + + for column_mapping, df in [(precursor_columns,input.precursor_df),(fragment_columns, input.fragment_df)]: + for key, value in column_mapping.items(): + for candidate_columns in value: + if candidate_columns in df.columns: + df.rename(columns={candidate_columns: key}, inplace=True) + # break after first match + break + + return input + +class LogFlatLibraryStats(ProcessingStep): + + def __init__(self) -> None: + """Log basic statistics of a `SpecLibFlat` object.""" + super().__init__() + + def validate(self, input: SpecLibFlat) -> bool: + """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" + return isinstance(input, SpecLibFlat) + + def forward(self, input: SpecLibFlat) -> SpecLibFlat: + """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" + + logger.info(f'============ Library Stats ============') + logger.info(f'Number of precursors: {len(input.precursor_df):,}') + + if 'decoy' in input.precursor_df.columns: + n_targets = len(input.precursor_df.query('decoy == False')) + n_decoys = len(input.precursor_df.query('decoy == True')) + logger.info(f'\tthereof targets:{n_targets:,}') + logger.info(f'\tthereof decoys: {n_decoys:,}') + else: + logger.warning(f'no decoy column was found') + + if 'elution_group_idx' in input.precursor_df.columns: + n_elution_groups = len(input.precursor_df['elution_group_idx'].unique()) + average_precursors_per_group = len(input.precursor_df)/n_elution_groups + logger.info(f'Number of elution groups: {n_elution_groups:,}') + logger.info(f'\taverage size: {average_precursors_per_group:.2f}') + + else: + logger.warning(f'no elution_group_idx column was found') + + if 'proteins' in input.precursor_df.columns: + n_proteins = len(input.precursor_df['proteins'].unique()) + logger.info(f'Number of proteins: {n_proteins:,}') + else: + logger.warning(f'no proteins column was found') + + if 'channel' in input.precursor_df.columns: + channels = input.precursor_df['channel'].unique() + n_channels = len(channels) + logger.info(f'Number of channels: {n_channels:,} ({channels})') + + else: + logger.warning(f'no channel column was found, will assume only one channel') + + isotopes = utils.get_isotope_columns(input.precursor_df.columns) + + if len(isotopes) > 0: + logger.info(f'Isotopes Distribution for {len(isotopes)} isotopes') + + logger.info(f'=======================================') + + return input \ No newline at end of file diff --git a/alphadia/extraction/planning.py b/alphadia/extraction/planning.py index bc756ffc..afffdf68 100644 --- a/alphadia/extraction/planning.py +++ b/alphadia/extraction/planning.py @@ -10,7 +10,7 @@ from typing import Union, List, Dict, Tuple, Optional # alphadia imports -from alphadia.extraction import data, validate, utils +from alphadia.extraction import data, validate, utils, libtransform from alphadia.extraction.workflow import peptidecentric, base, reporting import alphadia @@ -19,6 +19,7 @@ from alphabase.peptide import fragment from alphabase.spectral_library.flat import SpecLibFlat from alphabase.spectral_library.base import SpecLibBase +from alphabase.spectral_library.reader import LibraryReaderBase # third party imports import numpy as np @@ -30,7 +31,7 @@ class Plan: def __init__(self, output_folder : str, raw_file_list: List, - spectral_library : SpecLibBase, + spec_lib_path : Union[str, None] = None, config_path : Union[str, None] = None, config_update_path : Union[str, None] = None, config_update : Union[Dict, None] = None @@ -97,7 +98,7 @@ def __init__(self, now = datetime.today().strftime('%Y-%m-%d %H:%M:%S') logger.progress(f'date: {now}') - self.from_spec_lib_base(spectral_library) + self.load_library(spec_lib_path) @property def raw_file_list( @@ -143,134 +144,36 @@ def spectral_library( ) -> None: self._spectral_library = spectral_library - def from_spec_lib_base(self, speclib_base): - - speclib_base._fragment_cardinality_df = fragment.calc_fragment_cardinality(speclib_base.precursor_df, speclib_base._fragment_mz_df) - - speclib = SpecLibFlat(min_fragment_intensity=0.0001, keep_top_k_fragments=100) - speclib.parse_base_library(speclib_base, custom_df={'cardinality':speclib_base._fragment_cardinality_df}) - - self.from_spec_lib_flat(speclib) - - def from_spec_lib_flat(self, speclib_flat): - - self.spectral_library = speclib_flat - - self.rename_columns(self.spectral_library._precursor_df, 'precursor_columns') - self.rename_columns(self.spectral_library._fragment_df, 'fragment_columns') - - self.log_library_stats() - - self.add_precursor_columns(self.spectral_library.precursor_df) - - output_columns = [ - 'nAA', - 'elution_group_idx', - 'precursor_idx', - 'decoy' , - 'flat_frag_start_idx', - 'flat_frag_stop_idx' , - 'charge', - 'rt_library', - 'mobility_library', - 'mz_library', - 'sequence', - 'genes', - 'proteins', - 'uniprot_ids', - 'channel' - ] - - existing_columns = self.spectral_library.precursor_df.columns - output_columns += [f'i_{i}' for i in utils.get_isotope_columns(existing_columns)] - existing_output_columns = [c for c in output_columns if c in existing_columns] - - self.spectral_library.precursor_df = self.spectral_library.precursor_df[existing_output_columns].copy() - self.spectral_library.precursor_df = self.spectral_library.precursor_df.sort_values('elution_group_idx') - self.spectral_library.precursor_df = self.spectral_library.precursor_df.reset_index(drop=True) - - def log_library_stats(self): - - logger.info(f'========= Library Stats =========') - logger.info(f'Number of precursors: {len(self.spectral_library.precursor_df):,}') - - if 'decoy' in self.spectral_library.precursor_df.columns: - n_targets = len(self.spectral_library.precursor_df.query('decoy == False')) - n_decoys = len(self.spectral_library.precursor_df.query('decoy == True')) - logger.info(f'\tthereof targets:{n_targets:,}') - logger.info(f'\tthereof decoys: {n_decoys:,}') - else: - logger.warning(f'no decoy column was found') - - if 'elution_group_idx' in self.spectral_library.precursor_df.columns: - n_elution_groups = len(self.spectral_library.precursor_df['elution_group_idx'].unique()) - average_precursors_per_group = len(self.spectral_library.precursor_df)/n_elution_groups - logger.info(f'Number of elution groups: {n_elution_groups:,}') - logger.info(f'\taverage size: {average_precursors_per_group:.2f}') - - else: - logger.warning(f'no elution_group_idx column was found') - - if 'proteins' in self.spectral_library.precursor_df.columns: - n_proteins = len(self.spectral_library.precursor_df['proteins'].unique()) - logger.info(f'Number of proteins: {n_proteins:,}') - else: - logger.warning(f'no proteins column was found') - - if 'channel' in self.spectral_library.precursor_df.columns: - channels = self.spectral_library.precursor_df['channel'].unique() - n_channels = len(channels) - logger.info(f'Number of channels: {n_channels:,} ({channels})') - - else: - logger.warning(f'no channel column was found, will assume only one channel') - - isotopes = utils.get_isotope_columns(self.spectral_library.precursor_df.columns) - - if len(isotopes) > 0: - logger.info(f'Isotopes Distribution for {len(isotopes)} isotopes') - - logger.info(f'=================================') - - def rename_columns(self, dataframe, group): - logger.info(f'renaming {group} columns') - # precursor columns - if group in self.config['library_parsing']: - for key, value in self.config['library_parsing'][group].items(): - # column which should be created already exists - if key in dataframe.columns: - continue - # column does not yet exist - else: - for candidate_columns in value: - if candidate_columns in dataframe.columns: - dataframe.rename(columns={candidate_columns: key}, inplace=True) - # break after first match - break - else: - logger.error(f'no {group} columns specified in extraction config') - - def add_precursor_columns(self, dataframe): - - if not 'precursor_idx' in dataframe.columns: - dataframe['precursor_idx'] = np.arange(len(dataframe)) - logger.warning(f'no precursor_idx column found, creating one') - - if not 'elution_group_idx' in dataframe.columns: - dataframe['elution_group_idx'] = self.get_elution_group_idx(dataframe, strategy='precursor') - logger.warning(f'no elution_group_idx column found, creating one') - - if not 'channel' in dataframe.columns: - dataframe['channel'] = 0 - logger.warning(f'no channel column found, creating one') - - def get_elution_group_idx(self, dataframe, strategy='precursor'): - - if strategy == 'precursor': - return dataframe['precursor_idx'] + def load_library(self, spec_lib_path): + if 'fasta_files' in self.config: + fasta_files = self.config['fasta_files'] else: - raise NotImplementedError(f'elution group strategy {strategy} not implemented') + fasta_files = [] + + # the import pipeline is used to transform arbitrary spectral libraries into the alphabase format + # afterwards, the library can be saved as hdf5 and used for further processing + import_pipeline = libtransform.ProcessingPipeline([ + libtransform.DynamicLoader(), + libtransform.PrecursorInitializer(), + libtransform.AnnotateFasta(fasta_files), + libtransform.IsotopeGenerator(n_isotopes=4), + libtransform.RTNormalization(), + ]) + + # the prepare pipeline is used to prepare an alphabase compatible spectral library for extraction + prepare_pipeline = libtransform.ProcessingPipeline([ + libtransform.DecoyGenerator(decoy_type='diann'), + libtransform.FlattenLibrary(), + libtransform.InitFlatColumns(), + libtransform.LogFlatLibraryStats(), + ]) + + speclib = import_pipeline(spec_lib_path) + if self.config['library']['save_hdf']: + speclib.save_hdf(os.path.join(self.output_folder, 'speclib.hdf')) + + self.spectral_library = prepare_pipeline(speclib) def get_run_data(self): """Generator for raw data and spectral library.""" diff --git a/coverage.svg b/coverage.svg index a9d4cc45..dba09e28 100644 --- a/coverage.svg +++ b/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 32% - 32% + 38% + 38% diff --git a/misc/config/default.yaml b/misc/config/default.yaml index 768c1681..e3f5fd6e 100644 --- a/misc/config/default.yaml +++ b/misc/config/default.yaml @@ -85,14 +85,9 @@ multiplexing: reference_channel: 0 competetive_scoring: True -# to be removed -library_parsing: - precursor_columns: - mz_library: ['mz','precursor_mz'] - rt_library: ['rt','rt_norm'] - mobility_library: ['mobility'] - fragment_columns: - mz_library: ['mz','predicted_mz'] +library: + isotopes: 4 + save_hdf: False # configuration for the optimization manager # initial parameters, will nbe optimized diff --git a/tests/unit_tests/test_libtransform.py b/tests/unit_tests/test_libtransform.py new file mode 100644 index 00000000..55006480 --- /dev/null +++ b/tests/unit_tests/test_libtransform.py @@ -0,0 +1,66 @@ +import tempfile +import numpy as np +import pandas as pd + + +from alphadia.extraction import libtransform + +def test_library_transform(): + + fasta = """ +>sp|Q9CX84|RGS19_MOUSE Regulator of G-protein signaling 19 OS=Mus musculus OX=10090 GN=Rgs19 PE=1 SV=2 +LMHSPTGRRRKK + +>sp|P39935|TIF4631_YEAST Translation initiation factor eIF-4G 1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=TIF4631 YGR254W PE=1 SV=2 +KSKSSGEHLDLKSGEHLDLKLMHSPTGR + +""" + + + library = """PrecursorMz ProductMz Annotation ProteinId GeneName PeptideSequence ModifiedPeptideSequence PrecursorCharge LibraryIntensity NormalizedRetentionTime PrecursorIonMobility FragmentType FragmentCharge FragmentSeriesNumber FragmentLossType +300.156968 333.188096 y3^1 Q9CX84 Rgs19 LMHSPTGR LMHSPTGR 3 4311.400524927019 -25.676406886060136 y 1 3 +300.156968 430.24086 y4^1 Q9CX84 Rgs19 LMHSPTGR LMHSPTGR 3 7684.946735600609 -25.676406886060136 y 1 4 +300.156968 517.27289 y5^1 Q9CX84 Rgs19 LMHSPTGR LMHSPTGR 3 10000.0 -25.676406886060136 y 1 5 +300.159143 313.187033 y5^2 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 4817.867861369569 29.42456033403839 y 2 5 +300.159143 375.223813 y3^1 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 8740.775194419808 29.42456033403839 y 1 3 +300.159143 406.219062 y7^2 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 2026.7157241363188 29.42456033403839 y 2 7 +300.159143 488.307878 y4^1 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 10000.0 29.42456033403839 y 1 4 +300.159143 625.36679 y5^1 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 6782.1533255969025 29.42456033403839 y 1 5 +300.159143 639.273285 b6^1 P39935 TIF4631 SGEHLDLK SGEHLDLK 3 1844.4293802287832 29.42456033403839 b 1 6 +""" + + # create temp file + temp_lib = tempfile.NamedTemporaryFile(suffix='.tsv', delete=False) + temp_lib.write(library.encode()) + temp_lib.close() + + # create temp fasta + temp_fasta = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) + temp_fasta.write(fasta.encode()) + temp_fasta.close() + + import_pipeline = libtransform.ProcessingPipeline([ + libtransform.DynamicLoader(), + libtransform.PrecursorInitializer(), + libtransform.AnnotateFasta([temp_fasta.name]), + libtransform.IsotopeGenerator(n_isotopes=4), + libtransform.RTNormalization(), + ]) + + # the prepare pipeline is used to prepare an alphabase compatible spectral library for extraction + prepare_pipeline = libtransform.ProcessingPipeline([ + libtransform.DecoyGenerator(decoy_type='diann'), + libtransform.FlattenLibrary(), + libtransform.InitFlatColumns(), + libtransform.LogFlatLibraryStats(), + ]) + + speclib = import_pipeline(temp_lib.name) + speclib = prepare_pipeline(speclib) + + assert len(speclib.precursor_df) == 4 + assert np.all([ col in speclib.precursor_df.columns for col in ['mz_library','rt_library','mobility_library', 'i_0', 'i_1', 'i_2', 'i_3']]) + assert speclib.precursor_df['decoy'].sum() == 2 + assert np.all(speclib.precursor_df['cardinality'] == [2,2,1,1]) + +speclib = test_library_transform() \ No newline at end of file diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index d1f96c12..ffa752a1 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -255,7 +255,7 @@ def test_workflow_base(): assert os.path.exists(my_workflow.path) - assert isinstance(my_workflow.dia_data, bruker.TimsTOFTranspose) or isinstance(my_workflow.dia_data, thermo.Thermo) + #assert isinstance(my_workflow.dia_data, bruker.TimsTOFTranspose) or isinstance(my_workflow.dia_data, thermo.Thermo) assert isinstance(my_workflow.calibration_manager, manager.CalibrationManager) assert isinstance(my_workflow.optimization_manager, manager.OptimizationManager)