From fbe5fcc13072e4d175f6b1c6ae7a4fb4818b39c7 Mon Sep 17 00:00:00 2001
From: GeorgWa <gewaelbe@googlemail.com>
Date: Mon, 16 Oct 2023 17:30:45 +0200
Subject: [PATCH] FEAT library import pipeline

---
 alphadia/extraction/libtransform.py   | 419 ++++++++++++++++++++++++++
 alphadia/extraction/planning.py       | 161 ++--------
 coverage.svg                          |   4 +-
 misc/config/default.yaml              |  11 +-
 tests/unit_tests/test_libtransform.py |  66 ++++
 tests/unit_tests/test_workflow.py     |   2 +-
 6 files changed, 523 insertions(+), 140 deletions(-)
 create mode 100644 alphadia/extraction/libtransform.py
 create mode 100644 tests/unit_tests/test_libtransform.py

diff --git a/alphadia/extraction/libtransform.py b/alphadia/extraction/libtransform.py
new file mode 100644
index 00000000..9d1bbba9
--- /dev/null
+++ b/alphadia/extraction/libtransform.py
@@ -0,0 +1,419 @@
+from typing import Any
+from pathlib import Path
+import logging
+import os
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from alphabase.peptide import fragment
+from alphabase.protein import fasta
+from alphabase.spectral_library.flat import SpecLibFlat
+from alphabase.spectral_library.base import SpecLibBase
+from alphabase.spectral_library.reader import LibraryReaderBase
+
+from alphadia.extraction import utils
+from alphadia.extraction.workflow import reporting
+from alphabase.spectral_library.decoy import decoy_lib_provider
+
+logger = logging.getLogger()
+
+class ProcessingPipeline():
+    
+        def __init__(self, steps: list) -> None:
+            """Processing pipeline for loading and transforming spectral libraries.
+            The pipeline is a list of ProcessingStep objects. Each step is called in order and the output of the previous step is passed to the next step.
+
+            Example:
+            ```
+            pipeline = ProcessingPipeline([
+                DynamicLoader(),
+                PrecursorInitializer(),
+                AnnotateFasta(fasta_path_list),
+                IsotopeGenerator(),
+                DecoyGenerator(),
+                RTNormalization()
+            ])
+
+            library = pipeline(input_path)
+            ```            
+            """
+            self.steps = steps
+    
+        def __call__(self, input: Any) -> Any:
+            """Run the pipeline on the input object.            
+            """
+            for step in self.steps:
+                input = step(input)
+            return input
+
+class ProcessingStep():
+
+    def __init__(self) -> None:
+        """Base class for processing steps. Each implementation must implement the `validate` and `forward` method.
+        Processing steps can be chained together in a ProcessingPipeline."""
+        pass
+
+    def __call__(self, input: Any) -> Any:
+        """Run the processing step on the input object."""
+        logger.info(f'Running {self.__class__.__name__}')
+        if self.validate(input):
+            return self.forward(input)
+        else:
+            logger.critical(f'Input {input} failed validation for {self.__class__.__name__}')
+            raise ValueError(f'Input {input} failed validation for {self.__class__.__name__}')
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object."""
+        raise NotImplementedError('Subclasses must implement this method')
+
+    def forward(self, input: Any) -> Any:
+        """Run the processing step on the input object."""
+        raise NotImplementedError('Subclasses must implement this method')
+    
+
+class DynamicLoader(ProcessingStep):
+
+    def __init__(self) -> None:
+        """Load a spectral library from a file. The file type is dynamically inferred from the file ending.
+        Expects a `str` as input and will return a `SpecLibBase` object.
+
+        Supported file types are:
+
+        **Alphabase hdf5 files**
+        The library is loaded into a `SpecLibBase` object and immediately returned.
+
+        **Long format csv files**
+        The classical spectral library format as returned by MSFragger.
+        It will be imported and converted to a `SpecLibBase` format. This might require additional parsing information.
+
+        """
+        pass
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a path to a file which exists.
+        """
+        valid = True
+        valid &= isinstance(input, str) or isinstance(input, Path)
+
+        if not os.path.exists(input):
+            logger.error(f'Input path {input} does not exist')
+            valid = False
+
+        return valid
+
+    def forward(self, input_path: str) -> Any:
+        """Load the spectral library from the input path. The file type is dynamically inferred from the file ending."""
+        # get ending of file
+        file_type = Path(input_path).suffix
+
+        if file_type in ['.hdf5', '.h5', '.hdf']:
+            library = SpecLibBase()
+            library.load_hdf(input_path, load_mod_seq=True)
+
+        elif file_type in ['.csv', '.tsv']:
+            library = LibraryReaderBase()
+            library.import_file(input_path)
+
+        else:
+            raise ValueError(f'File type {file_type} not supported')
+        
+        return library
+    
+
+class PrecursorInitializer(ProcessingStep):
+
+    def __init__(self, *args, **kwargs) -> None:
+        """Initialize alphabase spectral library with precursor information.
+        Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
+        This step is required for all spectral libraries and will add the `precursor_idx`,`decoy`, `channel` and `elution_group_idx` columns to the precursor dataframe.
+        """
+        super().__init__(*args, **kwargs)
+
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
+
+        valid = isinstance(input, SpecLibBase)
+
+        if len(input.precursor_df) == 0:
+            logger.error(f'Input library has no precursor information')
+            valid = False
+
+        return valid
+    
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Initialize the precursor dataframe with the `precursor_idx`, `decoy`, `channel` and `elution_group_idx` columns."""
+
+        if 'decoy' not in input.precursor_df.columns:
+            input.precursor_df['decoy'] = 0
+
+        if 'channel' not in input.precursor_df.columns:
+            input.precursor_df['channel'] = 0
+
+        if 'elution_group_idx' not in input.precursor_df.columns:
+            input.precursor_df['elution_group_idx'] = np.arange(len(input.precursor_df))
+
+        if 'precursor_idx' not in input.precursor_df.columns:
+            input.precursor_df['precursor_idx'] = np.arange(len(input.precursor_df))
+        
+        return input
+    
+class AnnotateFasta(ProcessingStep):
+
+    def __init__(self, 
+                fasta_path_list: List[str],
+                drop_unannotated: bool = True,
+                ) -> None:
+        """Annotate the precursor dataframe with protein information from a FASTA file.
+        Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
+
+        Parameters
+        ----------
+
+        fasta_path_list : List[str]
+            List of paths to FASTA files. Multiple files can be provided and will be merged into a single protein dataframe.
+
+        drop_unannotated : bool, optional
+            Drop all precursors which could not be annotated by the FASTA file. Default is True.
+
+        """
+
+        super().__init__()
+        self.fasta_path_list = fasta_path_list
+        self.drop_unannotated = drop_unannotated
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object and that all FASTA files exist."""
+        valid = isinstance(input, SpecLibBase)
+
+        for path in self.fasta_path_list:
+            if not os.path.exists(path):
+                logger.error(f'Annotation by FASTA failed, input path {path} does not exist')
+                valid = False
+        
+        return valid
+    
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Annotate the precursor dataframe with protein information from a FASTA file."""
+        
+        protein_df = fasta.load_fasta_list_as_protein_df(
+            self.fasta_path_list
+        )
+        input._precursor_df = fasta.annotate_precursor_df(input.precursor_df, protein_df)
+
+        if self.drop_unannotated and 'cardinality' in input._precursor_df.columns:
+            input._precursor_df = input._precursor_df[input._precursor_df['cardinality'] > 0]
+
+        return input
+    
+class DecoyGenerator(ProcessingStep):
+
+    def __init__(self, decoy_type : str = 'diann') -> None:
+        """Generate decoys for the spectral library.
+        Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
+
+        Parameters
+        ----------
+
+        decoy_type : str, optional
+            Type of decoys to generate. Currently only `pseudo_reverse` and `diann` are supported. Default is `diann`.
+
+        """
+
+        super().__init__()
+        self.decoy_type = decoy_type
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
+        return isinstance(input, SpecLibBase)
+    
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Generate decoys for the spectral library."""
+        
+        if 'decoy' not in input.precursor_df.columns:
+            input.precursor_df['decoy'] = 0
+
+        decoy_values = input.precursor_df['decoy'].unique()
+        if len (decoy_values) > 1:
+            logger.warning(f'Input library already contains decoys. Skipping decoy generation. \n Please note that decoys generated outside of alphabase are not supported.')
+            return input
+            
+        decoy_lib = decoy_lib_provider.get_decoy_lib(self.decoy_type, input.copy())
+        decoy_lib.decoy_sequence()
+        decoy_lib.calc_precursor_mz()
+        decoy_lib.remove_unused_fragments()
+        decoy_lib.calc_fragment_mz_df()
+        decoy_lib._precursor_df['decoy'] = 1
+
+        input.append(decoy_lib)
+        input._precursor_df.sort_values('elution_group_idx', inplace=True)
+        input._precursor_df.reset_index(drop=True, inplace=True)
+        input.precursor_df['precursor_idx'] = np.arange(len(input.precursor_df))
+        input.remove_unused_fragments()
+
+        return input
+    
+class IsotopeGenerator(ProcessingStep):
+
+    def __init__(self, n_isotopes : int = 4) -> None:
+        """Generate isotope information for the spectral library.
+        Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
+
+        Parameters
+        ----------
+
+        n_isotopes : int, optional
+            Number of isotopes to generate. Default is 4.
+
+        """
+        super().__init__()
+        self.n_isotopes = n_isotopes
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
+        return isinstance(input, SpecLibBase)
+    
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Generate isotope information for the spectral library."""
+        existing_isotopes = utils.get_isotope_columns(input.precursor_df.columns)
+
+        if len(existing_isotopes) > 0:
+            logger.warning(f'Input library already contains isotope information. Skipping isotope generation. \n Please note that isotope generation outside of alphabase is not supported.')
+            return input
+        
+        input.calc_precursor_isotope_intensity(max_isotope=self.n_isotopes)
+        return input
+    
+class RTNormalization(ProcessingStep):
+
+    def __init__(self) -> None:
+        """Normalize the retention time of the spectral library.
+        Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
+        """
+        super().__init__()
+
+    def validate(self, input: Any) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
+        return isinstance(input, SpecLibBase)
+    
+    def forward(self, input: SpecLibBase) -> SpecLibBase:
+        """Normalize the retention time of the spectral library."""
+        if len(input.precursor_df) == 0:
+            logger.warning(f'Input library has no precursor information. Skipping RT normalization')
+            return input
+        percentiles = np.percentile(input.precursor_df['rt'], [0.1,99.9])
+        input._precursor_df['rt'] = np.clip(input._precursor_df['rt'], percentiles[0], percentiles[1])
+
+        return input
+    
+class FlattenLibrary(ProcessingStep):
+
+    def __init__(self) -> None:
+        """Convert a `SpecLibBase` object into a `SpecLibFlat` object."""
+        super().__init__()
+
+    def validate(self, input: SpecLibBase) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
+        return isinstance(input, SpecLibBase)
+
+    def forward(self, input: SpecLibBase) -> SpecLibFlat:
+        """Convert a `SpecLibBase` object into a `SpecLibFlat` object."""
+
+        input._fragment_cardinality_df = fragment.calc_fragment_cardinality(input.precursor_df, input._fragment_mz_df)
+        output = SpecLibFlat(min_fragment_intensity=0.0001, keep_top_k_fragments=100)
+        output.parse_base_library(input, custom_df={'cardinality':input._fragment_cardinality_df})
+        return output
+
+class InitFlatColumns(ProcessingStep):
+
+    def __init__(self) -> None:
+        """Initialize the columns of a `SpecLibFlat` object for alphadia search.
+        Calibratable columns are `mz_library`, `rt_library` and `mobility_library` will be initialized with the first matching column in the input dataframe.
+        """
+
+        super().__init__()
+
+    def validate(self, input: SpecLibFlat) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibFlat` object."""
+        return isinstance(input, SpecLibFlat)
+    
+    def forward(self, input: SpecLibFlat) -> SpecLibFlat:
+        """Initialize the columns of a `SpecLibFlat` object for alphadia search."""
+
+        precursor_columns = {
+            'mz_library': ['mz_library', 'mz','precursor_mz'],
+            'rt_library': ['rt_library','rt','rt_norm'],
+            'mobility_library': ['mobility_library','mobility']
+        }
+
+        fragment_columns = {
+            'mz_library': ['mz_library','mz','predicted_mz'],
+        }
+
+        for column_mapping, df in [(precursor_columns,input.precursor_df),(fragment_columns, input.fragment_df)]:
+            for key, value in column_mapping.items():
+                for candidate_columns in value:
+                    if candidate_columns in df.columns:
+                        df.rename(columns={candidate_columns: key}, inplace=True)
+                        # break after first match
+                        break
+        
+        return input
+
+class LogFlatLibraryStats(ProcessingStep):
+
+    def __init__(self) -> None:
+        """Log basic statistics of a `SpecLibFlat` object."""
+        super().__init__()
+
+    def validate(self, input: SpecLibFlat) -> bool:
+        """Validate the input object. It is expected that the input is a `SpecLibFlat` object."""
+        return isinstance(input, SpecLibFlat)
+    
+    def forward(self, input: SpecLibFlat) -> SpecLibFlat:
+        """Validate the input object. It is expected that the input is a `SpecLibFlat` object."""
+
+        logger.info(f'============ Library Stats ============')
+        logger.info(f'Number of precursors: {len(input.precursor_df):,}')
+
+        if 'decoy' in input.precursor_df.columns:
+            n_targets = len(input.precursor_df.query('decoy == False'))
+            n_decoys = len(input.precursor_df.query('decoy == True'))
+            logger.info(f'\tthereof targets:{n_targets:,}')
+            logger.info(f'\tthereof decoys: {n_decoys:,}')
+        else:
+            logger.warning(f'no decoy column was found')
+
+        if 'elution_group_idx' in input.precursor_df.columns:
+            n_elution_groups = len(input.precursor_df['elution_group_idx'].unique())
+            average_precursors_per_group = len(input.precursor_df)/n_elution_groups
+            logger.info(f'Number of elution groups: {n_elution_groups:,}')
+            logger.info(f'\taverage size: {average_precursors_per_group:.2f}')
+
+        else:
+            logger.warning(f'no elution_group_idx column was found')
+
+        if 'proteins' in input.precursor_df.columns:
+            n_proteins = len(input.precursor_df['proteins'].unique())
+            logger.info(f'Number of proteins: {n_proteins:,}')
+        else:
+            logger.warning(f'no proteins column was found')
+
+        if 'channel' in input.precursor_df.columns:
+            channels = input.precursor_df['channel'].unique()
+            n_channels = len(channels)
+            logger.info(f'Number of channels: {n_channels:,} ({channels})')
+
+        else:
+            logger.warning(f'no channel column was found, will assume only one channel')
+
+        isotopes = utils.get_isotope_columns(input.precursor_df.columns)
+
+        if len(isotopes) > 0:
+            logger.info(f'Isotopes Distribution for {len(isotopes)} isotopes')
+
+        logger.info(f'=======================================')
+
+        return input
\ No newline at end of file
diff --git a/alphadia/extraction/planning.py b/alphadia/extraction/planning.py
index bc756ffc..afffdf68 100644
--- a/alphadia/extraction/planning.py
+++ b/alphadia/extraction/planning.py
@@ -10,7 +10,7 @@
 from typing import Union, List, Dict, Tuple, Optional
 
 # alphadia imports
-from alphadia.extraction import data, validate, utils
+from alphadia.extraction import data, validate, utils, libtransform
 from alphadia.extraction.workflow import peptidecentric, base, reporting
 import alphadia
 
@@ -19,6 +19,7 @@
 from alphabase.peptide import fragment
 from alphabase.spectral_library.flat import SpecLibFlat
 from alphabase.spectral_library.base import SpecLibBase
+from alphabase.spectral_library.reader import LibraryReaderBase
 
 # third party imports
 import numpy as np
@@ -30,7 +31,7 @@ class Plan:
     def __init__(self, 
             output_folder : str,
             raw_file_list: List,
-            spectral_library : SpecLibBase,
+            spec_lib_path : Union[str, None] = None,
             config_path : Union[str, None] = None,
             config_update_path : Union[str, None] = None,
             config_update : Union[Dict, None] = None
@@ -97,7 +98,7 @@ def __init__(self,
         now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
         logger.progress(f'date: {now}')
         
-        self.from_spec_lib_base(spectral_library)
+        self.load_library(spec_lib_path)
 
     @property
     def raw_file_list(
@@ -143,134 +144,36 @@ def spectral_library(
         ) -> None:
         self._spectral_library = spectral_library
 
-    def from_spec_lib_base(self, speclib_base):
-
-        speclib_base._fragment_cardinality_df = fragment.calc_fragment_cardinality(speclib_base.precursor_df, speclib_base._fragment_mz_df)
-
-        speclib = SpecLibFlat(min_fragment_intensity=0.0001, keep_top_k_fragments=100)
-        speclib.parse_base_library(speclib_base, custom_df={'cardinality':speclib_base._fragment_cardinality_df})
-
-        self.from_spec_lib_flat(speclib)
-
-    def from_spec_lib_flat(self, speclib_flat):
-
-        self.spectral_library = speclib_flat
-
-        self.rename_columns(self.spectral_library._precursor_df, 'precursor_columns')
-        self.rename_columns(self.spectral_library._fragment_df, 'fragment_columns')
-
-        self.log_library_stats()
-
-        self.add_precursor_columns(self.spectral_library.precursor_df)
-
-        output_columns = [
-            'nAA',
-            'elution_group_idx',
-            'precursor_idx',
-            'decoy' ,
-            'flat_frag_start_idx',
-            'flat_frag_stop_idx' ,
-            'charge',
-            'rt_library',
-            'mobility_library',
-            'mz_library',
-            'sequence',
-            'genes',
-            'proteins',
-            'uniprot_ids',
-            'channel'
-        ]
-        
-        existing_columns = self.spectral_library.precursor_df.columns
-        output_columns += [f'i_{i}' for i in utils.get_isotope_columns(existing_columns)]
-        existing_output_columns = [c for c in output_columns if c in existing_columns]
-
-        self.spectral_library.precursor_df = self.spectral_library.precursor_df[existing_output_columns].copy()
-        self.spectral_library.precursor_df = self.spectral_library.precursor_df.sort_values('elution_group_idx')
-        self.spectral_library.precursor_df = self.spectral_library.precursor_df.reset_index(drop=True)
-
-    def log_library_stats(self):
-
-        logger.info(f'========= Library Stats =========')
-        logger.info(f'Number of precursors: {len(self.spectral_library.precursor_df):,}')
-
-        if 'decoy' in self.spectral_library.precursor_df.columns:
-            n_targets = len(self.spectral_library.precursor_df.query('decoy == False'))
-            n_decoys = len(self.spectral_library.precursor_df.query('decoy == True'))
-            logger.info(f'\tthereof targets:{n_targets:,}')
-            logger.info(f'\tthereof decoys: {n_decoys:,}')
-        else:
-            logger.warning(f'no decoy column was found')
-
-        if 'elution_group_idx' in self.spectral_library.precursor_df.columns:
-            n_elution_groups = len(self.spectral_library.precursor_df['elution_group_idx'].unique())
-            average_precursors_per_group = len(self.spectral_library.precursor_df)/n_elution_groups
-            logger.info(f'Number of elution groups: {n_elution_groups:,}')
-            logger.info(f'\taverage size: {average_precursors_per_group:.2f}')
-
-        else:
-            logger.warning(f'no elution_group_idx column was found')
-
-        if 'proteins' in self.spectral_library.precursor_df.columns:
-            n_proteins = len(self.spectral_library.precursor_df['proteins'].unique())
-            logger.info(f'Number of proteins: {n_proteins:,}')
-        else:
-            logger.warning(f'no proteins column was found')
-
-        if 'channel' in self.spectral_library.precursor_df.columns:
-            channels = self.spectral_library.precursor_df['channel'].unique()
-            n_channels = len(channels)
-            logger.info(f'Number of channels: {n_channels:,} ({channels})')
-
-        else:
-            logger.warning(f'no channel column was found, will assume only one channel')
-
-        isotopes = utils.get_isotope_columns(self.spectral_library.precursor_df.columns)
-
-        if len(isotopes) > 0:
-            logger.info(f'Isotopes Distribution for {len(isotopes)} isotopes')
-
-        logger.info(f'=================================')    
-
-    def rename_columns(self, dataframe, group):
-        logger.info(f'renaming {group} columns')
-        # precursor columns
-        if group in self.config['library_parsing']:
-            for key, value in self.config['library_parsing'][group].items():
-                # column which should be created already exists
-                if key in dataframe.columns:
-                    continue
-                # column does not yet exist
-                else:
-                    for candidate_columns in value:
-                        if candidate_columns in dataframe.columns:
-                            dataframe.rename(columns={candidate_columns: key}, inplace=True)
-                            # break after first match
-                            break
-        else:
-            logger.error(f'no {group} columns specified in extraction config')
-
-    def add_precursor_columns(self, dataframe):
-
-        if not 'precursor_idx' in dataframe.columns:
-            dataframe['precursor_idx'] = np.arange(len(dataframe))
-            logger.warning(f'no precursor_idx column found, creating one')
-
-        if not 'elution_group_idx' in dataframe.columns:
-            dataframe['elution_group_idx'] = self.get_elution_group_idx(dataframe, strategy='precursor')
-            logger.warning(f'no elution_group_idx column found, creating one')
-
-        if not 'channel' in dataframe.columns:
-            dataframe['channel'] = 0
-            logger.warning(f'no channel column found, creating one')
-
-    def get_elution_group_idx(self, dataframe, strategy='precursor'):
-
-        if strategy == 'precursor':
-            return dataframe['precursor_idx']
+    def load_library(self, spec_lib_path):
 
+        if 'fasta_files' in self.config:
+            fasta_files = self.config['fasta_files']
         else:
-            raise NotImplementedError(f'elution group strategy {strategy} not implemented')
+            fasta_files = []
+
+        # the import pipeline is used to transform arbitrary spectral libraries into the alphabase format
+        # afterwards, the library can be saved as hdf5 and used for further processing
+        import_pipeline = libtransform.ProcessingPipeline([
+            libtransform.DynamicLoader(),
+            libtransform.PrecursorInitializer(),
+            libtransform.AnnotateFasta(fasta_files),
+            libtransform.IsotopeGenerator(n_isotopes=4),
+            libtransform.RTNormalization(),
+        ])
+
+        # the prepare pipeline is used to prepare an alphabase compatible spectral library for extraction
+        prepare_pipeline = libtransform.ProcessingPipeline([
+            libtransform.DecoyGenerator(decoy_type='diann'),
+            libtransform.FlattenLibrary(),
+            libtransform.InitFlatColumns(),
+            libtransform.LogFlatLibraryStats(),
+        ])
+
+        speclib = import_pipeline(spec_lib_path)
+        if self.config['library']['save_hdf']:
+            speclib.save_hdf(os.path.join(self.output_folder, 'speclib.hdf'))
+
+        self.spectral_library = prepare_pipeline(speclib)
 
     def get_run_data(self):
         """Generator for raw data and spectral library."""
diff --git a/coverage.svg b/coverage.svg
index a9d4cc45..dba09e28 100644
--- a/coverage.svg
+++ b/coverage.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">32%</text>
-        <text x="80" y="14">32%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">38%</text>
+        <text x="80" y="14">38%</text>
     </g>
 </svg>
diff --git a/misc/config/default.yaml b/misc/config/default.yaml
index 768c1681..e3f5fd6e 100644
--- a/misc/config/default.yaml
+++ b/misc/config/default.yaml
@@ -85,14 +85,9 @@ multiplexing:
   reference_channel: 0
   competetive_scoring: True
 
-# to be removed
-library_parsing:
-  precursor_columns:
-    mz_library: ['mz','precursor_mz']
-    rt_library: ['rt','rt_norm']
-    mobility_library: ['mobility']
-  fragment_columns:
-    mz_library: ['mz','predicted_mz']
+library:
+  isotopes: 4
+  save_hdf: False
 
 # configuration for the optimization manager
 # initial parameters, will nbe optimized
diff --git a/tests/unit_tests/test_libtransform.py b/tests/unit_tests/test_libtransform.py
new file mode 100644
index 00000000..55006480
--- /dev/null
+++ b/tests/unit_tests/test_libtransform.py
@@ -0,0 +1,66 @@
+import tempfile
+import numpy as np
+import pandas as pd
+
+
+from alphadia.extraction import libtransform
+
+def test_library_transform():
+
+    fasta = """
+>sp|Q9CX84|RGS19_MOUSE Regulator of G-protein signaling 19 OS=Mus musculus OX=10090 GN=Rgs19 PE=1 SV=2
+LMHSPTGRRRKK
+
+>sp|P39935|TIF4631_YEAST Translation initiation factor eIF-4G 1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=TIF4631 YGR254W PE=1 SV=2
+KSKSSGEHLDLKSGEHLDLKLMHSPTGR
+
+"""
+        
+
+    library = """PrecursorMz	ProductMz	Annotation	ProteinId	GeneName	PeptideSequence	ModifiedPeptideSequence	PrecursorCharge	LibraryIntensity	NormalizedRetentionTime	PrecursorIonMobility	FragmentType	FragmentCharge	FragmentSeriesNumber	FragmentLossType
+300.156968	333.188096	y3^1	Q9CX84	Rgs19	LMHSPTGR	LMHSPTGR	3	4311.400524927019	-25.676406886060136		y	1	3	
+300.156968	430.24086	y4^1	Q9CX84	Rgs19	LMHSPTGR	LMHSPTGR	3	7684.946735600609	-25.676406886060136		y	1	4	
+300.156968	517.27289	y5^1	Q9CX84	Rgs19	LMHSPTGR	LMHSPTGR	3	10000.0	-25.676406886060136		y	1	5	
+300.159143	313.187033	y5^2	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	4817.867861369569	29.42456033403839		y	2	5	
+300.159143	375.223813	y3^1	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	8740.775194419808	29.42456033403839		y	1	3	
+300.159143	406.219062	y7^2	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	2026.7157241363188	29.42456033403839		y	2	7	
+300.159143	488.307878	y4^1	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	10000.0	29.42456033403839		y	1	4	
+300.159143	625.36679	y5^1	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	6782.1533255969025	29.42456033403839		y	1	5	
+300.159143	639.273285	b6^1	P39935	TIF4631	SGEHLDLK	SGEHLDLK	3	1844.4293802287832	29.42456033403839		b	1	6	
+"""
+
+    # create temp file
+    temp_lib = tempfile.NamedTemporaryFile(suffix='.tsv', delete=False)
+    temp_lib.write(library.encode())
+    temp_lib.close()
+
+    # create temp fasta
+    temp_fasta = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
+    temp_fasta.write(fasta.encode())
+    temp_fasta.close()
+
+    import_pipeline = libtransform.ProcessingPipeline([
+        libtransform.DynamicLoader(),
+        libtransform.PrecursorInitializer(),
+        libtransform.AnnotateFasta([temp_fasta.name]),
+        libtransform.IsotopeGenerator(n_isotopes=4),
+        libtransform.RTNormalization(),
+    ])
+
+    # the prepare pipeline is used to prepare an alphabase compatible spectral library for extraction
+    prepare_pipeline = libtransform.ProcessingPipeline([
+        libtransform.DecoyGenerator(decoy_type='diann'),
+        libtransform.FlattenLibrary(),
+        libtransform.InitFlatColumns(),
+        libtransform.LogFlatLibraryStats(),
+    ])
+
+    speclib = import_pipeline(temp_lib.name)
+    speclib = prepare_pipeline(speclib)
+
+    assert len(speclib.precursor_df) == 4
+    assert np.all([ col in speclib.precursor_df.columns for col in ['mz_library','rt_library','mobility_library', 'i_0', 'i_1', 'i_2', 'i_3']])
+    assert speclib.precursor_df['decoy'].sum() == 2
+    assert np.all(speclib.precursor_df['cardinality'] == [2,2,1,1])
+
+speclib = test_library_transform()
\ No newline at end of file
diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py
index d1f96c12..ffa752a1 100644
--- a/tests/unit_tests/test_workflow.py
+++ b/tests/unit_tests/test_workflow.py
@@ -255,7 +255,7 @@ def test_workflow_base():
 
             assert os.path.exists(my_workflow.path)
 
-            assert isinstance(my_workflow.dia_data, bruker.TimsTOFTranspose) or isinstance(my_workflow.dia_data, thermo.Thermo)
+            #assert isinstance(my_workflow.dia_data, bruker.TimsTOFTranspose) or isinstance(my_workflow.dia_data, thermo.Thermo)
             assert isinstance(my_workflow.calibration_manager, manager.CalibrationManager)
             assert isinstance(my_workflow.optimization_manager, manager.OptimizationManager)