From 34b41b9b998b26316cc158b8706655d92cb38f89 Mon Sep 17 00:00:00 2001
From: GeorgWa <gewaelbe@googlemail.com>
Date: Tue, 17 Oct 2023 00:43:22 +0200
Subject: [PATCH] CHORE major refactoring

---
 alphadia/__init__.py                          |    2 +-
 alphadia/analysis/__init__.py                 |    1 -
 alphadia/analysis/actions.py                  |  144 -
 alphadia/annotation/__init__.py               |   57 -
 alphadia/annotation/identification.py         |  288 --
 alphadia/annotation/library.py                |   32 -
 alphadia/annotation/percolation.py            |  433 ---
 alphadia/annotation/psm_stats.py              |  266 --
 alphadia/calibration.py                       |  943 +++++-
 alphadia/cli.py                               |  202 +-
 alphadia/{extraction => }/data/bruker.py      |   27 +-
 alphadia/{extraction => }/data/thermo.py      |   14 +-
 alphadia/dia.py                               |   82 -
 alphadia/extraction/__init__.py               |    0
 alphadia/extraction/calibration.py            |  811 -----
 alphadia/{extraction => }/fdr.py              |   17 +-
 alphadia/{extraction => }/fdrexperimental.py  |   73 +-
 alphadia/{extraction => }/features.py         |   10 +-
 alphadia/gui.py                               |    1 -
 alphadia/{extraction => }/hybridselection.py  |   38 +-
 alphadia/library.py                           | 1851 -----------
 alphadia/{extraction => }/libtransform.py     |   94 +-
 alphadia/{extraction => }/numba/config.py     |   12 +-
 alphadia/{extraction => }/numba/fragments.py  |    9 +-
 alphadia/{extraction => }/numba/numeric.py    |    7 +
 alphadia/{extraction => }/planning.py         |   35 +-
 alphadia/{extraction => }/plexscoring.py      |   34 +-
 alphadia/{extraction => }/plotting/cycle.py   |   26 +-
 alphadia/{extraction => }/plotting/debug.py   |   12 +-
 alphadia/{extraction => }/plotting/utils.py   |    8 +-
 alphadia/prefilter.py                         | 1265 --------
 alphadia/preprocessing/__init__.py            |  186 --
 alphadia/preprocessing/calibration.py         |  191 --
 alphadia/preprocessing/connecting.py          |  142 -
 alphadia/preprocessing/deisotoping.py         |  212 --
 alphadia/preprocessing/msmsgeneration.py      |  394 ---
 alphadia/preprocessing/peakfinding.py         |  363 ---
 alphadia/preprocessing/peakstats.py           |  636 ----
 alphadia/preprocessing/smoothing.py           |  210 --
 alphadia/{extraction => }/quadrupole.py       |    8 +-
 alphadia/smoothing.py                         | 2858 -----------------
 alphadia/{extraction => }/testing.py          |   25 +-
 alphadia/thermo.py                            |  215 --
 alphadia/{extraction => }/utils.py            |    9 -
 alphadia/{extraction => }/validate.py         |   13 +-
 alphadia/venn.py                              |  473 ---
 alphadia/{extraction => }/workflow/base.py    |   14 +-
 alphadia/{extraction => }/workflow/manager.py |   25 +-
 .../workflow/peptidecentric.py                |   23 +-
 .../{extraction => }/workflow/reporting.py    |   68 +-
 nbs/search/library_search.ipynb               |   41 +-
 51 files changed, 1129 insertions(+), 11771 deletions(-)
 delete mode 100644 alphadia/analysis/__init__.py
 delete mode 100644 alphadia/analysis/actions.py
 delete mode 100644 alphadia/annotation/__init__.py
 delete mode 100644 alphadia/annotation/identification.py
 delete mode 100644 alphadia/annotation/library.py
 delete mode 100644 alphadia/annotation/percolation.py
 delete mode 100644 alphadia/annotation/psm_stats.py
 rename alphadia/{extraction => }/data/bruker.py (98%)
 rename alphadia/{extraction => }/data/thermo.py (99%)
 delete mode 100644 alphadia/dia.py
 delete mode 100644 alphadia/extraction/__init__.py
 delete mode 100644 alphadia/extraction/calibration.py
 rename alphadia/{extraction => }/fdr.py (98%)
 rename alphadia/{extraction => }/fdrexperimental.py (87%)
 rename alphadia/{extraction => }/features.py (99%)
 rename alphadia/{extraction => }/hybridselection.py (99%)
 delete mode 100644 alphadia/library.py
 rename alphadia/{extraction => }/libtransform.py (94%)
 rename alphadia/{extraction => }/numba/config.py (97%)
 rename alphadia/{extraction => }/numba/fragments.py (98%)
 rename alphadia/{extraction => }/numba/numeric.py (99%)
 rename alphadia/{extraction => }/planning.py (91%)
 rename alphadia/{extraction => }/plexscoring.py (98%)
 rename alphadia/{extraction => }/plotting/cycle.py (94%)
 rename alphadia/{extraction => }/plotting/debug.py (98%)
 rename alphadia/{extraction => }/plotting/utils.py (97%)
 delete mode 100644 alphadia/prefilter.py
 delete mode 100644 alphadia/preprocessing/__init__.py
 delete mode 100644 alphadia/preprocessing/calibration.py
 delete mode 100644 alphadia/preprocessing/connecting.py
 delete mode 100644 alphadia/preprocessing/deisotoping.py
 delete mode 100644 alphadia/preprocessing/msmsgeneration.py
 delete mode 100644 alphadia/preprocessing/peakfinding.py
 delete mode 100644 alphadia/preprocessing/peakstats.py
 delete mode 100644 alphadia/preprocessing/smoothing.py
 rename alphadia/{extraction => }/quadrupole.py (98%)
 delete mode 100644 alphadia/smoothing.py
 rename alphadia/{extraction => }/testing.py (96%)
 delete mode 100644 alphadia/thermo.py
 rename alphadia/{extraction => }/utils.py (98%)
 rename alphadia/{extraction => }/validate.py (99%)
 delete mode 100644 alphadia/venn.py
 rename alphadia/{extraction => }/workflow/base.py (96%)
 rename alphadia/{extraction => }/workflow/manager.py (98%)
 rename alphadia/{extraction => }/workflow/peptidecentric.py (98%)
 rename alphadia/{extraction => }/workflow/reporting.py (93%)

diff --git a/alphadia/__init__.py b/alphadia/__init__.py
index 17161df3..fcf5ab7d 100644
--- a/alphadia/__init__.py
+++ b/alphadia/__init__.py
@@ -40,4 +40,4 @@
 }
 __extra_requirements__ = {
     "development": "requirements_development.txt",
-}
+}
\ No newline at end of file
diff --git a/alphadia/analysis/__init__.py b/alphadia/analysis/__init__.py
deleted file mode 100644
index 8dbe83e5..00000000
--- a/alphadia/analysis/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-import alphadia.analysis.actions as actions
diff --git a/alphadia/analysis/actions.py b/alphadia/analysis/actions.py
deleted file mode 100644
index ff2cd179..00000000
--- a/alphadia/analysis/actions.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""A module to analyse timsTOF DIA data."""
-
-import logging
-import collections
-import abc
-
-import alphatims.bruker
-import alphadia.smoothing
-
-class ActionDeque(collections.deque):
-
-    def run_all_consecutive_actions(self) -> None:
-        if len(self) == 0:
-            logging.info("No actions in ActionDeque")
-        else:
-            if len(self) == 1:
-                logging.info("Running 1 action in ActionDeque")
-            else:
-                logging.info(f"Running {len(self)} actions in ActionDeque")
-            for action_to_take in self:
-                action_to_take.run()
-
-
-class Action(abc.ABC):
-
-    def __init__(self, **parameters):
-        self.update_parameters(**parameters)
-
-    @property
-    @abc.abstractmethod
-    def default_parameters(self) -> dict:
-        pass
-
-    @property
-    def parameters(self) -> dict:
-        if not hasattr(self, "_parameters"):
-            self._parameters = self.default_parameters
-        return self._parameters
-
-    def update_parameters(self, **parameters) -> None:
-        self._parameters = self.parse_valid_parameters(**parameters)
-
-    def parse_valid_parameters(self, **parameters) -> None:
-        current_parameters = self.parameters
-        for parameter_key, parameter_value in parameters.items():
-            current_parameters[parameter_key] = parameter_value
-        return current_parameters
-
-    def set_output(self, output: type) -> type:
-        self._output = output
-
-    @property
-    def output(self) -> type:
-        if not hasattr(self, "_output"):
-            raise ValueError("No output has been defined for this action")
-        return self._output
-
-    @property
-    def is_completed(self) -> bool:
-        return hasattr(self, "_output")
-
-    def run(self, redo_completed: bool = False, **parameters) -> None:
-        if redo_completed or not self.is_completed:
-            if len(parameters) > 0:
-                self.update_parameters(**parameters)
-            logging.info(f"Running '{self.__class__.__name__}'")
-            try:
-                output = self._run()
-                self.set_output(output)
-            except Exception as raised_exception:
-                if hasattr(self, "_output"):
-                    del self._output
-                raise raised_exception
-        else:
-            logging.info(
-                f"'{self.__class__.__name__}' is already completed"
-            )
-        return self.output
-
-    @property
-    @abc.abstractmethod
-    def runnable_function(self) -> callable:
-        pass
-
-    def _run(self) -> type:
-        return self.runnable_function(**self.parameters)
-
-    # @staticmethod
-    # def create(name):
-    #     if name == "import":
-    #         return ImportAction()
-
-class ImportAction(Action):
-
-    @property
-    def default_parameters(self) -> dict:
-        return {
-            "bruker_d_folder_name": None,
-        }
-
-    @property
-    def runnable_function(self) -> callable:
-        return alphatims.bruker.TimsTOF
-
-class ConnectAction(Action):
-
-    @property
-    def default_parameters(self) -> dict:
-        return {
-            "scan_tolerance": 6,
-            "dia_data": None,
-            "multiple_frames": False,
-            "ms1": True,
-            "ms2": True,
-        }
-
-    @property
-    def runnable_function(self) -> callable:
-        # import functools
-        # _func = functools.partial(
-        #     alphadia.smoothing.get_connections_within_cycle,
-        #     scan_max_index=self.parameters["dia_data"].scan_max_index,
-        #     dia_mz_cycle=self.parameters["dia_data"].dia_mz_cycle
-        # )
-        def _func2(**kwargs):
-            parameters = self.parameters.copy()
-            dia_data = parameters.pop("dia_data")
-            return alphadia.smoothing.get_connections_within_cycle(
-                scan_max_index=dia_data.scan_max_index,
-                dia_mz_cycle=dia_data.dia_mz_cycle,
-                **parameters,
-            )
-        # parameters = self.parameters.copy()
-        # dia_data = parameters.pop("dia_data")
-        # _func = functools.partial(
-        #     alphadia.smoothing.get_connections_within_cycle,
-        #     scan_max_index=dia_data.scan_max_index,
-        #     dia_mz_cycle=dia_data.dia_mz_cycle,
-        #     **parameters,
-        # )
-        # result = _func()
-        # def _func2(**kwargs):
-        #     return result
-        return _func2
diff --git a/alphadia/annotation/__init__.py b/alphadia/annotation/__init__.py
deleted file mode 100644
index 34bd5279..00000000
--- a/alphadia/annotation/__init__.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""Identify pseudo MSMS data data."""
-
-from . import identification
-from . import psm_stats
-from . import library
-from . import percolation
-
-
-class Annotator:
-
-    def set_ions(self, precursor_df, fragment_df):
-        # self.preprocessing_workflow = preprocessing_workflow
-        self.precursor_df = precursor_df
-        self.fragment_df = fragment_df
-
-    def set_library(self, library):
-        self.library = library
-
-    def set_msms_identifier(self):
-        self.msms_identifier = identification.MSMSIdentifier()
-        # self.msms_identifier.set_preprocessor(self.preprocessing_workflow)
-        self.msms_identifier.set_ions(
-            self.precursor_df,
-            self.fragment_df,
-        )
-        self.msms_identifier.set_library(self.library)
-        self.msms_identifier.identify()
-
-    def set_psm_stats_calculator(self):
-        self.psm_stats_calculator = psm_stats.PSMStatsCalculator()
-        # self.psm_stats_calculator.set_preprocessor(self.preprocessing_workflow)
-        self.psm_stats_calculator.set_ions(self.precursor_df, self.fragment_df)
-        self.psm_stats_calculator.set_library(self.library)
-        self.psm_stats_calculator.set_annotation(
-            self.msms_identifier.annotation
-        )
-        self.psm_stats_calculator.estimate_mz_tolerance()
-
-    def set_percolator(self):
-        self.percolator = percolation.Percolator()
-        self.percolator.set_annotation(
-            self.psm_stats_calculator.annotation
-        )
-        self.percolator.percolate()
-
-    def run_default(self):
-        self.set_msms_identifier()
-        self.set_psm_stats_calculator()
-        self.msms_identifier.update_ppm_values_from_stats_calculator(
-            self.psm_stats_calculator
-        )
-        self.msms_identifier.identify()
-        self.psm_stats_calculator.set_annotation(
-            self.msms_identifier.annotation
-        )
-        self.psm_stats_calculator.update_annotation_stats()
-        self.set_percolator()
diff --git a/alphadia/annotation/identification.py b/alphadia/annotation/identification.py
deleted file mode 100644
index 2f6742cb..00000000
--- a/alphadia/annotation/identification.py
+++ /dev/null
@@ -1,288 +0,0 @@
-"""Annotate pseudo MSMS spectra."""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-
-
-class MSMSIdentifier:
-
-    def __init__(
-        self,
-        precursor_ppm=50,
-        fragment_ppm=50,
-        min_size=10,
-        ppm_mean=0,
-        min_hit_count=1,
-        append_stats=True,
-        top_n_hits=1,
-    ):
-        self.precursor_ppm = precursor_ppm
-        self.fragment_ppm = fragment_ppm
-        self.min_size = min_size
-        self.ppm_mean = ppm_mean
-        self.min_hit_count = min_hit_count
-        self.append_stats = append_stats
-        self.top_n_hits = top_n_hits
-
-    def set_ions(self, precursor_df, fragment_df):
-        self.precursor_df = precursor_df
-        self.fragment_df = fragment_df
-
-    def set_library(self, library):
-        self.library = library
-
-    def update_ppm_values_from_stats_calculator(
-        self,
-        psm_stats_calculator
-    ):
-        self.ppm_mean = psm_stats_calculator.ppm_mean
-        self.fragment_ppm = psm_stats_calculator.ppm_width
-        self.precursor_ppm = psm_stats_calculator.ppm_width
-
-    def identify(
-        self,
-    ):
-        logging.info(
-            f"Quick library annotation of mono isotopes with {self.ppm_mean=} and {self.precursor_ppm=}"
-        )
-        spectrum_sizes = (self.precursor_df.fragment_end - self.precursor_df.fragment_start).values
-        o = np.argsort(self.precursor_df.tof_indices.values)
-        p_mzs = self.precursor_df.mz_average.values[o]
-        lower = np.empty(
-            len(self.precursor_df),
-            dtype=np.int64
-        )
-        upper = np.empty(
-            len(self.precursor_df),
-            dtype=np.int64
-        )
-        lower[o] = np.searchsorted(
-            self.library.predicted_library_df.precursor_mz.values,
-            p_mzs / (1 + self.precursor_ppm * 10**-6)
-        )
-        upper[o] = np.searchsorted(
-            self.library.predicted_library_df.precursor_mz.values,
-            p_mzs * (1 + self.precursor_ppm * 10**-6)
-        )
-        logging.info(
-            f"PSMs to test: {np.sum(((upper - lower) * (spectrum_sizes >= self.min_size)))}"
-        )
-        (
-            precursor_indices,
-            precursor_indptr,
-            hit_counts,
-            frequency_counts,
-            db_indices,
-        ) = annotate(
-            range(len(lower)),
-            # range(100),
-            self.library.predicted_library_df.frag_start_idx.values,
-            self.library.predicted_library_df.frag_end_idx.values,
-            self.precursor_df.fragment_start.values,
-            self.precursor_df.fragment_end.values,
-            self.fragment_df.mz_average.values * (1 + self.ppm_mean * 10**-6),
-            self.fragment_df[
-                [i for i in self.fragment_df.columns if "correlation" in i]
-            ].prod(axis=1).values, # TODO
-            self.fragment_ppm,
-            lower,
-            upper,
-            self.library.y_mzs,
-            self.library.b_mzs,
-            self.min_size,
-            self.min_hit_count,
-            self.top_n_hits,
-        )
-
-        precursor_selection = np.repeat(precursor_indices, precursor_indptr)
-        hits = self.precursor_df.iloc[precursor_selection].reset_index()
-        hits["inet_index"] = precursor_selection
-        hits["candidates"] = (upper - lower)[precursor_selection]
-        hits["total_peaks"] = spectrum_sizes[precursor_selection]
-        hits["db_index"] = db_indices.astype(np.int64)
-        # hits["counts"] = np.repeat(hit_counts, precursor_indptr)
-        hits["counts"] = hit_counts
-        hits["frequency_counts"] = frequency_counts
-        self.annotation = hits.rename(columns={"charge": "precursor_charge"})
-        self.annotation = self.annotation.join(self.library.predicted_library_df, on="db_index")
-        self.annotation["im_diff"] = self.annotation.mobility_pred - self.annotation.mobility_values
-        self.annotation["mz_diff"] = self.annotation.precursor_mz - self.annotation.mz_values
-        self.annotation["ppm_diff"] = self.annotation.mz_diff / self.annotation.precursor_mz * 10**6
-        self.annotation["target"] = ~self.annotation.decoy
-        self.annotation.reset_index(drop=True, inplace=True)
-
-
-def annotate(
-    iterable,
-    frag_start_idx,
-    frag_end_idx,
-    frag_start,
-    frag_end,
-    frag_mzs,
-    frag_weights,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    min_size,
-    min_hit_count,
-    top_n_hits,
-):
-    import multiprocessing
-
-    def starfunc(index):
-        # return alphadia.prefilter.annotate_pool(
-        return annotate_pool2(
-            index,
-            frag_start_idx,
-            frag_end_idx,
-            frag_start,
-            frag_end,
-            frag_mzs,
-            frag_weights,
-            fragment_ppm,
-            lower,
-            upper,
-            y_mzs,
-            b_mzs,
-            min_size,
-            min_hit_count,
-            top_n_hits,
-        )
-    precursor_indices = []
-    max_hit_counts = []
-    max_frequency_counts = []
-    db_indices = []
-    precursor_indptr = []
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for (
-            precursor_index,
-            hit_count,
-            frequency_count,
-            db_indices_,
-        ) in alphatims.utils.progress_callback(
-            pool.imap(starfunc, iterable),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            # if hit_count >= min_hit_count:
-            if True:
-                precursor_indices.append(precursor_index)
-                precursor_indptr.append(len(db_indices_))
-                max_hit_counts.append(hit_count)
-                max_frequency_counts.append(frequency_count)
-                db_indices.append(db_indices_)
-    return (
-        np.array(precursor_indices),
-        np.array(precursor_indptr),
-        # np.array(max_hit_counts),
-        np.concatenate(max_hit_counts),
-        np.concatenate(max_frequency_counts),
-        np.concatenate(db_indices),
-    )
-
-
-@alphatims.utils.njit(nogil=True)
-def annotate_pool2(
-    index,
-    frag_start_idx,
-    frag_end_idx,
-    frag_start,
-    frag_end,
-    frag_mzs,
-    frag_weights,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    min_size,
-    min_hit_count,
-    top_n_hits,
-):
-    start = frag_start[index]
-    end = frag_end[index]
-    results = [0][1:] # this defines the type
-    hit_counts = [0][1:] # this defines the type
-    frequency_counts = [0.0][1:] # this defines the type
-    if (end - start) < min_size:
-        return index, hit_counts, frequency_counts, results
-    if (end - start) < min_hit_count:
-        return index, hit_counts, frequency_counts, results
-    frequencies = frag_weights[start: end]
-    fragment_mzs = frag_mzs[start: end]
-    max_hit_count = min_hit_count
-    for db_index in range(lower[index], upper[index]):
-        frag_start = frag_start_idx[db_index]
-        frag_end = frag_end_idx[db_index]
-        y_hits, y_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            y_mzs[frag_start: frag_end][::-1],
-            fragment_ppm,
-        )
-        b_hits, b_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            b_mzs[frag_start: frag_end],
-            fragment_ppm,
-        )
-        hit_count = b_hits + y_hits
-        frequency_count = b_frequency + y_frequency
-        if top_n_hits == 1:
-            if frequency_count == max_hit_count:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-            elif frequency_count > max_hit_count:
-                results = [db_index]
-                hit_counts = [hit_count]
-                frequency_counts = [frequency_count]
-                max_hit_count = hit_count
-        elif frequency_count >= min_hit_count:
-            if len(results) >= top_n_hits:
-                for min_index, freq_count in enumerate(frequency_counts):
-                    if freq_count == min_hit_count:
-                        results[min_index] = db_index
-                        hit_counts[min_index] = hit_count
-                        frequency_counts[min_index] = frequency_count
-                        break
-                min_hit_count = min(frequency_counts)
-            else:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-    # return index, max_hit_count, results
-    return index, hit_counts, frequency_counts, results
-
-
-
-@alphatims.utils.njit(nogil=True)
-def hit_and_frequency_count(
-    fragment_mzs,
-    frequencies,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hits = 0
-    summed_frequency = 0
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        frequency = frequencies[fragment_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            hits += 1
-            summed_frequency += frequency
-            fragment_index += 1
-            database_index += 1
-    return hits, summed_frequency
diff --git a/alphadia/annotation/library.py b/alphadia/annotation/library.py
deleted file mode 100644
index 5548a77d..00000000
--- a/alphadia/annotation/library.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""Import library."""
-
-import logging
-
-import numpy as np
-
-import alphabase.io.hdf
-
-
-class Library:
-
-    def import_from_file(self, library_file_name, is_already_mmapped=True):
-        logging.info("Loading library")
-        self.library_file_name = library_file_name
-        self.lib = alphabase.io.hdf.HDF_File(
-            self.library_file_name,
-            read_only=is_already_mmapped,
-        )
-
-        predicted_library_df = self.lib.library.precursor_df[...]
-        # predicted_library_df.sort_values(by=["rt_pred", "mobility_pred"], inplace=True)
-        predicted_library_df.sort_values(by="precursor_mz", inplace=True)
-        predicted_library_df.reset_index(level=0, inplace=True)
-        predicted_library_df.rename(columns={"index": "original_index"}, inplace=True)
-        predicted_library_df.decoy = predicted_library_df.decoy.astype(np.bool_)
-
-        self.y_mzs = self.lib.library.fragment_mz_df.y_z1.mmap
-        self.b_mzs = self.lib.library.fragment_mz_df.b_z1.mmap
-        self.y_ions_intensities = self.lib.library.fragment_intensity_df.y_z1.mmap
-        self.b_ions_intensities = self.lib.library.fragment_intensity_df.b_z1.mmap
-
-        self.predicted_library_df = predicted_library_df
diff --git a/alphadia/annotation/percolation.py b/alphadia/annotation/percolation.py
deleted file mode 100644
index 823fcd41..00000000
--- a/alphadia/annotation/percolation.py
+++ /dev/null
@@ -1,433 +0,0 @@
-"""Percolate results."""
-
-import logging
-
-import numpy as np
-import pandas as pd
-import sklearn
-import sklearn.model_selection
-import sklearn.decomposition
-import sklearn.neighbors
-import sklearn.preprocessing
-import sklearn.ensemble
-import sklearn.pipeline
-
-
-import alphatims.utils
-
-
-class Percolator:
-
-    def __init__(
-        self,
-        fdr=0.01,
-        train_fdr_level_pre_calibration=0.1,
-        train_fdr_level_post_calibration=0.33,
-        n_neighbors=4,
-        test_size=0.5,
-        random_state=0,
-    ):
-        self.fdr = fdr
-        self.train_fdr_level_pre_calibration = train_fdr_level_pre_calibration
-        self.train_fdr_level_post_calibration = train_fdr_level_post_calibration
-        self.n_neighbors = n_neighbors
-        self.test_size = test_size
-        self.random_state = random_state
-
-    def set_annotation(self, annotation):
-        self.annotation = annotation
-
-    def percolate(self):
-        logging.info("Percolating PSMs")
-        val_names = [
-            "counts",
-            "frequency_counts",
-            "ppm_diff",
-            "im_diff",
-            "charge",
-            "total_peaks",
-            "nAA",
-            "b_hit_counts",
-            "y_hit_counts",
-            "b_mean_ppm",
-            "y_mean_ppm",
-            "relative_found_b_int",
-            "relative_missed_b_int",
-            "relative_found_y_int",
-            "relative_missed_y_int",
-            "relative_found_int",
-            "relative_missed_int",
-            "pearsons",
-            "pearsons_log",
-            "candidates",
-        ]
-        logging.info("Calculating quick log odds")
-        score_df = self.annotation.copy()
-        log_odds = calculate_log_odds_product(
-            score_df,
-            val_names,
-        )
-        # log_odds = score_df["frequency_counts"].values
-        score_df["log_odds"] = log_odds
-        # score_df = alphadia.prefilter.train_and_score(
-        #     score_df,
-        #     val_names,
-        #     ini_score="log_odds",
-        #     train_fdr_level=train_fdr_level_pre_calibration,
-        # ).reset_index(drop=True)
-        score_df = get_q_values(score_df, "log_odds", 'decoy', drop=True)
-        score_df_above_fdr = score_df[
-            (score_df.q_value < self.fdr) & (score_df.target)
-        ].reset_index(drop=True)
-        logging.info(
-            f"Found {len(score_df_above_fdr)} targets for calibration"
-        )
-        score_df_above_fdr["im_pred"] = score_df_above_fdr.mobility_pred
-        score_df_above_fdr["im_values"] = score_df_above_fdr.mobility_values
-        self.predictors = {}
-        for dimension in ["rt", "im"]:
-            X = score_df_above_fdr[f"{dimension}_pred"].values.reshape(-1, 1)
-            y = score_df_above_fdr[f"{dimension}_values"].values
-            (
-                X_train,
-                X_test,
-                y_train,
-                y_test
-            ) = sklearn.model_selection.train_test_split(
-                X,
-                y,
-                test_size=self.test_size,
-                random_state=self.random_state,
-            )
-            self.predictors[dimension] = sklearn.neighbors.KNeighborsRegressor(
-                n_neighbors=self.n_neighbors,
-                # weights="distance",
-                n_jobs=alphatims.utils.set_threads(alphatims.utils.MAX_THREADS)
-            )
-            self.predictors[dimension].fit(X_train, y_train)
-            score_df_above_fdr[f"{dimension}_calibrated"] = self.predictors[dimension].predict(
-                score_df_above_fdr[f"{dimension}_pred"].values.reshape(-1, 1)
-            )
-            score_df_above_fdr[f"{dimension}_diff"] = score_df_above_fdr[f"{dimension}_values"] - score_df_above_fdr[f"{dimension}_calibrated"]
-        score_df["rt_calibrated"] = self.predictors["rt"].predict(
-            score_df.rt_pred.values.reshape(-1, 1)
-        )
-        score_df["im_calibrated"] = self.predictors["im"].predict(
-            score_df.mobility_pred.values.reshape(-1, 1)
-        )
-        ppm_mean = np.mean(score_df_above_fdr.ppm_diff.values)
-        score_df["mz_calibrated"] = score_df.precursor_mz * (
-            1 - ppm_mean * 10**-6
-        )
-
-        score_df["ppm_diff_calibrated"] = (score_df.mz_calibrated - score_df.mz_values) / score_df.mz_calibrated * 10**6
-        score_df["rt_diff_calibrated"] = score_df.rt_calibrated - score_df.rt_values
-        score_df["im_diff_calibrated"] = score_df.im_calibrated - score_df.mobility_values
-        # self.score_df = score_df.reset_index(drop=True)
-        self.score_df = train_and_score(
-            # score_df[np.abs(score_df.rt_diff_calibrated) < 250].reset_index(drop=True),
-            score_df,
-            [
-                "counts",
-                "frequency_counts",
-                "ppm_diff_calibrated",
-                "im_diff_calibrated",
-                "rt_diff_calibrated",
-                "charge",
-                "total_peaks",
-                "nAA",
-                "b_hit_counts",
-                "y_hit_counts",
-                "b_mean_ppm",
-                "y_mean_ppm",
-                "relative_found_b_int",
-                "relative_missed_b_int",
-                "relative_found_y_int",
-                "relative_missed_y_int",
-                "relative_found_int",
-                "relative_missed_int",
-                "pearsons",
-                "pearsons_log",
-                "candidates",
-                # "log_odds",
-            ],
-            ini_score="log_odds",
-            train_fdr_level=self.train_fdr_level_post_calibration,
-        ).reset_index(drop=True)
-
-        self.score_df["target_type"] = np.array([-1, 0])[
-            self.score_df.target.astype(np.int)
-        ]
-        self.score_df["target_type"][
-            (self.score_df.q_value < self.fdr) & (self.score_df.target)
-        ] = 1
-
-
-@alphatims.utils.njit(nogil=True)
-def fdr_to_q_values(fdr_values):
-    q_values = np.zeros_like(fdr_values)
-    min_q_value = np.max(fdr_values)
-    for i in range(len(fdr_values) - 1, -1, -1):
-        fdr = fdr_values[i]
-        if fdr < min_q_value:
-            min_q_value = fdr
-        q_values[i] = min_q_value
-    return q_values
-
-
-def get_q_values(_df, score_column, decoy_column, drop=False):
-    _df = _df.reset_index(drop=drop)
-    _df = _df.sort_values([score_column, score_column], ascending=False)
-    target_values = 1-_df['decoy'].values
-    decoy_cumsum = np.cumsum(_df['decoy'].values)
-    target_cumsum = np.cumsum(target_values)
-    fdr_values = decoy_cumsum/target_cumsum
-    _df['q_value'] = fdr_to_q_values(fdr_values)
-    return _df
-
-
-def calculate_odds(
-    df,
-    column_name,
-    *,
-    target_name="target",
-    smooth=1,
-    plot=False
-):
-    negatives, positives = np.bincount(df.target.values)
-    if negatives > positives:
-        raise ValueError(
-            f"Found more decoys ({negatives}) than targets ({positives})"
-        )
-        tp_count = 1000
-    else:
-        tp_count = positives - negatives
-    n = int(tp_count * smooth)
-    order = np.argsort(df[column_name].values)
-    forward = np.cumsum(df[target_name].values[order])
-    odds = np.zeros_like(forward, dtype=np.float)
-    odds[n:-n] = forward[2*n:] - forward[:-2*n]
-    odds[:n] = forward[n:2*n]
-    odds[-n:] = forward[-1] - forward[-2*n:-n]
-    odds[n:-n] /= 2*n
-    odds[:n] /= np.arange(n, 2*n)
-    odds[-n:] /= np.arange(n, 2*n)[::-1]
-    odds /= (1 - odds)
-    odds = odds[np.argsort(order)]
-    if plot:
-        import matplotlib.pyplot as plt
-        plt.scatter(df[column_name], odds, marker=".")
-    return odds
-
-
-def calculate_log_odds_product(
-    df_,
-    val_names
-):
-    df = df_[val_names]
-    df = sklearn.preprocessing.StandardScaler().fit_transform(df)
-    pca = sklearn.decomposition.PCA(n_components=df.shape[1])
-    pca.fit(df)
-    df = pd.DataFrame(pca.transform(df))
-    df["target"] = df_.target
-    negative, positive = np.bincount(df.target)
-    log_odds = np.zeros(len(df))
-    for val_name in range(df.shape[1] - 1):
-        odds = calculate_odds(df, val_name, smooth=1)
-        log_odds += np.log2(odds) * pca.explained_variance_[val_name]
-    return log_odds
-    # new_df = analysis1.score_df[["decoy", "target"]]
-    # new_df['odds'] = log_odds
-    # new_df = alphadia.library.get_q_values(new_df, "odds", 'decoy', drop=True)
-    # new_df.reset_index(drop=True, inplace=True)
-
-
-def train_and_score(
-    scores_df,
-    features,
-    train_fdr_level: float = 0.1,
-    ini_score: str = "count",
-    min_train: int = 1000,
-    test_size: float = 0.8,
-    max_depth: list = [5, 25, 50],
-    max_leaf_nodes: list = [150, 200, 250],
-    n_jobs: int = -1,
-    scoring: str = 'accuracy',
-    plot: bool = False,
-    random_state: int = 42,
-):
-    df = scores_df.copy()
-    cv = train_RF(
-        df,
-        features,
-        train_fdr_level=train_fdr_level,
-        ini_score=ini_score,
-        min_train=min_train,
-        test_size=test_size,
-        max_depth=max_depth,
-        max_leaf_nodes=max_leaf_nodes,
-        n_jobs=n_jobs,
-        scoring=scoring,
-        plot=plot,
-        random_state=random_state,
-    )
-    df['score'] = cv.predict_proba(df[features])[:, 1]
-    return get_q_values(df, "score", 'decoy', drop=True)
-
-
-def train_RF(
-    df: pd.DataFrame,
-    features: list,
-    train_fdr_level:  float = 0.1,
-    ini_score: str = None,
-    min_train: int = 1000,
-    test_size: float = 0.8,
-    max_depth: list = [5, 25, 50],
-    max_leaf_nodes: list = [150, 200, 250],
-    n_jobs: int = -1,
-    scoring: str = 'accuracy',
-    plot: bool = False,
-    random_state: int = 42,
-):
-    # Setup ML pipeline
-    scaler = sklearn.preprocessing.StandardScaler()
-    rfc = sklearn.ensemble.RandomForestClassifier(random_state=random_state)
-    ## Initiate scaling + classification pipeline
-    pipeline = sklearn.pipeline.Pipeline([('scaler', scaler), ('clf', rfc)])
-    parameters = {
-        'clf__max_depth': (max_depth),
-        'clf__max_leaf_nodes': (max_leaf_nodes)
-    }
-    ## Setup grid search framework for parameter selection and internal cross validation
-    cv = sklearn.model_selection.GridSearchCV(
-        pipeline,
-        param_grid=parameters,
-        cv=5,
-        scoring=scoring,
-        verbose=0,
-        return_train_score=True,
-        n_jobs=n_jobs
-    )
-    # Prepare target and decoy df
-    dfD = df[df.decoy.values]
-    # Select high scoring targets (<= train_fdr_level)
-    # df_prescore = filter_score(df)
-    # df_prescore = filter_precursor(df_prescore)
-    # scored = cut_fdr(df_prescore, fdr_level = train_fdr_level, plot=False)[1]
-    # highT = scored[scored.decoy==False]
-    # dfT_high = dfT[dfT['query_idx'].isin(highT.query_idx)]
-    # dfT_high = dfT_high[dfT_high['db_idx'].isin(highT.db_idx)]
-    if ini_score is None:
-        selection = None
-        best_hit_count = 0
-        best_feature = ""
-        for feature in features:
-            new_df = get_q_values(df, feature, 'decoy')
-            hits = (
-                new_df['q_value'] <= train_fdr_level
-            ) & (
-                new_df['decoy'] == 0
-            )
-            hit_count = np.sum(hits)
-            if hit_count > best_hit_count:
-                best_hit_count = hit_count
-                selection = hits
-                best_feature = feature
-        logging.info(f'Using optimal "{best_feature}" as initial_feature')
-        dfT_high = df[selection]
-    else:
-        logging.info(f'Using selected "{ini_score}" as initial_feature')
-        new_df = get_q_values(df, ini_score, 'decoy')
-        dfT_high = df[
-            (new_df['q_value'] <= train_fdr_level) & (new_df['decoy'] == 0)
-        ]
-
-    # Determine the number of psms for semi-supervised learning
-    n_train = int(dfT_high.shape[0])
-    if dfD.shape[0] < n_train:
-        n_train = int(dfD.shape[0])
-        logging.info(
-            "The total number of available decoys is lower than "
-            "the initial set of high scoring targets."
-        )
-    if n_train < min_train:
-        raise ValueError(
-            "There are fewer high scoring targets or decoys than "
-            "required by 'min_train'."
-        )
-
-    # Subset the targets and decoys datasets to result in a balanced dataset
-    df_training = dfT_high.append(
-        dfD.sample(n=n_train, random_state=random_state)
-    )
-    # df_training = dfT_high.append(dfD)
-
-    # Select training and test sets
-    X = df_training[features]
-    y = df_training['target'].astype(int)
-    (
-        X_train,
-        X_test,
-        y_train,
-        y_test
-    ) = sklearn.model_selection.train_test_split(
-        X.values,
-        y.values,
-        test_size=test_size,
-        random_state=random_state,
-        stratify=y.values
-    )
-
-    # Train the classifier on the training set via 5-fold cross-validation and subsequently test on the test set
-    logging.info(
-        'Training & cross-validation on {} targets and {} decoys'.format(
-            # np.sum(y_train), X_train.shape[0] - np.sum(y_train)
-            *np.bincount(y_train)[::-1]
-        )
-    )
-    cv.fit(X_train, y_train)
-
-    logging.info(
-        'The best parameters selected by 5-fold cross-validation were {}'.format(
-            cv.best_params_
-        )
-    )
-    logging.info(
-        'The train {} was {}'.format(scoring, cv.score(X_train, y_train))
-    )
-    logging.info(
-        'Testing on {} targets and {} decoys'.format(
-            np.sum(y_test),
-            X_test.shape[0] - np.sum(y_test)
-        )
-    )
-    logging.info(
-        'The test {} was {}'.format(scoring, cv.score(X_test, y_test))
-    )
-
-    feature_importances = cv.best_estimator_.named_steps['clf'].feature_importances_
-    indices = np.argsort(feature_importances)[::-1][:40]
-
-    top_features = X.columns[indices][:40]
-    top_score = feature_importances[indices][:40]
-
-    feature_dict = dict(zip(top_features, top_score))
-    logging.info(f"Top features {feature_dict}")
-
-    # Inspect feature importances
-    if plot:
-        import matplotlib.pyplot as plt
-        import seaborn as sns
-        g = sns.barplot(
-            y=X.columns[indices][:40],
-            x=feature_importances[indices][:40],
-            orient='h',
-            palette='RdBu'
-        )
-        g.set_xlabel("Relative importance", fontsize=12)
-        g.set_ylabel("Features", fontsize=12)
-        g.tick_params(labelsize=9)
-        g.set_title("Feature importance")
-        plt.show()
-
-    return cv
diff --git a/alphadia/annotation/psm_stats.py b/alphadia/annotation/psm_stats.py
deleted file mode 100644
index cd0fde33..00000000
--- a/alphadia/annotation/psm_stats.py
+++ /dev/null
@@ -1,266 +0,0 @@
-"""Calculate PSM stats."""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-
-
-class PSMStatsCalculator:
-
-    def __init__(
-        self,
-        pseudo_int=10**-6,
-    ):
-        self.pseudo_int = pseudo_int
-
-    def set_ions(self, precursor_df, fragment_df):
-        self.precursor_df = precursor_df
-        self.fragment_df = fragment_df
-
-    def set_library(self, library):
-        self.library = library
-
-    def set_annotation(self, annotation):
-        self.annotation = annotation
-
-    def estimate_mz_tolerance(self):
-        logging.info("Estimating ppm values")
-        ppm_diffs = self.annotation.ppm_diff
-        order = np.argsort(ppm_diffs.values)
-
-        decoys, targets = np.bincount(self.annotation.decoy.values)
-        distribution = np.cumsum(
-            [
-                1 / targets if i else -1 / decoys for i in self.annotation.decoy.values[order]
-            ]
-        )
-        low = ppm_diffs[order[np.argmin(distribution)]]
-        high = ppm_diffs[order[np.argmax(distribution)]]
-        self.ppm_mean = (low + high) / 2
-        self.ppm_width = abs(high - low)
-        # plt.plot(
-        #     ppm_diffs[order],
-        #     distribution,
-        # )
-        # sns.histplot(
-        #     data=self.annotation,
-        #     x="ppm_diff",
-        #     hue="decoy",
-        # )
-
-    def update_annotation_stats(self):
-        logging.info("Appending stats to quick annotation")
-        b_hit_counts = np.zeros(len(self.annotation))
-        y_hit_counts = np.zeros(len(self.annotation))
-        b_mean_ppm = np.zeros(len(self.annotation))
-        y_mean_ppm = np.zeros(len(self.annotation))
-        relative_found_b_int = np.zeros(len(self.annotation))
-        relative_missed_b_int = np.zeros(len(self.annotation))
-        relative_found_y_int = np.zeros(len(self.annotation))
-        relative_missed_y_int = np.zeros(len(self.annotation))
-        relative_found_int = np.zeros(len(self.annotation))
-        relative_missed_int = np.zeros(len(self.annotation))
-        pearsons = np.zeros(len(self.annotation))
-        pearsons_log = np.zeros(len(self.annotation))
-        update_annotation(
-            range(len(self.annotation)),
-            # 1000,
-            self.annotation.db_index.values,
-            self.library.predicted_library_df.frag_start_idx.values,
-            self.library.predicted_library_df.frag_end_idx.values,
-            self.library.y_mzs,
-            self.library.b_mzs,
-            self.library.y_ions_intensities,
-            self.library.b_ions_intensities,
-            self.annotation.inet_index.values,
-
-            self.precursor_df.fragment_start.values,
-            self.precursor_df.fragment_end.values,
-            self.fragment_df.summed_intensity_values.values,
-            self.fragment_df.mz_average.values * (1 + self.ppm_mean * 10**-6),
-            # self.precursor_indptr,
-            # self.fragment_indices,
-            # self.tof_indices,
-            # self.smooth_intensity_values, #.astype(np.float64),
-            # self.mz_values * (1 + self.ppm_mean * 10**-6),
-
-
-            self.ppm_width,
-            b_hit_counts,
-            y_hit_counts,
-            b_mean_ppm,
-            y_mean_ppm,
-            relative_found_b_int,
-            relative_missed_b_int,
-            relative_found_y_int,
-            relative_missed_y_int,
-            relative_found_int,
-            relative_missed_int,
-            pearsons,
-            pearsons_log,
-            np.float32(self.pseudo_int),
-        )
-        self.annotation["b_hit_counts"] = b_hit_counts
-        self.annotation["y_hit_counts"] = y_hit_counts
-        self.annotation["b_mean_ppm"] = b_mean_ppm
-        self.annotation["y_mean_ppm"] = y_mean_ppm
-        self.annotation["relative_found_b_int"] = relative_found_b_int
-        self.annotation["relative_missed_b_int"] = relative_missed_b_int
-        self.annotation["relative_found_y_int"] = relative_found_y_int
-        self.annotation["relative_missed_y_int"] = relative_missed_y_int
-        self.annotation["relative_found_int"] = relative_found_int
-        self.annotation["relative_missed_int"] = relative_missed_int
-        pearsons[~np.isfinite(pearsons)] = 0
-        self.annotation["pearsons"] = pearsons
-        pearsons_log[~np.isfinite(pearsons_log)] = 0
-        self.annotation["pearsons_log"] = pearsons_log
-
-
-@alphatims.utils.pjit
-# @alphatims.utils.njit(nogil=True)
-def update_annotation(
-    index,
-    database_indices,
-    database_frag_starts,
-    database_frag_ends,
-    database_y_mzs,
-    database_b_mzs,
-    database_y_ints,
-    database_b_ints,
-    inet_indices,
-    fragment_start,
-    fragment_end,
-    fragment_intensities,
-    fragment_mzs,
-    # precursor_indptr,
-    # fragment_indices,
-    # tof_indices,
-    # intensity_values,
-    # mz_values,
-    fragment_ppm,
-    b_hit_counts,
-    y_hit_counts,
-    b_mean_ppm,
-    y_mean_ppm,
-    relative_found_b_int,
-    relative_missed_b_int,
-    relative_found_y_int,
-    relative_missed_y_int,
-    relative_found_int,
-    relative_missed_int,
-    pearsons,
-    pearsons_log,
-    pseudo_int,
-):
-    if index >= len(database_indices):
-        return
-    database_index = database_indices[index]
-    db_frag_start_idx = database_frag_starts[database_index]
-    db_frag_end_idx = database_frag_ends[database_index]
-    db_y_mzs = database_y_mzs[db_frag_start_idx: db_frag_end_idx][::-1]
-    db_b_mzs = database_b_mzs[db_frag_start_idx: db_frag_end_idx]
-    db_y_ints = database_y_ints[db_frag_start_idx: db_frag_end_idx][::-1]
-    db_b_ints = database_b_ints[db_frag_start_idx: db_frag_end_idx]
-    if pseudo_int > 0:
-        db_y_ints = db_y_ints + pseudo_int
-        db_b_ints = db_b_ints + pseudo_int
-    precursor_index = inet_indices[index]
-    frag_start_idx = fragment_start[precursor_index]
-    frag_end_idx = fragment_end[precursor_index]
-    fragment_mzs = fragment_mzs[frag_start_idx: frag_end_idx]
-    fragment_ints = fragment_intensities[frag_start_idx: frag_end_idx]
-    fragment_b_hits, db_b_hits = find_hits(
-        fragment_mzs,
-        db_b_mzs,
-        fragment_ppm,
-    )
-    total_b_int = np.sum(db_b_ints)
-    if total_b_int == 0:
-        total_b_int = 1
-    if len(db_b_hits) > 0:
-        b_ppm = np.mean(
-            (db_b_mzs[db_b_hits] - fragment_mzs[fragment_b_hits]) / db_b_mzs[db_b_hits] * 10**6
-        )
-        found_b_int = np.sum(db_b_ints[db_b_hits])
-        min_b_int = np.min(db_b_ints[db_b_hits])
-    else: # TODO defaults are not reflective of good/bad scores
-        b_ppm = fragment_ppm
-        found_b_int = 0
-        min_b_int = -1
-    fragment_y_hits, db_y_hits = find_hits(
-        fragment_mzs,
-        db_y_mzs,
-        fragment_ppm,
-    )
-    total_y_int = np.sum(db_y_ints)
-    if total_y_int == 0:
-        total_y_int = 1
-    if len(db_y_hits) > 0:
-        y_ppm = np.mean(
-            (db_y_mzs[db_y_hits] - fragment_mzs[fragment_y_hits]) / db_y_mzs[db_y_hits] * 10**6
-        )
-        found_y_int = np.sum(db_y_ints[db_y_hits])
-        min_y_int = np.min(db_y_ints[db_y_hits])
-    else: # TODO defaults are not reflective of good/bad scores
-        y_ppm = fragment_ppm
-        found_y_int = 0
-        min_y_int = -1
-    missed_b_int = np.sum(
-        np.array([intsy for i, intsy in enumerate(db_b_ints) if (i not in db_b_hits) and (intsy > min_b_int)])
-    )
-    missed_y_int = np.sum(
-        np.array([intsy for i, intsy in enumerate(db_y_ints) if (i not in db_y_hits) and (intsy > min_y_int)])
-    )
-    # all_frags = fragment_ints
-    b_hit_counts[index] = len(db_b_hits)
-    y_hit_counts[index] = len(db_y_hits)
-    b_mean_ppm[index] = b_ppm
-    y_mean_ppm[index] = y_ppm
-    relative_found_b_int[index] = found_b_int / total_b_int
-    relative_missed_b_int[index] = missed_b_int / total_b_int
-    relative_found_y_int[index] = found_y_int / total_y_int
-    relative_missed_y_int[index] = missed_y_int / total_y_int
-    relative_found_int[index] = (found_b_int + found_y_int) / (total_b_int + total_y_int)
-    relative_missed_int[index] = (missed_b_int + missed_y_int) / (total_b_int + total_y_int)
-    all_db_ints = []
-    all_frag_ints = []
-    for b_int in db_b_ints[db_b_hits]:
-        all_db_ints.append(b_int)
-    for y_int in db_y_ints[db_y_hits]:
-        all_db_ints.append(y_int)
-    for frag_int in fragment_ints[fragment_b_hits]:
-        all_frag_ints.append(frag_int)
-    for frag_int in fragment_ints[fragment_y_hits]:
-        all_frag_ints.append(frag_int)
-    pearsons[index] = np.corrcoef(all_db_ints, all_frag_ints)[0, 1]
-    pearsons_log[index] = np.corrcoef(
-        np.log(np.array(all_db_ints)),
-        np.log(np.array(all_frag_ints)),
-    )[0, 1]
-
-
-@alphatims.utils.njit(nogil=True)
-def find_hits(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    fragment_hits = []
-    db_hits = []
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            fragment_hits.append(fragment_index)
-            db_hits.append(database_index)
-            fragment_index += 1
-            database_index += 1
-    return np.array(fragment_hits), np.array(db_hits)
diff --git a/alphadia/calibration.py b/alphadia/calibration.py
index c77050a6..5297a896 100644
--- a/alphadia/calibration.py
+++ b/alphadia/calibration.py
@@ -1,143 +1,808 @@
-"""Calibrate quad"""
+# native imports
+import os
+import logging
+import typing
+import pickle
 
-import alphatims.bruker
+# alphadia imports
+from alphadia.plotting.utils import density_scatter
+
+# alpha family imports
 import alphatims.utils
-import numpy as np
+from alphabase.statistics.regression import LOESSRegression
+
+# third party imports
 import pandas as pd
-import alphatims.plotting
-
-
-@alphatims.utils.njit(nogil=True, cache=False)
-def merge_cyclic_pushes(
-    cyclic_push_index,
-    intensity_values,
-    tof_indices,
-    push_indptr,
-    zeroth_frame,
-    cycle_length,
-    tof_max_index,
-    scan_max_index,
-    return_sparse=False,
-):
-    offset = scan_max_index * zeroth_frame + cyclic_push_index
-    intensity_buffer = np.zeros(tof_max_index)
-    tofs = []
-    for push_index in range(offset, len(push_indptr) - 1, cycle_length):
-        start = push_indptr[push_index]
-        end = push_indptr[push_index + 1]
-        for index in range(start, end):
-            tof = tof_indices[index]
-            intensity = intensity_values[index]
-            if intensity_buffer[tof] == 0:
-                tofs.append(tof)
-            intensity_buffer[tof] += intensity
-    tofs = np.array(tofs, dtype=tof_indices.dtype)
-    if return_sparse:
-        tofs = np.sort(tofs)
-        intensity_buffer = intensity_buffer[tofs]
-    return tofs, intensity_buffer
-
-
-def guesstimate_quad_settings(
-    dia_data,
-    smooth_window=100,
-    gaussian_blur=5,
-    percentile=50,
-    regresion_mz_lower_cutoff=400,
-    regresion_mz_upper_cutoff=1000,
-):
-    dia_mz_cycle = np.empty_like(dia_data.dia_mz_cycle)
-    weights = np.zeros(len(dia_mz_cycle))
-    for cyclic_push_index, (low_quad, high_quad) in alphatims.utils.progress_callback(
-        enumerate(dia_data.dia_mz_cycle),
-        total=len(dia_data.dia_mz_cycle)
-    ):
-        if (low_quad == -1) and (high_quad == -1):
-            dia_mz_cycle[cyclic_push_index] = (low_quad, high_quad)
-            continue
-        tofs, intensity_buffer = merge_cyclic_pushes(
-            cyclic_push_index=cyclic_push_index,
-            intensity_values=dia_data.intensity_values,
-            tof_indices=dia_data.tof_indices,
-            push_indptr=dia_data.push_indptr,
-            zeroth_frame=dia_data.zeroth_frame,
-            cycle_length=len(dia_data.dia_mz_cycle),
-            tof_max_index=dia_data.tof_max_index,
-            scan_max_index=dia_data.scan_max_index,
-            return_sparse=True,
-        )
-        if len(tofs) > 0:
-            cum_int = np.cumsum(intensity_buffer)
-            low_threshold = cum_int[-1] * percentile / 100 / 2
-            high_threshold = cum_int[-1] * (1 - (percentile / 100 / 2))
-            low_index = np.searchsorted(cum_int, low_threshold)
-            high_index = np.searchsorted(cum_int, high_threshold, "right")
-            low_quad_estimate = dia_data.mz_values[tofs[low_index]]
-            high_quad_estimate = dia_data.mz_values[tofs[high_index]]
+import numpy as np
+from matplotlib import pyplot as plt
+
+import sklearn.base
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.pipeline import Pipeline
+
+class Calibration():
+    def __init__(self, 
+                name : str = '',
+                function : object = None,
+                input_columns : typing.List[str] = [],
+                target_columns : typing.List[str] = [],
+                output_columns : typing.List[str] = [],
+                transform_deviation : typing.Union[None, float] = None,
+                **kwargs):
+        """A single estimator for a property (mz, rt, etc.).
+
+        Calibration is performed by modeling the deviation of an input values (e.g. mz_library) from an observed property (e.g. mz_observed) using a function (e.g. LinearRegression). Once calibrated, calibrated values (e.g. mz_calibrated) can be predicted from input values (e.g. mz_library). Additional explaining variabels can be added to the input values (e.g. rt_library) to improve the calibration.
+
+        Parameters
+        ----------
+
+        name : str
+            Name of the estimator for logging and plotting e.g. 'mz'
+        
+        function : object
+            The estimator object instance which must have a fit and predict method.
+            This will usually be a sklearn estimator or a custom estimator.
+
+        input_columns : list of str
+            The columns of the dataframe that are used as input for the estimator e.g. ['mz_library']. 
+            The first column is the property which should be calibrated, additional columns can be used as explaining variables e.g. ['mz_library', 'rt_library'].
+
+        target_columns : list of str
+            The columns of the dataframe that are used as target for the estimator e.g. ['mz_observed'].
+            At the moment only one target column is supported.
+
+        output_columns : list of str
+            The columns of the dataframe that are used as output for the estimator e.g. ['mz_calibrated'].
+            At the moment only one output column is supported.
+        
+        transform_deviation : typing.List[Union[None, float]]
+            If set to a valid float, the deviation is expressed as a fraction of the input value e.g. 1e6 for ppm.
+            If set to None, the deviation is expressed in absolute units.
+
+        """
+        
+        self.name = name
+        self.function = function
+        self.input_columns = input_columns
+        self.target_columns = target_columns
+        self.output_columns = output_columns
+        self.transform_deviation = float(transform_deviation) if transform_deviation is not None else None
+        self.is_fitted = False
+
+    def __repr__(self) -> str:
+        return f'<Calibration {self.name}, is_fitted: {self.is_fitted}>'
+
+    def save(self, file_name: str):
+        """Save the estimator to pickle file.
+
+        Parameters
+        ----------
+
+        file_name : str
+            Path to the pickle file
+
+        """
+
+        with open(file_name, 'wb') as f:
+            pickle.dump(self, f)
+
+    def load(self, file_name: str):
+        """Load the estimator from pickle file.
+
+        Parameters
+        ----------
+
+        file_name : str
+            Path to the pickle file
+
+        """
+
+        with open(file_name, 'rb') as f:
+            loaded_calibration = pickle.load(f)
+            self.__dict__.update(loaded_calibration.__dict__)
+
+    def validate_columns(
+            self, 
+            dataframe : pd.DataFrame
+        ):
+        """Validate that the input and target columns are present in the dataframe.
+
+        Parameters
+        ----------
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        Returns
+        -------
+        bool
+            True if all columns are present, False otherwise
+
+        """
+
+        valid = True
+
+        if len(self.target_columns) > 1 :
+            logging.warning('Only one target column supported')
+            valid = False
+
+        required_columns = set(self.input_columns + self.target_columns)
+        if not required_columns.issubset(dataframe.columns):
+            logging.warning(f'{self.name}, at least one column {required_columns} not found in dataframe')
+            valid = False
+
+        return valid
+
+    def fit(
+            self, 
+            dataframe : pd.DataFrame,
+            plot : bool = False, 
+            **kwargs
+        ):
+        """Fit the estimator based on the input and target columns of the dataframe.
+
+        Parameters
+        ----------
+
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        plot : bool, default=False
+            If True, a plot of the calibration is generated.
+        
+        Returns
+        -------
+
+        np.ndarray
+            Array of shape (n_input_columns, ) containing the mean absolute deviation of the residual deviation at the given confidence interval
+
+        """
+
+        if not self.validate_columns(dataframe):
+            logging.warning(f'{self.name} calibration was skipped')
+            return
+
+        if self.function is None:
+            raise ValueError('No estimator function provided')
+
+        input_values = dataframe[self.input_columns].values
+        target_value = dataframe[self.target_columns].values
+
+        try:
+            self.function.fit(input_values, target_value)
+            self.is_fitted = True
+        except Exception as e:
+            logging.error(f'Could not fit estimator {self.name}: {e}')
+            return
+
+        if plot == True:
+            self.plot(dataframe, **kwargs)
+
+
+    def predict(self, dataframe, inplace=True):
+        """Perform a prediction based on the input columns of the dataframe.
+
+        Parameters
+        ----------
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        inplace : bool, default=True
+            If True, the prediction is added as a new column to the dataframe. If False, the prediction is returned as a numpy array.
+
+        Returns
+        -------
+        np.ndarray
+            Array of shape (n_samples, ) containing the prediction
+
+        """
+
+        if self.is_fitted == False:
+            logging.warning(f'{self.name} prediction was skipped as it has not been fitted yet')
+            return
+        
+        if not set(self.input_columns).issubset(dataframe.columns):
+            logging.warning(f'{self.name} calibration was skipped as input column {self.input_columns} not found in dataframe')
+            return
+
+        input_values = dataframe[self.input_columns].values
+        
+        if inplace:
+            dataframe[self.output_columns[0]] = self.function.predict(input_values)
         else:
-            low_quad_estimate, high_quad_estimate = -1, -1
-        dia_mz_cycle[cyclic_push_index] = (
-            low_quad_estimate,
-            high_quad_estimate
-        )
-        weights[cyclic_push_index] = np.sum(intensity_buffer)
-    predicted_dia_mz_cycle = predict_dia_mz_cycle(
-        dia_mz_cycle,
-        dia_data,
-        weights,
-    )
-    return dia_mz_cycle, predicted_dia_mz_cycle
-
-
-
-def predict_dia_mz_cycle(
-    dia_mz_cycle,
-    dia_data,
-    weights,
-):
-    import sklearn.linear_model
-    df = pd.DataFrame(
-        {
-            "detected_lower": dia_mz_cycle[:, 0],
-            "detected_upper": dia_mz_cycle[:, 1],
-            "frame": np.arange(len(dia_mz_cycle)) // dia_data.scan_max_index,
-            "scan": np.arange(len(dia_mz_cycle)) % dia_data.scan_max_index,
-            "weights": weights,
-        }
-    )
-    frame_reg_lower = {}
-    frame_reg_upper = {}
-    model = sklearn.linear_model.HuberRegressor
-    for frame in np.unique(df.frame):
-        if np.all(dia_data.dia_mz_cycle[df.frame == frame] == -1):
-            continue
-        selection = df[df.frame == frame]
-        frame_reg_lower[frame] = model().fit(
-            selection.scan.values.reshape(-1, 1),
-            selection.detected_lower.values.reshape(-1, 1),
-            selection.weights.values,
-        )
-        frame_reg_upper[frame] = model().fit(
-            selection.scan.values.reshape(-1, 1),
-            selection.detected_upper.values.reshape(-1, 1),
-            selection.weights.values,
-        )
-    predicted_upper = []
-    predicted_lower = []
-    for index, frame in enumerate(df.frame.values):
-        if frame not in frame_reg_upper:
-            predicted_upper.append(-1)
-            predicted_lower.append(-1)
-            continue
-        predicted_lower_ = frame_reg_lower[frame].predict(
-            df.scan.values[index: index + 1].reshape(-1, 1)
-        )
-        predicted_upper_ = frame_reg_upper[frame].predict(
-            df.scan.values[index: index + 1].reshape(-1, 1)
-        )
-        predicted_lower.append(predicted_lower_[0])
-        predicted_upper.append(predicted_upper_[0])
-    predicted_dia_mz_cycle = np.vstack(
-        [predicted_lower, predicted_upper]
-    ).T
-    return predicted_dia_mz_cycle
+            return self.function.predict(input_values)
+        
+    def fit_predict(
+        self,
+        dataframe : pd.DataFrame,
+        plot : bool = False,
+        inplace : bool = True
+        ):
+        """Fit the estimator and perform a prediction based on the input columns of the dataframe.
+
+        Parameters
+        ----------
+
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+        
+        plot : bool, default=False
+            If True, a plot of the calibration is generated.
+
+        inplace : bool, default=True
+            If True, the prediction is added as a new column to the dataframe. If False, the prediction is returned as a numpy array.
+
+        """
+        self.fit(dataframe, plot=plot)
+        return self.predict(dataframe, inplace=inplace)
+
+    def deviation(self, dataframe : pd.DataFrame):
+        """ Calculate the deviations between the input, target and calibrated values.
+
+        Parameters
+        ----------
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        Returns 
+        -------
+        np.ndarray 
+            Array of shape (n_samples, 3 + n_input_columns). 
+            The second dimension contains the observed deviation, calibrated deviation, residual deviation and the input values.
+        
+        """
+
+        # the first column is the unclaibrated input property
+        # all other columns are explaining variables
+        input_values = dataframe[self.input_columns].values
+
+        # the first column is the unclaibrated input property
+        uncalibrated_values = input_values[:, [0]]
+
+        # only one target column is supported
+        target_values = dataframe[self.target_columns].values[:, [0]]
+        input_transform = self.transform_deviation
+
+        calibrated_values = self.predict(dataframe, inplace=False)
+        if calibrated_values.ndim == 1:
+            calibrated_values = calibrated_values[:, np.newaxis]
+
+        # only one output column is supported
+        calibrated_dim = calibrated_values[:, [0]]
+
+        # deviation is the difference between the (observed) target value and the uncalibrated input value
+        observed_deviation = target_values - uncalibrated_values
+        if input_transform is not None:
+            observed_deviation = observed_deviation/uncalibrated_values * float(input_transform)
+
+        # calibrated deviation is the explained difference between the (calibrated) target value and the uncalibrated input value
+        calibrated_deviation = calibrated_dim - uncalibrated_values
+        if input_transform is not None:
+            calibrated_deviation = calibrated_deviation/uncalibrated_values * float(input_transform)
+
+        # residual deviation is the unexplained difference between the (observed) target value and the (calibrated) target value
+        residual_deviation = observed_deviation - calibrated_deviation
+
+        return np.concatenate([observed_deviation, calibrated_deviation, residual_deviation, input_values], axis=1)
+
+    def ci(self, dataframe, ci : float = 0.95):
+        """Calculate the residual deviation at the given confidence interval.
+
+        Parameters
+        ----------
+
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        ci : float, default=0.95
+            confidence interval
+
+        Returns
+        -------
+
+        float
+            the confidence interval of the residual deviation after calibration
+        """
+        
+        if not 0 < ci < 1:
+            raise ValueError('Confidence interval must be between 0 and 1')
+        
+        if not self.is_fitted:
+            return 0
+
+        ci_percentile = [100*(1-ci)/2, 100*(1+ci)/2]
+        
+        deviation = self.deviation(dataframe)
+        residual_deviation = deviation[:, 2]
+        return np.mean(np.abs(np.percentile(residual_deviation, ci_percentile)))
+
+    def get_transform_unit(
+            self, 
+            transform_deviation : typing.Union[None, float]
+        ):
+
+        """Get the unit of the deviation based on the transform deviation.
+        
+        Parameters
+        ----------
+        
+        transform_deviation : typing.Union[None, float]
+            If set to a valid float, the deviation is expressed as a fraction of the input value e.g. 1e6 for ppm.
+            
+        Returns
+        -------
+        str
+            The unit of the deviation
+
+        """
+        if transform_deviation is not None:
+            if np.isclose(transform_deviation,1e6):
+                return '(ppm)'
+            elif np.isclose(transform_deviation,1e2):
+                return '(%)'
+            else:
+                return f'({transform_deviation})'
+        else:
+            return '(absolute)'
+
+
+    def plot(
+            self, 
+            dataframe : pd.DataFrame, 
+            figure_path : str = None,
+            #neptune_run : str = None, 
+            #neptune_key :str = None, 
+            **kwargs
+        ):
+
+        """Plot the data and calibration model.
+
+        Parameters
+        ----------
+
+        dataframe : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        figure_path : str, default=None
+            If set, the figure is saved to the given path.
+
+        neptune_run : str, default=None
+            If set, the figure is logged to the given neptune run.
+
+        neptune_key : str, default=None
+            key under which the figure is logged to the neptune run.
+
+        """
+
+        deviation = self.deviation(dataframe)
+
+        n_input_properties = deviation.shape[1] - 3
+
+        transform_unit = self.get_transform_unit(self.transform_deviation)
+
+        fig, axs = plt.subplots(n_input_properties, 2, figsize=(6.5, 3.5*n_input_properties), squeeze=False)
+
+        for input_property in range(n_input_properties):
+
+            # plot the relative observed deviation
+            density_scatter(
+                deviation[:, 3+input_property], 
+                deviation[:, 0],
+                axis=axs[input_property, 0],  
+                s=1
+            )
+
+            # plot the calibration model 
+            x_values = deviation[:, 3+input_property]
+            y_values = deviation[:, 1]
+            order = np.argsort(x_values)
+            x_values = x_values[order]
+            y_values = y_values[order]
+
+            axs[input_property, 0].plot(x_values, y_values, color='red')
+
+            # plot the calibrated deviation
+
+            density_scatter(
+                deviation[:, 3+input_property],
+                deviation[:, 2],
+                axis=axs[input_property, 1],
+                s=1
+            )
+
+            for ax, dim in zip(axs[input_property, :],[0,2]):
+                ax.set_xlabel(self.input_columns[input_property])
+                ax.set_ylabel(f'observed deviation {transform_unit}')
+                
+                # get absolute y value and set limites to plus minus absolute y
+                y = deviation[:, dim] 
+                y_abs = np.abs(y)
+                ax.set_ylim(-y_abs.max()*1.05, y_abs.max()*1.05)
+
+        fig.tight_layout()
+
+        # log figure to neptune ai
+        #if neptune_run is not None and neptune_key is not None:
+        #    neptune_run[f'calibration/{neptune_key}'].log(fig)
+
+        #if figure_path is not None:
+            
+        #    i = 0
+        #    file_name = os.path.join(figure_path, f'calibration_{neptune_key}_{i}.png')
+        #    while os.path.exists(file_name):
+        #        file_name = os.path.join(figure_path, f'calibration_{neptune_key}_{i}.png')
+        #        i += 1
+
+        #    fig.savefig(file_name)
+            
+        plt.show()  
+
+        plt.close()
+        
+class CalibrationManager():
+
+    def __init__(
+            self,
+            config : typing.Union[None, dict] = None,
+            path : typing.Union[None, str] = None,
+            load_calibration : bool = True):
+
+        """Contains, updates and applies all calibrations for a single run.
+
+        Calibrations are grouped into calibration groups. Each calibration group is applied to a single data structure (precursor dataframe, fragment fataframe, etc.). Each calibration group contains multiple estimators which each calibrate a single property (mz, rt, etc.). Each estimator is a `Calibration` object which contains the estimator function.
+        
+        Parameters
+        ----------
+
+        config : typing.Union[None, dict], default=None
+            Calibration config dict. If None, the default config is used.
+
+        path : str, default=None
+            Path where the current parameter set is saved to and loaded from.
+
+        load_calibration : bool, default=True
+            If True, the calibration manager is loaded from the given path.
+        
+        """
+        self._is_loaded_from_file = False
+        self.estimator_groups = []
+        self.path = path
+
+        logging.info('========= Initializing Calibration Manager =========')
+
+        self.load_config(config)
+        if load_calibration:
+            self.load()
+
+        logging.info('====================================================')
+
+    @property
+    def is_loaded_from_file(self):
+        """Check if the calibration manager was loaded from file.
+        """
+        return self._is_loaded_from_file
+    
+    @property
+    def is_fitted(self):
+        """Check if all estimators in all calibration groups are fitted.
+        """
+
+        is_fitted = True
+        for group in self.estimator_groups:
+            for estimator in group['estimators']:
+                if not estimator.is_fitted:
+                    is_fitted = False
+                    break
+        
+        return is_fitted and len(self.estimator_groups) > 0
+
+    def load_config(self, config : dict):
+        """Load calibration config from config Dict.
+
+        each calibration config is a list of calibration groups which consist of multiple estimators.
+        For each estimator the `model` and `model_args` are used to request a model from the calibration_model_provider and to initialize it.
+        The estimator is then initialized with the `Calibration` class and added to the group.
+
+        Parameters
+        ----------
+
+        config : dict
+            Calibration config dict
+
+        Example
+        -------
+
+        Create a calibration manager with a single group and a single estimator:
+
+        .. code-block:: python
+
+            calibration_manager = calibration.CalibrationManager()
+            calibration_manager.load_config([{
+                'name': 'mz_calibration',
+                'estimators': [
+                    {
+                        'name': 'mz',
+                        'model': 'LOESSRegression',
+                        'model_args': {
+                            'n_kernels': 2
+                        },
+                        'input_columns': ['mz_library'],
+                        'target_columns': ['mz_observed'],
+                        'output_columns': ['mz_calibrated'],
+                        'transform_deviation': 1e6
+                    },
+                    
+                ]
+            }])
+        
+        """
+        
+        logging.info('loading calibration config')
+        logging.info(f'found {len(config)} calibration groups')
+        for group in config:
+            logging.info(f'Calibration group :{group["name"]}, found {len(group["estimators"])} estimator(s)')
+            for estimator in group['estimators']:
+                try:
+                    template = calibration_model_provider.get_model(estimator['model'])
+                    model_args = estimator['model_args'] if 'model_args' in estimator else {}
+                    estimator['function'] = template(**model_args)
+                except Exception as e:
+                    logging.error(f'Could not load estimator {estimator["name"]}: {e}')
+
+            group_copy = {'name': group['name']} 
+            group_copy['estimators'] = [Calibration(**x) for x in group['estimators']]
+            self.estimator_groups.append(group_copy)
+
+    def save(self):
+        """Save the calibration manager state to pickle file.
+        """
+        if self.path is not None:
+            with open(self.path, 'wb') as f:
+                pickle.dump(self, f)
+
+    def load(self):
+        """Load the calibration manager from pickle file.
+        """
+        if self.path is not None and os.path.exists(self.path):
+            try:
+                with open(self.path, 'rb') as f:
+                    loaded_state = pickle.load(f)
+                    self.__dict__.update(loaded_state.__dict__)
+                    self._is_loaded_from_file = True
+            except:
+                logging.warning(f'Could not load calibration manager from {self.path}')
+            else:
+                logging.info(f'Loaded calibration manager from {self.path}')
+        else:
+            logging.warning(f'Calibration manager path {self.path} does not exist')
+
+    def get_group_names(self):
+        """Get the names of all calibration groups.
+
+        Returns
+        -------
+        list of str
+            List of calibration group names
+        """
+
+        return [x['name'] for x in self.estimator_groups]
+
+    def get_group(self, group_name : str):
+        """Get the calibration group by name.
+
+        Parameters
+        ----------
+
+        group_name : str
+            Name of the calibration group
+
+        Returns
+        -------
+        dict
+            Calibration group dict with `name` and `estimators` keys\
+        
+        """
+        for group in self.estimator_groups:
+            if group['name'] == group_name:
+                return group
+
+        logging.error(f'could not get_group: {group_name}')
+        return None
+    
+    def get_estimator_names(self, group_name : str):
+        """Get the names of all estimators in a calibration group.
+
+        Parameters
+        ----------
+
+        group_name : str
+            Name of the calibration group
+
+        Returns
+        -------
+        list of str
+            List of estimator names
+        """
+
+        group = self.get_group(group_name)
+        if group is not None:
+            return [x.name for x in group['estimators']]
+        logging.error(f'could not get_estimator_names: {group_name}')
+        return None
+
+    def get_estimator(self, group_name : str, estimator_name : str):
+
+        """Get an estimator from a calibration group.
+
+        Parameters
+        ----------
+
+        group_name : str
+            Name of the calibration group
+
+        estimator_name : str
+            Name of the estimator
+
+        Returns
+        -------
+        Calibration
+            The estimator object
+
+        """
+        group = self.get_group(group_name)
+        if group is not None:
+            for estimator in group['estimators']:
+                if estimator.name == estimator_name:
+                    return estimator
+        logging.error(f'could not get_estimator: {group_name}, {estimator_name}')
+        return None
+
+    def fit(
+        self, 
+        df : pd.DataFrame, 
+        group_name : str, 
+        *args,
+        **kwargs
+        ):
+        """Fit all estimators in a calibration group.
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        group_name : str
+            Name of the calibration group
+
+        """ 
+
+        if len(self.estimator_groups) == 0:
+            raise ValueError('No estimators defined')
+
+        group_idx = [i for i, x in enumerate(self.estimator_groups) if x['name'] == group_name]
+        if len(group_idx) == 0:
+            raise ValueError(f'No group named {group_name} found')
+        for group in group_idx:
+            for estimator in self.estimator_groups[group]['estimators']:
+                logging.info(f'calibration group: {group_name}, fitting {estimator.name} estimator ')
+                estimator.fit(df, *args, neptune_key=f'{group_name}_{estimator.name}', **kwargs)
+
+    def predict(
+            self, 
+            df : pd.DataFrame, 
+            group_name : str, 
+            *args, 
+            **kwargs):
+        
+        """Predict all estimators in a calibration group.
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        group_name : str
+            Name of the calibration group
+
+        """
+
+        if len(self.estimator_groups) == 0:
+            raise ValueError('No estimators defined')
+
+        group_idx = [i for i, x in enumerate(self.estimator_groups) if x['name'] == group_name]
+        if len(group_idx) == 0:
+            raise ValueError(f'No group named {group_name} found')
+        for group in group_idx:
+            for estimator in self.estimator_groups[group]['estimators']:
+                logging.info(f'calibration group: {group_name}, predicting {estimator.name}')
+                estimator.predict(df, inplace=True, *args, **kwargs)
+
+    def fit_predict(
+            self,
+            df : pd.DataFrame,
+            group_name : str,
+            plot : bool = True,
+        ):
+        """Fit and predict all estimators in a calibration group.
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            Dataframe containing the input and target columns
+
+        group_name : str
+            Name of the calibration group
+
+        plot : bool, default=True
+            If True, a plot of the calibration is generated.
+
+        """
+        self.fit(df, group_name, plot=plot)
+        self.predict(df, group_name)
+
+class CalibrationModelProvider:
+    def __init__(self):
+
+        """Provides a collection of scikit-learn compatible models for calibration.       
+        """
+        self.model_dict = {}
+
+    def __repr__(self) -> str:
+        string = '<CalibrationModelProvider, \n[\n'
+        for key, value in self.model_dict.items():
+            string += f' \t {key}: {value}\n'
+        string += ']>'
+        return string
+
+    def register_model(
+            self, 
+            model_name : str, 
+            model_template : sklearn.base.BaseEstimator
+        ):
+        """Register a model template with a given name.
+
+        Parameters
+        ----------
+        model_name : str
+            Name of the model
+
+        model_template : sklearn.base.BaseEstimator
+            The model template which must have a fit and predict method.
+
+        """
+        self.model_dict[model_name] = model_template
+
+    def get_model(self, model_name : str):
+        """Get a model template by name.
+
+        Parameters
+        ----------
+
+        model_name : str
+            Name of the model
+
+        Returns
+        -------
+        sklearn.base.BaseEstimator
+            The model template which must have a fit and predict method.
+
+        """
+
+        if model_name not in self.model_dict:
+            raise ValueError(f'Unknown model {model_name}')
+        else:
+            return self.model_dict[model_name]
+
+def PolynomialRegression(degree=2, include_bias=False):
+    return Pipeline([
+        ('poly', PolynomialFeatures(degree=degree, include_bias=include_bias)),
+        ('linear', LinearRegression())
+    ])
+
+calibration_model_provider = CalibrationModelProvider()
+calibration_model_provider.register_model('LinearRegression', LinearRegression)
+calibration_model_provider.register_model('LOESSRegression', LOESSRegression)
+calibration_model_provider.register_model('PolynomialRegression', PolynomialRegression)
\ No newline at end of file
diff --git a/alphadia/cli.py b/alphadia/cli.py
index ed9b62bb..2cfef77d 100644
--- a/alphadia/cli.py
+++ b/alphadia/cli.py
@@ -1,18 +1,18 @@
 #!python
 
-
-# builtin
+# native imports
 import logging
 import time
-import sys
 import yaml
 
-# external
-import click
-
-# local
+# alphadia imports
 import alphadia
-from alphadia.extraction.workflow import reporting
+from alphadia.workflow import reporting
+
+# alpha family imports
+
+# third party imports
+import click
 
 @click.group(
     context_settings=dict(
@@ -163,7 +163,7 @@ def extract(**kwargs):
         matplotlib.use('Agg')
 
         from alphabase.spectral_library.base import SpecLibBase
-        from alphadia.extraction.planning import Plan
+        from alphadia.planning import Plan
 
         lib = SpecLibBase()
         lib.load_hdf(library, load_mod_seq=True)
@@ -185,186 +185,4 @@ def extract(**kwargs):
         )
 
     except Exception as e:
-        logging.exception(e)
-
-@run.group(
-    "spectrum",
-    help="Process DIA data spectrum-centric."
-)
-def spectrum(*args, **kwargs):
-    pass
-
-@spectrum.command(
-    "create",
-    help="Create pseudo MSMS spectra from a DIA file."
-)
-@click.argument(
-    "file_names",
-    type=click.Path(exists=True, file_okay=True, dir_okay=True),
-    required=True,
-    nargs=-1,
-)
-@click.option(
-    "--folder",
-    help="If set, the input arguments are considered folders and all .d folders in them will be processed",
-    is_flag=True,
-    default=False,
-)
-@click.option(
-    "--thread_count",
-    help="The number of threads to use. 0 for all, negative to keep available.",
-    type=int,
-    default=-1,
-    show_default=True,
-)
-def create_spectrum(file_names, folder, thread_count, **kwargs):
-    logging.basicConfig(
-        format='%(asctime)s> %(message)s',
-        datefmt="%Y-%m-%d %H:%M:%S",
-        level=logging.INFO
-    )
-    start_time = time.time()
-    try:
-        import os
-        import alphatims.bruker
-        import alphadia.preprocessing
-        import alphatims.utils
-        alphatims.utils.set_threads(thread_count)
-        # alphatims.utils.set_logger()
-        if folder:
-            dia_file_names = []
-            for directory in file_names:
-                for file_name in os.listdir(directory):
-                    full_file_name = os.path.join(directory, file_name)
-                    dia_file_names.append(full_file_name)
-        else:
-            dia_file_names = file_names
-        for dia_file_name in dia_file_names:
-            if not dia_file_name.endswith(".d") or os.path.isfile(dia_file_name):
-                continue
-            dia_data = alphatims.bruker.TimsTOF(dia_file_name)
-            preprocessing_workflow = alphadia.preprocessing.Workflow()
-            preprocessing_workflow.set_dia_data(dia_data)
-            preprocessing_workflow.run_default()
-            # preprocessing_workflow.save_to_hdf()
-            # preprocessing_workflow.load_from_hdf()
-            preprocessing_workflow.msms_generator.write_to_hdf_file()
-    except Exception:
-        logging.exception("Something went wrong, execution incomplete!")
-    else:
-        logging.info(
-            f"Analysis done in {time.time() - start_time:.2f} seconds."
-        )
-
-
-
-@spectrum.command(
-    "annotate",
-    help="Annotate pseudo MSMS spectra from a DIA file."
-)
-@click.argument(
-    "library_file_name",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False),
-    required=True,
-)
-@click.argument(
-    "file_names",
-    type=click.Path(exists=True, file_okay=True, dir_okay=True),
-    required=True,
-    nargs=-1,
-)
-@click.option(
-    "--folder",
-    help="If set, the input arguments are considered folders and all .d folders in them will be processed",
-    is_flag=True,
-    default=False,
-)
-@click.option(
-    "--thread_count",
-    help="The number of threads to use. 0 for all, negative to keep available.",
-    type=int,
-    default=-1,
-    show_default=True,
-)
-def annotate_spectrum(library_file_name, file_names, folder, thread_count, **kwargs):
-    logging.basicConfig(
-        format='%(asctime)s> %(message)s',
-        datefmt="%Y-%m-%d %H:%M:%S",
-        level=logging.INFO
-    )
-    start_time = time.time()
-    try:
-        import os
-        import alphatims.utils
-        import alphadia.annotation
-        import alphabase.io.hdf
-        import numpy as np
-        import pandas as pd
-        alphatims.utils.set_threads(thread_count)
-        # alphatims.utils.set_logger()
-        if folder:
-            dia_file_names = []
-            for directory in file_names:
-                for file_name in os.listdir(directory):
-                    full_file_name = os.path.join(directory, file_name)
-                    dia_file_names.append(full_file_name)
-        else:
-            dia_file_names = file_names
-        library = None
-        for dia_file_name in dia_file_names:
-            print(dia_file_name)
-            if not (dia_file_name.endswith("pseudo_spectra.hdf") and os.path.isfile(dia_file_name)):
-                continue
-            if library is None:
-                library = alphadia.annotation.library.Library()
-                library.import_from_file(
-                    # "/mnt/a54a8df1-78df-4788-bd29-6fca4115f5c0/software_development_data/fastas/phospho_jon/predict.speclib.hdf",
-                    # "/mnt/a54a8df1-78df-4788-bd29-6fca4115f5c0/software_development_data/fastas/diann_entrapment/predict.speclib.hdf",
-                    library_file_name,
-                    is_already_mmapped=False,
-                )
-                all_seqs = library.lib.library.mod_seq_df.sequence.values
-                all_mods = library.lib.library.mod_seq_df.mods.values
-                all_mod_sites = library.lib.library.mod_seq_df.mod_sites.values
-                all_msch = library.lib.library.mod_seq_df.mod_seq_charge_hash.values
-                all_prot_idxs = library.lib.library.mod_seq_df.protein_idxes.values
-                all_prots = library.lib.library.protein_df.values
-            hdf = alphabase.io.hdf.HDF_File(
-                # f"{self.dia_data.sample_name}.pseudo_spectra.hdf",
-                dia_file_name,
-            #     read_only=False,
-            )
-            fragment_df = hdf.fragments.values
-            precursor_df = hdf.precursors.values
-            annotator = alphadia.annotation.Annotator()
-            # annotator.set_preprocessor(preprocessing_workflow)
-            annotator.set_ions(precursor_df, fragment_df)
-            annotator.set_library(library)
-            annotator.run_default()
-            mod_seq_charge_hash_pointer = annotator.percolator.score_df.original_index.values
-            annotator.percolator.score_df["sequence"] = all_seqs[mod_seq_charge_hash_pointer].astype('U')
-            annotator.percolator.score_df["mods"] = all_mods[mod_seq_charge_hash_pointer].astype('U')
-            annotator.percolator.score_df["mod_sites"] = all_mod_sites[mod_seq_charge_hash_pointer].astype('U')
-            annotator.percolator.score_df["protein_idxs"] = all_prot_idxs[mod_seq_charge_hash_pointer].astype('U')
-            vals = annotator.percolator.score_df.protein_idxs.str.split(";", expand=True)[0]
-            # Might be more
-            annotated_prots = all_prots.iloc[vals.values.astype(np.int64)].reset_index(drop=True)
-            annotator.percolator.score_df = pd.concat([annotator.percolator.score_df, annotated_prots], axis=1)
-            annotator.percolator.score_df["unique_protein"] = ~annotator.percolator.score_df.protein_idxs.str.contains(";").values
-            annotations = annotator.percolator.score_df
-            annotations = annotator.percolator.score_df[
-                annotator.percolator.score_df.target_type==1
-            #     :
-            ]
-
-            annotations.to_csv(
-            #     f"{dia_data.bruker_d_folder_name[:-2]}_annotation.csv",
-                f"{hdf.file_name[:-4]}_annotation.csv",
-                index=False,
-            )
-    except Exception:
-        logging.exception("Something went wrong, execution incomplete!")
-    else:
-        logging.info(
-            f"Analysis done in {time.time() - start_time:.2f} seconds."
-        )
+        logging.exception(e)
\ No newline at end of file
diff --git a/alphadia/extraction/data/bruker.py b/alphadia/data/bruker.py
similarity index 98%
rename from alphadia/extraction/data/bruker.py
rename to alphadia/data/bruker.py
index c44eea66..df7af14b 100644
--- a/alphadia/extraction/data/bruker.py
+++ b/alphadia/data/bruker.py
@@ -1,24 +1,21 @@
 # native imports
 import math
 import os
+import logging
+logger = logging.getLogger()
 
 # alphadia imports
-from alphadia.extraction.numba import numeric
-from alphadia.extraction import utils
+from alphadia import utils
 
 # alpha family imports
-
-# third party imports
-import h5py
-
 import alphatims.utils
 import alphatims.bruker
 import alphatims.tempmmap as tm
+
+# third party imports
 import numpy as np
 import numba as nb
-import logging
 from numba.core import types
-from numba.typed import Dict
 from numba.experimental import jitclass
 
 class TimsTOFTranspose(alphatims.bruker.TimsTOF):
@@ -42,7 +39,7 @@ def __init__(self,
 
         if bruker_d_folder_name.endswith("/"):
             bruker_d_folder_name = bruker_d_folder_name[:-1]
-        logging.info(f"Importing data from {bruker_d_folder_name}")
+        logger.info(f"Importing data from {bruker_d_folder_name}")
         if bruker_d_folder_name.endswith(".d"):
             bruker_hdf_file_name = f"{bruker_d_folder_name[:-2]}.hdf"
             hdf_file_exists = os.path.exists(bruker_hdf_file_name)
@@ -66,7 +63,7 @@ def __init__(self,
                 )
 
                 if self._cycle.shape[0] != 1:
-                    logging.error('Unexpected cycle shape. Will only retain first frame group')
+                    logger.error('Unexpected cycle shape. Will only retain first frame group')
                     raise ValueError('Unexpected cycle shape. Will only retain first frame group')
                 
                 self.transpose()
@@ -83,7 +80,7 @@ def __init__(self,
         if not hasattr(self, "version"):
             self._version = "N.A."
         if self.version != alphatims.__version__:
-            logging.info(
+            logger.info(
                 "WARNING: "
                 f"AlphaTims version {self.version} was used to initialize "
                 f"{bruker_d_folder_name}, while the current version of "
@@ -95,22 +92,22 @@ def __init__(self,
         )
 
         # Precompile
-        logging.info(f"Successfully imported data from {bruker_d_folder_name}")
+        logger.info(f"Successfully imported data from {bruker_d_folder_name}")
 
     def transpose(self):
 
         # abort if transposed data is already present
         if hasattr(self, "_push_indices") and hasattr(self, "_tof_indptr"):
-            logging.info("Transposed data already present, aborting")
+            logger.info("Transposed data already present, aborting")
             return
         
-        logging.info('Transposing detector events')
+        logger.info('Transposing detector events')
         push_indices, tof_indptr, intensity_values = transpose(
             self._tof_indices,
             self._push_indptr,
             self._intensity_values
         )
-        logging.info('Finished transposing data')
+        logger.info('Finished transposing data')
 
         self._tof_indices = np.zeros(1, np.uint32)
         self._push_indptr = np.zeros(1, np.int64)
diff --git a/alphadia/extraction/data/thermo.py b/alphadia/data/thermo.py
similarity index 99%
rename from alphadia/extraction/data/thermo.py
rename to alphadia/data/thermo.py
index b35b2e53..ce3f5650 100644
--- a/alphadia/extraction/data/thermo.py
+++ b/alphadia/data/thermo.py
@@ -1,11 +1,15 @@
-
-import numpy as np
-import numba as nb
-import pandas as pd
+# native imports
 import math
 import os
 
-from alphadia.extraction import utils
+# alphadia imports
+from alphadia import utils
+
+# alpha family imports
+
+# third party imports
+import numpy as np
+import numba as nb
 
 def normed_auto_correlation(x):
     """Calculate the normalized auto correlation of a 1D array.
diff --git a/alphadia/dia.py b/alphadia/dia.py
deleted file mode 100644
index 7b5e7b3c..00000000
--- a/alphadia/dia.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!python
-
-
-# builtin
-import logging
-
-# import external
-import alphatims.bruker
-import numpy as np
-
-# import local
-import alphadia.library
-import alphadia.thermo
-
-
-def run_analysis(
-    dia_file_name: str,
-    alphapept_library_file_name: str,
-    output_file_name: str,
-    ppm: float,
-    rt_tolerance: float,
-    mobility_tolerance: float,
-    max_scan_difference: int,
-    max_cycle_difference: int,
-    thread_count: int,
-    fdr_rate: float,
-):
-    alphatims.utils.set_threads(thread_count)
-    logging.info("Loading DIA data")
-    if dia_file_name.endswith(".raw"):
-        dia_data = alphadia.thermo.RawFile(dia_file_name, False)
-    else:
-        dia_data = alphatims.bruker.TimsTOF(dia_file_name)
-    logging.info("Loading target library")
-    library = alphadia.library.Library(
-        alphapept_library_file_name,
-        decoy=False
-    )
-    logging.info(f"Target library contains {len(library)} targets")
-    logging.info("Loading decoy library")
-    decoy_library = alphadia.library.Library(
-        alphapept_library_file_name,
-        decoy=True
-    )
-    logging.info(f"Decoy library contains {len(decoy_library)} decoys")
-    logging.info("Scoring target library")
-    scores_df = library.score(
-        dia_data,
-        max_scan_difference=max_scan_difference,
-        max_cycle_difference=max_cycle_difference,
-        ppm=ppm,
-        rt_tolerance=rt_tolerance,  # seconds
-        mobility_tolerance=mobility_tolerance,  # 1/k0
-    )
-    logging.info("Scoring decoy library")
-    decoy_scores_df = decoy_library.score(
-        dia_data,
-        max_scan_difference=max_scan_difference,
-        max_cycle_difference=max_cycle_difference,
-        ppm=ppm,
-        rt_tolerance=rt_tolerance,  # seconds
-        mobility_tolerance=mobility_tolerance,  # 1/k0
-    )
-    logging.info("Calculating FDR")
-    fdr_df = alphadia.library.train_and_score(
-        scores_df,
-        decoy_scores_df,
-        train_fdr_level=0.5,
-        # min_train=100,
-        # plot=True,
-    )
-    reachable = np.sum(fdr_df.target)
-    hits = np.sum((fdr_df['q_value'] <= fdr_rate) & fdr_df.target.values)
-    hit_rate = hits / reachable
-    logging.info(
-        f"Found {reachable} reachable targets"
-    )
-    logging.info(
-        f"Found {hits} ({100 * hit_rate:.2f}%) targets at FDR {fdr_rate}"
-    )
-    logging.info("Exporting results")
-    fdr_df.to_csv(output_file_name)
diff --git a/alphadia/extraction/__init__.py b/alphadia/extraction/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/alphadia/extraction/calibration.py b/alphadia/extraction/calibration.py
deleted file mode 100644
index 18806fc1..00000000
--- a/alphadia/extraction/calibration.py
+++ /dev/null
@@ -1,811 +0,0 @@
-# native imports
-import os
-import logging
-import yaml 
-import typing
-import pickle
-
-# alphadia imports
-from alphadia.extraction.plotting.utils import density_scatter
-
-# alpha family imports
-import alphatims.bruker
-import alphatims.utils
-from alphabase.statistics.regression import LOESSRegression
-
-# third party imports
-import pandas as pd
-import numpy as np
-from matplotlib import pyplot as plt
-
-import sklearn.base
-from sklearn.linear_model import LinearRegression
-from sklearn.preprocessing import PolynomialFeatures
-from sklearn.pipeline import Pipeline
-
-
-class Calibration():
-    def __init__(self, 
-                name : str = '',
-                function : object = None,
-                input_columns : typing.List[str] = [],
-                target_columns : typing.List[str] = [],
-                output_columns : typing.List[str] = [],
-                transform_deviation : typing.Union[None, float] = None,
-                **kwargs):
-        """A single estimator for a property (mz, rt, etc.).
-
-        Calibration is performed by modeling the deviation of an input values (e.g. mz_library) from an observed property (e.g. mz_observed) using a function (e.g. LinearRegression). Once calibrated, calibrated values (e.g. mz_calibrated) can be predicted from input values (e.g. mz_library). Additional explaining variabels can be added to the input values (e.g. rt_library) to improve the calibration.
-
-        Parameters
-        ----------
-
-        name : str
-            Name of the estimator for logging and plotting e.g. 'mz'
-        
-        function : object
-            The estimator object instance which must have a fit and predict method.
-            This will usually be a sklearn estimator or a custom estimator.
-
-        input_columns : list of str
-            The columns of the dataframe that are used as input for the estimator e.g. ['mz_library']. 
-            The first column is the property which should be calibrated, additional columns can be used as explaining variables e.g. ['mz_library', 'rt_library'].
-
-        target_columns : list of str
-            The columns of the dataframe that are used as target for the estimator e.g. ['mz_observed'].
-            At the moment only one target column is supported.
-
-        output_columns : list of str
-            The columns of the dataframe that are used as output for the estimator e.g. ['mz_calibrated'].
-            At the moment only one output column is supported.
-        
-        transform_deviation : typing.List[Union[None, float]]
-            If set to a valid float, the deviation is expressed as a fraction of the input value e.g. 1e6 for ppm.
-            If set to None, the deviation is expressed in absolute units.
-
-        """
-        
-        self.name = name
-        self.function = function
-        self.input_columns = input_columns
-        self.target_columns = target_columns
-        self.output_columns = output_columns
-        self.transform_deviation = float(transform_deviation) if transform_deviation is not None else None
-        self.is_fitted = False
-
-    def __repr__(self) -> str:
-        return f'<Calibration {self.name}, is_fitted: {self.is_fitted}>'
-
-    def save(self, file_name: str):
-        """Save the estimator to pickle file.
-
-        Parameters
-        ----------
-
-        file_name : str
-            Path to the pickle file
-
-        """
-
-        with open(file_name, 'wb') as f:
-            pickle.dump(self, f)
-
-    def load(self, file_name: str):
-        """Load the estimator from pickle file.
-
-        Parameters
-        ----------
-
-        file_name : str
-            Path to the pickle file
-
-        """
-
-        with open(file_name, 'rb') as f:
-            loaded_calibration = pickle.load(f)
-            self.__dict__.update(loaded_calibration.__dict__)
-
-    def validate_columns(
-            self, 
-            dataframe : pd.DataFrame
-        ):
-        """Validate that the input and target columns are present in the dataframe.
-
-        Parameters
-        ----------
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        Returns
-        -------
-        bool
-            True if all columns are present, False otherwise
-
-        """
-
-        valid = True
-
-        if len(self.target_columns) > 1 :
-            logging.warning('Only one target column supported')
-            valid = False
-
-        required_columns = set(self.input_columns + self.target_columns)
-        if not required_columns.issubset(dataframe.columns):
-            logging.warning(f'{self.name}, at least one column {required_columns} not found in dataframe')
-            valid = False
-
-        return valid
-
-    def fit(
-            self, 
-            dataframe : pd.DataFrame,
-            plot : bool = False, 
-            **kwargs
-        ):
-        """Fit the estimator based on the input and target columns of the dataframe.
-
-        Parameters
-        ----------
-
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        plot : bool, default=False
-            If True, a plot of the calibration is generated.
-        
-        Returns
-        -------
-
-        np.ndarray
-            Array of shape (n_input_columns, ) containing the mean absolute deviation of the residual deviation at the given confidence interval
-
-        """
-
-        if not self.validate_columns(dataframe):
-            logging.warning(f'{self.name} calibration was skipped')
-            return
-
-        if self.function is None:
-            raise ValueError('No estimator function provided')
-
-        input_values = dataframe[self.input_columns].values
-        target_value = dataframe[self.target_columns].values
-
-        try:
-            self.function.fit(input_values, target_value)
-            self.is_fitted = True
-        except Exception as e:
-            logging.error(f'Could not fit estimator {self.name}: {e}')
-            return
-
-        if plot == True:
-            self.plot(dataframe, **kwargs)
-
-
-    def predict(self, dataframe, inplace=True):
-        """Perform a prediction based on the input columns of the dataframe.
-
-        Parameters
-        ----------
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        inplace : bool, default=True
-            If True, the prediction is added as a new column to the dataframe. If False, the prediction is returned as a numpy array.
-
-        Returns
-        -------
-        np.ndarray
-            Array of shape (n_samples, ) containing the prediction
-
-        """
-
-        if self.is_fitted == False:
-            logging.warning(f'{self.name} prediction was skipped as it has not been fitted yet')
-            return
-        
-        if not set(self.input_columns).issubset(dataframe.columns):
-            logging.warning(f'{self.name} calibration was skipped as input column {self.input_columns} not found in dataframe')
-            return
-
-        input_values = dataframe[self.input_columns].values
-        
-        if inplace:
-            dataframe[self.output_columns[0]] = self.function.predict(input_values)
-        else:
-            return self.function.predict(input_values)
-        
-    def fit_predict(
-        self,
-        dataframe : pd.DataFrame,
-        plot : bool = False,
-        inplace : bool = True
-        ):
-        """Fit the estimator and perform a prediction based on the input columns of the dataframe.
-
-        Parameters
-        ----------
-
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-        
-        plot : bool, default=False
-            If True, a plot of the calibration is generated.
-
-        inplace : bool, default=True
-            If True, the prediction is added as a new column to the dataframe. If False, the prediction is returned as a numpy array.
-
-        """
-        self.fit(dataframe, plot=plot)
-        return self.predict(dataframe, inplace=inplace)
-
-    def deviation(self, dataframe : pd.DataFrame):
-        """ Calculate the deviations between the input, target and calibrated values.
-
-        Parameters
-        ----------
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        Returns 
-        -------
-        np.ndarray 
-            Array of shape (n_samples, 3 + n_input_columns). 
-            The second dimension contains the observed deviation, calibrated deviation, residual deviation and the input values.
-        
-        """
-
-        # the first column is the unclaibrated input property
-        # all other columns are explaining variables
-        input_values = dataframe[self.input_columns].values
-
-        # the first column is the unclaibrated input property
-        uncalibrated_values = input_values[:, [0]]
-
-        # only one target column is supported
-        target_values = dataframe[self.target_columns].values[:, [0]]
-        input_transform = self.transform_deviation
-
-        calibrated_values = self.predict(dataframe, inplace=False)
-        if calibrated_values.ndim == 1:
-            calibrated_values = calibrated_values[:, np.newaxis]
-
-        # only one output column is supported
-        calibrated_dim = calibrated_values[:, [0]]
-
-        # deviation is the difference between the (observed) target value and the uncalibrated input value
-        observed_deviation = target_values - uncalibrated_values
-        if input_transform is not None:
-            observed_deviation = observed_deviation/uncalibrated_values * float(input_transform)
-
-        # calibrated deviation is the explained difference between the (calibrated) target value and the uncalibrated input value
-        calibrated_deviation = calibrated_dim - uncalibrated_values
-        if input_transform is not None:
-            calibrated_deviation = calibrated_deviation/uncalibrated_values * float(input_transform)
-
-        # residual deviation is the unexplained difference between the (observed) target value and the (calibrated) target value
-        residual_deviation = observed_deviation - calibrated_deviation
-
-        return np.concatenate([observed_deviation, calibrated_deviation, residual_deviation, input_values], axis=1)
-
-    def ci(self, dataframe, ci : float = 0.95):
-        """Calculate the residual deviation at the given confidence interval.
-
-        Parameters
-        ----------
-
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        ci : float, default=0.95
-            confidence interval
-
-        Returns
-        -------
-
-        float
-            the confidence interval of the residual deviation after calibration
-        """
-        
-        if not 0 < ci < 1:
-            raise ValueError('Confidence interval must be between 0 and 1')
-        
-        if not self.is_fitted:
-            return 0
-
-        ci_percentile = [100*(1-ci)/2, 100*(1+ci)/2]
-        
-        deviation = self.deviation(dataframe)
-        residual_deviation = deviation[:, 2]
-        return np.mean(np.abs(np.percentile(residual_deviation, ci_percentile)))
-
-    def get_transform_unit(
-            self, 
-            transform_deviation : typing.Union[None, float]
-        ):
-
-        """Get the unit of the deviation based on the transform deviation.
-        
-        Parameters
-        ----------
-        
-        transform_deviation : typing.Union[None, float]
-            If set to a valid float, the deviation is expressed as a fraction of the input value e.g. 1e6 for ppm.
-            
-        Returns
-        -------
-        str
-            The unit of the deviation
-
-        """
-        if transform_deviation is not None:
-            if np.isclose(transform_deviation,1e6):
-                return '(ppm)'
-            elif np.isclose(transform_deviation,1e2):
-                return '(%)'
-            else:
-                return f'({transform_deviation})'
-        else:
-            return '(absolute)'
-
-
-    def plot(
-            self, 
-            dataframe : pd.DataFrame, 
-            figure_path : str = None,
-            #neptune_run : str = None, 
-            #neptune_key :str = None, 
-            **kwargs
-        ):
-
-        """Plot the data and calibration model.
-
-        Parameters
-        ----------
-
-        dataframe : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        figure_path : str, default=None
-            If set, the figure is saved to the given path.
-
-        neptune_run : str, default=None
-            If set, the figure is logged to the given neptune run.
-
-        neptune_key : str, default=None
-            key under which the figure is logged to the neptune run.
-
-        """
-
-        deviation = self.deviation(dataframe)
-
-        n_input_properties = deviation.shape[1] - 3
-
-        transform_unit = self.get_transform_unit(self.transform_deviation)
-
-        fig, axs = plt.subplots(n_input_properties, 2, figsize=(6.5, 3.5*n_input_properties), squeeze=False)
-
-        for input_property in range(n_input_properties):
-
-            # plot the relative observed deviation
-            density_scatter(
-                deviation[:, 3+input_property], 
-                deviation[:, 0],
-                axis=axs[input_property, 0],  
-                s=1
-            )
-
-            # plot the calibration model 
-            x_values = deviation[:, 3+input_property]
-            y_values = deviation[:, 1]
-            order = np.argsort(x_values)
-            x_values = x_values[order]
-            y_values = y_values[order]
-
-            axs[input_property, 0].plot(x_values, y_values, color='red')
-
-            # plot the calibrated deviation
-
-            density_scatter(
-                deviation[:, 3+input_property],
-                deviation[:, 2],
-                axis=axs[input_property, 1],
-                s=1
-            )
-
-            for ax, dim in zip(axs[input_property, :],[0,2]):
-                ax.set_xlabel(self.input_columns[input_property])
-                ax.set_ylabel(f'observed deviation {transform_unit}')
-                
-                # get absolute y value and set limites to plus minus absolute y
-                y = deviation[:, dim] 
-                y_abs = np.abs(y)
-                ax.set_ylim(-y_abs.max()*1.05, y_abs.max()*1.05)
-
-        fig.tight_layout()
-
-        # log figure to neptune ai
-        #if neptune_run is not None and neptune_key is not None:
-        #    neptune_run[f'calibration/{neptune_key}'].log(fig)
-
-        #if figure_path is not None:
-            
-        #    i = 0
-        #    file_name = os.path.join(figure_path, f'calibration_{neptune_key}_{i}.png')
-        #    while os.path.exists(file_name):
-        #        file_name = os.path.join(figure_path, f'calibration_{neptune_key}_{i}.png')
-        #        i += 1
-
-        #    fig.savefig(file_name)
-            
-        plt.show()  
-
-        plt.close()
-        
-class CalibrationManager():
-
-    def __init__(
-            self,
-            config : typing.Union[None, dict] = None,
-            path : typing.Union[None, str] = None,
-            load_calibration : bool = True):
-
-        """Contains, updates and applies all calibrations for a single run.
-
-        Calibrations are grouped into calibration groups. Each calibration group is applied to a single data structure (precursor dataframe, fragment fataframe, etc.). Each calibration group contains multiple estimators which each calibrate a single property (mz, rt, etc.). Each estimator is a `Calibration` object which contains the estimator function.
-        
-        Parameters
-        ----------
-
-        config : typing.Union[None, dict], default=None
-            Calibration config dict. If None, the default config is used.
-
-        path : str, default=None
-            Path where the current parameter set is saved to and loaded from.
-
-        load_calibration : bool, default=True
-            If True, the calibration manager is loaded from the given path.
-        
-        """
-        self._is_loaded_from_file = False
-        self.estimator_groups = []
-        self.path = path
-
-        logging.info('========= Initializing Calibration Manager =========')
-
-        self.load_config(config)
-        if load_calibration:
-            self.load()
-
-        logging.info('====================================================')
-
-    @property
-    def is_loaded_from_file(self):
-        """Check if the calibration manager was loaded from file.
-        """
-        return self._is_loaded_from_file
-    
-    @property
-    def is_fitted(self):
-        """Check if all estimators in all calibration groups are fitted.
-        """
-
-        is_fitted = True
-        for group in self.estimator_groups:
-            for estimator in group['estimators']:
-                if not estimator.is_fitted:
-                    is_fitted = False
-                    break
-        
-        return is_fitted and len(self.estimator_groups) > 0
-
-    def load_config(self, config : dict):
-        """Load calibration config from config Dict.
-
-        each calibration config is a list of calibration groups which consist of multiple estimators.
-        For each estimator the `model` and `model_args` are used to request a model from the calibration_model_provider and to initialize it.
-        The estimator is then initialized with the `Calibration` class and added to the group.
-
-        Parameters
-        ----------
-
-        config : dict
-            Calibration config dict
-
-        Example
-        -------
-
-        Create a calibration manager with a single group and a single estimator:
-
-        .. code-block:: python
-
-            calibration_manager = calibration.CalibrationManager()
-            calibration_manager.load_config([{
-                'name': 'mz_calibration',
-                'estimators': [
-                    {
-                        'name': 'mz',
-                        'model': 'LOESSRegression',
-                        'model_args': {
-                            'n_kernels': 2
-                        },
-                        'input_columns': ['mz_library'],
-                        'target_columns': ['mz_observed'],
-                        'output_columns': ['mz_calibrated'],
-                        'transform_deviation': 1e6
-                    },
-                    
-                ]
-            }])
-        
-        """
-        
-        logging.info('loading calibration config')
-        logging.info(f'found {len(config)} calibration groups')
-        for group in config:
-            logging.info(f'Calibration group :{group["name"]}, found {len(group["estimators"])} estimator(s)')
-            for estimator in group['estimators']:
-                try:
-                    template = calibration_model_provider.get_model(estimator['model'])
-                    model_args = estimator['model_args'] if 'model_args' in estimator else {}
-                    estimator['function'] = template(**model_args)
-                except Exception as e:
-                    logging.error(f'Could not load estimator {estimator["name"]}: {e}')
-
-            group_copy = {'name': group['name']} 
-            group_copy['estimators'] = [Calibration(**x) for x in group['estimators']]
-            self.estimator_groups.append(group_copy)
-
-    def save(self):
-        """Save the calibration manager state to pickle file.
-        """
-        if self.path is not None:
-            with open(self.path, 'wb') as f:
-                pickle.dump(self, f)
-
-    def load(self):
-        """Load the calibration manager from pickle file.
-        """
-        if self.path is not None and os.path.exists(self.path):
-            try:
-                with open(self.path, 'rb') as f:
-                    loaded_state = pickle.load(f)
-                    self.__dict__.update(loaded_state.__dict__)
-                    self._is_loaded_from_file = True
-            except:
-                logging.warning(f'Could not load calibration manager from {self.path}')
-            else:
-                logging.info(f'Loaded calibration manager from {self.path}')
-        else:
-            logging.warning(f'Calibration manager path {self.path} does not exist')
-
-    def get_group_names(self):
-        """Get the names of all calibration groups.
-
-        Returns
-        -------
-        list of str
-            List of calibration group names
-        """
-
-        return [x['name'] for x in self.estimator_groups]
-
-    def get_group(self, group_name : str):
-        """Get the calibration group by name.
-
-        Parameters
-        ----------
-
-        group_name : str
-            Name of the calibration group
-
-        Returns
-        -------
-        dict
-            Calibration group dict with `name` and `estimators` keys\
-        
-        """
-        for group in self.estimator_groups:
-            if group['name'] == group_name:
-                return group
-
-        logging.error(f'could not get_group: {group_name}')
-        return None
-    
-    def get_estimator_names(self, group_name : str):
-        """Get the names of all estimators in a calibration group.
-
-        Parameters
-        ----------
-
-        group_name : str
-            Name of the calibration group
-
-        Returns
-        -------
-        list of str
-            List of estimator names
-        """
-
-        group = self.get_group(group_name)
-        if group is not None:
-            return [x.name for x in group['estimators']]
-        logging.error(f'could not get_estimator_names: {group_name}')
-        return None
-
-    def get_estimator(self, group_name : str, estimator_name : str):
-
-        """Get an estimator from a calibration group.
-
-        Parameters
-        ----------
-
-        group_name : str
-            Name of the calibration group
-
-        estimator_name : str
-            Name of the estimator
-
-        Returns
-        -------
-        Calibration
-            The estimator object
-
-        """
-        group = self.get_group(group_name)
-        if group is not None:
-            for estimator in group['estimators']:
-                if estimator.name == estimator_name:
-                    return estimator
-        logging.error(f'could not get_estimator: {group_name}, {estimator_name}')
-        return None
-
-    def fit(
-        self, 
-        df : pd.DataFrame, 
-        group_name : str, 
-        *args,
-        **kwargs
-        ):
-        """Fit all estimators in a calibration group.
-
-        Parameters
-        ----------
-
-        df : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        group_name : str
-            Name of the calibration group
-
-        """ 
-
-        if len(self.estimator_groups) == 0:
-            raise ValueError('No estimators defined')
-
-        group_idx = [i for i, x in enumerate(self.estimator_groups) if x['name'] == group_name]
-        if len(group_idx) == 0:
-            raise ValueError(f'No group named {group_name} found')
-        for group in group_idx:
-            for estimator in self.estimator_groups[group]['estimators']:
-                logging.info(f'calibration group: {group_name}, fitting {estimator.name} estimator ')
-                estimator.fit(df, *args, neptune_key=f'{group_name}_{estimator.name}', **kwargs)
-
-    def predict(
-            self, 
-            df : pd.DataFrame, 
-            group_name : str, 
-            *args, 
-            **kwargs):
-        
-        """Predict all estimators in a calibration group.
-
-        Parameters
-        ----------
-
-        df : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        group_name : str
-            Name of the calibration group
-
-        """
-
-        if len(self.estimator_groups) == 0:
-            raise ValueError('No estimators defined')
-
-        group_idx = [i for i, x in enumerate(self.estimator_groups) if x['name'] == group_name]
-        if len(group_idx) == 0:
-            raise ValueError(f'No group named {group_name} found')
-        for group in group_idx:
-            for estimator in self.estimator_groups[group]['estimators']:
-                logging.info(f'calibration group: {group_name}, predicting {estimator.name}')
-                estimator.predict(df, inplace=True, *args, **kwargs)
-
-    def fit_predict(
-            self,
-            df : pd.DataFrame,
-            group_name : str,
-            plot : bool = True,
-        ):
-        """Fit and predict all estimators in a calibration group.
-
-        Parameters
-        ----------
-
-        df : pandas.DataFrame
-            Dataframe containing the input and target columns
-
-        group_name : str
-            Name of the calibration group
-
-        plot : bool, default=True
-            If True, a plot of the calibration is generated.
-
-        """
-        self.fit(df, group_name, plot=plot)
-        self.predict(df, group_name)
-
-class CalibrationModelProvider:
-    def __init__(self):
-
-        """Provides a collection of scikit-learn compatible models for calibration.       
-        """
-        self.model_dict = {}
-
-    def __repr__(self) -> str:
-        string = '<CalibrationModelProvider, \n[\n'
-        for key, value in self.model_dict.items():
-            string += f' \t {key}: {value}\n'
-        string += ']>'
-        return string
-
-    def register_model(
-            self, 
-            model_name : str, 
-            model_template : sklearn.base.BaseEstimator
-        ):
-        """Register a model template with a given name.
-
-        Parameters
-        ----------
-        model_name : str
-            Name of the model
-
-        model_template : sklearn.base.BaseEstimator
-            The model template which must have a fit and predict method.
-
-        """
-        self.model_dict[model_name] = model_template
-
-    def get_model(self, model_name : str):
-        """Get a model template by name.
-
-        Parameters
-        ----------
-
-        model_name : str
-            Name of the model
-
-        Returns
-        -------
-        sklearn.base.BaseEstimator
-            The model template which must have a fit and predict method.
-
-        """
-
-        if model_name not in self.model_dict:
-            raise ValueError(f'Unknown model {model_name}')
-        else:
-            return self.model_dict[model_name]
-
-def PolynomialRegression(degree=2, include_bias=False):
-    return Pipeline([
-        ('poly', PolynomialFeatures(degree=degree, include_bias=include_bias)),
-        ('linear', LinearRegression())
-    ])
-
-calibration_model_provider = CalibrationModelProvider()
-calibration_model_provider.register_model('LinearRegression', LinearRegression)
-calibration_model_provider.register_model('LOESSRegression', LOESSRegression)
-calibration_model_provider.register_model('PolynomialRegression', PolynomialRegression)
\ No newline at end of file
diff --git a/alphadia/extraction/fdr.py b/alphadia/fdr.py
similarity index 98%
rename from alphadia/extraction/fdr.py
rename to alphadia/fdr.py
index 22ff63d0..cfdf8a28 100644
--- a/alphadia/extraction/fdr.py
+++ b/alphadia/fdr.py
@@ -1,13 +1,18 @@
+# native imports
+import os
+import logging
+logger = logging.getLogger()
+
+# alphadia imports
+
+# alpha family imports
 
+# third party imports
 import pandas as pd
 import numpy as np
 import numba as nb
-import logging
-logger = logging.getLogger()
-
 import matplotlib.pyplot as plt
-import matplotlib
-import os
+import matplotlib as mpl
 import sklearn
 
 from typing import Union, Optional, Tuple, List
@@ -329,7 +334,7 @@ def plot_fdr(
         axs.spines['top'].set_visible(False)
         axs.spines['right'].set_visible(False)
         axs.get_yaxis().set_major_formatter(
-        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
+        mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
 
     fig.tight_layout()
     plt.show()
diff --git a/alphadia/extraction/fdrexperimental.py b/alphadia/fdrexperimental.py
similarity index 87%
rename from alphadia/extraction/fdrexperimental.py
rename to alphadia/fdrexperimental.py
index 23b78bdd..cd462842 100644
--- a/alphadia/extraction/fdrexperimental.py
+++ b/alphadia/fdrexperimental.py
@@ -1,15 +1,20 @@
+# native imports
+from abc import ABC, abstractmethod
+import warnings 
+from copy import deepcopy
+import typing
+
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
 import numpy as np
-import numba as nb
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from sklearn import model_selection
-import warnings 
-from copy import deepcopy
-
-from abc import ABC, abstractmethod
-from typing import List, Tuple, Union
 
 class Classifier(ABC):
     """Abstract base class for classifiers.
@@ -120,7 +125,7 @@ def __init__(
             epochs : int = 10,
             learning_rate : float = 0.0002,
             weight_decay : float = 0.00001,
-            layers : List[int] = [100, 50, 20, 5],
+            layers : typing.List[int] = [100, 50, 20, 5],
             dropout : float = 0.001,
             metric_interval : int = 1000,
         ):
@@ -151,8 +156,8 @@ def __init__(
         weight_decay : float, default=0.00001
             Weight decay for training.
 
-        layers : List[int], default=[100, 50, 20, 5]
-            List of hidden layer sizes.
+        layers : typing.List[int], default=[100, 50, 20, 5]
+            typing.List of hidden layer sizes.
 
         dropout : float, default=0.001
             Dropout probability for training.
@@ -397,56 +402,6 @@ def predict_proba(self, x: np.ndarray):
         self.network.eval()
         return self.network(torch.Tensor(x)).detach().numpy()
 
-class FDRDataset(torch.utils.data.Dataset):
-
-    def __init__(self, 
-                df_target, 
-                df_decoy, 
-                available_columns,
-                competetive=True, 
-                group_channels=True):
-        
-        self.available_columns = available_columns
-        self.competetive = competetive
-        self.group_channels = group_channels
-
-        if competetive:
-            group_columns = ['elution_group_idx', 'channel'] if group_channels else ['elution_group_idx']
-        else:
-            group_columns = ['precursor_idx']
-
-        self.df = pd.concat([df_target.copy(), df_decoy.copy()]).sort_values(group_columns)
-        self.df['fdr_group'] = self.df.groupby(group_columns).ngroup()
-        
-        self.n_items = self.df['fdr_group'].nunique()
-
-    def __len__(self):
-        return self.n_items
-                 
-    def __getitem__(self, idx):
-        df = self.df[self.df['fdr_group'] == idx]
-
-        decoy_np = df['decoy'].values.astype(np.float32)
-        decoy_np = np.stack([decoy_np, 1-decoy_np], axis=1)
-
-        y_true = torch.tensor(decoy_np)
-
-        return (
-            torch.tensor(df[self.available_columns].values.astype(np.float32)), 
-            y_true,
-            torch.tensor(df['fdr_group'].values.astype(np.int64)),
-        )
-
-def batching_collate_fn(batch_list):
-
-    # get first elements form list of tuples
-    features, labels, groups = zip(*batch_list)
-    features = torch.concat(features)
-    labels = torch.concat(labels)
-    groups = torch.concat(groups)
-
-    return features, labels, groups
-
 class FeedForwardNN(nn.Module):
 
     def __init__(
diff --git a/alphadia/extraction/features.py b/alphadia/features.py
similarity index 99%
rename from alphadia/extraction/features.py
rename to alphadia/features.py
index cbc03701..a3eac7e7 100644
--- a/alphadia/extraction/features.py
+++ b/alphadia/features.py
@@ -1,8 +1,9 @@
 # native imports
+import time
 
 # alphadia imports
-from alphadia.extraction import utils
-from alphadia.extraction.numba import numeric
+from alphadia import utils
+from alphadia.numba import numeric
 
 # alpha family imports
 
@@ -10,8 +11,6 @@
 import numpy as np
 import numba as nb 
 
-import time
-
 @nb.njit
 def center_of_mass(
     single_dense_representation,
@@ -218,9 +217,6 @@ def center_sum_2d(
             intensity[i, j], fraction_nonzero[i, j] = s, f
     return intensity, fraction_nonzero
 
-
-
-
 @nb.njit
 def masked_mean_a0(array, mask):
     """
diff --git a/alphadia/gui.py b/alphadia/gui.py
index 231335d0..807a20ed 100644
--- a/alphadia/gui.py
+++ b/alphadia/gui.py
@@ -1,5 +1,4 @@
 #!python
 
-
 def run():
     raise NotImplementedError
diff --git a/alphadia/extraction/hybridselection.py b/alphadia/hybridselection.py
similarity index 99%
rename from alphadia/extraction/hybridselection.py
rename to alphadia/hybridselection.py
index 3a54d961..9f963049 100644
--- a/alphadia/extraction/hybridselection.py
+++ b/alphadia/hybridselection.py
@@ -1,24 +1,23 @@
-from alphadia.extraction import utils
-from alphadia.extraction.numba import fragments, numeric, config
-from alphadia.extraction import validate, utils
-from alphadia.extraction.data import bruker, thermo
-import numba as nb
-import numpy as np
-import pandas as pd
+# native imports
 import logging
-import alphatims
+logger = logging.getLogger()
 import os
 import time
+import typing
 
+# alphadia imports
+from alphadia import utils
+from alphadia.numba import fragments, numeric, config
+from alphadia import validate, utils
+from alphadia.data import bruker, thermo
 
-logger = logging.getLogger()
-if not 'progress' in dir(logger):
-    from alphadia.extraction import processlogger
-    processlogger.init_logging()
-
-#typeas union
-from typing import Union
+# alpha family imports
+import alphatims
 
+# third party imports
+import numba as nb
+import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
 from matplotlib import patches
 import matplotlib as mpl
@@ -26,9 +25,10 @@
 class GaussianFilter:
     def __init__(
             self, 
-            dia_data : Union[
+            dia_data : typing.Union[
                 bruker.TimsTOFTransposeJIT,
                 bruker.TimsTOFTranspose,
+                thermo.Thermo
             ],
             fwhm_rt : float = 10.,
             sigma_scale_rt : float = 1.,
@@ -47,7 +47,7 @@ def __init__(
         Parameters
         ----------
 
-        dia_data : Union[bruker.TimsTOFTransposeJIT, bruker.TimsTOFTranspose]
+        dia_data : typing.Union[bruker.TimsTOFTransposeJIT, bruker.TimsTOFTranspose]
             alphatims dia_data object. 
 
         fwhm_rt : float
@@ -666,7 +666,7 @@ def process(
         Parameters
         ----------
 
-        jit_data : alphadia.extraction.bruker.TimsTOFJIT
+        jit_data : alphadia.bruker.TimsTOFJIT
             TimsTOFJIT object containing the raw data
 
         kernel : np.ndarray
@@ -999,7 +999,7 @@ def __init__(self,
         Parameters
         ----------
 
-        dia_data : alphadia.extraction.data.bruker.TimsTOFDIA
+        dia_data : alphadia.data.bruker.TimsTOFDIA
             dia data object
 
         precursors_flat : pandas.DataFrame
diff --git a/alphadia/library.py b/alphadia/library.py
deleted file mode 100644
index 41c53486..00000000
--- a/alphadia/library.py
+++ /dev/null
@@ -1,1851 +0,0 @@
-#!python
-
-#import alphapept.fasta
-#import alphapept.constants
-import alphatims.utils
-import alphatims.bruker
-# import alphatims.plotting
-import numpy as np
-import pandas as pd
-# import holoviews as hv
-import logging
-import sklearn.model_selection
-import sklearn.preprocessing
-import sklearn.ensemble
-import sklearn.pipeline
-import sklearn.model_selection
-import functools
-import scipy.ndimage.filters
-
-
-class Library(object):
-
-    def __len__(self):
-        return len(self.peptide_data)
-
-    def __init__(self, alphapept_hdf_file_name, decoy=False, decoy_style=""):
-        self.file_name = alphapept_hdf_file_name
-        ms_library = alphapept.io.MS_Data_File(alphapept_hdf_file_name)
-        self.ion_data = ms_library.read(dataset_name="ions")
-        pep_df = ms_library.read(dataset_name="peptide_fdr")
-        self.decoy = decoy
-        self.peptide_data = pep_df[pep_df.target_precursor]
-        self.peptide_data.reset_index(drop=True, inplace=True)
-        self.ion_data.reset_index(drop=True, inplace=True)
-        self.convert_to_peptide_arrays(decoy_style)
-        self.convert_to_peptide_dict()
-
-    def convert_to_peptide_dict(self):
-        self.peptide_dict = []
-        for i, sequence in enumerate(self.peptide_sequences):
-            offset_start = self.peptide_offsets[i]
-            offset_end = self.peptide_offsets[i + 1]
-            peptide = {
-                "sequence": sequence,
-                "mz": self.peptide_mzs[i],
-                "mobility": self.peptide_mobilities[i],
-                "rt": self.peptide_rt_apex[i],  # seconds
-                "charge": self.peptide_charges[i],
-                "fragment_mzs": self.peptide_fragment_mzs[offset_start: offset_end],
-                "fragment_intensities": self.peptide_fragment_intensities[offset_start: offset_end],
-                "fragment_loss_types": self.peptide_fragment_loss_types[offset_start: offset_end],
-                "fragment_ion_types": self.peptide_fragment_ion_types[offset_start: offset_end],
-            }
-            self.peptide_dict.append(peptide)
-
-    def convert_to_peptide_arrays(self, decoy_style=""):
-        self.peptide_rt_apex = self.peptide_data.rt_apex.values * 60
-        self.peptide_mzs = self.peptide_data.mz.values
-        try:
-            self.peptide_mobilities = self.peptide_data.mobility.values
-        except AttributeError:
-            self.peptide_data["mobility"] = 0
-            self.peptide_mobilities = self.peptide_data.mobility.values
-        self.peptide_charges = self.peptide_data.charge.values
-        self.peptide_sequences = self.peptide_data.sequence.values
-        self.peptide_lengths = self.peptide_data.n_AA.values
-        self.peptide_offsets = np.empty(
-            len(self.peptide_data) + 1,
-            dtype=np.int64
-        )
-        self.ion_count = self.peptide_data.n_ions.values
-        self.peptide_offsets[0] = 0
-        self.peptide_offsets[1:] = np.cumsum(self.ion_count)
-        self.peptide_fragment_mzs = np.empty(
-            self.peptide_offsets[-1],
-            dtype=np.float64
-        )
-        self.peptide_fragment_ion_types = np.empty(
-            self.peptide_offsets[-1],
-            dtype=np.int8
-        )
-        self.peptide_fragment_loss_types = np.empty(
-            self.peptide_offsets[-1],
-            dtype=np.int8
-        )
-        self.peptide_fragment_intensities = np.empty(
-            self.peptide_offsets[-1],
-            dtype=np.float64
-        )
-
-        loss_list = [0, 18.01056468346, 17.03052] #H2O, NH3
-        for i, sequence_string in alphatims.utils.progress_callback(
-            enumerate(self.peptide_sequences),
-            total=len(self.peptide_sequences)
-        ):
-            start = self.peptide_data.ion_idx.values[i]
-            end = start + self.ion_count[i]
-            offset_start = self.peptide_offsets[i]
-            offset_end = self.peptide_offsets[i + 1]
-            intensities = self.ion_data.ion_int.values[start: end]
-            loss_types = self.ion_data.ion_type.values[start: end].astype(np.int64)
-            ion_types = self.ion_data.ion_index.values[start: end].astype(np.int64)
-            if self.decoy:
-                decoy_sequence = alphapept.fasta.parse(sequence_string)
-                if decoy_style == "DIA-NN":
-                    original = "GAVLIFMPWSCTYHKRQEND"
-                    mutated = "LLLVVLLLLTSSSSLLNDQE"
-                    decoy_sequence[1] = alphapept.fasta.parse(
-                        mutated[original.index(decoy_sequence[1][-1])]
-                    )[0]
-                    decoy_sequence[-2] = alphapept.fasta.parse(
-                        mutated[original.index(decoy_sequence[-2][-1])]
-                    )[0]
-                else:
-                    decoy_sequence[:-1] = decoy_sequence[:-1][::-1]
-                self.peptide_sequences[i] = "".join(decoy_sequence)
-                mzs_, types_ = alphapept.fasta.get_fragmass(
-                    decoy_sequence,
-                    alphapept.constants.mass_dict
-                )
-                mz_dict = dict(zip(types_, mzs_))
-                mzs = [
-                    mz_dict[ion_type] - loss_list[loss_type] for (
-                        loss_type,
-                        ion_type,
-                    ) in zip(loss_types, ion_types)
-                ]
-                mzs = np.array(mzs)
-            else:
-                mzs = self.ion_data.db_mass.values[start: end]
-            order = np.argsort(mzs)
-            self.peptide_fragment_mzs[offset_start: offset_end] = mzs[order]
-            self.peptide_fragment_intensities[offset_start: offset_end] = intensities[order]
-            self.peptide_fragment_loss_types[offset_start: offset_end] = loss_types[order]
-            self.peptide_fragment_ion_types[offset_start: offset_end] = ion_types[order]
-
-    @functools.lru_cache(2)
-    def get_tolerance_arrays(
-        self,
-        dia_data,
-        ppm,
-        rt_tolerance,  # seconds
-        mobility_tolerance,  # 1/k0
-    ):
-        precursor_frame_slices = np.stack(
-            [
-                dia_data.convert_to_indices(
-                    self.peptide_rt_apex - rt_tolerance,
-                    return_frame_indices=True
-                ),
-                dia_data.convert_to_indices(
-                    self.peptide_rt_apex + rt_tolerance,
-                    return_frame_indices=True
-                ),
-                np.repeat(1, len(self.peptide_rt_apex))
-            ]
-        ).T.astype(np.int64)
-        precursor_scan_slices = np.stack(
-            [
-                dia_data.convert_to_indices(
-                    self.peptide_mobilities + mobility_tolerance,
-                    return_scan_indices=True
-                ),
-                dia_data.convert_to_indices(
-                    self.peptide_mobilities - mobility_tolerance,
-                    return_scan_indices=True
-                ),
-                np.repeat(1, len(self.peptide_mobilities))
-            ]
-        ).T.astype(np.int64)
-        precursor_tof_slices = np.stack(
-            [
-                dia_data.convert_to_indices(
-                    self.peptide_mzs / (1 + ppm / 10**6),
-                    return_tof_indices=True
-                ),
-                dia_data.convert_to_indices(
-                    self.peptide_mzs * (1 + ppm / 10**6),
-                    return_tof_indices=True
-                ),
-                np.repeat(1, len(self.peptide_mzs))
-            ]
-        ).T.astype(np.int64)
-        precursor_mz_slices = np.stack(
-            [
-                self.peptide_mzs / (1 + ppm / 10**6),
-                self.peptide_mzs * (1 + ppm / 10**6),
-            ]
-        ).T
-        fragment_tof_slices = np.stack(
-            [
-                dia_data.convert_to_indices(
-                    self.peptide_fragment_mzs / (1 + ppm / 10**6),
-                    return_tof_indices=True
-                ),
-                dia_data.convert_to_indices(
-                    self.peptide_fragment_mzs * (1 + ppm / 10**6),
-                    return_tof_indices=True
-                ),
-                np.repeat(1, len(self.peptide_fragment_mzs))
-            ]
-        ).T.astype(np.int64)
-        return (
-            precursor_frame_slices,
-            precursor_scan_slices,
-            precursor_tof_slices,
-            precursor_mz_slices,
-            fragment_tof_slices,
-        )
-
-    def score(
-        self,
-        dia_data,
-        max_scan_difference=3,
-        max_cycle_difference=2,
-        ppm=50,
-        rt_tolerance=30,  # seconds
-        mobility_tolerance=0.05,  # 1/k0
-        selection: np.ndarray = None,
-        return_as_df: bool = True,
-        blur_sigma: int = 3,
-        score_features=(
-            "push_apex",
-            "push_library_cosine_peak_mask_len",
-            "push_library_cosine_peak_mask_apex",
-            "push_library_cosine_peak_mask_push_library_cosine_best",
-            "push_library_cosine_peak_mask_push_library_cosine_worst",
-            "push_library_cosine_peak_mask_smooth_push_library_cosine_best",
-            "push_library_cosine_peak_mask_smooth_push_library_cosine_worst",
-            "push_library_cosine_peak_mask_push_intensities_best",
-            "push_library_cosine_peak_mask_push_intensities_worst",
-            "push_library_cosine_peak_mask_smooth_push_intensities_best",
-            "push_library_cosine_peak_mask_smooth_push_intensities_worst",
-            "push_library_cosine_peak_mask_normalized_push_intensities_best",
-            "push_library_cosine_peak_mask_normalized_push_intensities_worst",
-            "push_library_cosine_peak_mask_normalized_smooth_push_intensities_best",
-            "push_library_cosine_peak_mask_normalized_smooth_push_intensities_worst",
-            "push_library_cosine_peak_mask_push_weights_best",
-            "push_library_cosine_peak_mask_push_weights_worst",
-            "push_library_cosine_peak_mask_smooth_push_weights_best",
-            "push_library_cosine_peak_mask_smooth_push_weights_worst",
-            "smooth_push_library_cosine_peak_mask_len",
-            "smooth_push_library_cosine_peak_mask_apex",
-            "smooth_push_library_cosine_peak_mask_push_library_cosine_best",
-            "smooth_push_library_cosine_peak_mask_push_library_cosine_worst",
-            "smooth_push_library_cosine_peak_mask_smooth_push_library_cosine_best",
-            "smooth_push_library_cosine_peak_mask_smooth_push_library_cosine_worst",
-            "smooth_push_library_cosine_peak_mask_push_intensities_best",
-            "smooth_push_library_cosine_peak_mask_push_intensities_worst",
-            "smooth_push_library_cosine_peak_mask_smooth_push_intensities_best",
-            "smooth_push_library_cosine_peak_mask_smooth_push_intensities_worst",
-            "smooth_push_library_cosine_peak_mask_normalized_push_intensities_best",
-            "smooth_push_library_cosine_peak_mask_normalized_push_intensities_worst",
-            "smooth_push_library_cosine_peak_mask_normalized_smooth_push_intensities_best",
-            "smooth_push_library_cosine_peak_mask_normalized_smooth_push_intensities_worst",
-            "smooth_push_library_cosine_peak_mask_push_weights_best",
-            "smooth_push_library_cosine_peak_mask_push_weights_worst",
-            "smooth_push_library_cosine_peak_mask_smooth_push_weights_best",
-            "smooth_push_library_cosine_peak_mask_smooth_push_weights_worst",
-            "push_intensities_peak_mask_len",
-            "push_intensities_peak_mask_apex",
-            "push_intensities_peak_mask_push_library_cosine_best",
-            "push_intensities_peak_mask_push_library_cosine_worst",
-            "push_intensities_peak_mask_smooth_push_library_cosine_best",
-            "push_intensities_peak_mask_smooth_push_library_cosine_worst",
-            "push_intensities_peak_mask_push_intensities_best",
-            "push_intensities_peak_mask_push_intensities_worst",
-            "push_intensities_peak_mask_smooth_push_intensities_best",
-            "push_intensities_peak_mask_smooth_push_intensities_worst",
-            "push_intensities_peak_mask_normalized_push_intensities_best",
-            "push_intensities_peak_mask_normalized_push_intensities_worst",
-            "push_intensities_peak_mask_normalized_smooth_push_intensities_best",
-            "push_intensities_peak_mask_normalized_smooth_push_intensities_worst",
-            "push_intensities_peak_mask_push_weights_best",
-            "push_intensities_peak_mask_push_weights_worst",
-            "push_intensities_peak_mask_smooth_push_weights_best",
-            "push_intensities_peak_mask_smooth_push_weights_worst",
-            "smooth_push_intensities_peak_mask_len",
-            "smooth_push_intensities_peak_mask_apex",
-            "smooth_push_intensities_peak_mask_push_library_cosine_best",
-            "smooth_push_intensities_peak_mask_push_library_cosine_worst",
-            "smooth_push_intensities_peak_mask_smooth_push_library_cosine_best",
-            "smooth_push_intensities_peak_mask_smooth_push_library_cosine_worst",
-            "smooth_push_intensities_peak_mask_push_intensities_best",
-            "smooth_push_intensities_peak_mask_push_intensities_worst",
-            "smooth_push_intensities_peak_mask_smooth_push_intensities_best",
-            "smooth_push_intensities_peak_mask_smooth_push_intensities_worst",
-            "smooth_push_intensities_peak_mask_normalized_push_intensities_best",
-            "smooth_push_intensities_peak_mask_normalized_push_intensities_worst",
-            "smooth_push_intensities_peak_mask_normalized_smooth_push_intensities_best",
-            "smooth_push_intensities_peak_mask_normalized_smooth_push_intensities_worst",
-            "smooth_push_intensities_peak_mask_push_weights_best",
-            "smooth_push_intensities_peak_mask_push_weights_worst",
-            "smooth_push_intensities_peak_mask_smooth_push_weights_best",
-            "smooth_push_intensities_peak_mask_smooth_push_weights_worst",
-            "push_weights_peak_mask_len",
-            "push_weights_peak_mask_apex",
-            "push_weights_peak_mask_push_library_cosine_best",
-            "push_weights_peak_mask_push_library_cosine_worst",
-            "push_weights_peak_mask_smooth_push_library_cosine_best",
-            "push_weights_peak_mask_smooth_push_library_cosine_worst",
-            "push_weights_peak_mask_push_intensities_best",
-            "push_weights_peak_mask_push_intensities_worst",
-            "push_weights_peak_mask_smooth_push_intensities_best",
-            "push_weights_peak_mask_smooth_push_intensities_worst",
-            "push_weights_peak_mask_normalized_push_intensities_best",
-            "push_weights_peak_mask_normalized_push_intensities_worst",
-            "push_weights_peak_mask_normalized_smooth_push_intensities_best",
-            "push_weights_peak_mask_normalized_smooth_push_intensities_worst",
-            "push_weights_peak_mask_push_weights_best",
-            "push_weights_peak_mask_push_weights_worst",
-            "push_weights_peak_mask_smooth_push_weights_best",
-            "push_weights_peak_mask_smooth_push_weights_worst",
-            "smooth_push_weights_peak_mask_len",
-            "smooth_push_weights_peak_mask_apex",
-            "smooth_push_weights_peak_mask_push_library_cosine_best",
-            "smooth_push_weights_peak_mask_push_library_cosine_worst",
-            "smooth_push_weights_peak_mask_smooth_push_library_cosine_best",
-            "smooth_push_weights_peak_mask_smooth_push_library_cosine_worst",
-            "smooth_push_weights_peak_mask_push_intensities_best",
-            "smooth_push_weights_peak_mask_push_intensities_worst",
-            "smooth_push_weights_peak_mask_smooth_push_intensities_best",
-            "smooth_push_weights_peak_mask_smooth_push_intensities_worst",
-            "smooth_push_weights_peak_mask_normalized_push_intensities_best",
-            "smooth_push_weights_peak_mask_normalized_push_intensities_worst",
-            "smooth_push_weights_peak_mask_normalized_smooth_push_intensities_best",
-            "smooth_push_weights_peak_mask_normalized_smooth_push_intensities_worst",
-            "smooth_push_weights_peak_mask_push_weights_best",
-            "smooth_push_weights_peak_mask_push_weights_worst",
-            "smooth_push_weights_peak_mask_smooth_push_weights_best",
-            "smooth_push_weights_peak_mask_smooth_push_weights_worst",
-            "push_indices_count",
-            "raw_indices_count",
-        ),
-    ):
-        (
-            precursor_frame_slices,
-            precursor_scan_slices,
-            precursor_tof_slices,
-            precursor_mz_slices,
-            fragment_tof_slices,
-        ) = self.get_tolerance_arrays(
-            dia_data,
-            ppm=ppm,
-            rt_tolerance=rt_tolerance,  # seconds
-            mobility_tolerance=mobility_tolerance,  # 1/k0
-        )
-        if selection is None:
-            selection = range(len(self))
-            selection_slice = ...
-            score_features_ = {
-                feature: np.zeros(len(selection)) for feature in score_features
-            }
-            update_score_features = False
-        else:
-            selection_slice = selection
-            try:
-                score_features_ = {
-                    feature: {
-                        s: 0 for s in selection
-                    } for feature in score_features
-                }
-                update_score_features = 2
-            except TypeError:
-                score_features_ = {
-                    feature: {
-                        selection: 0
-                    } for feature in score_features
-                }
-                update_score_features = True
-        result = process_library_peptide(
-            selection,
-            self,
-            score_features_,
-            dia_data,
-            precursor_frame_slices,
-            precursor_scan_slices,
-            precursor_tof_slices,
-            precursor_mz_slices,
-            fragment_tof_slices,
-            max_scan_difference,
-            max_cycle_difference,
-            blur_sigma,
-        )
-        if not return_as_df:
-            if result is not None:
-                return result
-            else:
-                return score_features_
-        if update_score_features:
-            try:
-                score_features__ = {
-                    feature: np.zeros(len(selection)) for feature in score_features_
-                }
-                position_dict = {pos: i for i, pos in enumerate(selection)}
-            except TypeError:
-                score_features__ = {
-                    feature: np.zeros(1) for feature in score_features_
-                }
-                position_dict = {selection: 0}
-            for feature, score_dict in score_features_.items():
-                for position, score in score_dict.items():
-                    score_features__[feature][position_dict[position]] = score
-            score_features_ = score_features__
-        rts = dia_data.rt_values[
-            score_features_["push_apex"].astype(np.int64) // dia_data.scan_max_index
-        ]
-        mobilities = dia_data.mobility_values[
-            score_features_["push_apex"].astype(np.int64) % dia_data.scan_max_index
-        ]
-        score_df = pd.DataFrame(
-            {
-                "library_id": selection,
-                "peptide_sequence": self.peptide_sequences[selection_slice],
-                "peptide_mz": self.peptide_mzs[selection_slice],
-                "peptide_mobility": self.peptide_mobilities[selection_slice],
-                "peptide_rt_min": self.peptide_rt_apex[selection_slice] / 60,
-                "peptide_rt": self.peptide_rt_apex[selection_slice],
-                "peptide_length": self.peptide_lengths[selection_slice],
-                "peptide_charge": self.peptide_charges[selection_slice],
-                "fragment_count": self.ion_count[selection_slice],
-                "mobility_error": self.peptide_mobilities[selection_slice] - mobilities,
-                "rt_error": self.peptide_rt_apex[selection_slice] - rts,
-                "absolute_mobility_error": np.abs(
-                    self.peptide_mobilities[selection_slice] - mobilities
-                ),
-                "absolute_rt_error": np.abs(
-                    self.peptide_rt_apex[selection_slice] - rts
-                ),
-                **score_features_,
-            }
-        )
-        score_df["decoy"] = self.decoy
-        score_df["target"] = not self.decoy
-        score_df = score_df[score_df.push_indices_count > 0]
-        score_df.reset_index(drop=True, inplace=True)
-        return score_df
-
-#
-# @alphatims.utils.threadpool
-# def set_frags(
-#     peptide_index,
-#     peptide_sequences,
-#     peptide_fragment_mzs,
-#     peptide_fragment_types,
-#     peptide_offsets,
-#     decoy=False
-# ):
-#     seq_string = peptide_sequences[peptide_index]
-#     seq = alphapept.fasta.parse(seq_string)
-#     if decoy:
-#         seq[:-1] = seq[:-1][::-1]
-#     # if decoy:
-#     #     #diaNN style
-#     #     original = "GAVLIFMPWSCTYHKRQEND"
-#     #     mutated = "LLLVVLLLLTSSSSLLNDQE"
-#     #     seq[1] = alphapept.fasta.parse(
-#     #         mutated[original.index(seq[1][-1])]
-#     #     )[0]
-#     #     seq[-2] = alphapept.fasta.parse(
-#     #         mutated[original.index(seq[-2][-1])]
-#     #     )[0]
-#     # if decoy:
-#     #     seq[-2], seq[0] = seq[0], seq[-2]
-#     fragment_mzs, fragment_types = alphapept.fasta.get_fragmass(
-#         seq,
-#         alphapept.constants.mass_dict
-#     )
-#     start = peptide_offsets[peptide_index]
-#     end = peptide_offsets[peptide_index + 1]
-#     order = np.argsort(fragment_mzs)
-#     peptide_fragment_mzs[start: end] = fragment_mzs[order]
-#     peptide_fragment_types[start: end] = fragment_types[order]
-
-
-# @alphatims.utils.threadpool
-# def process_library_peptide(
-#     peptide_index,
-#     library,
-#     push_peaks,
-#     scores,
-#     dia_data,
-#     left_frame_borders,
-#     right_frame_borders,
-#     left_scan_borders,
-#     right_scan_borders,
-#     inflex_threshold,
-# ):
-#     precursor_frame_slices = library.precursor_frame_slices
-#     precursor_scan_slices = library.precursor_scan_slices
-#     precursor_tof_slices = library.precursor_tof_slices
-#     precursor_mz_slices = library.precursor_mz_slices
-#     fragment_tof_slices = library.fragment_tof_slices
-#     peptide_offsets = library.peptide_offsets
-#     precursor_indices, fragment_indices = get_peptide_raw_indices(
-#         peptide_index,
-#         precursor_frame_slices,
-#         precursor_scan_slices,
-#         precursor_tof_slices,
-#         precursor_mz_slices,
-#         fragment_tof_slices,
-#         peptide_offsets,
-#         dia_data.frame_max_index,
-#         dia_data.scan_max_index,
-#         dia_data.push_indptr,
-#         dia_data.precursor_indices,
-#         dia_data.quad_mz_values,
-#         dia_data.quad_indptr,
-#         dia_data.tof_indices,
-#         dia_data.intensity_values,
-#         dia_data.precursor_max_index,
-#     )
-# #     TODO: Score peptides
-# #     TODO: Note the GIL is not yet released for code below
-#     fragment_coordinates = dia_data.convert_from_indices(
-#         fragment_indices,
-#         return_raw_indices=True,
-#         return_frame_indices=True,
-#         return_scan_indices=True,
-#         return_quad_indices=True,
-#         return_precursor_indices=True,
-#         return_tof_indices=True,
-#         return_rt_values=True,
-#         return_mobility_values=True,
-#         return_quad_mz_values=True,
-#         return_push_indices=True,
-#         return_mz_values=True,
-#         return_intensity_values=True,
-#         raw_indices_sorted=True,
-#     )
-#     # precursor_coordinates = dia_data.convert_from_indices(
-#     #     precursor_indices,
-#     #     return_raw_indices=True,
-#     #     return_frame_indices=True,
-#     #     return_scan_indices=True,
-#     #     return_quad_indices=True,
-#     #     return_precursor_indices=True,
-#     #     return_tof_indices=True,
-#     #     return_rt_values=True,
-#     #     return_mobility_values=True,
-#     #     return_quad_mz_values=True,
-#     #     return_push_indices=True,
-#     #     return_mz_values=True,
-#     #     return_intensity_values=True,
-#     #     raw_indices_sorted=True,
-#     # )
-#     (
-#         push_peak,
-#         score,
-#         hit_matrix,
-#         unique_push_indices,
-#         bpi,
-#         left_frame_border,
-#         right_frame_border,
-#         left_scan_border,
-#         right_scan_border,
-#     ) = get_apex(
-#         peptide_index,
-#         fragment_tof_slices,
-#         peptide_offsets,
-#         fragment_coordinates["tof_indices"],
-#         fragment_coordinates["push_indices"],
-#         fragment_coordinates["intensity_values"],
-#         dia_data.scan_max_index,
-#         inflex_threshold,
-#     )
-#     left_frame_borders[peptide_index] = left_frame_border
-#     right_frame_borders[peptide_index] = right_frame_border
-#     left_scan_borders[peptide_index] = left_scan_border
-#     right_scan_borders[peptide_index] = right_scan_border
-#     push_peaks[peptide_index] = push_peak
-#     scores[peptide_index] = score
-#     return (
-#         push_peak,
-#         score,
-#         hit_matrix,
-#         unique_push_indices,
-#         bpi,
-#         left_frame_border,
-#         right_frame_border,
-#         left_scan_border,
-#         right_scan_border,
-#         fragment_indices,
-#         precursor_indices,
-#     )
-
-
-# @alphatims.utils.njit(nogil=True)
-# def get_peptide_raw_indices(
-#     peptide_index,
-#     precursor_frame_slices,
-#     precursor_scan_slices,
-#     precursor_tof_slices,
-#     precursor_mz_slices,
-#     fragment_tof_slices,
-#     peptide_offsets,
-#     frame_max_index,
-#     scan_max_index,
-#     push_indptr,
-#     precursor_indices,
-#     quad_mz_values,
-#     quad_indptr,
-#     tof_indices,
-#     intensities,
-#     precursor_max_index,
-# ):
-#     frames = precursor_frame_slices[peptide_index].copy().reshape((1,3))
-#     scans = precursor_scan_slices[peptide_index].copy().reshape((1,3))
-#     tofs = precursor_tof_slices[peptide_index].copy().reshape((1,3))
-#     precursor_indices_ = alphatims.bruker.filter_indices(
-#         frame_slices=frames,
-#         scan_slices=scans,
-#         precursor_slices=np.array([[0, 1, 1]]),
-#         tof_slices=tofs,
-#         quad_slices=np.array([[-np.inf, np.inf]]),
-#         intensity_slices=np.array([[-np.inf, np.inf]]),
-#         frame_max_index=frame_max_index,
-#         scan_max_index=scan_max_index,
-#         push_indptr=push_indptr,
-#         precursor_indices=precursor_indices,
-#         quad_mz_values=quad_mz_values,
-#         quad_indptr=quad_indptr,
-#         tof_indices=tof_indices,
-#         intensities=intensities,
-#     )
-#     start = peptide_offsets[peptide_index]
-#     end = peptide_offsets[peptide_index + 1]
-#     fragment_indices_ = alphatims.bruker.filter_indices(
-#         frame_slices=frames,
-#         scan_slices=scans,
-#         precursor_slices=np.array([[1, precursor_max_index, 1]]),
-#         tof_slices=fragment_tof_slices[start: end],
-#         quad_slices=precursor_mz_slices[peptide_index].copy().reshape((1,2)),
-# #         quad_slices=np.array([[-np.inf, np.inf]]),
-#         intensity_slices=np.array([[-np.inf, np.inf]]),
-#         frame_max_index=frame_max_index,
-#         scan_max_index=scan_max_index,
-#         push_indptr=push_indptr,
-#         precursor_indices=precursor_indices,
-#         quad_mz_values=quad_mz_values,
-#         quad_indptr=quad_indptr,
-#         tof_indices=tof_indices,
-#         intensities=intensities,
-#     )
-#     return precursor_indices_, fragment_indices_
-#
-#
-# @alphatims.utils.njit(nogil=True)
-# def get_apex(
-#     precursor_index,
-#     library_tof_indices,
-#     peptide_offsets,
-#     fragment_tof_indices,
-#     push_indices,
-#     intensity_values,
-#     scan_max_index,
-#     inflex_threshold,
-# ):
-#     unique_push_indices = []
-#     unique_fragment_counts = []
-#     unique_fragment_count = 1
-#     last_push_index = push_indices[0]
-#     for push_index in push_indices:
-#         if push_index != last_push_index:
-#             unique_push_indices.append(last_push_index)
-#             unique_fragment_counts.append(unique_fragment_count)
-#             last_push_index = push_index
-#             unique_fragment_count = 1
-#         else:
-#             unique_fragment_count += 1
-#     unique_push_indices.append(last_push_index)
-#     unique_fragment_counts.append(unique_fragment_count)
-#     unique_push_indices = np.array(unique_push_indices)
-#     start = peptide_offsets[precursor_index]
-#     end = peptide_offsets[precursor_index + 1]
-#     hit_matrix = np.zeros((end - start, len(unique_push_indices)))
-#     last_push_index = -1
-#     push_offset = -1
-#     bpi = np.ones((end - start, 1))
-#     for push_index, fragment_tof_index, intensity in zip(
-#         push_indices,
-#         fragment_tof_indices,
-#         intensity_values,
-#     ):
-#         if push_index != last_push_index:
-#             library_tof_index = 0
-#             last_push_index = push_index
-#             push_offset += 1
-#         while fragment_tof_index not in range(
-#             library_tof_indices[start + library_tof_index][0],
-#             library_tof_indices[start + library_tof_index][1],
-#         ):
-#             library_tof_index += 1
-#         if bpi[library_tof_index] < intensity:
-#             bpi[library_tof_index] = intensity
-#         hit_matrix[library_tof_index, push_offset] += intensity
-#     peak_index = np.argmax((hit_matrix / bpi).sum(axis=0))
-#     push_peak = unique_push_indices[peak_index]
-#
-#     scan = push_peak % scan_max_index
-#     scan_pushes = (unique_push_indices % scan_max_index)==scan
-#     scan_bpi = (
-#         hit_matrix[:, scan_pushes] / bpi
-#     ).sum(axis=0) / len(bpi)
-#     scan_bpi /= np.max(scan_bpi)
-#     left, right = get_borders(scan_bpi, inflex_threshold)
-#     scan_pushes = unique_push_indices[scan_pushes]
-#     left_frame_border = scan_pushes[left] // scan_max_index
-#     right_frame_border = scan_pushes[right] // scan_max_index
-#
-#     frame = push_peak // scan_max_index
-#     frame_pushes = (unique_push_indices // scan_max_index)==frame
-#     frame_bpi = (
-#         hit_matrix[:, frame_pushes] / bpi
-#     ).sum(axis=0) / len(bpi)
-#     frame_bpi /= np.max(frame_bpi)
-#     # TODO start from apex and move down
-#     left, right = get_borders(frame_bpi, inflex_threshold)
-#     frame_pushes = unique_push_indices[frame_pushes]
-#     left_scan_border = frame_pushes[left] % scan_max_index
-#     right_scan_border = frame_pushes[right] % scan_max_index
-#
-#     return (
-#         push_peak,
-#         np.sum(hit_matrix[:,peak_index] / bpi.ravel()),
-#         hit_matrix,
-#         unique_push_indices,
-#         bpi,
-#         left_frame_border,
-#         right_frame_border,
-#         left_scan_border,
-#         right_scan_border,
-#     )
-#
-#
-# @alphatims.utils.njit(nogil=True)
-# def get_borders(bpi, threshold):
-#     apex = np.argmax(bpi)
-#     lower = apex
-#     upper = apex
-#     while lower > 0:
-#         lower -= 1
-#         if bpi[lower] <= threshold:
-#             break
-#     while upper < (len(bpi) - 1):
-#         upper += 1
-#         if bpi[upper] <= threshold:
-#             break
-#     # lower = 0
-#     # upper = 0
-#     # last = 0
-#     # for i, current_bpi in enumerate(bpi):
-#     #     if current_bpi > threshold:
-#     #         size = i - last
-#     #         if size > (upper - lower):
-#     #             lower, upper = last, i + 1
-#     #     else:
-#     #         last = i
-#     return lower, upper
-
-
-def visualize_peptide(
-    dia_data,
-    peptide,
-    ppm=50,
-    rt_tolerance=30, #seconds
-    mobility_tolerance=0.05, #1/k0
-    heatmap=False,
-):
-    precursor_mz = peptide["mz"]
-    precursor_mobility = peptide["mobility"]
-    precursor_rt = peptide["rt"]
-    fragment_mzs = peptide["fragment_mzs"]
-    fragment_ion_types = peptide["fragment_ion_types"]
-    fragment_loss_types = peptide["fragment_loss_types"]
-    rt_slice = slice(
-        precursor_rt - rt_tolerance,
-        precursor_rt + rt_tolerance
-    )
-    im_slice = slice(
-        precursor_mobility - mobility_tolerance,
-        precursor_mobility + mobility_tolerance
-    )
-    precursor_mz_slice = slice(
-        precursor_mz / (1 + ppm / 10**6),
-        precursor_mz * (1 + ppm / 10**6)
-    )
-    precursor_indices = dia_data[
-        rt_slice,
-        im_slice,
-        0, #index 0 means that the quadrupole is not used
-        precursor_mz_slice,
-        "raw"
-    ]
-    if heatmap:
-        precursor_heatmap = alphatims.plotting.heatmap(
-            dia_data.as_dataframe(precursor_indices),
-            x_axis_label="rt",
-            y_axis_label="mobility",
-            title="precursor",
-            width=250,
-            height=250
-        )
-        overlay = precursor_heatmap
-    else:
-        precursor_xic = alphatims.plotting.line_plot(
-            dia_data,
-            precursor_indices,
-            x_axis_label="rt",
-            width=900,
-            remove_zeros=True,
-            label="precursor"
-        )
-        overlay = precursor_xic
-    for fragment_ion_type, mz, fragment_loss_type in zip(
-        fragment_ion_types,
-        fragment_mzs,
-        fragment_loss_types
-    ):
-        ion_type = f"{'y' if (np.sign(fragment_ion_type) > 0) else 'b'}"
-        ion_number = str(abs(fragment_ion_type))
-        loss_types = ["", "-H2O", "-NH3"]
-
-        fragment_name = (
-            f"{ion_type}{ion_number}{loss_types[fragment_loss_type]}"
-        )
-        fragment_mz_slice = slice(
-            mz / (1 + ppm / 10**6),
-            mz * (1 + ppm / 10**6)
-        )
-        fragment_indices = dia_data[
-            rt_slice,
-            im_slice,
-            precursor_mz_slice,
-            fragment_mz_slice,
-            "raw"
-        ]
-        if len(fragment_indices) > 0:
-            if heatmap:
-                fragment_heatmap = alphatims.plotting.heatmap(
-                    dia_data.as_dataframe(fragment_indices),
-                    x_axis_label="rt",
-                    y_axis_label="mobility",
-                    title=f"{fragment_name}: {mz:.3f}",
-                    width=250,
-                    height=250,
-                )
-                overlay += fragment_heatmap
-            else:
-                fragment_xic = alphatims.plotting.line_plot(
-                    dia_data,
-                    fragment_indices,
-                    x_axis_label="rt",
-                    width=900,
-                    remove_zeros=True,
-                    label=fragment_name,
-                )
-                overlay *= fragment_xic.opts(muted=True)
-    if not heatmap:
-        overlay.opts(hv.opts.Overlay(legend_position='bottom'))
-        overlay.opts(hv.opts.Overlay(click_policy='mute'))
-        overlay = overlay.opts(show_legend=True)
-    return overlay.opts(
-        title=f"{peptide['sequence']}_{peptide['charge']}"
-    )
-
-
-# @alphatims.utils.njit(nogil=True)
-# def get_feature(
-#     fragment_count: int,
-#     scan_max_index: int,
-#     push_indices: np.ndarray,
-#     indptr: np.ndarray,
-#     values: np.ndarray,
-#     columns: np.ndarray,
-#     intensity_values: np.ndarray,
-# ):
-#     indptr_T = np.bincount(columns, minlength=fragment_count + 1)
-#     indptr_T[1:] = np.cumsum(indptr_T[:-1])
-#     indptr_T[0] = 0
-#     indptr_T_tmp = indptr_T.copy()
-#     values_T = np.empty(len(values))
-#     max_intensities = np.ones(fragment_count)
-#     columns_T = np.empty_like(columns)
-#     for i, push_index in enumerate(push_indices):
-#         for index in range(indptr[i], indptr[i + 1]):
-#             column = columns[index]
-#             value = values[index]
-#             intensity = intensity_values[value]
-#             offset = indptr_T_tmp[column]
-#             columns_T[offset] = push_index
-#             values_T[offset] = intensity
-#             indptr_T_tmp[column] += 1
-#             if intensity > max_intensities[column]:
-#                 max_intensities[column] = intensity
-#     for i, max_intensity in enumerate(max_intensities):
-#         start = indptr_T[i]
-#         end = indptr_T[i + 1]
-#         values_T[start: end] /= max_intensity
-#     apex_value = -1
-#     relative_intensities_list = []
-#     for i, push_index in enumerate(push_indices):
-#         ions = values[indptr[i]: indptr[i + 1]]
-#         fragments = columns[indptr[i]: indptr[i + 1]]
-#         summed_value = np.sum(intensity_values[ions] / max_intensities[fragments])
-#         relative_intensities_list.append(summed_value)
-#         if summed_value > apex_value:
-#             apex_index = i
-#             apex_value = summed_value
-#     relative_intensities = np.array(relative_intensities_list)
-#     # from matplotlib import pyplot as plt
-#     # rt_selection = push_indices // scan_max_index == push_indices[apex_index] // scan_max_index
-#     # plt.plot(
-#     #     push_indices[rt_selection],
-#     #     relative_intensities[rt_selection]
-#     # )
-#     # plt.scatter([push_indices[apex_index]], [apex_value])
-#     # for fragment in range(fragment_count):
-#     #     start = indptr_T[fragment]
-#     #     end = indptr_T[fragment + 1]
-#     #     selection = columns_T[start: end] // scan_max_index == push_indices[apex_index] // scan_max_index
-#     #     plt.plot(
-#     #         columns_T[start: end][selection],
-#     #         values_T[start: end][selection]
-#     #     )
-#     return (
-#         indptr_T,
-#         values_T,
-#         columns_T,
-#         max_intensities,
-#         push_indices[apex_index],
-#         apex_value,
-#         relative_intensities,
-#     )
-
-
-@alphatims.utils.njit(nogil=True)
-def define_connections(
-    push_indices,
-    scan_max_index,
-    max_rt,
-    max_im,
-):
-    mat = _create_push_matrix(
-        push_indices,
-        scan_max_index,
-    )
-    indptr, indices = _get_matrix_connections(
-        mat,
-        max_rt,
-        max_im,
-    )
-    return indptr, indices
-
-
-@alphatims.utils.njit(nogil=True)
-def _create_push_matrix(
-    push_indices,
-    scan_max_index,
-):
-    im = push_indices % scan_max_index
-    min_im = np.min(im)
-    max_im = np.max(im)
-    rt = push_indices // scan_max_index
-    min_rt = np.min(rt)
-    max_rt = np.max(rt)
-    shape = ((max_rt - min_rt + 1), (max_im - min_im + 1))
-    mat = np.repeat(-1, shape[0] * shape[1]).reshape(shape)
-    for i, push_index in enumerate(push_indices):
-        mat[
-            push_index // scan_max_index - min_rt,
-            push_index % scan_max_index - min_im,
-        ] = i
-    return mat
-
-
-@alphatims.utils.njit(nogil=True)
-def _get_matrix_connections(
-    mat,
-    max_rt,
-    max_im,
-):
-    indptr = [0]
-    indices = []
-    count = 0
-    for rt in range(mat.shape[0]):
-        for im in range(mat.shape[1]):
-            push_index = mat[rt, im]
-            if push_index != -1:
-                low_rt = max(0, rt - max_rt)
-                high_rt = min(mat.shape[0], rt + max_rt + 1)
-                low_im = max(0, im - max_im)
-                high_im = min(mat.shape[1], im + max_im + 1)
-                for other_rt in range(low_rt, high_rt):
-                    for other_im in range(low_im, high_im):
-                        other_push_index = mat[other_rt, other_im]
-                        if other_push_index != -1:
-                            if other_push_index != push_index:
-                                count += 1
-                                indices.append(other_push_index)
-                indptr.append(count)
-    return np.array(indptr), np.array(indices)
-
-#
-# @alphatims.utils.njit(nogil=True)
-# def transpose_pushes(
-#     fragment_count: int,
-#     push_indptr: np.ndarray,
-#     fragment_indices: np.ndarray,
-# ):
-#     fragment_indptr = np.bincount(
-#         fragment_indices,
-#         minlength=fragment_count + 1
-#     )
-#     fragment_indptr[1:] = np.cumsum(fragment_indptr[:-1])
-#     fragment_offsets = fragment_indptr[:-1].copy()
-#     fragment_indptr[0] = 0
-#     raw_pointers = np.empty_like(fragment_indices)
-#     push_pointers = np.empty_like(fragment_indices)
-#     for push_index, start in enumerate(push_indptr[:-1]):
-#         end = push_indptr[push_index + 1]
-#         for index in range(start, end):
-#             fragment_index = fragment_indices[index]
-#             offset = fragment_offsets[fragment_index]
-#             raw_pointers[offset] = index
-#             push_pointers[offset] = push_index
-#             fragment_offsets[fragment_index] += 1
-#     return fragment_indptr, raw_pointers, push_pointers
-
-
-@alphatims.utils.njit(nogil=True)
-def make_dense_matrix(
-    fragment_count: int,
-    push_indptr: np.ndarray,
-    fragment_indices: np.ndarray,
-    raw_indices: np.ndarray,
-):
-    shape = (push_indptr.shape[0] - 1, fragment_count)
-    mat = np.repeat(-1, shape[0] * shape[1]).reshape(shape)
-    offset = 0
-    for push_index, start in enumerate(push_indptr[:-1]):
-        end = push_indptr[push_index + 1]
-        for fragment_index in fragment_indices[start: end]:
-            mat[push_index, fragment_index] = raw_indices[offset]
-            offset += 1
-    return mat
-
-
-@alphatims.utils.njit(nogil=True)
-def get_intensity_matrix(
-    matrix: np.ndarray,
-    intensity_values: np.ndarray,
-    impute_value: float = 0,
-):
-    intensity_matrix = np.full(matrix.shape, impute_value, dtype=np.float64)
-    for push_index in range(matrix.shape[0]):
-        for fragment_index in range(matrix.shape[1]):
-            raw_index = matrix[push_index, fragment_index]
-            if raw_index != -1:
-                intensity_matrix[
-                    push_index,
-                    fragment_index
-                ] = intensity_values[raw_index]
-    return intensity_matrix
-
-
-@alphatims.utils.njit(nogil=True)
-def smoothen_intensity_matrix(
-    intensity_matrix: np.ndarray,
-    push_connection_indptr: np.ndarray,
-    push_connection_indices: np.ndarray,
-    multiplier: float = 2,
-):
-    smooth_intensity_matrix = np.empty_like(intensity_matrix)
-    for push_index in range(intensity_matrix.shape[0]):
-        start = push_connection_indptr[push_index]
-        end = push_connection_indptr[push_index + 1]
-        for fragment_index in range(intensity_matrix.shape[1]):
-            intensity = multiplier * intensity_matrix[push_index, fragment_index]
-            for connection in push_connection_indices[start: end]:
-                intensity += intensity_matrix[connection, fragment_index]
-            smooth_intensity_matrix[
-                push_index,
-                fragment_index
-            ] = intensity / (multiplier + end - start)
-    return smooth_intensity_matrix
-
-
-@alphatims.utils.njit(nogil=True)
-def normalize_intensity_matrix(
-    intensity_matrix: np.ndarray,
-):
-    normalized_intensity_matrix = intensity_matrix.copy()
-    for fragment_index in range(intensity_matrix.shape[1]):
-        max_intensity = np.max(intensity_matrix[:, fragment_index])
-        if max_intensity > 0:
-            normalized_intensity_matrix[:, fragment_index] /= max_intensity
-    return normalized_intensity_matrix
-
-
-# peak_descend(
-#     best_push,
-#     peak_mask,
-#     smoothed_push_intensities,
-#     push_indices,
-#     rt_lim=dia_data.precursor_max_index*1,
-#     im_lim=1,
-#     im_cycle=dia_data.scan_max_index,
-# )
-# @alphatims.utils.njit(nogil=True)
-# def peak_descend(
-#     index,
-#     peak_mask,
-#     intensities,
-#     push_indices,
-#     rt_lim,
-#     im_lim,
-#     im_cycle,
-# ):
-#     if peak_mask[index]:
-#         return
-#     peak_mask[index] = True
-#     push_index = push_indices[index]
-#     rt = push_index // im_cycle
-#     im = push_index % im_cycle
-#     intensity = intensities[index]
-#     for i, other_index in enumerate(push_indices):
-#         other_rt = other_index // im_cycle
-#         other_im = other_index % im_cycle
-#         if np.abs(other_rt - rt) > rt_lim:
-#             continue
-#         if np.abs(other_im - im) > im_lim:
-#             continue
-#         if intensities[i] < intensity:
-#             peak_descend(
-#                 i,
-#                 peak_mask,
-#                 intensities,
-#                 push_indices,
-#                 rt_lim,
-#                 im_lim,
-#                 im_cycle,
-#             )
-
-
-@alphatims.utils.njit(nogil=True)
-def fdr_to_q_values(fdr_values):
-    q_values = np.zeros_like(fdr_values)
-    min_q_value = np.max(fdr_values)
-    for i in range(len(fdr_values) - 1, -1, -1):
-        fdr = fdr_values[i]
-        if fdr < min_q_value:
-            min_q_value = fdr
-        q_values[i] = min_q_value
-    return q_values
-
-
-def get_q_values(_df, score_column, decoy_column, drop=False):
-    _df = _df.reset_index(drop=drop)
-    _df = _df.sort_values([score_column,score_column], ascending=False)
-    target_values = 1-_df['decoy'].values
-    decoy_cumsum = np.cumsum(_df['decoy'].values)
-    target_cumsum = np.cumsum(target_values)
-    fdr_values = decoy_cumsum/target_cumsum
-    _df['q_value'] = fdr_to_q_values(fdr_values)
-    return _df
-
-
-def create_common_df(
-    dia_data,
-    library,
-    score_features,
-    decoy_library,
-    decoy_score_features,
-):
-    df = {}
-    for name, library_, score_features_ in [
-        ("decoy", decoy_library, decoy_score_features),
-        ("target", library, score_features),
-    ]:
-        df[name] = pd.DataFrame(
-            {
-                "library_id": np.arange(len(library_)),
-                "peptide": library_.peptide_sequences,
-                "mz": library_.peptide_mzs,
-                "mobility": library_.peptide_mobilities,
-                "rt_min": library_.peptide_rt_apex / 60,
-                "rt": library_.peptide_rt_apex,
-                "decoy": library_.decoy,
-                "target": not library_.decoy,
-                "length": library_.peptide_lengths,
-                **score_features_,
-            }
-        )
-    df = pd.concat(df.values())
-    df = df[df.ion_count > 0]
-#     df = df[np.isfinite(df.correlation_25)]
-#     df = df[np.isfinite(df["apex_fragment_enrichment"])]
-    df.reset_index(drop=True, inplace=True)
-    return df
-
-
-def calculate_q_values(df, features, model):
-    model.fit(df[features].values, 1-df['decoy'].values)
-    df['ML_score'] = model.predict_proba(df[features].values)[:,1]
-    new_df = get_q_values(df, 'ML_score', 'decoy')
-    return new_df
-
-
-def train_RF(
-    df: pd.DataFrame,
-    features: list,
-    train_fdr_level:  float = 0.1,
-    ini_score: str = None,
-    min_train: int = 1000,
-    test_size: float = 0.8,
-    max_depth: list = [5,25,50],
-    max_leaf_nodes: list = [150,200,250],
-    n_jobs: int = -1,
-    scoring: str = 'accuracy',
-    plot: bool = False,
-    random_state: int = 42,
-) -> (sklearn.model_selection.GridSearchCV, list):
-    # Setup ML pipeline
-    scaler = sklearn.preprocessing.StandardScaler()
-    rfc = sklearn.ensemble.RandomForestClassifier(random_state=random_state) # class_weight={False:1,True:5},
-    ## Initiate scaling + classification pipeline
-    pipeline = sklearn.pipeline .Pipeline([('scaler', scaler), ('clf', rfc)])
-    parameters = {
-        'clf__max_depth': (max_depth),
-        'clf__max_leaf_nodes': (max_leaf_nodes)
-    }
-    ## Setup grid search framework for parameter selection and internal cross validation
-    cv = sklearn.model_selection.GridSearchCV(
-        pipeline,
-        param_grid=parameters,
-        cv=5,
-        scoring=scoring,
-        verbose=0,
-        return_train_score=True,
-        n_jobs=n_jobs
-    )
-    # Prepare target and decoy df
-    dfD = df[df.decoy.values]
-    # Select high scoring targets (<= train_fdr_level)
-    # df_prescore = filter_score(df)
-    # df_prescore = filter_precursor(df_prescore)
-    # scored = cut_fdr(df_prescore, fdr_level = train_fdr_level, plot=False)[1]
-    # highT = scored[scored.decoy==False]
-    # dfT_high = dfT[dfT['query_idx'].isin(highT.query_idx)]
-    # dfT_high = dfT_high[dfT_high['db_idx'].isin(highT.db_idx)]
-    if ini_score is None:
-        selection = None
-        best_hit_count = 0
-        best_feature = ""
-        for feature in features:
-            new_df = get_q_values(df, feature, 'decoy')
-            hits = (new_df['q_value'] <= train_fdr_level) & (new_df['decoy'] == 0)
-            hit_count = np.sum(hits)
-            if hit_count > best_hit_count:
-                best_hit_count = hit_count
-                selection = hits
-                best_feature = feature
-        logging.info(f'Using optimal "{best_feature}" as initial_feature')
-        dfT_high = df[selection]
-    else:
-        logging.info(f'Using selected "{ini_score}" as initial_feature')
-        new_df = get_q_values(df, ini_score, 'decoy')
-        dfT_high = df[
-            (new_df['q_value'] <= train_fdr_level) & (new_df['decoy'] == 0)
-        ]
-
-
-    # Determine the number of psms for semi-supervised learning
-    n_train = int(dfT_high.shape[0])
-    if dfD.shape[0] < n_train:
-        n_train = int(dfD.shape[0])
-        logging.info(
-            "The total number of available decoys is lower than "
-            "the initial set of high scoring targets."
-        )
-    if n_train < min_train:
-        raise ValueError(
-            "There are fewer high scoring targets or decoys than "
-            "required by 'min_train'."
-        )
-
-    # Subset the targets and decoys datasets to result in a balanced dataset
-    df_training = dfT_high.append(dfD.sample(n=n_train, random_state=random_state))
-    # df_training = dfT_high.append(dfD)
-
-    # Select training and test sets
-    X = df_training[features]
-    y = df_training['target'].astype(int)
-    (
-        X_train,
-        X_test,
-        y_train,
-        y_test
-    ) = sklearn.model_selection.train_test_split(
-        X.values,
-        y.values,
-        test_size=test_size,
-        random_state=random_state,
-        stratify=y.values
-    )
-
-    # Train the classifier on the training set via 5-fold cross-validation and subsequently test on the test set
-    logging.info(
-        'Training & cross-validation on {} targets and {} decoys'.format(
-            # np.sum(y_train), X_train.shape[0] - np.sum(y_train)
-            *np.bincount(y_train)[::-1]
-        )
-    )
-    cv.fit(X_train, y_train)
-
-    logging.info(
-        'The best parameters selected by 5-fold cross-validation were {}'.format(
-            cv.best_params_
-        )
-    )
-    logging.info(
-        'The train {} was {}'.format(scoring, cv.score(X_train, y_train))
-    )
-    logging.info(
-        'Testing on {} targets and {} decoys'.format(
-            np.sum(y_test),
-            X_test.shape[0] - np.sum(y_test)
-        )
-    )
-    logging.info(
-        'The test {} was {}'.format(scoring, cv.score(X_test, y_test))
-    )
-
-    feature_importances = cv.best_estimator_.named_steps['clf'].feature_importances_
-    indices = np.argsort(feature_importances)[::-1][:40]
-
-    top_features = X.columns[indices][:40]
-    top_score = feature_importances[indices][:40]
-
-    feature_dict = dict(zip(top_features, top_score))
-    logging.info(f"Top features {feature_dict}")
-
-    # Inspect feature importances
-    if plot:
-        import matplotlib.pyplot as plt
-        import seaborn as sns
-        g = sns.barplot(
-            y=X.columns[indices][:40],
-            x=feature_importances[indices][:40],
-            orient='h',
-            palette='RdBu'
-        )
-        g.set_xlabel("Relative importance", fontsize=12)
-        g.set_ylabel("Features", fontsize=12)
-        g.tick_params(labelsize=9)
-        g.set_title("Feature importance")
-        plt.show()
-
-    return cv
-
-
-@alphatims.utils.njit(nogil=True)
-def cosine_similarity(v1, v2s):
-    scores = []
-    for v2 in v2s:
-        sumxx, sumxy, sumyy = 0, 0, 0
-        for i in range(len(v1)):
-            x = v1[i]
-            y = v2[i]
-            sumxx += x * x
-            sumyy += y * y
-            sumxy += x * y
-        if sumyy == 0:
-            score = 0
-        else:
-            score = sumxy / np.sqrt(sumxx * sumyy)
-        scores.append(score)
-    return np.array(scores)
-
-
-# @alphatims.utils.njit(nogil=True)
-# def find_correlations(
-#     smooth_intensity_matrix,
-# ):
-#     smoothed_push_intensities = np.sum(smooth_intensity_matrix, axis=1)
-#     fwhm_pushes = np.flatnonzero(
-#         (smoothed_push_intensities > np.max(smoothed_push_intensities) / 2) #& peak_mask
-#     )
-#     corrs = np.corrcoef(smooth_intensity_matrix[fwhm_pushes])
-#     return (
-#         smoothed_push_intensities,
-#         fwhm_pushes,
-#         corrs,
-#     )
-
-
-@alphatims.utils.njit(nogil=True)
-def sum_push_intensities(
-    intensity_matrix,
-):
-    return np.sum(intensity_matrix, axis=1)
-
-
-@alphatims.utils.njit(nogil=True)
-def is_reachable_push(
-    push_value,
-    push_connection_indices,
-    push_connection_indptr,
-):
-    order = np.argsort(push_value)[::-1]
-    reachable = set([order[0]])
-    result = []
-    for push_index in order:
-        if push_index not in reachable:
-            break
-        result.append(push_index)
-        start = push_connection_indptr[push_index]
-        end = push_connection_indptr[push_index + 1]
-        reachable |= set(push_connection_indices[start: end])
-    return np.array(result)
-
-
-def train_and_score(
-    scores_df,
-    decoy_scores_df,
-    features=None,
-    exclude_features=[
-        "decoy",
-        "target",
-        "library_id",
-        "peptide_sequence",
-        # "peptide_mz",
-        # "peptide_mobility",
-        # "peptide_rt_min",
-        # "peptide_rt",
-        "max_intensity_push",
-    ],
-    train_fdr_level: float = 0.1,
-    ini_score: str = None,
-    min_train: int = 1000,
-    test_size: float = 0.8,
-    max_depth: list = [5, 25, 50],
-    max_leaf_nodes: list = [150, 200, 250],
-    n_jobs: int = -1,
-    scoring: str = 'accuracy',
-    plot: bool = False,
-    random_state: int = 42,
-):
-    df = pd.concat([decoy_scores_df, scores_df])
-    # df = df[df.ion_count > 0]
-    #     df = df[np.isfinite(df.correlation_25)]
-    #     df = df[np.isfinite(df["apex_fragment_enrichment"])]
-    df.reset_index(drop=True, inplace=True)
-    if features is None:
-        features = [
-            feature for feature in df if feature not in exclude_features
-        ]
-    cv = train_RF(
-        df,
-        features,
-        train_fdr_level=train_fdr_level,
-        ini_score=ini_score,
-        min_train=min_train,
-        test_size=test_size,
-        max_depth=max_depth,
-        max_leaf_nodes=max_leaf_nodes,
-        n_jobs=n_jobs,
-        scoring=scoring,
-        plot=plot,
-        random_state=random_state,
-    )
-    new_df = df.copy()
-    new_df['score'] = cv.predict_proba(new_df[features])[:, 1]
-    return get_q_values(new_df, "score", 'decoy')
-
-
-# def force_square_push_indptr(
-#     dia_data,
-#     push_indices,
-#     push_indptr,
-#     size=128,
-# ):
-#     push_indptr_ = np.zeros(size * size + 1, dtype=np.int64)
-#     scans = push_indices % dia_data.scan_max_index
-#     cycles = push_indices // len(dia_data.dia_mz_cycle)
-#     min_scan = scans[0]
-#     min_cycle = cycles[0]
-#     # min_frame = push_indices[0] // dia_data.scan_max_index
-#     inds = scans - min_scan + size * (cycles - min_cycle)
-#     push_indptr_[inds] = np.diff(push_indptr)
-#     push_indptr_[1:] = np.cumsum(push_indptr_[:-1])
-#     push_indptr_[0] = 0
-#     return push_indptr_
-
-
-@alphatims.utils.njit(nogil=True)
-def peak_percentile(
-    peak_mask,
-    peak_values,
-    percentiles=[0, 50, 100]
-):
-    return np.percentile(
-        peak_values[peak_mask],
-        percentiles,
-    )
-
-
-@alphatims.utils.njit(nogil=True, cache=False)
-def make_dense_cycle_matrix(
-    push_indices,
-    zeroth_frame,
-    scan_max_index,
-    dia_mz_cycle_length,
-    values,
-):
-    mobility_indices = push_indices % scan_max_index
-    cycle_indices = (
-        push_indices - zeroth_frame * scan_max_index
-    ) // dia_mz_cycle_length
-    mobility_indices -= np.min(mobility_indices)
-    cycle_indices -= np.min(cycle_indices)
-    matrix = np.zeros((np.max(mobility_indices) + 1, np.max(cycle_indices) + 1))
-    for intensity, mobility, cycle in zip(
-        values,
-        mobility_indices,
-        cycle_indices,
-    ):
-        matrix[mobility, cycle] += intensity
-    return matrix, (mobility_indices, cycle_indices)
-
-
-@alphatims.utils.njit(nogil=True, cache=False)
-def partition_dense_cycle_matrix(matrix):
-    order = np.argsort(matrix.flatten())[::-1]
-    assignments = np.zeros(matrix.shape, dtype=np.int64)
-    borders = []
-    new_assignment = 1
-#     im_indices, rt_indices = np.unravel_index(order, matrix.shape)
-    im_indices = order // matrix.shape[1]
-    rt_indices = order % matrix.shape[1]
-    im_max, rt_max = matrix.shape
-    peaks = []
-    for im_index, rt_index in zip(im_indices, rt_indices):
-#         print(im_index, rt_index)
-        assignment = assignments[im_index, rt_index]
-        neighbors = []
-        if im_index > 0:
-            neighbors.append((im_index - 1, rt_index))
-        if im_index + 1 < im_max:
-            neighbors.append((im_index + 1, rt_index))
-        if rt_index > 0:
-            neighbors.append((im_index, rt_index - 1))
-        if rt_index + 1 < rt_max:
-            neighbors.append((im_index, rt_index + 1))
-        if assignment == 0:
-            min_connected_assignment = np.inf
-            for neighbor_im_index, neighbor_rt_index in neighbors:
-                connected_assignment = assignments[neighbor_im_index, neighbor_rt_index]
-                if connected_assignment != 0:
-                    if connected_assignment < min_connected_assignment:
-                        min_connected_assignment = connected_assignment
-            if min_connected_assignment != np.inf:
-                assignment = min_connected_assignment
-            else:
-                assignment = new_assignment
-                new_assignment += 1
-                peaks.append((rt_index, im_index))
-            assignments[im_index, rt_index] = assignment
-        for neighbor_im_index, neighbor_rt_index in neighbors:
-            connected_assignment = assignments[neighbor_im_index, neighbor_rt_index]
-            if connected_assignment == 0:
-                assignments[neighbor_im_index, neighbor_rt_index] = assignment
-            elif connected_assignment != assignment:
-                borders.append((neighbor_rt_index, neighbor_im_index))
-                borders.append((rt_index, im_index))
-    return assignments, peaks, borders
-
-
-def visualize_segmented_peaks(
-    matrix,
-    blurred_matrix,
-    peaks,
-    assignments,
-):
-    from matplotlib import pyplot as plt
-    fig, axs = plt.subplots(2, 3, sharex=True, sharey=True)
-
-    axs[0, 0].imshow(matrix, cmap="Greys")
-    axs[0, 0].set_title("raw")
-    axs[0, 1].imshow(blurred_matrix, cmap="Greys")
-    axs[0, 1].set_title("blurred")
-    axs[0, 2].imshow(np.log10(blurred_matrix), cmap="Greys")
-    axs[0, 2].set_title("log10 blurred")
-
-    axs[1, 0].imshow(matrix, cmap="Greys")
-    axs[1, 0].scatter(*list(zip(*peaks)), c="r", marker=".")
-    # axs[1, 0].scatter(*list(zip(*borders)))
-    axs[1, 0].imshow(assignments, cmap="tab20", alpha=0.33)
-
-    axs[1, 1].imshow(blurred_matrix, cmap="Greys")
-    axs[1, 1].scatter(*list(zip(*peaks)), c="r", marker=".")
-    # axs[1, 1].scatter(*list(zip(*borders)))
-    axs[1, 1].imshow(assignments, cmap="tab20", alpha=0.33)
-
-    axs[1, 2].imshow(np.log10(blurred_matrix), cmap="Greys")
-    axs[1, 2].scatter(*list(zip(*peaks)), c="r", marker=".")
-    # axs[1, 2].scatter(*list(zip(*borders)))
-    axs[1, 2].imshow(assignments, cmap="tab20", alpha=0.33)
-
-
-@alphatims.utils.threadpool
-def process_library_peptide(
-    peptide_index,
-    library,
-    score_features,
-    dia_data,
-    precursor_frame_slices,
-    precursor_scan_slices,
-    precursor_tof_slices,  # unused
-    precursor_mz_slices,
-    fragment_tof_slices,
-    max_scan_difference,
-    max_cycle_difference,
-    blur_sigma,
-):
-    push_indices = alphatims.bruker.get_dia_push_indices(
-        precursor_frame_slices[peptide_index: peptide_index + 1],
-        precursor_scan_slices[peptide_index: peptide_index + 1],
-        precursor_mz_slices[peptide_index: peptide_index + 1],
-        dia_data.scan_max_index,
-        dia_data.dia_mz_cycle,
-        zeroth_frame=dia_data.zeroth_frame,
-    )
-    if len(push_indices) == 0:
-        return  # Outside quad region?
-    fragment_start = library.peptide_offsets[peptide_index]
-    fragment_end = library.peptide_offsets[peptide_index + 1]
-    (
-        push_indptr,
-        raw_indices,
-        fragment_indices
-    ) = alphatims.bruker.filter_tof_to_csr(
-        fragment_tof_slices[fragment_start: fragment_end],
-        push_indices,
-        dia_data.tof_indices,
-        dia_data.push_indptr,
-    )
-    if len(raw_indices) == 0:
-        return
-    push_fragment_matrix = make_dense_matrix(
-        fragment_end - fragment_start,
-        push_indptr,
-        fragment_indices,
-        raw_indices,
-    )
-    intensity_matrix = get_intensity_matrix(
-        push_fragment_matrix,
-        dia_data.intensity_values,
-    )
-    push_connection_indptr, push_connection_indices = define_connections(
-        push_indices,
-        dia_data.scan_max_index,
-        max_cycle_difference * dia_data.precursor_max_index,
-        max_scan_difference,
-    )
-    smooth_intensity_matrix = smoothen_intensity_matrix(
-        intensity_matrix,
-        push_connection_indptr,
-        push_connection_indices
-    )
-    normalized_intensity_matrix = normalize_intensity_matrix(
-        intensity_matrix,
-    )
-    normalized_smooth_intensity_matrix = normalize_intensity_matrix(
-        smooth_intensity_matrix
-    )
-    library_intensities = library.peptide_fragment_intensities[
-        fragment_start: fragment_end
-    ]
-
-    push_library_cosine = cosine_similarity(
-        library_intensities,
-        intensity_matrix,
-    )
-    smooth_push_library_cosine = cosine_similarity(
-        library_intensities,
-        smooth_intensity_matrix,
-    )
-
-    push_intensities = sum_push_intensities(
-        intensity_matrix
-    )
-    smooth_push_intensities = sum_push_intensities(
-        smooth_intensity_matrix
-    )
-    normalized_push_intensities = sum_push_intensities(
-        normalized_intensity_matrix
-    )
-    normalized_smooth_push_intensities = sum_push_intensities(
-        normalized_smooth_intensity_matrix
-    )
-    # (
-    #     push_intensities,
-    #     fwhm_pushes,
-    #     corrs,
-    # ) = find_correlations(intensity_matrix)
-    # (
-    #     smooth_push_intensities,
-    #     smooth_fwhm_pushes,
-    #     smooth_corrs,
-    # ) = find_correlations(smooth_intensity_matrix)
-
-    push_library_cosine_peak_mask = is_reachable_push(
-        push_library_cosine,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-    smooth_push_library_cosine_peak_mask = is_reachable_push(
-        smooth_push_library_cosine,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-
-    push_intensities_peak_mask = is_reachable_push(
-        push_intensities,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-    smooth_push_intensities_peak_mask = is_reachable_push(
-        smooth_push_intensities,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-
-    push_weights = push_intensities * push_library_cosine
-    push_weights_peak_mask = is_reachable_push(
-        push_weights,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-    smooth_push_weights = smooth_push_intensities * smooth_push_library_cosine
-    smooth_push_weights_peak_mask = is_reachable_push(
-        smooth_push_weights,
-        push_connection_indices,
-        push_connection_indptr,
-    )
-
-    matrix, reverse_push_indices = make_dense_cycle_matrix(
-        push_indices=push_indices,
-        zeroth_frame=dia_data.zeroth_frame,
-        scan_max_index=dia_data.scan_max_index,
-        dia_mz_cycle_length=len(dia_data.dia_mz_cycle),
-        values=push_intensities,
-    )
-    blurred_matrix = scipy.ndimage.filters.gaussian_filter(
-        matrix,
-        sigma=blur_sigma
-    )
-    assignments, peaks, borders = partition_dense_cycle_matrix(blurred_matrix)
-    push_assignments = assignments[reverse_push_indices]
-
-    percentiles = [0, 50, 100]
-    # import collections
-    # score_features = collections.defaultdict(lambda:{}) #TODELETE!!!!!!!!!!!!!!!!!!
-    for peak_mask_name, peak_mask in [
-        ("push_library_cosine_peak_mask", push_library_cosine_peak_mask),
-        ("smooth_push_library_cosine_peak_mask", smooth_push_library_cosine_peak_mask),
-        ("push_intensities_peak_mask", push_intensities_peak_mask),
-        ("smooth_push_intensities_peak_mask", smooth_push_intensities_peak_mask),
-        ("push_weights_peak_mask", push_weights_peak_mask),
-        ("smooth_push_weights_peak_mask", smooth_push_weights_peak_mask),
-    ]:
-        feature = f"{peak_mask_name}_len"
-        score_features[feature][peptide_index] = len(peak_mask)
-        feature = f"{peak_mask_name}_apex"
-        score_features[feature][peptide_index] = peak_mask[0]
-        for peak_values_name, peak_values in [
-            ("push_library_cosine", push_library_cosine),
-            ("smooth_push_library_cosine", smooth_push_library_cosine),
-            ("push_intensities", push_intensities),
-            ("smooth_push_intensities", smooth_push_intensities),
-            ("normalized_push_intensities", normalized_push_intensities),
-            ("normalized_smooth_push_intensities", normalized_smooth_push_intensities),
-            ("push_weights", push_weights),
-            ("smooth_push_weights", smooth_push_weights),
-        ]:
-            # peak_percentiles = peak_percentile(
-            #     peak_mask,
-            #     peak_values,
-            #     percentiles,
-            # )
-            # for i, percentile in enumerate(percentiles):
-            #     feature = f"{peak_mask_name}_{peak_values}_{percentile}"
-            #     score_features[feature][peptide_index] = percentile[i]
-            feature = f"{peak_mask_name}_{peak_values_name}_best"
-            score_features[feature][peptide_index] = peak_values[peak_mask[0]]
-            feature = f"{peak_mask_name}_{peak_values_name}_worst"
-            score_features[feature][peptide_index] = peak_values[peak_mask[-1]]
-
-    # cor_percentiles = np.percentile(corrs, [25, 50, 75])
-    # smooth_cor_percentiles = np.percentile(smooth_corrs, [25, 50, 75])
-    #
-    # library_intensity_cos_percentiles = np.percentile(
-    #     cos_sims,
-    #     [90, 95, 100],
-    # )
-    # smooth_library_intensity_cos_percentiles = np.percentile(
-    #     smooth_cos_sims,
-    #     [90, 95, 100],
-    # )
-    #
-    # smooth_relative_intensity_percentiles = np.percentile(
-    #     smooth_push_intensities,
-    #     [50, 75, 100],
-    # )
-    # relative_intensity_percentiles = np.percentile(
-    #     push_intensities,
-    #     [50, 75, 100],
-    # )
-
-    # for feature, score in {
-    #     "push_indices_count": len(push_indices),
-    #     "raw_indices_count": len(raw_indices),
-    #     #
-    #     # "fwhm_corr_25": cor_percentiles[0],
-    #     # "fwhm_corr_50": cor_percentiles[1],
-    #     # "fwhm_corr_75": cor_percentiles[2],
-    #     # "smooth_fwhm_corr_25": smooth_cor_percentiles[0],
-    #     # "smooth_fwhm_corr_50": smooth_cor_percentiles[1],
-    #     # "smooth_fwhm_corr_75": smooth_cor_percentiles[2],
-    #     #
-    #     # # "fwhm_push_count": len(fwhm_pushes),
-    #     # # "smooth_fwhm_push_count": len(smooth_fwhm_pushes),
-    #     #
-    #     # "relative_intensity_50": relative_intensity_percentiles[0],
-    #     # "relative_intensity_75": relative_intensity_percentiles[1],
-    #     # "relative_intensity_100": relative_intensity_percentiles[2],
-    #     "push_apex": push_indices[smooth_push_weights_peak_mask_apex],
-    #     # "library_intensity_cos_90": library_intensity_cos_percentiles[0],
-    #     # "library_intensity_cos_95": library_intensity_cos_percentiles[1],
-    #     # "library_intensity_cos_100": library_intensity_cos_percentiles[2],
-    #     # "smooth_library_intensity_cos_90": smooth_library_intensity_cos_percentiles[0],
-    #     # "smooth_library_intensity_cos_95": smooth_library_intensity_cos_percentiles[1],
-    #     # "smooth_library_intensity_cos_100": smooth_library_intensity_cos_percentiles[2],
-    # }.items():
-    #     score_features[feature][peptide_index] = score
-    score_features["push_indices_count"][peptide_index] = len(push_indices)
-    score_features["raw_indices_count"][peptide_index] = len(raw_indices)
-    score_features["push_apex"][peptide_index] = push_indices[smooth_push_weights_peak_mask[0]]
-    return locals()
diff --git a/alphadia/extraction/libtransform.py b/alphadia/libtransform.py
similarity index 94%
rename from alphadia/extraction/libtransform.py
rename to alphadia/libtransform.py
index 9d1bbba9..ccab28e5 100644
--- a/alphadia/extraction/libtransform.py
+++ b/alphadia/libtransform.py
@@ -1,27 +1,54 @@
-from typing import Any
+# native imports
 from pathlib import Path
 import logging
+logger = logging.getLogger()
 import os
-from typing import List
+import typing
 
-import numpy as np
-import pandas as pd
+# alphadia imports
+from alphadia import utils
 
+# alpha family imports
 from alphabase.peptide import fragment
 from alphabase.protein import fasta
 from alphabase.spectral_library.flat import SpecLibFlat
 from alphabase.spectral_library.base import SpecLibBase
 from alphabase.spectral_library.reader import LibraryReaderBase
-
-from alphadia.extraction import utils
-from alphadia.extraction.workflow import reporting
 from alphabase.spectral_library.decoy import decoy_lib_provider
 
-logger = logging.getLogger()
+# third party imports
+import numpy as np
+import pandas as pd
+
+
+
+class ProcessingStep():
+
+    def __init__(self) -> None:
+        """Base class for processing steps. Each implementation must implement the `validate` and `forward` method.
+        Processing steps can be chained together in a ProcessingPipeline."""
+        pass
 
+    def __call__(self, input: typing.Any) -> typing.Any:
+        """Run the processing step on the input object."""
+        logger.info(f'Running {self.__class__.__name__}')
+        if self.validate(input):
+            return self.forward(input)
+        else:
+            logger.critical(f'Input {input} failed validation for {self.__class__.__name__}')
+            raise ValueError(f'Input {input} failed validation for {self.__class__.__name__}')
+
+    def validate(self, input: typing.Any) -> bool:
+        """Validate the input object."""
+        raise NotImplementedError('Subclasses must implement this method')
+
+    def forward(self, input: typing.Any) -> typing.Any:
+        """Run the processing step on the input object."""
+        raise NotImplementedError('Subclasses must implement this method')
+    
 class ProcessingPipeline():
     
-        def __init__(self, steps: list) -> None:
+        def __init__(self, steps: typing.List[ProcessingStep]) -> None:
             """Processing pipeline for loading and transforming spectral libraries.
             The pipeline is a list of ProcessingStep objects. Each step is called in order and the output of the previous step is passed to the next step.
 
@@ -41,38 +68,13 @@ def __init__(self, steps: list) -> None:
             """
             self.steps = steps
     
-        def __call__(self, input: Any) -> Any:
+        def __call__(self, input: typing.Any) -> typing.Any:
             """Run the pipeline on the input object.            
             """
             for step in self.steps:
                 input = step(input)
             return input
-
-class ProcessingStep():
-
-    def __init__(self) -> None:
-        """Base class for processing steps. Each implementation must implement the `validate` and `forward` method.
-        Processing steps can be chained together in a ProcessingPipeline."""
-        pass
-
-    def __call__(self, input: Any) -> Any:
-        """Run the processing step on the input object."""
-        logger.info(f'Running {self.__class__.__name__}')
-        if self.validate(input):
-            return self.forward(input)
-        else:
-            logger.critical(f'Input {input} failed validation for {self.__class__.__name__}')
-            raise ValueError(f'Input {input} failed validation for {self.__class__.__name__}')
-
-    def validate(self, input: Any) -> bool:
-        """Validate the input object."""
-        raise NotImplementedError('Subclasses must implement this method')
-
-    def forward(self, input: Any) -> Any:
-        """Run the processing step on the input object."""
-        raise NotImplementedError('Subclasses must implement this method')
-    
-
+        
 class DynamicLoader(ProcessingStep):
 
     def __init__(self) -> None:
@@ -87,13 +89,11 @@ def __init__(self) -> None:
         **Long format csv files**
         The classical spectral library format as returned by MSFragger.
         It will be imported and converted to a `SpecLibBase` format. This might require additional parsing information.
-
         """
         pass
 
-    def validate(self, input: Any) -> bool:
-        """Validate the input object. It is expected that the input is a path to a file which exists.
-        """
+    def validate(self, input: str) -> bool:
+        """Validate the input object. It is expected that the input is a path to a file which exists."""
         valid = True
         valid &= isinstance(input, str) or isinstance(input, Path)
 
@@ -103,7 +103,7 @@ def validate(self, input: Any) -> bool:
 
         return valid
 
-    def forward(self, input_path: str) -> Any:
+    def forward(self, input_path: str) -> SpecLibBase:
         """Load the spectral library from the input path. The file type is dynamically inferred from the file ending."""
         # get ending of file
         file_type = Path(input_path).suffix
@@ -132,7 +132,7 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
 
-    def validate(self, input: Any) -> bool:
+    def validate(self, input: SpecLibBase) -> bool:
         """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
 
         valid = isinstance(input, SpecLibBase)
@@ -163,7 +163,7 @@ def forward(self, input: SpecLibBase) -> SpecLibBase:
 class AnnotateFasta(ProcessingStep):
 
     def __init__(self, 
-                fasta_path_list: List[str],
+                fasta_path_list: typing.List[str],
                 drop_unannotated: bool = True,
                 ) -> None:
         """Annotate the precursor dataframe with protein information from a FASTA file.
@@ -184,7 +184,7 @@ def __init__(self,
         self.fasta_path_list = fasta_path_list
         self.drop_unannotated = drop_unannotated
 
-    def validate(self, input: Any) -> bool:
+    def validate(self, input: SpecLibBase) -> bool:
         """Validate the input object. It is expected that the input is a `SpecLibBase` object and that all FASTA files exist."""
         valid = isinstance(input, SpecLibBase)
 
@@ -225,7 +225,7 @@ def __init__(self, decoy_type : str = 'diann') -> None:
         super().__init__()
         self.decoy_type = decoy_type
 
-    def validate(self, input: Any) -> bool:
+    def validate(self, input: SpecLibBase) -> bool:
         """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
         return isinstance(input, SpecLibBase)
     
@@ -271,7 +271,7 @@ def __init__(self, n_isotopes : int = 4) -> None:
         super().__init__()
         self.n_isotopes = n_isotopes
 
-    def validate(self, input: Any) -> bool:
+    def validate(self, input: SpecLibBase) -> bool:
         """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
         return isinstance(input, SpecLibBase)
     
@@ -294,7 +294,7 @@ def __init__(self) -> None:
         """
         super().__init__()
 
-    def validate(self, input: Any) -> bool:
+    def validate(self, input: SpecLibBase) -> bool:
         """Validate the input object. It is expected that the input is a `SpecLibBase` object."""
         return isinstance(input, SpecLibBase)
     
diff --git a/alphadia/extraction/numba/config.py b/alphadia/numba/config.py
similarity index 97%
rename from alphadia/extraction/numba/config.py
rename to alphadia/numba/config.py
index 93b30b3a..5b3ba01a 100644
--- a/alphadia/extraction/numba/config.py
+++ b/alphadia/numba/config.py
@@ -1,8 +1,14 @@
-import numba as nb
+# native imports
+
+# alphadia imports
+from alphadia.workflow import reporting
+
+# alpha family imports
+
+# third party imports
 import numpy as np
-import logging
 
-from alphadia.extraction.workflow import reporting
+
 
 class JITConfig():
     """
diff --git a/alphadia/extraction/numba/fragments.py b/alphadia/numba/fragments.py
similarity index 98%
rename from alphadia/extraction/numba/fragments.py
rename to alphadia/numba/fragments.py
index 5354cbf4..743293d3 100644
--- a/alphadia/extraction/numba/fragments.py
+++ b/alphadia/numba/fragments.py
@@ -1,5 +1,12 @@
+# native imports
+
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
 import numba as nb
-from numba.extending import overload_method, overload
+from numba.extending import overload_method
 import numpy as np
 
 @nb.experimental.jitclass()
diff --git a/alphadia/extraction/numba/numeric.py b/alphadia/numba/numeric.py
similarity index 99%
rename from alphadia/extraction/numba/numeric.py
rename to alphadia/numba/numeric.py
index db78c6f0..0fec535f 100644
--- a/alphadia/extraction/numba/numeric.py
+++ b/alphadia/numba/numeric.py
@@ -1,3 +1,10 @@
+# native imports
+
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
 import numpy as np
 import numba as nb
 
diff --git a/alphadia/extraction/planning.py b/alphadia/planning.py
similarity index 91%
rename from alphadia/extraction/planning.py
rename to alphadia/planning.py
index afffdf68..35a8ad30 100644
--- a/alphadia/extraction/planning.py
+++ b/alphadia/planning.py
@@ -1,25 +1,20 @@
 # native imports
-from typing_extensions import Self
 import logging
+logger = logging.getLogger()
 import socket
 from pathlib import Path
 import yaml
 import os 
 from datetime import datetime
-import hashlib
-from typing import Union, List, Dict, Tuple, Optional
+import typing
 
 # alphadia imports
-from alphadia.extraction import data, validate, utils, libtransform
-from alphadia.extraction.workflow import peptidecentric, base, reporting
+from alphadia import utils, libtransform
+from alphadia.workflow import peptidecentric, base, reporting
 import alphadia
 
-logger = logging.getLogger()
-
-from alphabase.peptide import fragment
+# alpha family imports
 from alphabase.spectral_library.flat import SpecLibFlat
-from alphabase.spectral_library.base import SpecLibBase
-from alphabase.spectral_library.reader import LibraryReaderBase
 
 # third party imports
 import numpy as np
@@ -30,11 +25,11 @@ class Plan:
 
     def __init__(self, 
             output_folder : str,
-            raw_file_list: List,
-            spec_lib_path : Union[str, None] = None,
-            config_path : Union[str, None] = None,
-            config_update_path : Union[str, None] = None,
-            config_update : Union[Dict, None] = None
+            raw_file_list: typing.List,
+            spec_lib_path : typing.Union[str, None] = None,
+            config_path : typing.Union[str, None] = None,
+            config_update_path : typing.Union[str, None] = None,
+            config_update : typing.Union[typing.Dict, None] = None
         ) -> None:
         """Highest level class to plan a DIA Search. 
         Owns the input file list, speclib and the config.
@@ -70,7 +65,7 @@ def __init__(self,
         # default config path is not defined in the function definition to account for for different path separators on different OS
         if config_path is None:
             # default yaml config location under /misc/config/config.yaml
-            config_path = os.path.join(os.path.dirname(__file__), '..','..','misc','config','default.yaml')
+            config_path = os.path.join(os.path.dirname(__file__), '..','misc','config','default.yaml')
 
         # 1. load default config
         with open(config_path, 'r') as f:
@@ -103,7 +98,7 @@ def __init__(self,
     @property
     def raw_file_list(
             self
-        ) -> List[str]:
+        ) -> typing.List[str]:
         """List of input files locations.
         """
         return self._raw_file_list
@@ -111,14 +106,14 @@ def raw_file_list(
     @raw_file_list.setter
     def raw_file_list(
             self, 
-            raw_file_list : List[str]
+            raw_file_list : typing.List[str]
         ):
         self._raw_file_list = raw_file_list
 
     @property
     def config(
             self
-        ) -> dict:
+        ) -> typing.Dict:
         """Dict with all configuration parameters for the extraction.
         """
         return self._config
@@ -126,7 +121,7 @@ def config(
     @config.setter
     def config(
             self, 
-            config : dict
+            config : typing.Dict
         ) -> None:
         self._config = config
 
diff --git a/alphadia/extraction/plexscoring.py b/alphadia/plexscoring.py
similarity index 98%
rename from alphadia/extraction/plexscoring.py
rename to alphadia/plexscoring.py
index 5a5b1fd9..c455d9ee 100644
--- a/alphadia/extraction/plexscoring.py
+++ b/alphadia/plexscoring.py
@@ -1,22 +1,28 @@
-from alphadia.extraction import validate, utils, features, quadrupole
-from alphadia.extraction.numba import fragments
-from alphadia.extraction.data import bruker, thermo
-from alphadia.extraction.plotting.cycle import plot_cycle
-from alphadia.extraction.plotting.debug import (
+# native imports
+import logging
+logger = logging.getLogger()
+import typing
+
+# alphadia imports
+from alphadia import validate, utils, features, quadrupole
+from alphadia.numba import fragments
+from alphadia.data import bruker, thermo
+from alphadia.plotting.cycle import plot_cycle
+from alphadia.plotting.debug import (
     plot_fragment_profile,
     plot_precursor,
     plot_fragments,
     plot_template
 )
 
+# alpha family imports
 import alphatims.utils
 
+# third party imports
 import pandas as pd
 import numpy as np
 import numba as nb
-import logging
 
-import typing
 
 def candidate_features_to_candidates(
     candidate_features_df : pd.DataFrame,
@@ -135,7 +141,7 @@ def multiplex_candidates(
     return multiplexed_candidates_df
 
     
-from alphadia.extraction.numba import config
+from alphadia.numba import config
 
 
 
@@ -166,7 +172,7 @@ def __init__(self,
         """Numba JIT compatible config object for CandidateScoring.
         Will be emitted when `CandidateConfig.jitclass()` is called.
 
-        Please refer to :class:`.alphadia.extraction.plexscoring.CandidateConfig` for documentation.
+        Please refer to :class:`.alphadia.plexscoring.CandidateConfig` for documentation.
         """
 
         self.score_grouped = score_grouped
@@ -1172,11 +1178,11 @@ def __init__(self,
 
         precursors_flat : pd.DataFrame
             A DataFrame containing precursor information. 
-            The DataFrame will be validated by using the `alphadia.extraction.validate.precursors_flat` schema.
+            The DataFrame will be validated by using the `alphadia.validate.precursors_flat` schema.
 
         fragments_flat : pd.DataFrame
             A DataFrame containing fragment information.
-            The DataFrame will be validated by using the `alphadia.extraction.validate.fragments_flat` schema.
+            The DataFrame will be validated by using the `alphadia.validate.fragments_flat` schema.
 
         quadrupole_calibration : quadrupole.SimpleQuadrupole, default=None
             An object containing the quadrupole calibration information.
@@ -1388,7 +1394,7 @@ def assemble_fragments(self) -> fragments.FragmentContainer:
             pass
         
         else:
-            logging.warning('Fragment cardinality column not found in fragment dataframe. Setting cardinality to 1.')
+            logger.warning('Fragment cardinality column not found in fragment dataframe. Setting cardinality to 1.')
             self.fragments_flat['cardinality'] = np.ones(len(self.fragments_flat), dtype=np.uint8)
         
         # validate dataframe schema and prepare jitclass compatible dtypes
@@ -1563,7 +1569,7 @@ def __call__(
             A DataFrame containing the features for each fragment.
 
         """
-        logging.info('Starting candidate scoring')
+        logger.info('Starting candidate scoring')
 
         score_group_container = self.assemble_score_group_container(candidates_df)
         fragment_container = self.assemble_fragments()
@@ -1588,6 +1594,6 @@ def __call__(
         fragment_features_df = self.collect_fragments(candidates_df, score_group_container)
         validate.fragment_features_df(fragment_features_df)
 
-        logging.info('Finished candidate scoring')
+        logger.info('Finished candidate scoring')
 
         return candidate_features_df, fragment_features_df
\ No newline at end of file
diff --git a/alphadia/extraction/plotting/cycle.py b/alphadia/plotting/cycle.py
similarity index 94%
rename from alphadia/extraction/plotting/cycle.py
rename to alphadia/plotting/cycle.py
index 07081937..2b98bd61 100644
--- a/alphadia/extraction/plotting/cycle.py
+++ b/alphadia/plotting/cycle.py
@@ -1,6 +1,12 @@
+# native imports
+import typing
+
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
 from matplotlib import cm, patches
-from alphadia.extraction import plotting
-from typing import List
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -9,7 +15,7 @@ def _generate_patch_collection_nomobility(
     cmap_name : str, 
     start_val : float = 0.4,
     stop_val : float = 1.0
-    ) -> List[dict]:
+    ) -> typing.List[dict]:
     """Generate a collection of patches for a DIA cycle for an experiment without ion mobility separation.
 
     Parameters
@@ -29,8 +35,8 @@ def _generate_patch_collection_nomobility(
 
     Returns
     -------
-    List[dict]
-        List of dicts for building the patches. Can be plotted by feeding them into plotting._plot_patch_collection
+    typing.List[dict]
+        typing.List of dicts for building the patches. Can be plotted by feeding them into plotting._plot_patch_collection
     
     """
 
@@ -62,7 +68,7 @@ def _generate_patch_collection_nomobility(
     return slice_collection
 
 def _plot_patch_collection(
-    patch_collection : List[dict],
+    patch_collection : typing.List[dict],
     ax : plt.Axes = None, 
     alpha : float = 0.5
     ):
@@ -71,8 +77,8 @@ def _plot_patch_collection(
     Parameters
     ----------
 
-    patch_collection : List[dict]
-        List of dicts for building the patches.
+    patch_collection : typing.List[dict]
+        typing.List of dicts for building the patches.
 
     ax : plt.Axes, optional
         Axes to plot on. If None, the current axes will be used.
@@ -174,8 +180,8 @@ def _generate_patch_collection_mobility(
 
     Returns
     -------
-    List[dict]
-        List of dicts for building the patches. Can be plotted by feeding them into plotting._plot_patch_collection
+    typing.List[dict]
+        typing.List of dicts for building the patches. Can be plotted by feeding them into plotting._plot_patch_collection
 
     """
 
diff --git a/alphadia/extraction/plotting/debug.py b/alphadia/plotting/debug.py
similarity index 98%
rename from alphadia/extraction/plotting/debug.py
rename to alphadia/plotting/debug.py
index 65648316..0f6638b4 100644
--- a/alphadia/extraction/plotting/debug.py
+++ b/alphadia/plotting/debug.py
@@ -1,9 +1,15 @@
+# native imports
+
+# alphadia imports
+from alphadia.plotting import utils
+from alphadia import quadrupole
+
+# alpha family imports
+
+# third party imports
 import matplotlib.pyplot as plt
 import numpy as np
 
-from alphadia.extraction.plotting import utils
-from alphadia.extraction import quadrupole
-
 def plot_fragment_profile(
     template,
     fragment_scan_profile,
diff --git a/alphadia/extraction/plotting/utils.py b/alphadia/plotting/utils.py
similarity index 97%
rename from alphadia/extraction/plotting/utils.py
rename to alphadia/plotting/utils.py
index 6391dbbd..50efabc4 100644
--- a/alphadia/extraction/plotting/utils.py
+++ b/alphadia/plotting/utils.py
@@ -1,9 +1,15 @@
+# native imports
 import typing
 
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
+from scipy.stats import gaussian_kde
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from scipy.stats import gaussian_kde
 
 def lighten_color(color, amount=0.5):
     """
diff --git a/alphadia/prefilter.py b/alphadia/prefilter.py
deleted file mode 100644
index 098cd27f..00000000
--- a/alphadia/prefilter.py
+++ /dev/null
@@ -1,1265 +0,0 @@
-"""A module to perform prefilterinf of peptides."""
-
-import alphatims.utils
-import numpy as np
-import sklearn.neighbors
-
-import alphadia.library
-import alpharaw.smoothing
-
-
-# @alphatims.utils.njit(nogil=True)
-@alphatims.utils.pjit
-def find_maximal_pushes(
-    cycle_index,
-    connection_indptr,
-    connection_indices,
-    cycle_tolerance,
-    zeroth_frame,
-    scan_max_index,
-    intensity_values,
-    selected_push_indices,
-    maximum_push_indices,
-):
-    cycle_length = len(connection_indptr) - 1
-    for self_connection_index, connection_start in enumerate(
-        connection_indptr[:-1]
-    ):
-        self_push_index = self_connection_index
-        self_push_index += cycle_index * cycle_length
-        self_push_index += zeroth_frame * scan_max_index
-        if not (0 <= self_push_index < len(selected_push_indices)):
-            continue
-        if not maximum_push_indices[self_push_index]:
-            continue
-        self_intensity_index = selected_push_indices[self_push_index]
-        if self_intensity_index == -1:
-            maximum_push_indices[self_push_index] = False
-            continue
-        self_intensity = intensity_values[self_intensity_index]
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            if cycle_offset == 0:
-                continue
-            other_push_index = self_push_index + cycle_offset * cycle_length
-            if not (0 <= other_push_index < len(selected_push_indices)):
-                continue
-            other_intensity_index = selected_push_indices[other_push_index]
-            if other_intensity_index == -1:
-                continue
-            other_intensity = intensity_values[other_intensity_index]
-            if self_intensity < other_intensity:
-                maximum_push_indices[self_push_index] = False
-                break
-        if not maximum_push_indices[self_push_index]:
-            continue
-        connection_end = connection_indptr[self_connection_index + 1]
-        other_connection_indices = connection_indices[connection_start: connection_end]
-        for other_push_index in self_push_index + other_connection_indices:
-            if not (0 <= other_push_index < len(selected_push_indices)):
-                continue
-            other_intensity_index = selected_push_indices[other_push_index]
-            if other_intensity_index == -1:
-                continue
-            other_intensity = intensity_values[other_intensity_index]
-            if self_intensity < other_intensity:
-                maximum_push_indices[self_push_index] = False
-                break
-
-
-@alphatims.utils.pjit
-def find_max_intensity_per_push(
-    cycle_index,
-    dia_mz_cycle,
-    zeroth_frame,
-    scan_max_index,
-    push_indptr,
-    intensity_values,
-    tof_indices,
-    mz_values,
-    selected_push_indices,
-    min_peaks,
-    min_intensity_value,
-    use_ms1,
-    use_ms2,
-    use_precursor_range,
-):
-    cycle_length = len(dia_mz_cycle)
-    for self_connection_index, (lower_mz, upper_mz) in enumerate(
-        dia_mz_cycle
-    ):
-        push_index = self_connection_index
-        push_index += cycle_index * cycle_length
-        push_index += zeroth_frame * scan_max_index
-        if not (0 <= push_index < len(selected_push_indices)):
-            continue
-        selected_push_indices[push_index] = -1
-        if (not use_ms1) and (lower_mz == -1):
-            continue
-        if (not use_ms2) and (lower_mz != -1):
-            continue
-        start_index = push_indptr[push_index]
-        end_index = push_indptr[push_index + 1]
-        if (end_index - start_index) >= min_peaks:
-            current_min_intensity = min_intensity_value
-            max_index = -1
-            for index, intensity in enumerate(
-                intensity_values[start_index:end_index],
-                start_index
-            ):
-                mz_value = mz_values[tof_indices[index]]
-                if (not use_precursor_range) and (lower_mz < mz_value < upper_mz):
-                    continue
-                if intensity >= current_min_intensity:
-                    max_index = index
-            selected_push_indices[push_index] = max_index
-
-
-def push_index_to_cycle_index(
-    push_index,
-    dia_mz_cycle,
-    zeroth_frame,
-    scan_max_index,
-):
-    cycle_length = len(dia_mz_cycle)
-    push_index_ = push_index - zeroth_frame * scan_max_index
-    cycle_index = push_index_ // cycle_length
-    cycle_offset = push_index_ % cycle_length
-    return cycle_index, cycle_offset
-
-
-@alphatims.utils.njit(nogil=True)
-def push_precursor_borders(
-    push_indices,
-    push_indptr,
-    tof_indices,
-    mz_values,
-    dia_mz_cycle,
-    zeroth_frame,
-    scan_max_index,
-    precursor_frame,
-):
-    potential_precursors = np.zeros(
-        (len(push_indices), 2),
-        dtype=np.int64
-    )
-    cycle_length = len(dia_mz_cycle)
-    for index, push_index in enumerate(push_indices):
-        if push_index < zeroth_frame * scan_max_index:
-            continue
-        push_index_ = push_index - zeroth_frame * scan_max_index
-        cycle_index = push_index_ // cycle_length
-        cycle_offset = push_index_ % cycle_length
-        precursor_index = cycle_offset % scan_max_index + precursor_frame * scan_max_index
-        precursor_index += cycle_index * cycle_length
-        precursor_index += zeroth_frame * scan_max_index
-        mz_borders = dia_mz_cycle[cycle_offset]
-        precursor_start = push_indptr[precursor_index]
-        precursor_end = push_indptr[precursor_index + 1]
-        offsets = mz_values[tof_indices[precursor_start: precursor_end]]
-        potential_precursors[index] = precursor_start + np.searchsorted(offsets, mz_borders)
-    return potential_precursors
-
-
-@alphatims.utils.pjit
-def find_best_peptide(
-    selected_index,
-    potential_precursors,
-    spectra_of_interest,
-    precursor_mzs,
-    y_mzs,
-    b_mzs,
-    frag_start_idxs,
-    frag_end_idxs,
-    mz_values,
-    tof_indices,
-    push_indptr,
-    intensity_values,
-    max_indices,
-    max_counts,
-    max_precursor_mzs,
-    selected_ms1_ions,
-    fragment_ppm=50,
-    precursor_ppm=50,
-):
-    selected_precursors = potential_precursors[selected_index]
-    selected_fragments = (
-        push_indptr[spectra_of_interest[selected_index]],
-        push_indptr[spectra_of_interest[selected_index] + 1]
-    )
-    fragment_mzs = mz_values[
-        tof_indices[
-            selected_fragments[0]: selected_fragments[1]
-        ]
-    ]
-    if selected_precursors[0] == selected_precursors[1]:
-        max_indices[selected_index] = -1
-        max_counts[selected_index] = 0
-        return
-    precursor_intensities = intensity_values[
-        selected_precursors[0]: selected_precursors[1]
-    ]
-    selected_ms1_ion = np.argmax(
-        precursor_intensities
-    ) + selected_precursors[0]
-    precursor_mz = mz_values[
-        tof_indices[
-            selected_ms1_ion
-        ]
-    ]
-    lower_bound = np.searchsorted(
-        precursor_mzs,
-        precursor_mz / (1 + 10**-6 * precursor_ppm),
-    )
-    upper_bound = np.searchsorted(
-        precursor_mzs,
-        precursor_mz * (1 + 10**-6 * precursor_ppm),
-    )
-    max_hit_count = 0
-    max_index = -1
-    for index in range(lower_bound, upper_bound):
-        frag_start_idx = frag_start_idxs[index]
-        frag_end_idx = frag_end_idxs[index]
-        if frag_start_idx == frag_end_idx:
-            continue
-        y_hit_count = rough_match(
-            fragment_mzs,
-            y_mzs[frag_start_idx: frag_end_idx][::-1],
-            fragment_ppm,
-        )
-        b_hit_count = rough_match(
-            fragment_mzs,
-            b_mzs[frag_start_idx: frag_end_idx],
-            fragment_ppm,
-        )
-        hit_count = y_hit_count + b_hit_count
-        if hit_count > max_hit_count:
-            max_hit_count = hit_count
-            max_index = index
-    max_indices[selected_index] = max_index
-    max_counts[selected_index] = max_hit_count
-    max_precursor_mzs[selected_index] = precursor_mz
-    selected_ms1_ions[selected_index] = selected_ms1_ion
-    # return max_index, max_hit_count
-
-
-@alphatims.utils.njit(nogil=True)
-def rough_match(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hit_count = 0
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            hit_count += 1
-            fragment_index += 1
-            database_index += 1
-    return hit_count
-
-
-def first_search(
-    dia_data,
-    y_ions,
-    b_ions,
-    predicted_library_df,
-    scan_tolerance=6,
-    multiple_frames_per_cycle=True,
-    ms1=True,
-    ms2=True,
-    cycle_tolerance=3,
-    precursor_ppm=50,
-    fragment_ppm=50,
-    precursor_frame=0,
-    train_fdr_level=0.5,
-    min_peaks=10,
-    min_intensity_value=1000,
-    use_ms1=False,
-    use_ms2=True,
-    use_precursor_range=False,
-):
-    cycle_count = len(dia_data.push_indptr) // len(dia_data.dia_mz_cycle)
-    selected_push_indices = np.empty_like(dia_data.push_indptr)[:-1]
-    selected_push_indices[:dia_data.zeroth_frame * dia_data.scan_max_index] = -1
-    maximum_push_indices = np.ones_like(dia_data.push_indptr, dtype=np.bool)[:-1]
-    maximum_push_indices[:dia_data.zeroth_frame * dia_data.scan_max_index] = False
-    find_max_intensity_per_push(
-        range(cycle_count + 1),
-        dia_data.dia_mz_cycle,
-        dia_data.zeroth_frame,
-        dia_data.scan_max_index,
-        dia_data.push_indptr,
-        dia_data.intensity_values,
-        dia_data.tof_indices,
-        dia_data.mz_values,
-        selected_push_indices,
-        min_peaks,
-        min_intensity_value,
-        use_ms1,
-        use_ms2,
-        use_precursor_range,
-    )
-    connection_indptr, connection_indices = alpharaw.smoothing.get_connections_within_cycle(
-        scan_tolerance=scan_tolerance,
-        scan_max_index=dia_data.scan_max_index,
-        dia_mz_cycle=dia_data.dia_mz_cycle,
-        multiple_frames=multiple_frames_per_cycle,
-        ms1=ms1,
-        ms2=ms2,
-    )
-    find_maximal_pushes(
-        range(cycle_count + 1),
-        connection_indptr,
-        connection_indices,
-        cycle_tolerance,
-        dia_data.zeroth_frame,
-        dia_data.scan_max_index,
-        dia_data.intensity_values,
-        selected_push_indices,
-        maximum_push_indices,
-    )
-    spectra_of_interest = np.flatnonzero(maximum_push_indices)
-    potential_precursors = push_precursor_borders(
-        spectra_of_interest,
-        dia_data.push_indptr,
-        dia_data.tof_indices,
-        dia_data.mz_values,
-        dia_data.dia_mz_cycle,
-        dia_data.zeroth_frame,
-        dia_data.scan_max_index,
-        precursor_frame,
-    )
-    max_indices = np.zeros_like(spectra_of_interest)
-    max_counts = np.zeros_like(spectra_of_interest)
-    max_precursor_mzs = np.zeros_like(spectra_of_interest, dtype=np.float64)
-    selected_ms1_ions = np.zeros_like(spectra_of_interest)
-    find_best_peptide(
-        range(len(spectra_of_interest)),
-        potential_precursors,
-        spectra_of_interest,
-        predicted_library_df.precursor_mz.values,
-        y_ions,
-        b_ions,
-        predicted_library_df.frag_start_idx.values,
-        predicted_library_df.frag_end_idx.values,
-        dia_data.mz_values,
-        dia_data.tof_indices,
-        dia_data.push_indptr,
-        dia_data.intensity_values,
-        max_indices,
-        max_counts,
-        max_precursor_mzs,
-        selected_ms1_ions,
-        precursor_ppm,
-        fragment_ppm,
-    )
-    selection = max_counts >= 1
-    selected = max_indices[selection]
-    data_df = dia_data.as_dataframe(dia_data.push_indptr[spectra_of_interest[selection]])
-    lib_df = predicted_library_df.iloc[selected]
-    lib_df["rt_experimental"] = np.copy(data_df.rt_values_min.values)
-    lib_df["mobility_experimental"] = np.copy(data_df.mobility_values.values)
-    lib_df["mz_experimental"] = max_precursor_mzs[selection]
-    lib_df["raw_ms1_index"] = selected_ms1_ions[selection]
-    lib_df["count"] = max_counts[selection]
-    lib_df["ppm"] = (lib_df.mz_experimental - lib_df.precursor_mz) / lib_df.precursor_mz * 10**6
-    lib_df["delta_im"] = lib_df.mobility_experimental - lib_df.mobility_pred
-    lib_df.reset_index(level=0, inplace=True)
-    lib_df.rename(
-        columns={
-            "index": "mz_sorted_index",
-        },
-        inplace=True,
-    )
-    lib_df["decoy"] = lib_df["decoy"].astype(np.bool)
-    lib_df["target"] = ~lib_df["decoy"]
-    lib_df["mz_pred"] = lib_df.precursor_mz
-    lib_df["im_pred"] = lib_df.mobility_pred
-    lib_df["im_experimental"] = lib_df.mobility_experimental
-    return train_and_score(
-        lib_df,
-        ["count", "ppm", "delta_im"],
-        train_fdr_level=train_fdr_level,
-    )
-
-def train_and_score(
-    scores_df,
-    features,
-    train_fdr_level: float = 0.1,
-    ini_score: str = "count",
-    min_train: int = 1000,
-    test_size: float = 0.8,
-    max_depth: list = [5, 25, 50],
-    max_leaf_nodes: list = [150, 200, 250],
-    n_jobs: int = -1,
-    scoring: str = 'accuracy',
-    plot: bool = False,
-    random_state: int = 42,
-):
-    df = scores_df.copy()
-    cv = alphadia.library.train_RF(
-        df,
-        features,
-        train_fdr_level=train_fdr_level,
-        ini_score=ini_score,
-        min_train=min_train,
-        test_size=test_size,
-        max_depth=max_depth,
-        max_leaf_nodes=max_leaf_nodes,
-        n_jobs=n_jobs,
-        scoring=scoring,
-        plot=plot,
-        random_state=random_state,
-    )
-    df['score'] = cv.predict_proba(df[features])[:, 1]
-    return alphadia.library.get_q_values(df, "score", 'decoy', drop=True)
-
-
-@alphatims.utils.pjit
-def filter_precursor_candidates(
-    selected_index,
-    potential_precursors,
-    spectra_of_interest,
-    precursor_mzs,
-    y_mzs,
-    b_mzs,
-    frag_start_idxs,
-    frag_end_idxs,
-    mz_values,
-    tof_indices,
-    push_indptr,
-    intensity_values,
-    max_indices,
-    max_counts,
-    max_precursor_mzs,
-    fragment_ppm=50,
-    precursor_ppm=50,
-):
-    selected_precursors = potential_precursors[selected_index]
-    selected_fragments = (
-        push_indptr[spectra_of_interest[selected_index]],
-        push_indptr[spectra_of_interest[selected_index] + 1]
-    )
-    fragment_mzs = mz_values[
-        tof_indices[
-            selected_fragments[0]: selected_fragments[1]
-        ]
-    ]
-    if selected_precursors[0] == selected_precursors[1]:
-        max_indices[selected_index] = -1
-        max_counts[selected_index] = 0
-        return
-    precursor_intensities = intensity_values[
-        selected_precursors[0]: selected_precursors[1]
-    ]
-    precursor_mz = mz_values[
-        tof_indices[
-            np.argmax(precursor_intensities) + selected_precursors[0]
-        ]
-    ]
-    lower_bound = np.searchsorted(
-        precursor_mzs,
-        precursor_mz / (1 + 10**-6 * precursor_ppm),
-    )
-    upper_bound = np.searchsorted(
-        precursor_mzs,
-        precursor_mz * (1 + 10**-6 * precursor_ppm),
-    )
-    max_hit_count = 0
-    max_index = -1
-    for index in range(lower_bound, upper_bound):
-        frag_start_idx = frag_start_idxs[index]
-        frag_end_idx = frag_end_idxs[index]
-        if frag_start_idx == frag_end_idx:
-            continue
-        y_hit_count = rough_match(
-            fragment_mzs,
-            y_mzs[frag_start_idx: frag_end_idx][::-1],
-            fragment_ppm,
-        )
-        b_hit_count = rough_match(
-            fragment_mzs,
-            b_mzs[frag_start_idx: frag_end_idx],
-            fragment_ppm,
-        )
-        hit_count = y_hit_count + b_hit_count
-        if hit_count > max_hit_count:
-            max_hit_count = hit_count
-            max_index = index
-    max_indices[selected_index] = max_index
-    max_counts[selected_index] = max_hit_count
-    max_precursor_mzs[selected_index] = precursor_mz
-    # return max_index, max_hit_count
-
-
-def calibrate_hits(
-    first_hits,
-    n_neighbors=4,
-    test_size=0.8,
-    fdr=0.01,
-):
-    lib_df = first_hits[first_hits.q_value < fdr]
-    for dimension in ["rt", "im"]:
-        X = lib_df[f"{dimension}_pred"].values.reshape(-1, 1)
-        y = lib_df[f"{dimension}_experimental"].values
-        (
-            X_train,
-            X_test,
-            y_train,
-            y_test
-        ) = sklearn.model_selection.train_test_split(
-            X,
-            y,
-            test_size=test_size,
-            random_state=0,
-        )
-        neigh = sklearn.neighbors.KNeighborsRegressor(
-            n_neighbors=n_neighbors,
-            weights="distance",
-            n_jobs=alphatims.utils.set_threads(alphatims.utils.MAX_THREADS)
-        )
-        neigh.fit(
-            X_train,
-            y_train,
-        )
-        first_hits[f"{dimension}_calibrated"] = neigh.predict(
-            first_hits[f"{dimension}_pred"].values.reshape(-1, 1)
-        )
-        first_hits[f"{dimension}_diff"] = first_hits[f"{dimension}_experimental"] - first_hits[f"{dimension}_calibrated"]
-    return first_hits
-
-
-@alphatims.utils.pjit
-def best_transitions(
-    index,
-    start_indices,
-    end_indices,
-    fragment_intensities,
-    best_transitions,
-    remove_outer_ions,
-):
-    start = start_indices[index] + remove_outer_ions
-    end = end_indices[index] - remove_outer_ions
-    if (start + remove_outer_ions) < (end - remove_outer_ions):
-        start += remove_outer_ions
-        end -= remove_outer_ions
-    max_index = start + np.argmax(fragment_intensities[start: end])
-    best_transitions[index] = max_index
-
-
-@alphatims.utils.njit
-def merge_best_transitions(
-    b_intensities,
-    b_transitions,
-    b_mzs,
-    y_intensities,
-    y_transitions,
-    y_mzs,
-):
-    # best_transation_indices = np.empty_like(y_transitions)
-    best_transation_mzs = np.empty_like(y_transitions, dtype=np.float64)
-    for index, y_index in enumerate(y_transitions):
-        b_index = b_transitions[index]
-        if b_intensities[b_index] > y_intensities[y_index]:
-            # best_transation_indices[index] = b_index
-            best_transation_mzs[index] = b_mzs[b_index]
-        else:
-            # best_transation_indices[index] = y_index
-            best_transation_mzs[index] = y_mzs[y_index]
-    # return best_transation_indices, best_transation_mzs
-    return best_transation_mzs
-
-
-def find_seeds(
-    dia_data,
-    predicted_library_df,
-    cycle_index,
-    offsets,
-    idxs,
-    library_precursor_rt_values,
-    library_precursor_im_values,
-    rt_diff_std,
-    im_diff_std,
-    precursor_cycle,
-    precursor_frame,
-    min_fragments,
-    potential_peaks,
-):
-    import multiprocessing
-
-    def starfunc(cycle_index):
-        return find_candidates(
-            cycle_index,
-            offsets,
-            idxs,
-            library_precursor_rt_values,
-            library_precursor_im_values,
-            predicted_library_df.precursor_index_lower.values,
-            predicted_library_df.precursor_index_upper.values,
-            rt_diff_std,
-            im_diff_std,
-            dia_data.push_indptr,
-            dia_data.tof_indices,
-            dia_data.mz_values,
-            dia_data.rt_values / 60,
-            dia_data.mobility_values,
-            precursor_cycle,
-            dia_data.zeroth_frame,
-            dia_data.scan_max_index,
-            precursor_frame,
-            # final_candidates,
-            min_fragments,
-            potential_peaks,
-        )
-
-    iterable = range(len(dia_data.push_indptr) // len(dia_data.dia_mz_cycle) + 1)
-    # iterable = range(500, 520)
-    seeds = []
-    counts = [[0] * (1 + dia_data.zeroth_frame * dia_data.scan_max_index)]
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for cycle_index, (
-            push_counts,
-            seed_candidates,
-        ) in alphatims.utils.progress_callback(
-            enumerate(pool.imap(starfunc, iterable)),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            counts.append(push_counts)
-            seeds.append(seed_candidates)
-    # return seeds, counts
-    return (
-        np.cumsum(np.concatenate(counts))[:len(dia_data.push_indptr)],
-        np.concatenate(seeds)
-    )
-
-
-@alphatims.utils.njit(nogil=True)
-def find_candidates(
-    cycle_index,
-    library_precursor_offsets,
-    library_precursor_indices,
-    library_precursor_rt_values,
-    library_precursor_im_values,
-    library_lower_indices,
-    library_upper_indices,
-    rt_tolerance,
-    im_tolerance,
-    push_indptr,
-    tof_indices,
-    mz_values,
-    rt_values,
-    im_values,
-    precursor_cycle,
-    zeroth_frame,
-    scan_max_index,
-    precursor_frame,
-    # final_candidates,
-    min_fragments,
-    potential_peaks,
-):
-    cycle_length = len(precursor_cycle)
-    push_offset = cycle_length * cycle_index + zeroth_frame * scan_max_index
-    frame = push_offset // scan_max_index
-    rt = rt_values[frame]
-    start_offsets, end_offsets = filter_offsets_by_rt(
-        library_precursor_offsets,
-        library_precursor_rt_values,
-        rt,
-        rt_tolerance,
-    )
-    seed_candidates = []
-    push_counts = np.zeros(len(precursor_cycle), dtype=np.int64)
-    # peptide_buffer = np.zeros_like(library_lower_indices, dtype=np.bool_)
-    for cycle_offset, (low_precursor, high_precursor) in enumerate(
-        precursor_cycle
-    ):
-        if low_precursor == high_precursor:
-            continue
-        scan_index = cycle_offset % scan_max_index
-        im = im_values[scan_index]
-        precursor_push_index = (
-            push_offset + precursor_frame * scan_max_index + scan_index
-        )
-        if precursor_push_index > len(push_indptr):
-            continue
-        push_index = push_offset + cycle_offset
-        if push_index > len(push_indptr):
-            continue
-        precursor_push_index_start = push_indptr[precursor_push_index]
-        precursor_push_index_end = push_indptr[precursor_push_index + 1]
-        if precursor_push_index_start == precursor_push_index_end:
-            continue
-        push_index_start = push_indptr[push_index]
-        push_index_end = push_indptr[push_index + 1]
-        if (push_index_end - push_index_start) < min_fragments:
-            continue
-        candidate_precursors = []
-        for index, tof_index in enumerate(
-            tof_indices[push_index_start: push_index_end],
-            push_index_start,
-        ):
-            if not potential_peaks[index]:
-                continue
-            low_index = start_offsets[tof_index]
-            high_index = end_offsets[tof_index]
-            candidate_ims = library_precursor_im_values[
-                low_index: high_index
-            ]
-            # final_candidates[cycle_index] += high_index - low_index
-            for candidate_index, candidate_im in enumerate(
-                candidate_ims,
-                low_index
-            ):
-                candidate_precursor = library_precursor_indices[candidate_index]
-                if library_upper_indices[candidate_precursor] < low_precursor:
-                    continue
-                if library_lower_indices[candidate_precursor] > high_precursor:
-                    continue
-                # if peptide_buffer[candidate_precursor]:
-                #     continue
-                # if final_candidates[candidate_precursor]:
-                #     continue
-                if abs(candidate_im - im) < im_tolerance:
-                    candidate_precursors.append(candidate_precursor)
-                    # final_candidates[candidate_precursor] = True
-        if len(candidate_precursors) == 0:
-            continue
-        # return locals()
-        candidate_precursors = sorted(candidate_precursors)
-        candidate_index = 0
-        for precursor_tof in tof_indices[
-            precursor_push_index_start: precursor_push_index_end
-        ]:
-            while candidate_index < len(candidate_precursors):
-                candidate_precursor = candidate_precursors[candidate_index]
-                lower_tof = library_lower_indices[candidate_precursor]
-                if precursor_tof < lower_tof:
-                    break
-                upper_tof = library_upper_indices[candidate_precursor]
-                if precursor_tof < upper_tof:
-                    # check other fragments?
-                    # peptide_buffer[candidate_precursor] = True
-                    # final_candidates[candidate_precursor] = True
-                    seed_candidates.append(candidate_precursor)
-                    push_counts[cycle_offset] += 1
-                    # final_candidates[cycle_index] += 1
-                candidate_index += 1
-                # else:
-                #     final_candidates[candidate_precursor] = True
-                #     candidate_index += 1
-    # for candidate_index, candidate_precursor in enumerate(peptide_buffer):
-    #     if candidate_precursor:
-    #         final_candidates[candidate_index] = True
-    return push_counts, np.array(seed_candidates)
-
-
-@alphatims.utils.njit(nogil=True)
-def filter_offsets_by_rt(
-    offsets,
-    library_precursor_rt_values,
-    rt,
-    rt_tolerance,
-):
-    start_offsets = np.empty(len(offsets) - 1, dtype=offsets.dtype)
-    end_offsets = np.empty(len(offsets) - 1, dtype=offsets.dtype)
-    for index, start in enumerate(offsets[:-1]):
-        end = offsets[index + 1]
-        start_offsets[index], end_offsets[index] = start + np.searchsorted(
-            library_precursor_rt_values[start: end],
-            [rt - rt_tolerance, rt + rt_tolerance]
-        )
-    return start_offsets, end_offsets
-
-
-def remove_unreachable_precursors(
-    predicted_library_df,
-    dia_data,
-):
-    lower_mz_index, upper_mz_index = np.searchsorted(
-        predicted_library_df.precursor_mz.values,
-        [dia_data.quad_mz_min_value, dia_data.quad_mz_max_value],
-    )
-    return predicted_library_df[lower_mz_index: upper_mz_index].reset_index(
-        drop=True,
-    )
-
-
-
-@alphatims.utils.pjit
-def annotate_seeds(
-    push_index,
-    y_mzs,
-    b_mzs,
-    frag_start_idxs,
-    frag_end_idxs,
-    mz_values,
-    tof_indices,
-    push_indptr,
-    # max_indices,
-    # max_counts,
-    # max_precursor_mzs,
-    push_counts,
-    seed_candidates,
-    fragment_ppm,
-    # ppm_offset,
-    hit_counts,
-    intensity_values,
-    dia_mz_cycle,
-    tof_tolerance,
-    scan_max_index,
-    tof_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    is_signal,
-):
-    seed_start = push_counts[push_index]
-    seed_end = push_counts[push_index + 1]
-    if seed_start == seed_end:
-        return
-    push_start = push_indptr[push_index]
-    push_end = push_indptr[push_index + 1]
-    fragment_tofs = tof_indices[push_start: push_end]
-    fragment_tofs, fragment_intensities = alpharaw.smoothing.merge_pushes2(
-        push_index,
-        push_indptr,
-        tof_indices,
-        intensity_values,
-        dia_mz_cycle,
-        tof_tolerance,
-        scan_max_index,
-        tof_max_index,
-        zeroth_frame,
-        connection_counts,
-        connections,
-        cycle_tolerance,
-        is_signal,
-        # mz_values,
-    )
-    fragment_mzs = mz_values[fragment_tofs]
-    # smooth from connections?
-    for seed_index, seed in enumerate(
-        seed_candidates[seed_start: seed_end],
-        seed_start
-    ):
-        frag_start_idx = frag_start_idxs[seed]
-        frag_end_idx = frag_end_idxs[seed]
-        if frag_start_idx == frag_end_idx:
-            continue
-        y_hit_count = rough_match(
-            fragment_mzs,
-            y_mzs[frag_start_idx: frag_end_idx][::-1],
-            fragment_ppm,
-        )
-        b_hit_count = rough_match(
-            fragment_mzs,
-            b_mzs[frag_start_idx: frag_end_idx],
-            fragment_ppm,
-        )
-        hit_count = y_hit_count + b_hit_count
-        hit_counts[seed_index] = hit_count
-
-
-@alphatims.utils.pjit
-# @alphatims.utils.njit
-def annotate_seeds2(
-    push_index,
-    y_mzs,
-    b_mzs,
-    frag_start_idxs,
-    frag_end_idxs,
-    mz_values,
-    tof_indices,
-    push_indptr,
-    # max_indices,
-    # max_counts,
-    # max_precursor_mzs,
-    push_counts,
-    seed_candidates,
-    fragment_ppm,
-    # ppm_offset,
-    b_hit_counts,
-    y_hit_counts,
-    summed_b_hit_ints,
-    summed_y_hit_ints,
-    mean_b_hit_ints,
-    mean_y_hit_ints,
-    std_b_hit_ints,
-    std_y_hit_ints,
-    intensity_values,
-    dia_mz_cycle,
-    tof_tolerance,
-    scan_max_index,
-    tof_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    is_signal,
-):
-    seed_start = push_counts[push_index]
-    seed_end = push_counts[push_index + 1]
-    if seed_start == seed_end:
-        return
-    # push_start = push_indptr[push_index]
-    # push_end = push_indptr[push_index + 1]
-    # fragment_tofs = tof_indices[push_start: push_end]
-    fragment_tofs, fragment_intensities = alpharaw.smoothing.merge_pushes2(
-        push_index,
-        push_indptr,
-        tof_indices,
-        intensity_values,
-        dia_mz_cycle,
-        tof_tolerance,
-        scan_max_index,
-        tof_max_index,
-        zeroth_frame,
-        connection_counts,
-        connections,
-        cycle_tolerance,
-        is_signal,
-        # mz_values,
-    )
-    fragment_mzs = mz_values[fragment_tofs]
-    # smooth from connections?
-    for seed_index, seed in enumerate(
-        seed_candidates[seed_start: seed_end],
-        seed_start
-    ):
-        frag_start_idx = frag_start_idxs[seed]
-        frag_end_idx = frag_end_idxs[seed]
-        if frag_start_idx == frag_end_idx:
-            continue
-        b_hits = rough_match2(
-            fragment_mzs,
-            y_mzs[frag_start_idx: frag_end_idx][::-1],
-            fragment_ppm,
-        )
-        y_hits = rough_match2(
-            fragment_mzs,
-            b_mzs[frag_start_idx: frag_end_idx],
-            fragment_ppm,
-        )
-        b_hit_counts[seed_index] = len(b_hits)
-        y_hit_counts[seed_index] = len(y_hits)
-        if len(b_hits) > 0:
-            mean_b_hit_ints[seed_index] = sum(b_hits) / len(b_hits)
-            for i in b_hits:
-                std_b_hit_ints[seed_index] += (i - mean_b_hit_ints[seed_index])**2
-            std_b_hit_ints[seed_index] /= len(b_hits)
-        if len(y_hits) > 0:
-            mean_y_hit_ints[seed_index] = sum(y_hits) / len(y_hits)
-            for i in y_hits:
-                std_y_hit_ints[seed_index] += (i - mean_y_hit_ints[seed_index])**2
-            std_y_hit_ints[seed_index] /= len(y_hits)
-
-
-@alphatims.utils.njit(nogil=True)
-def rough_match2(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hits = []
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            ppm = (fragment_mz - database_mz) / fragment_mz * 10**6
-            hits.append(ppm)
-            fragment_index += 1
-            database_index += 1
-    return hits
-
-
-@alphatims.utils.njit(nogil=True)
-def rough_match2_count_only(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hits = 0
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            hits += 1
-            fragment_index += 1
-            database_index += 1
-    return hits
-
-
-@alphatims.utils.pjit
-def annotate(
-    index,
-    frag_start_idx,
-    frag_end_idx,
-    frag_indices,
-    indptr,
-    precursor_indices,
-    mz_values,
-    tof_indices,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    max_hit_counts,
-    max_db_indices,
-    min_size,
-):
-    start = indptr[index]
-    end = indptr[index + 1]
-    if (end - start) < min_size:
-        return
-    frags = frag_indices[start: end]
-    fragment_tofs = np.sort(tof_indices[frags])
-    fragment_mzs = mz_values[fragment_tofs]
-    for db_index in range(lower[index], upper[index]):
-        frag_start = frag_start_idx[db_index]
-        frag_end = frag_end_idx[db_index]
-        # b_hits = rough_match2(
-        y_hits = rough_match2_count_only(
-            fragment_mzs,
-            y_mzs[frag_start: frag_end][::-1],
-            fragment_ppm,
-        )
-        # y_hits = rough_match2(
-        b_hits = rough_match2_count_only(
-            fragment_mzs,
-            b_mzs[frag_start: frag_end],
-            fragment_ppm,
-        )
-        # hit_count = len(b_hits) + len(y_hits)
-        hit_count = b_hits + y_hits
-        # if hit_count == max_hit_counts[index]:
-        #     max_db_indices[index] = db_index
-        if hit_count > max_hit_counts[index]:
-            max_db_indices[index] = db_index
-            max_hit_counts[index] = hit_count
-
-
-@alphatims.utils.njit(nogil=True)
-def annotate_pool(
-    index,
-    frag_start_idx,
-    frag_end_idx,
-    frag_indices,
-    frag_frequencies,
-    indptr,
-    mz_values,
-    tof_indices,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    min_size,
-    min_hit_count,
-    top_n_hits,
-):
-    start = indptr[index]
-    end = indptr[index + 1]
-    results = [0][1:] # this defines the type
-    hit_counts = [0][1:] # this defines the type
-    frequency_counts = [0.0][1:] # this defines the type
-    if (end - start) < min_size:
-        return index, hit_counts, frequency_counts, results
-    if (end - start) < min_hit_count:
-        return index, hit_counts, frequency_counts, results
-    frags = frag_indices[start: end]
-    fragment_tofs = tof_indices[frags]
-    order = np.argsort(fragment_tofs)
-    frequencies = frag_frequencies[start: end][order]
-    fragment_tofs = fragment_tofs[order]
-    fragment_mzs = mz_values[fragment_tofs]
-    max_hit_count = min_hit_count
-    for db_index in range(lower[index], upper[index]):
-        frag_start = frag_start_idx[db_index]
-        frag_end = frag_end_idx[db_index]
-        y_hits, y_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            y_mzs[frag_start: frag_end][::-1],
-            fragment_ppm,
-        )
-        b_hits, b_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            b_mzs[frag_start: frag_end],
-            fragment_ppm,
-        )
-        hit_count = b_hits + y_hits
-        frequency_count = b_frequency + y_frequency
-        if top_n_hits == 1:
-            if frequency_count == max_hit_count:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-            elif frequency_count > max_hit_count:
-                results = [db_index]
-                hit_counts = [hit_count]
-                frequency_counts = [frequency_count]
-                max_hit_count = hit_count
-        elif frequency_count >= min_hit_count:
-            if len(results) >= top_n_hits:
-                for min_index, freq_count in enumerate(frequency_counts):
-                    if freq_count == min_hit_count:
-                        results[min_index] = db_index
-                        hit_counts[min_index] = hit_count
-                        frequency_counts[min_index] = frequency_count
-                        break
-                min_hit_count = min(frequency_counts)
-            else:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-    # return index, max_hit_count, results
-    return index, hit_counts, frequency_counts, results
-
-
-@alphatims.utils.njit(nogil=True)
-def hit_and_frequency_count(
-    fragment_mzs,
-    frequencies,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hits = 0
-    summed_frequency = 0
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        frequency = frequencies[fragment_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            hits += 1
-            summed_frequency += frequency
-            fragment_index += 1
-            database_index += 1
-    return hits, summed_frequency
-
-
-@alphatims.utils.njit
-def trim_seeds(
-    push_counts,
-    seed_candidates,
-    hit_counts,
-    threshold=3,
-):
-    new_push_counts = np.empty_like(push_counts)
-    new_push_counts[0] = 0
-    total_sum = 0
-    for index, start in enumerate(push_counts[:-1]):
-        end = push_counts[index + 1]
-        total_sum += np.sum(hit_counts[start: end] >= threshold)
-        new_push_counts[index + 1] = total_sum
-    new_seed_candidates = seed_candidates[hit_counts >= threshold]
-    return new_push_counts, new_seed_candidates
-
-
-
-
-
-@alphatims.utils.njit(nogil=True)
-def annotate_pool2(
-    index,
-    frag_start_idx,
-    frag_end_idx,
-    frag_indices,
-    frag_frequencies,
-    indptr,
-    mz_values,
-    tof_indices,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    min_size,
-    min_hit_count,
-    top_n_hits,
-):
-    start = indptr[index]
-    end = indptr[index + 1]
-    results = [0][1:] # this defines the type
-    hit_counts = [0][1:] # this defines the type
-    frequency_counts = [0.0][1:] # this defines the type
-    if (end - start) < min_size:
-        return index, hit_counts, frequency_counts, results
-    if (end - start) < min_hit_count:
-        return index, hit_counts, frequency_counts, results
-    fragment_tofs = frag_indices[start: end]
-    order = np.argsort(fragment_tofs)
-    frequencies = frag_frequencies[start: end][order]
-    fragment_tofs = fragment_tofs[order]
-    fragment_mzs = mz_values[fragment_tofs]
-    max_hit_count = min_hit_count
-    for db_index in range(lower[index], upper[index]):
-        frag_start = frag_start_idx[db_index]
-        frag_end = frag_end_idx[db_index]
-        y_hits, y_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            y_mzs[frag_start: frag_end][::-1],
-            fragment_ppm,
-        )
-        b_hits, b_frequency = hit_and_frequency_count(
-            fragment_mzs,
-            frequencies,
-            b_mzs[frag_start: frag_end],
-            fragment_ppm,
-        )
-        hit_count = b_hits + y_hits
-        frequency_count = b_frequency + y_frequency
-        if top_n_hits == 1:
-            if frequency_count == max_hit_count:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-            elif frequency_count > max_hit_count:
-                results = [db_index]
-                hit_counts = [hit_count]
-                frequency_counts = [frequency_count]
-                max_hit_count = hit_count
-        elif frequency_count >= min_hit_count:
-            if len(results) >= top_n_hits:
-                for min_index, freq_count in enumerate(frequency_counts):
-                    if freq_count == min_hit_count:
-                        results[min_index] = db_index
-                        hit_counts[min_index] = hit_count
-                        frequency_counts[min_index] = frequency_count
-                        break
-                min_hit_count = min(frequency_counts)
-            else:
-                results.append(db_index)
-                hit_counts.append(hit_count)
-                frequency_counts.append(frequency_count)
-    # return index, max_hit_count, results
-    return index, hit_counts, frequency_counts, results
diff --git a/alphadia/preprocessing/__init__.py b/alphadia/preprocessing/__init__.py
deleted file mode 100644
index fe1f672b..00000000
--- a/alphadia/preprocessing/__init__.py
+++ /dev/null
@@ -1,186 +0,0 @@
-"""Preprocess dia data."""
-
-import logging
-
-import alphabase.io
-
-from . import connecting
-from . import smoothing
-from . import peakfinding
-from . import deisotoping
-from . import peakstats
-from . import msmsgeneration
-from . import calibration
-
-
-class Workflow:
-
-    def run_default(
-        self,
-    ):
-        self.set_connector()
-        self.set_smoother()
-        self.set_peak_collection()
-        self.set_peak_stats_calculator()
-        self.set_deisotoper()
-        self.set_msms_generator()
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_connector(self):
-        connector = connecting.PushConnector(
-            self.dia_data,
-            # subcycle_tolerance=3,
-            # scan_tolerance=6,
-        )
-        self.connector = connector
-
-    def set_smoother(self):
-        self.smoother = smoothing.Smoother()
-        self.smoother.set_dia_data(self.dia_data)
-        self.smoother.set_connector(self.connector)
-        self.smoother.smooth()
-
-    def set_peak_collection(self):
-        self.peakfinder = peakfinding.PeakFinder()
-        self.peakfinder.set_dia_data(self.dia_data)
-        self.peakfinder.set_connector(self.connector)
-        self.peakfinder.set_smoother(self.smoother)
-        self.peakfinder.find_peaks()
-
-    def set_deisotoper(self):
-        self.deisotoper = deisotoping.Deisotoper()
-        self.deisotoper.set_dia_data(self.dia_data)
-        self.deisotoper.set_connector(self.connector)
-        self.deisotoper.set_peak_collection(self.peakfinder.peak_collection)
-        self.deisotoper.set_peak_stats_calculator(
-            self.peak_stats_calculator
-        )
-        self.deisotoper.deisotope()
-
-    def set_peak_stats_calculator(self):
-        self.peak_stats_calculator = peakstats.PeakStatsCalculator()
-        self.peak_stats_calculator.set_dia_data(self.dia_data)
-        self.peak_stats_calculator.set_peakfinder(self.peakfinder)
-        self.peak_stats_calculator.calculate_stats()
-
-    def set_msms_generator(self):
-        self.msms_generator = msmsgeneration.MSMSGenerator()
-        self.msms_generator.set_dia_data(self.dia_data)
-        self.msms_generator.set_connector(self.connector)
-        self.msms_generator.set_peak_collection(
-            self.peakfinder.peak_collection
-        )
-        self.msms_generator.set_deisotoper(self.deisotoper)
-        self.msms_generator.set_peak_stats_calculator(
-            self.peak_stats_calculator
-        )
-        self.msms_generator.create_msms_spectra()
-
-    def save_to_hdf(self, file_name=None):
-        if file_name is None:
-            file_name = f"{self.dia_data.bruker_d_folder_name[:-2]}_preprocess_workflow.hdf"
-        logging.info(f"Saving preprocessing workflow results to {file_name}.")
-        hdf = alphabase.io.hdf.HDF_File(
-            file_name,
-            read_only=False,
-            truncate=True,
-        )
-        hdf.connector = self._get_step_as_dict(
-            self.connector
-        )
-        hdf.smoother = self._get_step_as_dict(
-            self.smoother
-        )
-        hdf.peakfinder = self._get_step_as_dict(
-            self.peakfinder
-        )
-        hdf.peakfinder.peak_collection = self._get_step_as_dict(
-            self.peakfinder.peak_collection
-        )
-        hdf.deisotoper = self._get_step_as_dict(
-            self.deisotoper
-        )
-        hdf.peak_stats_calculator = self._get_step_as_dict(
-            self.peak_stats_calculator
-        )
-        hdf.msms_generator = self._get_step_as_dict(
-            self.msms_generator
-        )
-
-    def _get_step_as_dict(self, step):
-        skip_vals = [
-            "dia_data",
-            "connector",
-            "smoother",
-            "peak_collection",
-            "peakfinder",
-            "deisotoper",
-            "peak_stats_calculator",
-        ]
-        return {
-            key: val for (
-                key,
-                val
-            ) in step.__dict__.items() if key not in skip_vals
-        }
-
-    def load_from_hdf(self):
-        hdf = alphabase.io.hdf.HDF_File(
-            f"{self.dia_data.bruker_hdf_file_name[:-4]}_preprocess_workflow.hdf",
-            read_only=False,
-        )
-        self.connector = connecting.PushConnector(self.dia_data)
-        self.smoother = smoothing.Smoother()
-        self.peakfinder = peakfinding.PeakFinder()
-        self.deisotoper = deisotoping.Deisotoper()
-        self.peak_stats_calculator = peakstats.PeakStatsCalculator()
-        self.msms_generator = msmsgeneration.MSMSGenerator()
-        self.connector.__dict__ = self._load_from_hdf_dict(
-            hdf.connector
-        )
-        self.smoother.__dict__ = self._load_from_hdf_dict(
-            hdf.smoother
-        )
-        self.peakfinder.__dict__ = self._load_from_hdf_dict(
-            hdf.peakfinder
-        )
-        self.deisotoper.__dict__ = self._load_from_hdf_dict(
-            hdf.deisotoper
-        )
-        self.peak_stats_calculator.__dict__ = self._load_from_hdf_dict(
-            hdf.peak_stats_calculator
-        )
-        self.msms_generator.__dict__ = self._load_from_hdf_dict(
-            hdf.msms_generator
-        )
-        # self.connector.set_dia_data(self.dia_data)
-        self.smoother.set_dia_data(self.dia_data)
-        self.smoother.set_connector(self.connector)
-        self.peakfinder.set_dia_data(self.dia_data)
-        self.peakfinder.set_connector(self.connector)
-        self.peakfinder.set_smoother(self.smoother)
-        self.peakfinder.peak_collection = peakfinding.PeakCollection()
-        self.peakfinder.peak_collection.__dict__ = self._load_from_hdf_dict(
-            hdf.peakfinder.peak_collection
-        )
-        self.deisotoper.set_dia_data(self.dia_data)
-        self.deisotoper.set_connector(self.connector)
-        self.deisotoper.set_peak_collection(self.peakfinder.peak_collection)
-        self.peak_stats_calculator.set_dia_data(self.dia_data)
-        self.peak_stats_calculator.set_peakfinder(self.peakfinder)
-        self.msms_generator.set_dia_data(self.dia_data)
-        self.msms_generator.set_peak_collection(self.peakfinder.peak_collection)
-        self.msms_generator.set_deisotoper(self.deisotoper)
-        self.msms_generator.set_peak_stats_calculator(
-            self.peak_stats_calculator
-        )
-
-    def _load_from_hdf_dict(self, element):
-        select_dict = {}
-        for key, val in element.__dict__.items():
-            if isinstance(val, alphabase.io.hdf.HDF_Dataset):
-                val = val.mmap
-            select_dict[key] = val
-        return select_dict
diff --git a/alphadia/preprocessing/calibration.py b/alphadia/preprocessing/calibration.py
deleted file mode 100644
index 3428fc50..00000000
--- a/alphadia/preprocessing/calibration.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""Calibrating quadrupole settings"""
-
-
-import logging
-
-import numpy as np
-import alphatims.utils
-import numpy as np
-
-class QuadCalibrator:
-
-    def __init__(
-        self,
-        dia_data,
-    ):
-        self.dia_data = dia_data
-
-    def calculate_calibrated_cycle(self,):
-        logging.info("Calibrating quadrupole")
-        cycle = np.copy(self.dia_data.cycle).reshape(-1, 2)
-        summed_intensity = np.zeros(len(cycle))
-        estimate_isolation_window(
-            range(len(cycle)),
-            self.dia_data.intensity_values,
-            self.dia_data.tof_indices,
-            self.dia_data.push_indptr,
-            self.dia_data.zeroth_frame,
-            len(cycle),
-            self.dia_data.tof_max_index,
-            self.dia_data.scan_max_index,
-            self.dia_data.mz_values,
-            cycle,
-            summed_intensity,
-        )
-        cycle = cycle.reshape(self.dia_data.cycle.shape)
-        summed_intensity = summed_intensity.reshape(self.dia_data.cycle.shape[:-1])
-        self.summed_intensity = summed_intensity
-        self.cycle = cycle
-        self.predict_cycle()
-
-    def predict_cycle(self):
-        import sklearn.linear_model
-        predicted_cycle = np.copy(self.cycle)
-        frame_length = self.cycle.shape[2]
-        for subcycle_i, subcycle in enumerate(self.cycle):
-            for frame_i, frame in enumerate(subcycle):
-                model_lower = sklearn.linear_model.LinearRegression().fit(
-                    np.arange(frame_length).reshape(-1, 1),
-                    frame[:, 0].reshape(-1, 1),
-                    self.summed_intensity[subcycle_i, frame_i] + 1
-                )
-                model_upper = sklearn.linear_model.LinearRegression().fit(
-                    np.arange(frame_length).reshape(-1, 1),
-                    frame[:, 1].reshape(-1, 1),
-                    self.summed_intensity[subcycle_i, frame_i] + 1
-                )
-                predicted_frame_lower = model_lower.predict(
-                    np.arange(frame_length).reshape(-1, 1)
-                )
-                predicted_frame_upper = model_upper.predict(
-                    np.arange(frame_length).reshape(-1, 1)
-                )
-                predicted_cycle[subcycle_i, frame_i, :, 0] = predicted_frame_lower.ravel()
-                predicted_cycle[subcycle_i, frame_i, :, 1] = predicted_frame_upper.ravel()
-        self.predicted_cycle = predicted_cycle
-
-
-@alphatims.utils.pjit
-def estimate_isolation_window(
-    cyclic_push_index,
-    intensity_values,
-    tof_indices,
-    push_indptr,
-    zeroth_frame,
-    cycle_length,
-    tof_max_index,
-    scan_max_index,
-    mz_values,
-    cycle,
-    summed_intensity
-):
-    if cycle[cyclic_push_index, 0] <= 0:
-        return
-    intensity_buffer = merge_cyclic_pushes(
-        cyclic_push_index,
-        intensity_values,
-        tof_indices,
-        push_indptr,
-        zeroth_frame,
-        cycle_length,
-        tof_max_index,
-        scan_max_index
-    )
-    vals = np.cumsum(intensity_buffer)
-    total_intensity = vals[-1]
-    vals /= total_intensity
-    low_tof = np.searchsorted(vals, 0.25, "left")
-    high_tof = np.searchsorted(vals, 0.75, "right")
-    low_mz = mz_values[low_tof]
-    high_mz = mz_values[high_tof]
-    mz_width = high_mz - low_mz
-    mz_mean = (high_mz + low_mz) / 2
-    cycle[cyclic_push_index] = (mz_mean - mz_width, mz_mean + mz_width)
-    summed_intensity[cyclic_push_index] = total_intensity
-
-
-@alphatims.utils.njit
-def merge_cyclic_pushes(
-    cyclic_push_index,
-    intensity_values,
-    tof_indices,
-    push_indptr,
-    zeroth_frame,
-    cycle_length,
-    tof_max_index,
-    scan_max_index,
-):
-    offset = scan_max_index * zeroth_frame + cyclic_push_index
-    intensity_buffer = np.zeros(tof_max_index)
-    for push_index in range(offset, len(push_indptr) - 1, cycle_length):
-        start = push_indptr[push_index]
-        end = push_indptr[push_index + 1]
-        for index in range(start, end):
-            tof = tof_indices[index]
-            intensity = intensity_values[index]
-            intensity_buffer[tof] += intensity
-    return intensity_buffer
-
-
-class QuadWindowShape:
-
-    def __init__(
-        self,
-        file_name,#="/mnt/a54a8df1-78df-4788-bd29-6fca4115f5c0/software_development_data/synchroPASEF/quadCalibration.db",
-        quad_lines,#=slice(26, 30, 2),
-    ):
-        import sqlite3
-        import pandas as pd
-        with sqlite3.connect(
-            file_name
-        ) as sql_database_connection:
-            quad_cal_data = pd.read_sql_query(
-                "SELECT * from QuadCalibrationTable",
-                sql_database_connection
-            )
-            quad = quad_cal_data[quad_lines]
-        self.sp_mass = quad.SetpointMass.values
-        self.c1_params = get_slope_and_intercept_of_linear_function(
-            *self.sp_mass,
-            *quad.Center1.values
-        )
-        self.s1_params = get_slope_and_intercept_of_linear_function(
-            *self.sp_mass,
-            *quad.Sigma1.values
-        )
-        self.c2_params = get_slope_and_intercept_of_linear_function(
-            *self.sp_mass,
-            *quad.Center2.values
-        )
-        self.s2_params = get_slope_and_intercept_of_linear_function(
-            *self.sp_mass,
-            *quad.Sigma2.values
-        )
-
-
-    def get_efficiency_matrix(self, dia_data):
-        import math
-        cycle_center = np.mean(dia_data.cycle, axis=-1)
-        c1_params = self.c1_params
-        s1_params = self.s1_params
-        c2_params = self.c2_params
-        s2_params = self.s2_params
-        def get_transmission_matrix(mz):
-            center1 = mz * c1_params[0] + c1_params[1]
-            sigma1 = mz * s1_params[0] + s1_params[1]
-            center2 = mz * c2_params[0] + c2_params[1]
-            sigma2 = mz * s2_params[0] + s2_params[1]
-            out = (mz - cycle_center).copy().ravel()
-            for i, xi in enumerate(out):
-                arg1 = (xi - center1) / sigma1
-                arg2 = (center2  - xi) / sigma2
-                out[i] = 0.5 * (math.erf(arg1) + math.erf(arg2))
-            return out.reshape(cycle_center.shape)
-        return alphatims.utils.njit(get_transmission_matrix)
-
-
-@alphatims.utils.njit
-def get_slope_and_intercept_of_linear_function(x0, x1, y0, y1):
-    slope = (y1 - y0) / (x1 - x0)
-    intercept = y0 - x0 * slope
-    return slope, intercept
diff --git a/alphadia/preprocessing/connecting.py b/alphadia/preprocessing/connecting.py
deleted file mode 100644
index c4dd3b97..00000000
--- a/alphadia/preprocessing/connecting.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""Connect push indices from dia data."""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-
-
-class PushConnector:
-
-    def __init__(
-        self,
-        dia_data,
-        subcycle_tolerance=3,
-        scan_tolerance=6,
-    ):
-        logging.info("Setting connections")
-        cycle = get_cycle(dia_data)
-        indptr, indices = get_connections(
-            cycle,
-            scan_tolerance=scan_tolerance,
-            subcycle_tolerance=subcycle_tolerance,
-        )
-        self.dia_data = dia_data
-        self.scan_tolerance = scan_tolerance
-        self.subcycle_tolerance = subcycle_tolerance
-        self.cycle = cycle
-        self.indptr = indptr
-        self.indices = indices
-        self.connection_counts = self.indptr
-        self.connections = self.indices
-
-
-def get_cycle(dia_data):
-    if hasattr(dia_data, "cycle"):
-        return dia_data.cycle
-    last_window_group = -1
-    for max_index, (frame, window_group) in enumerate(
-        zip(
-            dia_data.fragment_frames.Frame,
-            dia_data.fragment_frames.Precursor
-        )
-    ):
-        if window_group < last_window_group:
-            break
-        else:
-            last_window_group = window_group
-    frames = dia_data.fragment_frames.Frame[max_index-1]
-    frames += dia_data.fragment_frames.Frame[0] == int(dia_data.zeroth_frame)
-    sub_cycles = frames - len(np.unique(dia_data.fragment_frames.Frame[:max_index]))
-    cycle = np.zeros(
-        (
-            frames,
-            dia_data.scan_max_index,
-            2,
-        )
-    )
-    # cycle[:] = -1
-    precursor_frames = np.ones(frames, dtype=np.bool_)
-    for index, row in dia_data.fragment_frames[:max_index].iterrows():
-        frame = int(row.Frame - dia_data.zeroth_frame)
-        scan_begin = int(row.ScanNumBegin)
-        scan_end = int(row.ScanNumEnd)
-        low_mz = row.IsolationMz - row.IsolationWidth / 2
-        high_mz = row.IsolationMz + row.IsolationWidth / 2
-    #     print(low_mz, high_mz)
-        cycle[
-            frame,
-            scan_begin: scan_end,
-        ] = (low_mz, high_mz)
-        precursor_frames[frame] = False
-    cycle[precursor_frames] = (-1, -1)
-    cycle = cycle.reshape(
-        (
-            sub_cycles,
-            frames // sub_cycles,
-            *cycle.shape[1:]
-        )
-    )
-    return cycle
-
-
-def get_connections(
-    cycle,
-    scan_tolerance,
-    subcycle_tolerance,
-):
-    cycle_size = np.prod(cycle.shape[:-1])
-    max_subcycle_count = cycle.shape[0]
-    cycle_tolerance = int(np.ceil(subcycle_tolerance / max_subcycle_count))
-    pointer_cycle = np.arange(
-        cycle_size
-    ).reshape(cycle.shape[:-1])
-    indices = []
-    indptr = np.empty(np.prod(cycle.shape[:-1]) + 1, dtype=np.int64)
-    indptr[0] = 0
-    push_index = 0
-    hit_count = 0
-    for subcycle_index, subcycle in enumerate(cycle):
-        for frame_index, frame in enumerate(subcycle):
-            for scan_index, (low_mz, high_mz) in enumerate(frame):
-                low_scan_index = max(0, scan_index - scan_tolerance)
-                high_scan_index = scan_index + scan_tolerance + 1
-                if low_mz == -1:
-                    sub_selection = cycle[
-                        :, :, low_scan_index: high_scan_index, 0
-                    ] == -1
-                else:
-                    sub_selection = cycle[
-                        :, :, low_scan_index: high_scan_index, 0
-                    ] < high_mz
-                    sub_selection &= cycle[
-                        :, :, low_scan_index: high_scan_index, 1
-                    ] > low_mz
-                selected_pointers = pointer_cycle[
-                    :, :, low_scan_index: high_scan_index
-                ][sub_selection]
-                elements = []
-                for i in range(-cycle_tolerance, cycle_tolerance + 1):
-                    elements.append(selected_pointers + i * cycle_size)
-                elements = np.concatenate(elements)
-                subcycle_offsets = elements // np.prod(cycle.shape[1:-1]) - subcycle_index
-                left = np.searchsorted(
-                    subcycle_offsets,
-                    -subcycle_tolerance,
-                    side="left",
-                )
-                right = np.searchsorted(
-                    subcycle_offsets,
-                    subcycle_tolerance,
-                    side="right",
-                )
-                selected_elements = elements[left: right]
-                selected_offsets = selected_elements - push_index
-                indices.append(selected_offsets)
-                push_index += 1
-                hit_count += len(selected_offsets)
-                indptr[push_index] = hit_count
-    indices = np.concatenate(indices)
-    return indptr, indices
diff --git a/alphadia/preprocessing/deisotoping.py b/alphadia/preprocessing/deisotoping.py
deleted file mode 100644
index 02ebf1e4..00000000
--- a/alphadia/preprocessing/deisotoping.py
+++ /dev/null
@@ -1,212 +0,0 @@
-"""Deisotope peaks"""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-import alphadia.preprocessing.peakstats
-
-
-class Deisotoper:
-
-    def __init__(
-        self,
-        isotope_mz_tolerance=0.01,
-        cycle_tolerance=3,
-        min_correlation=0.5,
-        proton_mass=1.007277,
-    ):
-        self.isotope_mz_tolerance = isotope_mz_tolerance
-        self.cycle_tolerance = cycle_tolerance
-        self.min_correlation = min_correlation
-        self.proton_mass = proton_mass
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_connector(self, connector):
-        self.connector = connector
-
-    def set_peak_collection(self, peak_collection):
-        self.peak_collection = peak_collection
-
-    def set_peak_stats_calculator(self, peak_stats_calculator):
-        self.peak_stats_calculator = peak_stats_calculator
-
-    def deisotope(self):
-        logging.info("Determining mono isotopes")
-        logging.info("Charge 2")
-        self.mono_isotopes_charge2 = create_isotopic_pairs(
-            self,
-            difference=self.proton_mass/2,
-            mz_tolerance=self.isotope_mz_tolerance,
-            min_correlation=self.min_correlation
-        )
-        logging.info("Charge 3")
-        self.mono_isotopes_charge3 = create_isotopic_pairs(
-            self,
-            difference=self.proton_mass/3,
-            mz_tolerance=self.isotope_mz_tolerance,
-            min_correlation=self.min_correlation
-        )
-        self.mono_isotopes = np.unique(
-            np.concatenate(
-                [
-                    self.mono_isotopes_charge2,
-                    self.mono_isotopes_charge3,
-                ]
-            )
-        )
-
-
-def create_isotopic_pairs(
-    self,
-    difference,
-    mz_tolerance,
-    min_correlation,
-):
-    import multiprocessing
-
-    def starfunc(cycle_index):
-        return get_isotopic_pairs(
-            cycle_index,
-            self.peak_collection.indptr,
-            self.dia_data.mz_values[
-                self.dia_data.tof_indices[
-                    self.peak_collection.indices
-                ]
-            ],
-            mz_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connector.connection_counts,
-            self.connector.connections,
-            self.cycle_tolerance,
-            difference,
-        )
-
-    iterable = range(
-        len(self.dia_data.push_indptr) // np.prod(
-            self.connector.cycle.shape[:-1]
-        ) + 1
-    )
-    # iterable = range(500, 520)
-    self_connections = []
-    other_connections = []
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for cycle_index, (
-            self_connection,
-            other_connection,
-        ) in alphatims.utils.progress_callback(
-            enumerate(pool.imap(starfunc, iterable)),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            self_connections.append(self_connection)
-            other_connections.append(other_connection)
-    left_connection = np.concatenate(self_connections)
-    right_connection = np.concatenate(other_connections)
-    xic_correlations = np.empty(len(left_connection))
-    alphadia.preprocessing.peakstats.set_profile_correlations(
-        range(len(left_connection)),
-        # range(10),
-        # 0,
-        left_connection,
-        right_connection,
-        np.arange(len(left_connection) + 1),
-        xic_correlations,
-        self.peak_stats_calculator.xic_offset,
-        self.peak_stats_calculator.xics,
-        self.peak_stats_calculator.xic_indptr,
-    )
-    mobilogram_correlations = np.empty(len(left_connection))
-    alphadia.preprocessing.peakstats.set_profile_correlations(
-        range(len(left_connection)),
-        # range(10),
-        # 0,
-        left_connection,
-        right_connection,
-        np.arange(len(left_connection) + 1),
-        mobilogram_correlations,
-        self.peak_stats_calculator.mobilogram_offset,
-        self.peak_stats_calculator.mobilograms,
-        self.peak_stats_calculator.mobilogram_indptr,
-    )
-    correlation = xic_correlations * mobilogram_correlations
-    mono_isotopes_charge = tm.clone(
-        self.peak_collection.indices[
-            left_connection[correlation > min_correlation][
-                ~np.isin(
-                    left_connection[correlation > min_correlation], right_connection[correlation > min_correlation]
-                # ) & np.isin(
-                #     right_connection, left_connection
-                )
-            ]
-        ]
-    )
-    return mono_isotopes_charge
-
-
-# @alphatims.utils.pjit
-@alphatims.utils.njit(nogil=True)
-def get_isotopic_pairs(
-    cycle_index,
-    indptr,
-    mz_values,
-    mz_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    difference,
-    # peaks,
-):
-    len_cycle = len(connection_counts) - 1
-    push_offset = len_cycle * cycle_index + zeroth_frame * scan_max_index
-    self_connections = []
-    other_connections = []
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        if True:
-            for other_connection_offset in connections[connection_start: connection_end]:
-                other_push_index = self_push_index + other_connection_offset
-                # if other_push_index == self_push_index:
-                #     continue
-                if not (0 <= other_push_index < len(indptr)):
-                    continue
-        # for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-        #     for other_connection_index in connections[connection_start: connection_end]:
-        #         other_push_index = push_offset + other_connection_index + len_cycle * cycle_offset
-        #         if other_push_index >= len(indptr):
-        #             continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    # if self_index == other_index:
-                    #     self_index += 1
-                    self_tof = mz_values[self_index]
-                    other_tof = mz_values[other_index]
-                    if (self_tof - mz_tolerance) <= (other_tof - difference) <= (self_tof + mz_tolerance):
-                        self_connections.append(self_index)
-                        other_connections.append(other_index)
-                    if self_tof < (other_tof - difference - mz_tolerance):
-                        self_index += 1
-                    else:
-                        other_index += 1
-    return np.array(self_connections), np.array(other_connections)
diff --git a/alphadia/preprocessing/msmsgeneration.py b/alphadia/preprocessing/msmsgeneration.py
deleted file mode 100644
index ed24c40f..00000000
--- a/alphadia/preprocessing/msmsgeneration.py
+++ /dev/null
@@ -1,394 +0,0 @@
-"""Create MSMS spectra."""
-
-import logging
-
-import numpy as np
-import pandas as pd
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-import alphabase.io.hdf
-import alphadia.preprocessing.peakstats
-
-
-class MSMSGenerator:
-
-    def __init__(
-        self,
-        scan_tolerance=6,
-        subcycle_tolerance=3,
-        cycle_sigma=3,
-        scan_sigma=6,
-    ):
-        self.scan_tolerance = scan_tolerance
-        self.subcycle_tolerance = subcycle_tolerance
-        self.cycle_sigma = cycle_sigma
-        self.scan_sigma = scan_sigma
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_peak_collection(self, peak_collection):
-        self.peak_collection = peak_collection
-
-    def set_connector(self, connector):
-        self.connector = connector
-
-    def set_deisotoper(self, deisotoper):
-        self.deisotoper = deisotoper
-
-    def set_peak_stats_calculator(self, peak_stats_calculator):
-        self.peak_stats_calculator = peak_stats_calculator
-
-    def create_msms_spectra(self):
-        import multiprocessing
-        logging.info("Creating MSMS spectra")
-
-        def starfunc(cycle_index):
-            return create_precursor_centric_ion_network(
-                cycle_index,
-                self.peak_collection.indices,
-                self.peak_collection.indptr,
-                self.dia_data.zeroth_frame,
-                self.dia_data.scan_max_index,
-                self.scan_tolerance,
-                self.subcycle_tolerance,
-                self.connector.cycle,
-                self.dia_data.mz_values,
-                self.dia_data.tof_indices,
-                np.isin(self.peak_collection.indices, self.deisotoper.mono_isotopes)
-            )
-
-        precursor_indices = []
-        precursor_counts = [[0]]
-        fragment_indices = []
-
-        iterable = range(
-            len(self.dia_data.push_indptr) // np.prod(self.dia_data.cycle.shape[:-1]) + 1
-        )
-
-        with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-            for (
-                precursor_indices_,
-                precursor_counts_,
-                fragment_indices_,
-            ) in alphatims.utils.progress_callback(
-                pool.imap(starfunc, iterable),
-                total=len(iterable),
-                include_progress_callback=True
-            ):
-                precursor_indices.append(precursor_indices_)
-                precursor_counts.append(precursor_counts_)
-                fragment_indices.append(fragment_indices_)
-
-        precursor_indices = np.concatenate(precursor_indices)
-        precursor_counts = np.cumsum(np.concatenate(precursor_counts))
-        fragment_indices = np.concatenate(fragment_indices)
-        self.precursor_indices = tm.clone(precursor_indices)
-        self.precursor_indptr = tm.clone(precursor_counts)
-        self.fragment_indices = tm.clone(fragment_indices)
-        self.set_fragment_apex_distances()
-        self.set_fragment_profile_distances()
-        self.set_fragment_quad_overlaps()
-        self.fragment_frequencies = self.mobilogram_correlations * self.xic_correlations * self.quad_overlaps
-
-    def set_fragment_apex_distances(self):
-        logging.info("Setting fragment-precursor apex distances")
-        dia_data = self.dia_data
-        fdf = pd.DataFrame(
-            dia_data.convert_from_indices(
-                self.fragment_indices,
-                return_scan_indices=True,
-                return_push_indices=True,
-            )
-        )
-        pdf = pd.DataFrame(
-            dia_data.convert_from_indices(
-                np.repeat(
-                    self.precursor_indices,
-                    np.diff(self.precursor_indptr)
-                ),
-                return_scan_indices=True,
-                return_push_indices=True,
-            )
-        )
-        pdf["cycle"] = (pdf.push_indices - dia_data.zeroth_frame * dia_data.scan_max_index) // np.prod(dia_data.cycle.shape[:-1])
-        fdf["cycle"] = (fdf.push_indices - dia_data.zeroth_frame * dia_data.scan_max_index) // np.prod(dia_data.cycle.shape[:-1])
-        self.apex_distances = (
-            np.exp(
-                -((pdf.scan_indices - fdf.scan_indices) / self.scan_sigma)**2 / 2
-            ) * np.exp(
-                -((pdf.cycle - fdf.cycle) / self.cycle_sigma)**2 / 2
-            )
-        ).values
-        self.fragment_frequencies = self.apex_distances
-
-    def set_fragment_profile_distances(self):
-        logging.info("Setting fragment-precursor correlations")
-        self.xic_correlations = np.empty(len(self.fragment_indices))
-        self.mobilogram_correlations = np.empty(len(self.fragment_indices))
-        fragment_peak_indices = np.searchsorted(
-            self.peak_collection.indices,
-            self.fragment_indices,
-        )
-        precursor_peak_indices = np.searchsorted(
-            self.peak_collection.indices,
-            self.precursor_indices,
-        )
-        alphadia.preprocessing.peakstats.set_profile_correlations(
-            range(len(precursor_peak_indices)),
-            # range(10),
-            # 0,
-            fragment_peak_indices,
-            precursor_peak_indices,
-            self.precursor_indptr,
-            self.xic_correlations,
-            self.peak_stats_calculator.xic_offset,
-            self.peak_stats_calculator.xics,
-            self.peak_stats_calculator.xic_indptr,
-        )
-        alphadia.preprocessing.peakstats.set_profile_correlations(
-            range(len(precursor_peak_indices)),
-            # range(10),
-            # 0,
-            fragment_peak_indices,
-            precursor_peak_indices,
-            self.precursor_indptr,
-            self.mobilogram_correlations,
-            self.peak_stats_calculator.mobilogram_offset,
-            self.peak_stats_calculator.mobilograms,
-            self.peak_stats_calculator.mobilogram_indptr,
-        )
-
-    def set_fragment_quad_overlaps(self):
-        logging.info("Setting fragment-precursor quad overlaps")
-        self.quad_overlaps = np.empty_like(
-            self.apex_distances
-        )
-        calculate_quad_overlap(
-            range(len(self.precursor_indices)),
-            self.precursor_indices,
-            self.precursor_indptr,
-            self.fragment_indices,
-            self.dia_data.mz_values,
-            self.dia_data.tof_indices,
-            self.dia_data.push_indptr,
-            self.dia_data.zeroth_frame,
-            self.dia_data.cycle,
-            self.peak_stats_calculator.peakfinder.cluster_assemblies,
-            self.dia_data.intensity_values.astype(np.float32),
-            self.quad_overlaps,
-        )
-
-    def get_ms1_df(self):
-        logging.info("Creating MS1 dataframe")
-        precursor_index_mask = np.isin(
-            self.peak_collection.indices,
-            self.precursor_indices
-        )
-        ms1_df = self.peak_stats_calculator.as_dataframe(
-            precursor_index_mask,
-            append_apices=True
-        )
-        ms1_df['charge'] = np.array([2, 3])[
-            np.isin(
-                self.precursor_indices,
-                self.deisotoper.mono_isotopes_charge3
-            ).astype(np.int)
-        ]
-        ms1_df['fragment_start'] = self.precursor_indptr[:-1]
-        ms1_df['fragment_end'] = self.precursor_indptr[1:]
-        return ms1_df
-
-    def get_ms2_df(self):
-        logging.info("Creating MS2 dataframe")
-        fragment_order = np.arange(len(self.fragment_indices))
-        sort_ms2_fragments_by_tof_indices(
-            range(len(self.precursor_indices)),
-            self.fragment_indices,
-            self.precursor_indptr,
-            self.dia_data.tof_indices,
-            fragment_order,
-        )
-        fragment_indices = np.searchsorted(
-            self.peak_collection.indices,
-            self.fragment_indices[fragment_order],
-        )
-        ms2_df = self.peak_stats_calculator.as_dataframe(
-            fragment_indices,
-            append_apices=True
-        )
-        # temp = np.empty(len(ms2_df))
-        ms2_df["apex_correlation"] = self.apex_distances[
-            fragment_order
-        ]
-        ms2_df["xic_correlations"] = self.xic_correlations[
-            fragment_order
-        ]
-        ms2_df["mobilogram_correlations"] = self.mobilogram_correlations[
-            fragment_order
-        ]
-        ms2_df["quad_correlation"] = self.quad_overlaps[
-            fragment_order
-        ]
-        return ms2_df
-
-    def write_to_hdf_file(self, file_name=None):
-        if file_name is None:
-            import os
-            file_name = os.path.join(
-                self.dia_data.bruker_d_folder_name,
-                "pseudo_spectra.hdf"
-            )
-        ms1_df = self.get_ms1_df()
-        ms2_df = self.get_ms2_df()
-        hdf = alphabase.io.hdf.HDF_File(
-            file_name,
-            read_only=False,
-            truncate=True,
-        )
-        hdf.precursors = ms1_df
-        hdf.fragments = ms2_df
-        return hdf
-
-
-@alphatims.utils.njit(nogil=True)
-def create_precursor_centric_ion_network(
-    cycle_index,
-    indices,
-    indptr,
-    zeroth_frame,
-    scan_max_index,
-    scan_tolerance,
-    subcycle_tolerance,
-    mz_windows,
-    mz_values,
-    tof_indices,
-    is_mono,
-):
-    subcycle_count = mz_windows.shape[0]
-    frame_count = mz_windows.shape[1]
-    cycle_length = subcycle_count * frame_count * scan_max_index
-    push_offset = cycle_length * cycle_index + zeroth_frame * scan_max_index
-    precursor_indices = []
-    precursor_count = []
-    fragment_indices = []
-    for self_push_offset in np.flatnonzero(mz_windows[..., 0] == -1):
-        self_push_index = push_offset + self_push_offset
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        self_scan = self_push_offset % scan_max_index
-        self_frame = (self_push_offset // scan_max_index) % subcycle_count
-        for precursor_index_ in range(self_start, self_end):
-            if not is_mono[precursor_index_]:
-                continue
-            precursor_index = indices[precursor_index_]
-            precursor_mz = mz_values[tof_indices[precursor_index]]
-            hits = 0
-            for sub_cycle_offset in range(-subcycle_tolerance, subcycle_tolerance + 1):
-                for frame_offset in range(-self_frame, frame_count - self_frame):
-                    for scan_offset in range(-scan_tolerance, scan_tolerance + 1):
-                        other_scan = self_scan + scan_offset
-                        if not (0 <= other_scan < scan_max_index):
-                            continue
-
-                        other_push_index = self_push_index
-                        other_push_index += scan_offset
-                        other_push_index += frame_offset * scan_max_index
-                        other_push_index += sub_cycle_offset * frame_count * scan_max_index
-
-                        if not (0 <= other_push_index < len(indptr)):
-                            continue
-                        low_mz, high_mz = mz_windows.reshape((-1, 2))[
-                            (other_push_index - zeroth_frame * scan_max_index) % cycle_length
-                        ]
-                        if not (low_mz <= precursor_mz < high_mz):
-                            continue
-                        other_start = indptr[other_push_index]
-                        other_end = indptr[other_push_index + 1]
-                        for fragment_index_ in range(other_start, other_end):
-                            fragment_index = indices[fragment_index_]
-                            fragment_indices.append(fragment_index)
-                            hits += 1
-            if hits > 0:
-                precursor_indices.append(precursor_index)
-                precursor_count.append(hits)
-    return (
-        np.array(precursor_indices),
-        np.array(precursor_count),
-        np.array(fragment_indices),
-    )
-
-
-@alphatims.utils.pjit
-def sort_ms2_fragments_by_tof_indices(
-    index,
-    fragment_indices,
-    indptr,
-    tof_indices,
-    fragment_order
-):
-    start = indptr[index]
-    end = indptr[index + 1]
-    selected_fragment_indices = fragment_indices[start:end]
-    selected_tof_indices = tof_indices[selected_fragment_indices]
-    order = np.argsort(selected_tof_indices)
-    fragment_order[start:end] = fragment_order[start:end][order]
-
-
-@alphatims.utils.pjit
-def calculate_quad_overlap(
-    index,
-    precursor_indices,
-    precursor_indptr,
-    fragment_indices,
-    mz_values,
-    tof_indices,
-    push_indptr,
-    zeroth_frame,
-    cycle,
-    cluster_assemblies,
-    intensity_values,
-    quad_overlaps,
-):
-    precursor_index = precursor_indices[index]
-    precursor_mz = mz_values[tof_indices[precursor_index]]
-    precursor_start = precursor_indptr[index]
-    precursor_end = precursor_indptr[index + 1]
-    for f_index in range(precursor_start, precursor_end):
-        fragment_index = fragment_indices[f_index]
-        raw_indices = alphadia.preprocessing.peakstats.get_ions(
-            fragment_index,
-            cluster_assemblies,
-        )
-        push_indices = np.searchsorted(
-            push_indptr,
-            raw_indices,
-            "right"
-        ) - 1
-        cycle_offsets = (push_indices - zeroth_frame * cycle.shape[-2]) % (cycle.size // 2)
-        quads = cycle.reshape(-1,2)[cycle_offsets]
-        # selected = quads[:,0] <= precursor_mz
-        # selected &= precursor_mz <= quads[:,1]
-        # result = np.sum(selected) / len(selected)
-        # quad_overlaps[f_index] = result
-        mzs = np.empty(len(quads) * 2)
-        mzs[:len(quads)] = quads[:,0]
-        mzs[len(quads):] = quads[:,1]
-        ints = np.empty(len(quads) * 2)
-        ints[:len(quads)] = intensity_values[raw_indices]
-        ints[len(quads):] = -intensity_values[raw_indices]
-        order = np.argsort(mzs)
-        selected = np.zeros(len(mzs), dtype=np.bool_)
-        mzs = mzs[order]
-        if len(selected) > 0:
-            selected[0] = True
-            selected[np.flatnonzero(mzs[:-1] != mzs[1:]) + 1] = True
-        mzs = mzs[selected]
-        ints = np.cumsum(ints[order])[selected]
-        ints /= np.max(ints)
-        location = np.searchsorted(mzs, precursor_mz)
-        probability = ints[location]
-        quad_overlaps[f_index] = probability
diff --git a/alphadia/preprocessing/peakfinding.py b/alphadia/preprocessing/peakfinding.py
deleted file mode 100644
index 5fbf4a1d..00000000
--- a/alphadia/preprocessing/peakfinding.py
+++ /dev/null
@@ -1,363 +0,0 @@
-"""Find peaks in dia data."""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-
-
-class PeakFinder:
-
-    def __init__(
-        self,
-        tof_tolerance=3,
-        cycle_tolerance=3,
-    ):
-        self.tof_tolerance = tof_tolerance
-        self.cycle_tolerance = cycle_tolerance
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_connector(self, connector):
-        self.connector = connector
-
-    def set_smoother(self, smoother):
-        self.smoother = smoother
-
-    def find_peaks(self):
-        self.assign_internal_points()
-        self.find_cluster_paths()
-        self.cluster_from_paths()
-        self.find_ambiguous_cluster_overlaps()
-        self.assemble_clusters()
-        self.assign_quantifiable_clusters()
-        self.peak_collection = PeakCollection()
-        self.peak_collection.set_peak_indptr(
-            self.dia_data.push_indptr,
-            self.peaks,
-        )
-
-    def assign_internal_points(self):
-        logging.info("Assigning internal points")
-        self.internal_points = tm.empty(
-            shape=self.smoother.smooth_intensity_values.shape,
-            dtype=np.bool_
-        )
-        self.valid_neighborhood = np.ones(2**8, dtype=np.bool_)
-        for index in range(2**8):
-            bin_repr = '{:08b}'.format(index)
-            if "00" in bin_repr:
-                self.valid_neighborhood[index] = False
-            if (index < 2**7) and (index % 2 == 0):
-                self.valid_neighborhood[index] = False
-        self.internal_points[:] = self.valid_neighborhood[self.smoother.neighbor_types]
-
-    def find_cluster_paths(self):
-        logging.info("Finding cluster paths")
-        self.cluster_path_pointers = tm.clone(np.arange(len(self.dia_data)))
-        cluster_to_max_peaks_(
-            range(
-                len(self.dia_data.push_indptr) // np.prod(
-                    self.connector.cycle.shape[:-1]
-                ) + 1
-            ),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.smoother.smooth_intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connector.connection_counts,
-            self.connector.connections,
-            self.cycle_tolerance,
-            self.cluster_path_pointers,
-        )
-
-    def cluster_from_paths(self):
-        logging.info("Clustering from paths")
-        self.cluster_pointers = tm.clone(self.cluster_path_pointers)
-        walk_cluster_path(np.arange(10))
-        walk_cluster_path(self.cluster_pointers)
-
-    def find_ambiguous_cluster_overlaps(self):
-        logging.info("Detecting cluster ambiguities")
-        self.nonambiguous_ions = tm.ones(len(self.dia_data), dtype=np.bool_)
-        find_unique_peaks_(
-            range(
-                len(self.dia_data.push_indptr) // np.prod(
-                    self.connector.cycle.shape[:-1]
-                ) + 1
-            ),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.smoother.smooth_intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connector.connection_counts,
-            self.connector.connections,
-            self.cycle_tolerance,
-            self.cluster_pointers,
-            self.nonambiguous_ions,
-        )
-        logging.info("Removing cluster ambiguities")
-        walk_unique_cluster_path(
-            np.arange(10),
-            np.zeros(10, dtype=np.bool_),
-            np.ones(10, dtype=np.bool_),
-        )
-        to_visit = np.ones_like(self.nonambiguous_ions)
-        walk_unique_cluster_path(
-            self.cluster_path_pointers,
-            self.nonambiguous_ions,
-            to_visit,
-        )
-
-    def assemble_clusters(self):
-        logging.info("Assembling clusters")
-        self.cluster_assemblies = tm.clone(self.cluster_pointers)
-        assemble_clusters(
-            self.cluster_pointers,
-            self.nonambiguous_ions,
-            self.cluster_assemblies,
-        )
-
-    def assign_quantifiable_clusters(self):
-        logging.info("Assigning quantifiable clusters")
-        unique_peaks = np.unique(self.cluster_pointers)
-        self.peaks = unique_peaks[
-            (self.nonambiguous_ions & self.internal_points)[unique_peaks]
-        ]
-
-
-class PeakCollection(object):
-
-    def set_peak_indptr(
-        self,
-        indptr: np.ndarray = None,
-        peaks: np.ndarray = None,
-    ):
-        if peaks is None:
-            return
-        if peaks.dtype == np.bool_:
-            self.indices = tm.clone(np.flatnonzero(peaks))
-        else:
-            self.indices = tm.clone(peaks)
-        self.indptr = tm.empty(indptr.shape, indptr.dtype)
-        set_peak_indptr(indptr, self.indptr, self.indices)
-
-
-@alphatims.utils.njit
-def set_peak_indptr(old_indptr, new_indptr, indices):
-    count = 0
-    offset = 0
-    for index in indices:
-        while index >= old_indptr[offset]:
-            new_indptr[offset] = count
-            offset += 1
-        count += 1
-    while index >= old_indptr[offset]:
-        new_indptr[offset] = count
-        offset += 1
-    new_indptr[offset:] = count
-
-
-
-
-@alphatims.utils.pjit
-def cluster_to_max_peaks_(
-    cycle_index,
-    indptr,
-    tof_indices,
-    smooth_intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    clusters,
-):
-    len_cycle = len(connection_counts) - 1
-    push_offset = len_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        if True:
-            for other_connection_offset in connections[connection_start: connection_end]:
-                other_push_index = self_push_index + other_connection_offset
-                if other_push_index == self_push_index:
-                    continue
-                if not (0 <= other_push_index < len(indptr)):
-                    continue
-        # for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-        #     for other_connection_index in connections[connection_start: connection_end]:
-        #         other_push_index = push_offset + other_connection_index + len_cycle * cycle_offset
-        #         if other_push_index == self_push_index:
-        #             continue
-        #         if other_push_index >= len(indptr):
-        #             continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        self_ref = clusters[self_index]
-                        max_intensity = smooth_intensity_values[self_ref]
-                        other_intensity = smooth_intensity_values[other_index]
-                        if max_intensity < other_intensity:
-                            clusters[self_index] = other_index
-                        elif max_intensity == other_intensity:
-                            if self_index <= other_index:
-                                clusters[self_index] = other_index
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-
-
-@alphatims.utils.njit
-def walk_cluster_path(
-    clusters
-):
-    for index, pointer in enumerate(clusters):
-        initial_index = index
-        path_length = 1
-        while (pointer >= 0) and (index != pointer):
-            index = pointer
-            pointer = clusters[index]
-            path_length += 1
-        if pointer >= 0:
-            final_pointer = -(pointer + 1)
-        else:
-            final_pointer = pointer
-        index = initial_index
-        for i in range(path_length):
-            pointer = clusters[index]
-            clusters[index] = final_pointer
-            index = pointer
-    for index, pointer in enumerate(clusters):
-        clusters[index] = -(pointer + 1)
-
-
-@alphatims.utils.pjit
-def find_unique_peaks_(
-    cycle_index,
-    indptr,
-    tof_indices,
-    smooth_intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    clusters,
-    unique_peaks,
-):
-    len_cycle = len(connection_counts) - 1
-    push_offset = len_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        if True:
-            for other_connection_offset in connections[connection_start: connection_end]:
-                other_push_index = self_push_index + other_connection_offset
-                if other_push_index == self_push_index:
-                    continue
-                if not (0 <= other_push_index < len(indptr)):
-                    continue
-        # for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-        #     for other_connection_index in connections[connection_start: connection_end]:
-        #         other_push_index = push_offset + other_connection_index + len_cycle * cycle_offset
-        #         if other_push_index <= self_push_index:
-        #             continue
-        #         if other_push_index >= len(indptr):
-        #             continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        self_intensity = smooth_intensity_values[self_index]
-                        other_intensity = smooth_intensity_values[other_index]
-                        if self_intensity <= other_intensity:
-                            if clusters[self_index] != clusters[other_index]:
-                                unique_peaks[self_index] = False
-                        if self_intensity >= other_intensity:
-                            if clusters[self_index] != clusters[other_index]:
-                                unique_peaks[other_index] = False
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-
-
-@alphatims.utils.njit
-def walk_unique_cluster_path(
-    cluster_pointers,
-    nonambiguous_elements,
-    to_visit
-):
-    for index, nonambiguous in enumerate(nonambiguous_elements):
-        initial_index = index
-        path_length = 0
-        while nonambiguous:
-            path_length += 1
-            if not to_visit[index]:
-                break
-            else:
-                to_visit[index] = False
-            pointer = cluster_pointers[index]
-            if index == pointer:
-                break
-            index = pointer
-            nonambiguous = nonambiguous_elements[index]
-        if not nonambiguous:
-            index = initial_index
-            for i in range(path_length):
-                nonambiguous_elements[index] = False
-                index = cluster_pointers[index]
-
-
-@alphatims.utils.njit
-def assemble_clusters(
-    cluster_pointers,
-    nonambiguous_ions,
-    cluster_assemblies,
-):
-    for index, pointer in enumerate(cluster_pointers):
-        if nonambiguous_ions[index]:
-            if index != pointer:
-                secondary_pointer = cluster_assemblies[pointer]
-                cluster_assemblies[index] = secondary_pointer
-                cluster_assemblies[pointer] = index
diff --git a/alphadia/preprocessing/peakstats.py b/alphadia/preprocessing/peakstats.py
deleted file mode 100644
index 3e1d12c2..00000000
--- a/alphadia/preprocessing/peakstats.py
+++ /dev/null
@@ -1,636 +0,0 @@
-"""Calculate peak stats."""
-
-import logging
-
-import numpy as np
-import pandas as pd
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-
-
-class PeakStatsCalculator:
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_peakfinder(self, peakfinder):
-        self.peakfinder = peakfinder
-
-    def calculate_stats(self):
-        logging.info("Calculating peak stats")
-        import multiprocessing
-        import multiprocessing.pool
-
-        iterable = range(len(self.peakfinder.peak_collection.indices))
-        # iterable = range(len(self.peakfinder.peak_collection.indices) // 10)
-        # iterable = range(4585132, 4585132+1)
-
-        self.cycle_rt_values = self.dia_data.rt_values[
-            int(self.dia_data.zeroth_frame)::np.prod(self.dia_data.cycle.shape[:-1])//self.dia_data.scan_max_index
-        ]
-
-        self.number_of_ions = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=np.int32
-        )
-
-        self.summed_intensity_values = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=np.float64
-        )
-
-        xic_indptr = tm.empty(
-            len(self.peakfinder.peak_collection.indices) + 1,
-            dtype=np.int64
-        )
-        xic_indptr[0] = 0
-        self.xic_offset = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=np.int32
-        )
-        self.rt_average = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.rt_values.dtype
-        )
-        self.rt_start = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.rt_values.dtype
-        )
-        self.rt_end = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.rt_values.dtype
-        )
-
-        mobilogram_indptr = tm.empty(
-            len(self.peakfinder.peak_collection.indices) + 1,
-            dtype=np.int64
-        )
-        mobilogram_indptr[0] = 0
-        self.mobilogram_offset = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=np.int32
-        )
-        self.mobility_average = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mobility_values.dtype
-        )
-        self.mobility_start = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mobility_values.dtype
-        )
-        self.mobility_end = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mobility_values.dtype
-        )
-
-        mz_profile_indptr = tm.empty(
-            len(self.peakfinder.peak_collection.indices) + 1,
-            dtype=np.int64
-        )
-        mz_profile_indptr[0] = 0
-        self.mz_profile_offset = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=np.int32
-        )
-        self.mz_average = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mz_values.dtype
-        )
-        self.mz_start = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mz_values.dtype
-        )
-        self.mz_end = tm.empty(
-            self.peakfinder.peak_collection.indices.shape,
-            dtype=self.dia_data.mz_values.dtype
-        )
-
-        count = np.prod(self.dia_data.cycle.shape[:-1])
-
-        # @alphatims.utils.njit(nogil=True)
-        def starfunc(index):
-            return calculate_stats(
-                index,
-                self.peakfinder.peak_collection.indices,
-                self.dia_data.push_indptr,
-                self.dia_data.tof_indices,
-                self.dia_data.scan_max_index,
-                self.dia_data.zeroth_frame,
-                self.dia_data.intensity_values,
-                self.dia_data.mz_values,
-                self.dia_data.rt_values,
-                self.cycle_rt_values,
-                self.dia_data.mobility_values,
-                self.peakfinder.cluster_assemblies,
-                count,
-                self.number_of_ions,
-                self.summed_intensity_values,
-                xic_indptr,
-                self.xic_offset,
-                self.rt_average,
-                self.rt_start,
-                self.rt_end,
-                mobilogram_indptr,
-                self.mobilogram_offset,
-                self.mobility_average,
-                self.mobility_start,
-                self.mobility_end,
-                mz_profile_indptr,
-                self.mz_profile_offset,
-                self.mz_average,
-                self.mz_start,
-                self.mz_end,
-            )
-
-        xics = []
-        mobilograms = []
-        mz_profiles = []
-        with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-            for index in alphatims.utils.progress_callback(iterable):
-                (
-                    xic,
-                    mobilogram,
-                    mz_profile,
-                ) = starfunc(index)
-            # chunksize=len(iterable) // 10000
-            # for (
-            #     xic,
-            #     mobilogram,
-            #     mz_profile,
-            # ) in alphatims.utils.progress_callback(
-            #     pool.imap(starfunc, iterable, chunksize=chunksize),
-            #     total=len(iterable),
-            #     include_progress_callback=True
-            #     # include_progress_callback=False
-            # ):
-                xics.append(xic)
-                mobilograms.append(mobilogram)
-                mz_profiles.append(mz_profile)
-        self.xics = tm.clone(np.concatenate(xics))
-        self.mobilograms = tm.clone(np.concatenate(mobilograms))
-        self.mz_profiles = tm.clone(np.concatenate(mz_profiles))
-        self.xic_indptr = tm.clone(np.cumsum(xic_indptr))
-        self.mobilogram_indptr = tm.clone(np.cumsum(mobilogram_indptr))
-        self.mz_profile_indptr = tm.clone(np.cumsum(mz_profile_indptr))
-
-    def as_dataframe(self, selected_indices=Ellipsis, *, append_apices=False):
-        raw_indices = self.peakfinder.peak_collection.indices[
-            selected_indices
-        ]
-        df = pd.DataFrame(
-            {
-                "number_of_ions": self.number_of_ions[
-                    selected_indices
-                ],
-                "summed_intensity_values": self.summed_intensity_values[
-                    selected_indices
-                ],
-                "xic_offset": self.xic_offset[
-                    selected_indices
-                ],
-                "rt_average": self.rt_average[
-                    selected_indices
-                ],
-                "rt_start": self.rt_start[
-                    selected_indices
-                ],
-                "rt_end": self.rt_end[
-                    selected_indices
-                ],
-                "mobilogram_offset": self.mobilogram_offset[
-                    selected_indices
-                ],
-                "mobility_average": self.mobility_average[
-                    selected_indices
-                ],
-                "mobility_start": self.mobility_start[
-                    selected_indices
-                ],
-                "mobility_end": self.mobility_end[
-                    selected_indices
-                ],
-                "mz_profile_offset": self.mz_profile_offset[
-                    selected_indices
-                ],
-                "mz_average": self.mz_average[
-                    selected_indices
-                ],
-                "mz_start": self.mz_start[
-                    selected_indices
-                ],
-                "mz_end": self.mz_end[
-                    selected_indices
-                ],
-            }
-        )
-        if append_apices:
-            apex_df = self.dia_data.as_dataframe(raw_indices)
-            df = df.join(apex_df, how="left")
-        else:
-            df["raw_indices"] = raw_indices
-        return df
-
-    def get_xic_from_index(
-        self,
-        peak_index,
-    ):
-        xics = self.xics
-        xic_indptr = self.xic_indptr
-
-        def _get_xic_from_index(
-            peak_index,
-        ):
-            xic_start = xic_indptr[peak_index]
-            xic_end = xic_indptr[peak_index + 1]
-            return xics[xic_start: xic_end]
-        self.get_xic_from_index = alphatims.utils.njit(
-            _get_xic_from_index,
-            cache=False,
-        )
-        return self.get_xic_from_index(peak_index)
-
-    def get_mobilogram_from_index(
-        self,
-        peak_index,
-    ):
-        mobilograms = self.mobilograms
-        mobilogram_indptr = self.mobilogram_indptr
-
-        def _get_mobilogram_from_index(
-            peak_index,
-        ):
-            mobilogram_start = mobilogram_indptr[peak_index]
-            mobilogram_end = mobilogram_indptr[peak_index + 1]
-            return mobilograms[mobilogram_start: mobilogram_end]
-        self.get_mobilogram_from_index = alphatims.utils.njit(
-            _get_mobilogram_from_index,
-            cache=False,
-        )
-        return self.get_mobilogram_from_index(peak_index)
-
-    def get_ions_from_index(
-        self,
-        peak_index,
-    ):
-        cluster_assemblies = self.peakfinder.cluster_assemblies
-
-        def _get_ions_from_index(
-            peak_index,
-        ):
-            return get_ions(peak_index, cluster_assemblies)
-        self.get_ions_from_index = alphatims.utils.njit(
-            _get_ions_from_index,
-            cache=False,
-        )
-        return self.get_ions_from_index(peak_index)
-
-
-@alphatims.utils.njit(nogil=True)
-def calculate_stats(
-    index,
-    peak_indices,
-    push_indptr,
-    tof_indices,
-    scan_max_index,
-    zeroth_frame,
-    intensity_values,
-    mz_values,
-    rt_values,
-    cycle_rt_values,
-    mobility_values,
-    cluster_assemblies,
-    cycle_length,
-    number_of_ions,
-    summed_intensity_values,
-    xic_indptr,
-    xic_offset,
-    rt_average,
-    rt_start,
-    rt_end,
-    mobilogram_indptr,
-    mobilogram_offset,
-    mobility_average,
-    mobility_start,
-    mobility_end,
-    mz_profile_indptr,
-    mz_profile_offset,
-    mz_average,
-    mz_start,
-    mz_end,
-):
-    # return (
-    #     np.empty(0),
-    #     np.empty(0),
-    #     np.empty(0),
-    # )
-    # if index < 100:
-    #     print(index)
-    peak_index = peak_indices[index]
-    raw_ion_indices = get_ions(peak_index, cluster_assemblies)
-    number_of_ions[index] = len(raw_ion_indices)
-    if len(raw_ion_indices) == 1:
-        return (
-            np.empty(0),
-            np.empty(0),
-            np.empty(0),
-        )
-    raw_intensities = intensity_values[raw_ion_indices]
-    summed_intensity = np.sum(raw_intensities)
-    summed_intensity_values[index] = summed_intensity
-    push_indices = np.searchsorted(
-        push_indptr,
-        raw_ion_indices,
-        "right"
-    ) - 1 - zeroth_frame * scan_max_index
-    scan_intensities = calculate_rt_stats(
-        index,
-        push_indices,
-        raw_intensities,
-        cycle_length,
-        xic_indptr,
-        xic_offset,
-        rt_average,
-        rt_start,
-        rt_end,
-        rt_values,
-        cycle_rt_values,
-        summed_intensity,
-    )
-    cycle_intensities = calculate_mobility_stats(
-        index,
-        push_indices,
-        raw_intensities,
-        scan_max_index,
-        mobilogram_indptr,
-        mobilogram_offset,
-        mobility_average,
-        mobility_start,
-        mobility_end,
-        mobility_values,
-        summed_intensity,
-    )
-    mz_intensities = calculate_mz_stats(
-        index,
-        tof_indices[raw_ion_indices],
-        raw_intensities,
-        mz_profile_indptr,
-        mz_profile_offset,
-        mz_average,
-        mz_start,
-        mz_end,
-        mz_values,
-        summed_intensity,
-    )
-    return (
-        scan_intensities,
-        cycle_intensities,
-        mz_intensities,
-    )
-
-
-@alphatims.utils.njit(nogil=True)
-def get_ions(peak_index, cluster_assemblies):
-    size = 1
-    pointer = cluster_assemblies[peak_index]
-    while pointer != peak_index:
-        size += 1
-        pointer = cluster_assemblies[pointer]
-    pointer = cluster_assemblies[peak_index]
-    raw_ions = np.empty(size, dtype=cluster_assemblies.dtype)
-    for index in range(size):
-        raw_ions[index] = pointer
-        pointer = cluster_assemblies[pointer]
-    return raw_ions
-
-
-@alphatims.utils.njit(nogil=True)
-def calculate_rt_stats(
-    index,
-    push_indices,
-    raw_intensities,
-    cycle_length,
-    xic_indptr,
-    xic_offset,
-    rt_average,
-    rt_start,
-    rt_end,
-    rt_values,
-    cycle_rt_values,
-    summed_intensity,
-):
-    scan_indices = push_indices // cycle_length
-    lowest_scan, scan_intensities = extract_profile(
-        raw_intensities,
-        scan_indices
-    )
-    xic_indptr[index + 1] = len(scan_intensities)
-    xic_offset[index] = lowest_scan
-    rt_average[index] = np.sum(
-        scan_intensities * cycle_rt_values[
-            lowest_scan: lowest_scan + len(scan_intensities)
-        ]
-    ) / summed_intensity
-    rt_start[index] = cycle_rt_values[lowest_scan]
-    rt_end[index] = cycle_rt_values[lowest_scan + len(scan_intensities) - 1]
-    return scan_intensities
-
-
-@alphatims.utils.njit(nogil=True)
-def calculate_mobility_stats(
-    index,
-    push_indices,
-    raw_intensities,
-    scan_max_index,
-    mobilogram_indptr,
-    mobilogram_offset,
-    mobility_average,
-    mobility_start,
-    mobility_end,
-    mobility_values,
-    summed_intensity,
-):
-    cycle_indices = push_indices % scan_max_index
-    lowest_cycle, cycle_intensities = extract_profile(
-        raw_intensities,
-        cycle_indices
-    )
-    mobilogram_indptr[index + 1] = len(cycle_intensities)
-    mobilogram_offset[index] = lowest_cycle
-    mobility_average[index] = np.sum(
-        cycle_intensities * mobility_values[
-            lowest_cycle: lowest_cycle + len(cycle_intensities)
-        ]
-    ) / summed_intensity
-    mobility_end[index] = mobility_values[lowest_cycle]
-    mobility_start[index] = mobility_values[lowest_cycle + len(cycle_intensities) - 1]
-    return cycle_intensities
-
-
-@alphatims.utils.njit(nogil=True)
-def calculate_mz_stats(
-    index,
-    tof_indices,
-    raw_intensities,
-    mz_profile_indptr,
-    mz_profile_offset,
-    mz_average,
-    mz_start,
-    mz_end,
-    mz_values,
-    summed_intensity,
-):
-    lowest_mz_index, mz_intensities = extract_profile(
-        raw_intensities,
-        tof_indices
-    )
-    mz_profile_indptr[index + 1] = len(mz_intensities)
-    mz_profile_offset[index] = lowest_mz_index
-    mz_average[index] = np.sum(
-        mz_intensities * mz_values[
-            lowest_mz_index: lowest_mz_index + len(mz_intensities)
-        ]
-    ) / summed_intensity
-    mz_start[index] = mz_values[lowest_mz_index]
-    mz_end[index] = mz_values[lowest_mz_index + len(mz_intensities) - 1]
-    return mz_intensities
-
-
-@alphatims.utils.njit(nogil=True)
-def extract_profile(
-    raw_intensities,
-    indices
-):
-    minimum_index = np.min(indices)
-    maximum_index = np.max(indices)
-    cumulative_intensities = np.zeros(maximum_index - minimum_index + 1)
-    for index, intensity in zip(indices, raw_intensities):
-        cumulative_intensities[index - minimum_index] += intensity
-    return minimum_index, cumulative_intensities
-
-
-@alphatims.utils.pjit
-def set_profile_correlations(
-    precursor_index,
-    fragment_peak_indices,
-    precursor_peak_indices,
-    precursor_indptr,
-    profile_correlations,
-    profile_offset,
-    profiles,
-    profile_indptr,
-):
-    convolution_mask = np.array([.5,.5,.75,1,.75,.5,.25])
-    precursor_peak_index = precursor_peak_indices[precursor_index]
-    precursor_profile_offset = profile_offset[precursor_peak_index]
-    precursor_profile_start = profile_indptr[precursor_peak_index]
-    precursor_profile_end = profile_indptr[precursor_peak_index + 1]
-    precursor_profile = profiles[precursor_profile_start: precursor_profile_end]
-    fragment_start = precursor_indptr[precursor_index]
-    fragment_end = precursor_indptr[precursor_index + 1]
-    for fragment_index, fragment_peak_index in enumerate(
-        fragment_peak_indices[fragment_start: fragment_end],
-        fragment_start,
-    ):
-        fragment_profile_offset = profile_offset[fragment_peak_index]
-        fragment_profile_start = profile_indptr[fragment_peak_index]
-        fragment_profile_end = profile_indptr[fragment_peak_index + 1]
-        fragment_profile = profiles[fragment_profile_start: fragment_profile_end]
-        fragment_overlap_profile = fragment_profile
-        precursor_overlap_profile = precursor_profile
-        if fragment_profile_offset < precursor_profile_offset:
-            fragment_overlap_profile = fragment_profile[
-                precursor_profile_offset - fragment_profile_offset:
-            ]
-        else:
-            precursor_overlap_profile = precursor_profile[
-                fragment_profile_offset - precursor_profile_offset:
-            ]
-        if len(precursor_overlap_profile) <= 1:
-            correlation = 0
-        elif len(fragment_overlap_profile) <= 1:
-            correlation = 0
-        else:
-            # correlation = np.corrcoef(
-            #     fragment_overlap_profile[:len(precursor_overlap_profile)],
-            #     precursor_overlap_profile[:len(fragment_overlap_profile)],
-            # )[0, 1]
-            start = min(fragment_profile_offset, precursor_profile_offset)
-            end = max(
-                fragment_profile_offset + len(fragment_profile),
-                precursor_profile_offset + len(precursor_profile)
-            )
-            # precursor_profile_cumulative = np.zeros(end - start)
-            # precursor_profile_cumulative[
-            #     precursor_profile_offset - start: precursor_profile_offset + len(precursor_profile) - start
-            # ] = precursor_profile
-            # precursor_profile_cumulative[
-            #     precursor_profile_offset + len(precursor_profile) - start:
-            # ]
-            # precursor_profile_cumulative = np.cumsum(
-            #     precursor_profile_cumulative
-            # )
-            # fragment_profile_cumulative = np.zeros(end - start)
-            # fragment_profile_cumulative[
-            #     fragment_profile_offset - start: fragment_profile_offset + len(fragment_profile) - start
-            # ] = fragment_profile
-            # fragment_profile_cumulative[
-            #     fragment_profile_offset + len(fragment_profile) - start:
-            # ]
-            # fragment_profile_cumulative = np.cumsum(
-            #     fragment_profile_cumulative
-            # )
-            # # correlation = 1 - np.sum(np.abs(diff_profile)) / (end - start)
-            # correlation = 1 - np.max(
-            #     np.abs(
-            #         precursor_profile_cumulative - fragment_profile_cumulative
-            #     )
-            # )
-            start = min(fragment_profile_offset, precursor_profile_offset)
-            end = max(
-                fragment_profile_offset + len(fragment_profile),
-                precursor_profile_offset + len(precursor_profile)
-            )
-            precursor_profile_cumulative = np.zeros(end - start)
-            precursor_profile_cumulative[
-                precursor_profile_offset - start: precursor_profile_offset + len(precursor_profile) - start
-            ] = precursor_profile
-            # precursor_profile_cumulative[
-            #     precursor_profile_offset + len(precursor_profile) - start:
-            # ]
-            # precursor_profile_cumulative = np.cumsum(
-            #     precursor_profile_cumulative
-            # )
-            fragment_profile_cumulative = np.zeros(end - start)
-            fragment_profile_cumulative[
-                fragment_profile_offset - start: fragment_profile_offset + len(fragment_profile) - start
-            ] = fragment_profile
-            # fragment_profile_cumulative[
-            #     fragment_profile_offset + len(fragment_profile) - start:
-            # ]
-            # fragment_profile_cumulative = np.cumsum(
-            #     fragment_profile_cumulative
-            # )
-
-            # correlation = 1 - np.sum(np.abs(diff_profile)) / (end - start)
-
-            precursor_profile_cumulative = np.convolve(
-                precursor_profile_cumulative,
-                convolution_mask,
-            )
-            fragment_profile_cumulative = np.convolve(
-                fragment_profile_cumulative,
-                convolution_mask,
-            )
-            # correlation = 1 - np.max(
-            #     np.abs(
-            #         np.cumsum(precursor_profile_cumulative)/np.sum(precursor_profile_cumulative) - np.cumsum(fragment_profile_cumulative)/np.sum(fragment_profile_cumulative)
-            #     )
-            # )
-            summed_profile = precursor_profile_cumulative + fragment_profile_cumulative
-            correlation = 1 - np.sum(
-                np.abs(
-                    np.cumsum(precursor_profile_cumulative)/np.sum(precursor_profile_cumulative) - np.cumsum(fragment_profile_cumulative)/np.sum(fragment_profile_cumulative)
-                )*summed_profile/np.sum(summed_profile)
-            )
-        profile_correlations[fragment_index] = correlation
diff --git a/alphadia/preprocessing/smoothing.py b/alphadia/preprocessing/smoothing.py
deleted file mode 100644
index bc18bd48..00000000
--- a/alphadia/preprocessing/smoothing.py
+++ /dev/null
@@ -1,210 +0,0 @@
-"""Smooth dia data intensity values."""
-
-import logging
-
-import numpy as np
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-
-
-class Smoother:
-
-    def __init__(
-        self,
-        tof_tolerance=3,
-        cycle_tolerance=3,
-        scan_tolerance=45,
-        tof_sigma=0,
-        cycle_sigma=3,
-        scan_sigma=15,
-    ):
-        self.tof_tolerance = tof_tolerance
-        self.cycle_tolerance = cycle_tolerance
-        self.scan_tolerance = scan_tolerance
-        self.cycle_sigma = cycle_sigma
-        self.scan_sigma = scan_sigma
-        self.tof_sigma = tof_sigma
-
-    def set_dia_data(self, dia_data):
-        self.dia_data = dia_data
-
-    def set_connector(self, connector):
-        self.connector = connector
-
-    def smooth(self):
-        logging.info("Smoothing peaks")
-        self.smooth_intensity_values = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.float32
-        )
-        self.density_values = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.float32
-        )
-        self.neighbor_types = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.uint8
-        )
-        smooth(
-            # 0,
-            range(
-                len(self.dia_data.push_indptr) // np.prod(
-                    self.connector.cycle.shape[:-1]
-                ) + 1
-            ),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.dia_data.intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connector.connection_counts,
-            self.connector.connections,
-            self.cycle_tolerance,
-            self.smooth_intensity_values,
-            self.neighbor_types,
-            self.density_values,
-            self.cycle_sigma,
-            self.scan_sigma,
-            self.tof_sigma,
-            np.prod(
-                self.connector.cycle.shape[1:-1]
-            ),
-        )
-        self.smooth_intensity_values += self.dia_data.intensity_values
-
-
-@alphatims.utils.pjit
-def smooth(
-    cycle_index,
-    indptr,
-    tof_indices,
-    intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    smooth_intensity_values,
-    neighbor_types,
-    density_values,
-    cycle_sigma,
-    scan_sigma,
-    tof_sigma,
-    subcycle_len,
-):
-    len_cycle = len(connection_counts) - 1
-    push_offset = len_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        if connection_end == connection_start:
-            continue
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        self_scan = self_connection_index % scan_max_index
-        max_neighbor_count = 0
-        if True:
-            self_cycle = (self_push_index - zeroth_frame * scan_max_index) // subcycle_len
-            for other_connection_offset in connections[connection_start: connection_end]:
-                other_push_index = self_push_index + other_connection_offset
-                if other_push_index == self_push_index:
-                    continue
-                if not (0 <= other_push_index < len(indptr)):
-                    continue
-                other_scan = other_push_index % scan_max_index
-                connection_blur = gauss_correction(
-                    self_scan - other_scan,
-                    scan_sigma,
-                )
-                other_cycle = (other_push_index - zeroth_frame * scan_max_index) // subcycle_len
-                cycle_offset = self_cycle - other_cycle
-                cycle_blur = gauss_correction(cycle_offset, cycle_sigma)
-        # for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-        #     cycle_blur = gauss_correction(cycle_offset, cycle_sigma)
-        #     for other_connection_index in connections[connection_start: connection_end]:
-        #         other_scan = other_connection_index % scan_max_index
-        #         connection_blur = gauss_correction(
-        #             self_connection_index % scan_max_index - other_connection_index % scan_max_index,
-        #             scan_sigma,
-        #         )
-        #         other_push_index = push_offset + other_connection_index + len_cycle * cycle_offset
-        #         if other_push_index == self_push_index:
-        #             continue
-        #         if other_push_index >= len(indptr):
-        #             continue
-                max_neighbor_count += 1
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                neighbor_type = determine_neighbor_type(
-                    cycle_offset,
-                    self_scan,
-                    other_scan,
-                )
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        other_intensity = intensity_values[other_index]
-                        tof_blur = gauss_correction(
-                            int(self_tof) - int(other_tof),
-                            tof_sigma,
-                        )
-                        smooth_intensity_values[self_index] += other_intensity * cycle_blur * connection_blur * tof_blur
-                        neighbor_types[self_index] |= neighbor_type
-                        density_values[self_index] += 1
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-        for self_index in range(self_start, self_end):
-            density_values[self_index] /= max_neighbor_count
-
-
-@alphatims.utils.njit(nogil=True)
-def gauss_correction(x=0, sigma=1):
-    if sigma == 0:
-        return 1
-    else:
-        return np.exp(-(x / sigma)**2 / 2)
-
-
-@alphatims.utils.njit(nogil=True)
-def determine_neighbor_type(
-    cycle_offset,
-    self_scan,
-    other_scan,
-):
-    if cycle_offset < 0:
-        if self_scan < other_scan:
-            return 2**0
-        elif self_scan == other_scan:
-            return 2**1
-        else:
-            return 2**2
-    elif cycle_offset == 0:
-        if self_scan < other_scan:
-            return 2**7
-        elif self_scan == other_scan:
-            return 0
-        else:
-            return 2**3
-    else:
-        if self_scan < other_scan:
-            return 2**6
-        elif self_scan == other_scan:
-            return 2**5
-        else:
-            return 2**4
diff --git a/alphadia/extraction/quadrupole.py b/alphadia/quadrupole.py
similarity index 98%
rename from alphadia/extraction/quadrupole.py
rename to alphadia/quadrupole.py
index 8606478e..6f7b73ca 100644
--- a/alphadia/extraction/quadrupole.py
+++ b/alphadia/quadrupole.py
@@ -2,8 +2,7 @@
 import math
 
 # alphadia imports
-from alphadia.extraction import utils
-
+from alphadia import utils
 
 # alpha family imports
 import alphatims.utils
@@ -11,10 +10,7 @@
 # third party imports
 import numba as nb
 from numba.experimental import jitclass
-
 import numpy as np
-from sklearn.base import BaseEstimator, RegressorMixin
-
 from scipy.optimize import curve_fit
 
 @alphatims.utils.njit
@@ -297,7 +293,7 @@ def quadrupole_transfer_function_single(
 
     Parameters
     ----------
-    quadrupole_calibration_jit : alphadia.extraction.quadrupole.SimpleQuadrupoleJit
+    quadrupole_calibration_jit : alphadia.quadrupole.SimpleQuadrupoleJit
         Quadrupole calibration jit object
 
     observation_indices : np.ndarray
diff --git a/alphadia/smoothing.py b/alphadia/smoothing.py
deleted file mode 100644
index 51e887c3..00000000
--- a/alphadia/smoothing.py
+++ /dev/null
@@ -1,2858 +0,0 @@
-"""A module to perform smoothing of TOF data."""
-
-import logging
-
-import numpy as np
-import pandas as pd
-import sklearn
-import sklearn.decomposition
-import sklearn.preprocessing
-
-
-import alphatims.utils
-import alphatims.tempmmap as tm
-import alphabase.io
-import alphadia.prefilter
-
-
-@alphatims.utils.njit(nogil=True)
-def get_connections_within_cycle(
-    scan_tolerance: int,
-    scan_max_index: int,
-    dia_mz_cycle: np.ndarray,
-    exclude_self: bool = False,
-    multiple_frames: bool = False,
-    ms1: bool = True,
-    ms2: bool = False,
-) -> tuple:
-    """Determine how individual pushes in a cycle are connected.
-
-    Parameters
-    ----------
-    scan_tolerance : int
-        Maximum scan distance for two pushes to be connected
-    scan_max_index : int
-        The maximum scan index (dia_data.scan_max_index).
-    dia_mz_cycle : np.ndarray
-        An np.float64[:, 2] array with upper and lower quadrupole boundaries
-        per push of a cycle.
-    exclude_self : bool
-        Excluded connections between equal push indices
-        (the default is False).
-    multiple_frames : bool
-        Connect scans between different frames a cycle
-        (the default is False).
-    ms1 : bool
-        Allow connections between MS1 pushes
-        (the default is True).
-    ms2 : bool
-        OAllow connections between MS2 pushes
-        (the default is False).
-
-    Returns
-    -------
-    tuple
-        A tuple with indptr and indices defining the (sparse) connections.
-    """
-    connections = []
-    connection_count = 0
-    connection_counts = [connection_count]
-    shape = (
-        scan_max_index,
-        len(dia_mz_cycle) // scan_max_index
-    )
-    if multiple_frames:
-        frame_iterator = range(shape[1])
-    for self_frame in range(shape[1]):
-        if not multiple_frames:
-            frame_iterator = range(self_frame, self_frame + 1)
-        for self_scan in range(shape[0]):
-            index = self_scan + self_frame * shape[0]
-            low_quad, high_quad = dia_mz_cycle[index]
-            if (not ms1) and (low_quad == -1):
-                connection_counts.append(connection_count)
-                continue
-            if (not ms2) and (low_quad != -1):
-                connection_counts.append(connection_count)
-                continue
-            for other_frame in frame_iterator:
-                for other_scan in range(
-                    self_scan - scan_tolerance,
-                    self_scan + scan_tolerance + 1
-                ):
-                    if not (0 <= other_scan < scan_max_index):
-                        continue
-                    other_index = other_scan + other_frame * shape[0]
-                    if exclude_self and (index == other_index):
-                        continue
-                    other_low_quad, other_high_quad = dia_mz_cycle[other_index]
-                    if low_quad > other_high_quad:
-                        continue
-                    if high_quad < other_low_quad:
-                        continue
-                    connection_count += 1
-                    connections.append(other_index)
-            connection_counts.append(connection_count)
-    return np.array(connection_counts), np.array(connections)
-
-
-@alphatims.utils.njit(nogil=True)
-def calculate_cyclic_scan_blur(
-    connection_indices: np.ndarray,
-    connection_indptr: np.ndarray,
-    scan_max_index: int,
-    sigma: float = 1,
-) -> np.ndarray:
-    """Short summary.
-
-    Parameters
-    ----------
-    connection_indices : np.ndarray
-        Connections indices from .get_connections_within_cycle.
-    connection_indptr : np.ndarray
-        Connections indptr from .get_connections_within_cycle.
-    scan_max_index : int
-        The maximum scan index (dia_data.scan_max_index).
-    sigma : float
-        The sigma for the Gaussian blur (default is 1).
-        To make sure there are no large dropoffs, this sigma should be at most
-        scan_max_index / 3 (see get_connections_within_cycle).
-
-    Returns
-    -------
-    np.ndarray
-        The blurred weight for all the connection_indices.
-
-    """
-    scan_blur = np.repeat(
-        np.arange(len(connection_indptr) - 1),
-        np.diff(connection_indptr),
-    ) % scan_max_index - connection_indices % scan_max_index
-    scan_blur = np.exp(-(scan_blur / sigma)**2 / 2)
-    for i, start in enumerate(connection_indptr[:-1]):
-        end = connection_indptr[i + 1]
-        scan_blur[start: end] /= np.sum(scan_blur[start: end])
-    return scan_blur
-
-
-@alphatims.utils.pjit
-def smooth(
-    cycle_index,
-    indptr,
-    tof_indices,
-    intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    smooth_intensity_values,
-    neighbor_types,
-    density_values,
-    cycle_sigma,
-    scan_sigma,
-    tof_sigma,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        if connection_end == connection_start:
-            continue
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        self_scan = self_connection_index % scan_max_index
-        max_neighbor_count = 0
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            cycle_blur = gauss_correction(cycle_offset, cycle_sigma)
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_scan = other_connection_index % scan_max_index
-                connection_blur = gauss_correction(
-                    self_connection_index % scan_max_index - other_connection_index % scan_max_index,
-                    scan_sigma,
-                )
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index == self_push_index:
-                    continue
-                if other_push_index >= len(indptr):
-                    continue
-                max_neighbor_count += 1
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                neighbor_type = determine_neighbor_type(
-                    cycle_offset,
-                    self_scan,
-                    other_scan,
-                )
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        other_intensity = intensity_values[other_index]
-                        tof_blur = gauss_correction(
-                            int(self_tof) - int(other_tof),
-                            tof_sigma,
-                        )
-                        smooth_intensity_values[self_index] += other_intensity * cycle_blur * connection_blur * tof_blur
-                        neighbor_types[self_index] |= neighbor_type
-                        density_values[self_index] += 1
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-        for self_index in range(self_start, self_end):
-            density_values[self_index] /= max_neighbor_count
-
-
-@alphatims.utils.njit(nogil=True)
-def gauss_correction(x=0, sigma=1):
-    return np.exp(-(x / sigma)**2 / 2)
-
-
-@alphatims.utils.njit(nogil=True)
-def determine_neighbor_type(
-    cycle_offset,
-    self_scan,
-    other_scan,
-):
-    if cycle_offset < 0:
-        if self_scan < other_scan:
-            return 2**0
-        elif self_scan == other_scan:
-            return 2**1
-        else:
-            return 2**2
-    elif cycle_offset == 0:
-        if self_scan < other_scan:
-            return 2**7
-        elif self_scan == other_scan:
-            pass  # cannot happen because scan is fully equal?
-        else:
-            return 2**3
-    else:
-        if self_scan < other_scan:
-            return 2**6
-        elif self_scan == other_scan:
-            return 2**5
-        else:
-            return 2**4
-
-
-@alphatims.utils.pjit
-def find_seeds(
-    cycle_index,
-    indptr,
-    tof_indices,
-    smooth_intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    peaks,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index <= self_push_index:
-                    continue
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        self_intensity = smooth_intensity_values[self_index]
-                        other_intensity = smooth_intensity_values[other_index]
-                        if self_intensity < other_intensity:
-                            peaks[self_index] = False
-                        if self_intensity > other_intensity:
-                            peaks[other_index] = False
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-
-#
-#
-# # @alphatims.utils.pjit
-# @alphatims.utils.njit
-# def create_inet(
-#     self_push_index,
-#     indptr,
-#     tof_indices,
-#     intensity_values,
-#     dia_mz_cycle,
-#     tof_tolerance,
-#     scan_max_index,
-#     tof_max_index,
-#     zeroth_frame,
-#     connection_counts,
-#     connections,
-#     cycle_tolerance,
-#     is_signal,
-#     # mz_values,
-# ):
-#     intensity_buffer = np.zeros(tof_max_index, dtype=np.float32)
-#     new_tof_indices = []
-#     new_intensity_values = []
-#     index_offset = (self_push_index - zeroth_frame * scan_max_index) % len(dia_mz_cycle)
-#     cycle_index = (self_push_index - zeroth_frame * scan_max_index) // len(dia_mz_cycle)
-#     push_offset = len(dia_mz_cycle) * cycle_index + zeroth_frame * scan_max_index
-#     current_tof_indices = []
-#     connection_start = connection_counts[index_offset]
-#     connection_end = connection_counts[index_offset + 1]
-#     for connection_index in connections[connection_start: connection_end]:
-#         for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-#             other_push_index = push_offset + connection_index + len(dia_mz_cycle) * cycle_offset
-#             if other_push_index < 0:
-#                 continue
-#                 # Check mz
-#             if other_push_index >= len(indptr):
-#                 continue
-#             for index in range(
-#                 indptr[other_push_index],
-#                 indptr[other_push_index + 1]
-#             ):
-#                 if not is_signal[index]:
-#                     continue
-#                 tof_index = tof_indices[index]
-#                 if intensity_buffer[tof_index] == 0:
-#                     current_tof_indices.append(tof_index)
-#                 intensity_buffer[tof_index] += intensity_values[index]
-#     if len(tof_indices) == 0:
-#         return
-#     current_tof_indices = sorted(current_tof_indices)
-#     last_tof_index = tof_indices[0]
-#     last_tof_index = -(1 + tof_tolerance)
-#     summed_intensity = intensity_buffer[last_tof_index]
-#     summed_tof = last_tof_index
-#     count = 1
-#     intensity_buffer[last_tof_index] = 0
-#     for tof_index in current_tof_indices[1:]:
-#         intensity = intensity_buffer[tof_index]
-#         if (tof_index - last_tof_index) >= tof_tolerance:
-#             if last_tof_index >= 0:
-#                 new_tof_indices.append(summed_tof // count)
-#                 new_intensity_values.append(summed_intensity)
-#             summed_intensity = intensity
-#             summed_tof = tof_index
-#             count = 1
-#         else:
-#             summed_intensity += intensity
-#             summed_tof += last_tof_index
-#             count += 1
-#         intensity_buffer[tof_index] = 0
-#         last_tof_index = tof_index
-#     return (
-#         np.array(new_tof_indices),
-#         np.array(new_intensity_values),
-#     )
-#
-
-
-@alphatims.utils.pjit
-def inet_counts(
-    cycle_index,
-    indptr,
-    tof_indices,
-    intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    inet_indptr,
-    peaks,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                # self_index = self_start
-                # other_index = other_start
-                # while (self_index < self_end) and (other_index < other_end):
-                #     # if self_index == other_index:
-                #     #     self_index += 1
-                #     self_tof = tof_indices[self_index]
-                #     other_tof = tof_indices[other_index]
-                #     if peaks[self_index] and peaks[other_index]:
-                #         inet_indptr[self_index] += 1
-                #     if self_tof < other_tof:
-                #         self_index += 1
-                #     else:
-                #         other_index += 1
-                count = np.sum(peaks[other_start: other_end])
-                # for self_index in range(self_start, self_end):
-                #     inet_indptr[self_index] += count
-                inet_indptr[self_start] += count
-
-
-def create_inet(
-    dia_data,
-    tof_tolerance,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    potential_peaks,
-):
-    import multiprocessing
-
-    def starfunc(cycle_index):
-        return get_inet(
-            cycle_index,
-            dia_data.push_indptr,
-            dia_data.tof_indices,
-            tof_tolerance,
-            dia_data.scan_max_index,
-            dia_data.zeroth_frame,
-            connection_counts,
-            connections,
-            cycle_tolerance,
-            potential_peaks,
-        )
-
-    iterable = range(len(dia_data.push_indptr) // len(dia_data.dia_mz_cycle) + 1)
-    # self.inet_indptr = tm.zeros(
-    #     self.dia_data.intensity_values.shape,
-    #     dtype=np.int64
-    # )
-    # iterable = range(500, 520)
-    self_connections = []
-    other_connections = []
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for cycle_index, (
-            self_connection,
-            other_connection,
-        ) in alphatims.utils.progress_callback(
-            enumerate(pool.imap(starfunc, iterable)),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            self_connections.append(np.concatenate(self_connection))
-            other_connections.append(np.concatenate(other_connection))
-    return self_connections, other_connections
-
-# @alphatims.utils.pjit
-@alphatims.utils.njit(nogil=True)
-def get_inet(
-    cycle_index,
-    indptr,
-    tof_indices,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    peaks,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    self_connections = []
-    other_connections = []
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        self_connection = []
-        other_connection = []
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    # if self_index == other_index:
-                    #     self_index += 1
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if peaks[self_index] and peaks[other_index]:
-                        self_connection.append(self_index)
-                        other_connection.append(other_index)
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-        self_connection = np.array(self_connection, dtype=np.int64)
-        other_connection = np.array(other_connection, dtype=np.int64)
-        order = np.argsort(self_connection)
-        self_connections.append(self_connection[order])
-        other_connections.append(other_connection[order])
-    return self_connections, other_connections
-
-
-def create_isotopic_pairs(
-    analysis,
-    difference,
-    mz_tolerance,
-    # tof_tolerance,
-    # connection_counts,
-    # connections,
-    # cycle_tolerance,
-    # potential_peaks,
-):
-    import multiprocessing
-
-    def starfunc(cycle_index):
-        return get_isotopic_pairs(
-            cycle_index,
-            analysis.peak_collection.indptr,
-            analysis.dia_data.mz_values[
-                analysis.dia_data.tof_indices[
-                    analysis.peak_collection.indices
-                ]
-            ],
-            mz_tolerance,
-            analysis.dia_data.scan_max_index,
-            analysis.dia_data.zeroth_frame,
-            analysis.connection_counts,
-            analysis.connections,
-            analysis.cycle_tolerance,
-            difference,
-        )
-
-    iterable = analysis.cycle_range
-    # iterable = range(500, 520)
-    self_connections = []
-    other_connections = []
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for cycle_index, (
-            self_connection,
-            other_connection,
-        ) in alphatims.utils.progress_callback(
-            enumerate(pool.imap(starfunc, iterable)),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            self_connections.append(self_connection)
-            other_connections.append(other_connection)
-    return np.concatenate(self_connections), np.concatenate(other_connections)
-
-
-
-# @alphatims.utils.pjit
-@alphatims.utils.njit(nogil=True)
-def get_isotopic_pairs(
-    cycle_index,
-    indptr,
-    mz_values,
-    mz_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    difference,
-    # peaks,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    self_connections = []
-    other_connections = []
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    # if self_index == other_index:
-                    #     self_index += 1
-                    self_tof = mz_values[self_index]
-                    other_tof = mz_values[other_index]
-                    if (self_tof - mz_tolerance) <= (other_tof - difference) <= (self_tof + mz_tolerance):
-                        self_connections.append(self_index)
-                        other_connections.append(other_index)
-                    if self_tof < (other_tof - difference - mz_tolerance):
-                        self_index += 1
-                    else:
-                        other_index += 1
-    return np.array(self_connections), np.array(other_connections)
-
-
-@alphatims.utils.njit(nogil=True)
-def create_precursor_centric_ion_network(
-    cycle_index,
-    indices,
-    indptr,
-    zeroth_frame,
-    scan_max_index,
-    scan_tolerance,
-    cycle_tolerance,
-    mz_windows,
-    mz_values,
-    tof_indices,
-    is_mono,
-):
-    cycle_length = len(mz_windows)
-    frame_count = cycle_length // scan_max_index
-    push_offset = cycle_length * cycle_index + zeroth_frame * scan_max_index
-    precursor_indices = []
-    precursor_count = []
-    fragment_indices = []
-    for self_push_offset in np.flatnonzero(mz_windows[:, 0] == -1):
-        self_push_index = push_offset + self_push_offset
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        self_scan = self_push_offset % scan_max_index
-        for precursor_index_ in range(self_start, self_end):
-            if not is_mono[precursor_index_]:
-                continue
-            precursor_index = indices[precursor_index_]
-            precursor_mz = mz_values[tof_indices[precursor_index]]
-            hits = 0
-            for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-                for frame_offset in range(frame_count):
-                    for scan_offset in range(-scan_tolerance, scan_tolerance + 1):
-                        other_scan = self_scan + scan_offset
-                        if not (0 <= other_scan < scan_max_index):
-                            continue
-                        other_push_offset = frame_offset * scan_max_index + other_scan
-                        low_mz, high_mz = mz_windows[other_push_offset]
-                        if not (low_mz <= precursor_mz < high_mz):
-                            continue
-                        other_push_index = push_offset + other_push_offset + cycle_length * cycle_offset
-                        if not (0 <= other_push_index < len(indptr)):
-                            continue
-                        other_start = indptr[other_push_index]
-                        other_end = indptr[other_push_index + 1]
-                        for fragment_index_ in range(other_start, other_end):
-                            fragment_index = indices[fragment_index_]
-                            fragment_indices.append(fragment_index)
-                            hits += 1
-            if hits > 0:
-                precursor_indices.append(precursor_index)
-                precursor_count.append(hits)
-    return (
-        np.array(precursor_indices),
-        np.array(precursor_count),
-        np.array(fragment_indices),
-    )
-
-
-@alphatims.utils.njit(nogil=True)
-def find_unfragmented_precursors(
-    cycle_index,
-    indices,
-    indptr,
-    zeroth_frame,
-    scan_max_index,
-    scan_tolerance,
-    cycle_tolerance,
-    mz_windows,
-    tof_indices,
-    tof_tolerance,
-):
-    cycle_length = len(mz_windows)
-    frame_count = cycle_length // scan_max_index
-    push_offset = cycle_length * cycle_index + zeroth_frame * scan_max_index
-    unfragmented_precursor_indices = []
-    for self_push_offset in np.flatnonzero(mz_windows[:, 0] == -1):
-        self_push_index = push_offset + self_push_offset
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        self_scan = self_push_offset % scan_max_index
-        for precursor_index_ in range(self_start, self_end):
-            precursor_index = indices[precursor_index_]
-            precursor_tof = tof_indices[precursor_index]
-            for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-                for frame_offset in range(frame_count):
-                    for scan_offset in range(-scan_tolerance, scan_tolerance + 1):
-                        other_scan = self_scan + scan_offset
-                        if not (0 <= other_scan < scan_max_index):
-                            continue
-                        other_push_offset = frame_offset * scan_max_index + other_scan
-                        low_mz, high_mz = mz_windows[other_push_offset]
-                        if low_mz == -1:
-                            continue
-                        other_push_index = push_offset + other_push_offset + cycle_length * cycle_offset
-                        if not (0 <= other_push_index < len(indptr)):
-                            continue
-                        other_start = indptr[other_push_index]
-                        other_end = indptr[other_push_index + 1]
-                        for fragment_index_ in range(other_start, other_end):
-                            fragment_index = indices[fragment_index_]
-                            fragment_tof = tof_indices[fragment_index]
-                            if np.abs(fragment_tof - precursor_tof) < tof_tolerance:
-                                unfragmented_precursor_indices.append(fragment_index)
-    return np.array(unfragmented_precursor_indices)
-
-
-def annotate(
-    iterable,
-    frag_start_idx,
-    frag_end_idx,
-    frag_indices,
-    frag_frequencies,
-    indptr,
-    mz_values,
-    tof_indices,
-    fragment_ppm,
-    lower,
-    upper,
-    y_mzs,
-    b_mzs,
-    min_size,
-    min_hit_count,
-    top_n_hits,
-):
-    import multiprocessing
-
-    def starfunc(index):
-        # return alphadia.prefilter.annotate_pool(
-        return alphadia.prefilter.annotate_pool2(
-            index,
-            frag_start_idx,
-            frag_end_idx,
-            frag_indices,
-            frag_frequencies,
-            indptr,
-            mz_values,
-            tof_indices,
-            fragment_ppm,
-            lower,
-            upper,
-            y_mzs,
-            b_mzs,
-            min_size,
-            min_hit_count,
-            top_n_hits,
-        )
-    precursor_indices = []
-    max_hit_counts = []
-    max_frequency_counts = []
-    db_indices = []
-    precursor_indptr = []
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for (
-            precursor_index,
-            hit_count,
-            frequency_count,
-            db_indices_,
-        ) in alphatims.utils.progress_callback(
-            pool.imap(starfunc, iterable),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            # if hit_count >= min_hit_count:
-            if True:
-                precursor_indices.append(precursor_index)
-                precursor_indptr.append(len(db_indices_))
-                max_hit_counts.append(hit_count)
-                max_frequency_counts.append(frequency_count)
-                db_indices.append(db_indices_)
-    return (
-        np.array(precursor_indices),
-        np.array(precursor_indptr),
-        # np.array(max_hit_counts),
-        np.concatenate(max_hit_counts),
-        np.concatenate(max_frequency_counts),
-        np.concatenate(db_indices),
-    )
-
-
-@alphatims.utils.pjit
-# @alphatims.utils.njit(nogil=True)
-def update_annotation(
-    index,
-    database_indices,
-    database_frag_starts,
-    database_frag_ends,
-    database_y_mzs,
-    database_b_mzs,
-    database_y_ints,
-    database_b_ints,
-    inet_indices,
-    precursor_indptr,
-    fragment_indices,
-    tof_indices,
-    intensity_values,
-    mz_values,
-    fragment_ppm,
-    b_hit_counts,
-    y_hit_counts,
-    b_mean_ppm,
-    y_mean_ppm,
-    relative_found_b_int,
-    relative_missed_b_int,
-    relative_found_y_int,
-    relative_missed_y_int,
-    relative_found_int,
-    relative_missed_int,
-    pearsons,
-    pearsons_log,
-    pseudo_int,
-):
-    if index >= len(database_indices):
-        return
-    database_index = database_indices[index]
-    db_frag_start_idx = database_frag_starts[database_index]
-    db_frag_end_idx = database_frag_ends[database_index]
-    db_y_mzs = database_y_mzs[db_frag_start_idx: db_frag_end_idx][::-1]
-    db_b_mzs = database_b_mzs[db_frag_start_idx: db_frag_end_idx]
-    db_y_ints = database_y_ints[db_frag_start_idx: db_frag_end_idx][::-1]
-    db_b_ints = database_b_ints[db_frag_start_idx: db_frag_end_idx]
-    if pseudo_int > 0:
-        db_y_ints = db_y_ints + pseudo_int
-        db_b_ints = db_b_ints + pseudo_int
-    precursor_index = inet_indices[index]
-    frag_start_idx = precursor_indptr[precursor_index]
-    frag_end_idx = precursor_indptr[precursor_index + 1]
-    frags = fragment_indices[frag_start_idx: frag_end_idx]
-    fragment_tofs = tof_indices[frags]
-    order = np.argsort(fragment_tofs)
-    fragment_mzs = mz_values[fragment_tofs][order]
-    fragment_ints = intensity_values[frags][order]
-    fragment_b_hits, db_b_hits = find_hits(
-        fragment_mzs,
-        db_b_mzs,
-        fragment_ppm,
-    )
-    total_b_int = np.sum(db_b_ints)
-    if total_b_int == 0:
-        total_b_int = 1
-    if len(db_b_hits) > 0:
-        b_ppm = np.mean(
-            (db_b_mzs[db_b_hits] - fragment_mzs[fragment_b_hits]) / db_b_mzs[db_b_hits] * 10**6
-        )
-        found_b_int = np.sum(db_b_ints[db_b_hits])
-        min_b_int = np.min(db_b_ints[db_b_hits])
-    else: # TODO defaults are not reflective of good/bad scores
-        b_ppm = fragment_ppm
-        found_b_int = 0
-        min_b_int = -1
-    fragment_y_hits, db_y_hits = find_hits(
-        fragment_mzs,
-        db_y_mzs,
-        fragment_ppm,
-    )
-    total_y_int = np.sum(db_y_ints)
-    if total_y_int == 0:
-        total_y_int = 1
-    if len(db_y_hits) > 0:
-        y_ppm = np.mean(
-            (db_y_mzs[db_y_hits] - fragment_mzs[fragment_y_hits]) / db_y_mzs[db_y_hits] * 10**6
-        )
-        found_y_int = np.sum(db_y_ints[db_y_hits])
-        min_y_int = np.min(db_y_ints[db_y_hits])
-    else: # TODO defaults are not reflective of good/bad scores
-        y_ppm = fragment_ppm
-        found_y_int = 0
-        min_y_int = -1
-    missed_b_int = np.sum(
-        np.array([intsy for i, intsy in enumerate(db_b_ints) if (i not in db_b_hits) and (intsy > min_b_int)])
-    )
-    missed_y_int = np.sum(
-        np.array([intsy for i, intsy in enumerate(db_y_ints) if (i not in db_y_hits) and (intsy > min_y_int)])
-    )
-    # all_frags = fragment_ints
-    b_hit_counts[index] = len(db_b_hits)
-    y_hit_counts[index] = len(db_y_hits)
-    b_mean_ppm[index] = b_ppm
-    y_mean_ppm[index] = y_ppm
-    relative_found_b_int[index] = found_b_int / total_b_int
-    relative_missed_b_int[index] = missed_b_int / total_b_int
-    relative_found_y_int[index] = found_y_int / total_y_int
-    relative_missed_y_int[index] = missed_y_int / total_y_int
-    relative_found_int[index] = (found_b_int + found_y_int) / (total_b_int + total_y_int)
-    relative_missed_int[index] = (missed_b_int + missed_y_int) / (total_b_int + total_y_int)
-    all_db_ints = []
-    all_frag_ints = []
-    for b_int in db_b_ints[db_b_hits]:
-        all_db_ints.append(b_int)
-    for y_int in db_y_ints[db_y_hits]:
-        all_db_ints.append(y_int)
-    for frag_int in fragment_ints[fragment_b_hits]:
-        all_frag_ints.append(frag_int)
-    for frag_int in fragment_ints[fragment_y_hits]:
-        all_frag_ints.append(frag_int)
-    pearsons[index] = np.corrcoef(all_db_ints, all_frag_ints)[0, 1]
-    pearsons_log[index] = np.corrcoef(
-        np.log(np.array(all_db_ints)),
-        np.log(np.array(all_frag_ints)),
-    )[0, 1]
-
-    # return (
-    #     len(db_b_hits),
-    #     len(db_y_hits),
-    #     b_ppm,
-    #     y_ppm,
-    #     found_b_int / total_b_int,
-    #     missed_b_int / total_b_int,
-    #     found_y_int / total_y_int,
-    #     missed_y_int / total_y_int,
-    #     (found_b_int + found_y_int) / (total_b_int + total_y_int),
-    #     (missed_b_int + missed_y_int) / (total_b_int + total_y_int),
-    #     # pearson,
-    # )
-
-
-@alphatims.utils.njit(nogil=True)
-def find_hits(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    fragment_hits = []
-    db_hits = []
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            fragment_hits.append(fragment_index)
-            db_hits.append(database_index)
-            fragment_index += 1
-            database_index += 1
-    return np.array(fragment_hits), np.array(db_hits)
-
-
-def quick_annotation_stats(analysis1, pseudo_int=10**-6):
-    logging.info("Appending stats to quick annotation")
-    b_hit_counts = np.zeros(len(analysis1.quick_annotation))
-    y_hit_counts = np.zeros(len(analysis1.quick_annotation))
-    b_mean_ppm = np.zeros(len(analysis1.quick_annotation))
-    y_mean_ppm = np.zeros(len(analysis1.quick_annotation))
-    relative_found_b_int = np.zeros(len(analysis1.quick_annotation))
-    relative_missed_b_int = np.zeros(len(analysis1.quick_annotation))
-    relative_found_y_int = np.zeros(len(analysis1.quick_annotation))
-    relative_missed_y_int = np.zeros(len(analysis1.quick_annotation))
-    relative_found_int = np.zeros(len(analysis1.quick_annotation))
-    relative_missed_int = np.zeros(len(analysis1.quick_annotation))
-    pearsons = np.zeros(len(analysis1.quick_annotation))
-    pearsons_log = np.zeros(len(analysis1.quick_annotation))
-    update_annotation(
-        range(len(analysis1.quick_annotation)),
-        analysis1.quick_annotation.db_index.values,
-        analysis1.predicted_library_df.frag_start_idx.values,
-        analysis1.predicted_library_df.frag_end_idx.values,
-        analysis1.y_mzs,
-        analysis1.b_mzs,
-        analysis1.y_ions_intensities,
-        analysis1.b_ions_intensities,
-        analysis1.quick_annotation.inet_index.values,
-        analysis1.precursor_indptr,
-        analysis1.fragment_indices,
-        analysis1.dia_data.tof_indices,
-        # analysis1.dia_data.intensity_values,#.astype(np.float64),
-        analysis1.smooth_intensity_values,#.astype(np.float64),
-        analysis1.dia_data.mz_values * (1 + analysis1.ppm_mean * 10**-6),
-        analysis1.ppm_width,
-        b_hit_counts,
-        y_hit_counts,
-        b_mean_ppm,
-        y_mean_ppm,
-        relative_found_b_int,
-        relative_missed_b_int,
-        relative_found_y_int,
-        relative_missed_y_int,
-        relative_found_int,
-        relative_missed_int,
-        pearsons,
-        pearsons_log,
-        np.float32(pseudo_int),
-    )
-    analysis1.quick_annotation["b_hit_counts"] = b_hit_counts
-    analysis1.quick_annotation["y_hit_counts"] = y_hit_counts
-    analysis1.quick_annotation["b_mean_ppm"] = b_mean_ppm
-    analysis1.quick_annotation["y_mean_ppm"] = y_mean_ppm
-    analysis1.quick_annotation["relative_found_b_int"] = relative_found_b_int
-    analysis1.quick_annotation["relative_missed_b_int"] = relative_missed_b_int
-    analysis1.quick_annotation["relative_found_y_int"] = relative_found_y_int
-    analysis1.quick_annotation["relative_missed_y_int"] = relative_missed_y_int
-    analysis1.quick_annotation["relative_found_int"] = relative_found_int
-    analysis1.quick_annotation["relative_missed_int"] = relative_missed_int
-    pearsons[~np.isfinite(pearsons)] = 0
-    analysis1.quick_annotation["pearsons"] = pearsons
-    pearsons_log[~np.isfinite(pearsons_log)] = 0
-    analysis1.quick_annotation["pearsons_log"] = pearsons_log
-
-
-@alphatims.utils.pjit
-# @alphatims.utils.njit(nogil=True)
-def cluster_peaks(
-    cycle_index,
-    indptr,
-    tof_indices,
-    intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    connected_ions,
-    is_internal,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        if connection_end == connection_start:
-            continue
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for other_connection_index in connections[connection_start: connection_end]:
-            other_push_index = push_offset + other_connection_index
-            if other_push_index <= self_push_index:
-                continue
-            if other_push_index >= len(indptr):
-                continue
-            other_start = indptr[other_push_index]
-            other_end = indptr[other_push_index + 1]
-            if other_start == other_end:
-                continue
-            self_index = self_start
-            other_index = other_start
-            while (self_index < self_end) and (other_index < other_end):
-                self_tof = tof_indices[self_index]
-                other_tof = tof_indices[other_index]
-                if is_internal[self_index] & is_internal[other_index]:
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        pointer = connected_ions[self_index]
-                        to_merge = True
-                        # print(self_index, other_index)
-                        while pointer != self_index:
-                            if pointer == other_index:
-                                to_merge = False
-                                break
-                            pointer = connected_ions[pointer]
-                        if to_merge:
-                            connected_ions[self_index], connected_ions[other_index] = connected_ions[other_index], connected_ions[self_index]
-                if self_tof < other_tof:
-                    self_index += 1
-                else:
-                    other_index += 1
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for self_index in range(self_start, self_end):
-            pointer = connected_ions[self_index]
-            if pointer == self_index:
-                connected_ions[self_index] = 0
-                # TODO
-            else:
-                while pointer > 0:
-                    new_pointer = connected_ions[pointer]
-                    connected_ions[pointer] = -self_index
-                    pointer = new_pointer
-
-
-class Analysis(object):
-
-    def __init__(
-        self,
-        dia_data,
-        tof_tolerance=3,
-        cycle_tolerance=3,
-        scan_tolerance=6,
-        multiple_frames_per_cycle=False,
-        ms1=True,
-        ms2=True,
-        tof_sigma=3,
-        cycle_sigma=3,
-        scan_sigma=6,
-    ):
-        if isinstance(dia_data, str):
-            dia_data = alphatims.bruker.TimsTOF(
-                dia_data,
-            )
-        self.dia_data = dia_data
-        self.tof_tolerance = tof_tolerance
-        self.cycle_tolerance = cycle_tolerance
-        self.scan_tolerance = scan_tolerance
-        self.multiple_frames_per_cycle = multiple_frames_per_cycle
-        self.ms1 = ms1
-        self.ms2 = ms2
-        self.cycle_sigma = cycle_sigma
-        self.scan_sigma = scan_sigma
-        self.tof_sigma = tof_sigma
-        self.cycle_range = range(len(dia_data.push_indptr) // len(dia_data.dia_mz_cycle) + 1)
-        self.cycle_length = len(dia_data.dia_mz_cycle) // dia_data.scan_max_index
-        logging.info("Setting connections")
-        self.connect()
-
-    def connect(self):
-        self.connection_counts, self.connections = get_connections_within_cycle(
-            scan_tolerance=self.scan_tolerance,
-            scan_max_index=self.dia_data.scan_max_index,
-            dia_mz_cycle=self.dia_data.dia_mz_cycle,
-            multiple_frames=self.multiple_frames_per_cycle,
-            ms1=self.ms1,
-            ms2=self.ms2,
-        )
-
-    def smooth(self):
-        logging.info("Smoothing peaks")
-        self.smooth_intensity_values = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.float32
-        )
-        self.density_values = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.float32
-        )
-        self.neighbor_types = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.uint8
-        )
-        smooth(
-            range(len(self.dia_data.push_indptr) // len(self.dia_data.dia_mz_cycle) + 1),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.dia_data.intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connection_counts,
-            self.connections,
-            self.cycle_tolerance,
-            self.smooth_intensity_values,
-            self.neighbor_types,
-            self.density_values,
-            self.cycle_sigma,
-            self.scan_sigma,
-            self.tof_sigma,
-        )
-        self.smooth_intensity_values += self.dia_data.intensity_values
-
-    def find_peaks(self):
-        logging.info("Finding peaks")
-        self.potential_peaks = tm.empty(
-            shape=self.smooth_intensity_values.shape,
-            dtype=np.bool_
-        )
-        self.valid_neighborhood = np.ones(2**8, dtype=np.bool_)
-        for index in range(2**8):
-            bin_repr = '{:08b}'.format(index)
-            if "00" in bin_repr:
-                self.valid_neighborhood[index] = False
-            if (index < 2**7) and (index % 2 == 0):
-                self.valid_neighborhood[index] = False
-        self.potential_peaks[:] = self.valid_neighborhood[self.neighbor_types]
-        find_seeds(
-            range(len(self.dia_data.push_indptr) // len(self.dia_data.dia_mz_cycle) + 1),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.smooth_intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connection_counts,
-            self.connections,
-            self.cycle_tolerance,
-            self.potential_peaks,
-        )
-        self.peak_collection = PeakCollection(
-            self.dia_data.push_indptr,
-            self.potential_peaks,
-        )
-
-    def cluster(self):
-        self.find_cluster_paths()
-        self.assign_internal_points()
-        self.cluster_from_paths()
-        self.find_ambiguous_cluster_overlaps()
-        self.assemble_clusters()
-        self.assign_quantifiable_clusters()
-        self.peak_collection = alphadia.smoothing.PeakCollection(
-            self.dia_data.push_indptr,
-            self.peaks,
-        )
-
-    def find_cluster_paths(self):
-        logging.info("Finding cluster paths")
-        self.cluster_path_pointers = tm.clone(np.arange(len(self.dia_data)))
-        cluster_to_max_peaks_(
-            range(len(self.dia_data.push_indptr) // len(self.dia_data.dia_mz_cycle) + 1),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.dia_data.intensity_values + self.smooth_intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connection_counts,
-            self.connections,
-            self.cycle_tolerance,
-            self.cluster_path_pointers,
-        )
-
-    def cluster_from_paths(self):
-        logging.info("Clustering from paths")
-        self.cluster_pointers = tm.clone(self.cluster_path_pointers)
-        walk_cluster_path(np.arange(10))
-        walk_cluster_path(self.cluster_pointers)
-
-    def find_ambiguous_cluster_overlaps(self):
-        logging.info("Detecting cluster ambiguities")
-        self.nonambiguous_ions = tm.ones(len(self.dia_data), dtype=np.bool_)
-        find_unique_peaks_(
-            range(len(self.dia_data.push_indptr) // len(self.dia_data.dia_mz_cycle) + 1),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.dia_data.intensity_values + self.smooth_intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connection_counts,
-            self.connections,
-            self.cycle_tolerance,
-            self.cluster_pointers,
-            self.nonambiguous_ions,
-        )
-        logging.info("Removing cluster ambiguities")
-        walk_unique_cluster_path(
-            np.arange(10),
-            np.zeros(10, dtype=np.bool_),
-            np.ones(10, dtype=np.bool_),
-        )
-        to_visit = np.ones_like(self.nonambiguous_ions)
-        walk_unique_cluster_path(
-            # range(len(to_visit)),
-            self.cluster_path_pointers,
-            self.nonambiguous_ions,
-            to_visit,
-        )
-
-    def assemble_clusters(self):
-        logging.info("Assembling clusters")
-        self.cluster_assemblies = tm.clone(self.cluster_pointers)
-        assemble_clusters(
-            self.cluster_pointers,
-            self.nonambiguous_ions,
-            self.cluster_assemblies,
-        )
-
-    def assign_internal_points(self):
-        logging.info("Assigning internal points")
-        self.internal_points = tm.empty(
-            shape=self.smooth_intensity_values.shape,
-            dtype=np.bool_
-        )
-        self.valid_neighborhood = np.ones(2**8, dtype=np.bool_)
-        for index in range(2**8):
-            bin_repr = '{:08b}'.format(index)
-            if "00" in bin_repr:
-                self.valid_neighborhood[index] = False
-            if (index < 2**7) and (index % 2 == 0):
-                self.valid_neighborhood[index] = False
-        self.internal_points[:] = self.valid_neighborhood[self.neighbor_types]
-
-    def assign_quantifiable_clusters(self):
-        logging.info("Assigning quantifiable clusters")
-        unique_peaks = np.unique(self.cluster_pointers)
-        self.peaks = unique_peaks[
-            (self.nonambiguous_ions & self.internal_points)[unique_peaks]
-        ]
-
-    def create_precursor_centric_ion_network(self):
-        import multiprocessing
-        logging.info("Creating net")
-
-        def starfunc(cycle_index):
-            return create_precursor_centric_ion_network(
-                cycle_index,
-                self.peak_collection.indices,
-                self.peak_collection.indptr,
-                self.dia_data.zeroth_frame,
-                self.dia_data.scan_max_index,
-                self.scan_tolerance,
-                self.cycle_tolerance,
-                self.dia_data.dia_mz_cycle,
-                self.dia_data.mz_values,
-                self.dia_data.tof_indices,
-                np.isin(self.peak_collection.indices, self.mono_isotopes)
-            )
-
-        precursor_indices = []
-        precursor_counts = [[0]]
-        fragment_indices = []
-
-        with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-            for (
-                precursor_indices_,
-                precursor_counts_,
-                fragment_indices_,
-            ) in alphatims.utils.progress_callback(
-                pool.imap(starfunc, self.cycle_range),
-                total=len(self.cycle_range),
-                include_progress_callback=True
-            ):
-                precursor_indices.append(precursor_indices_)
-                precursor_counts.append(precursor_counts_)
-                fragment_indices.append(fragment_indices_)
-
-        # for cycle_index in alphatims.utils.progress_callback(self.cycle_range):
-        #     (
-        #         precursor_indices_,
-        #         precursor_counts_,
-        #         fragment_indices_,
-        #     ) = create_precursor_centric_ion_network(
-        #         cycle_index,
-        #         self.peak_collection.indices,
-        #         self.peak_collection.indptr,
-        #         self.dia_data.zeroth_frame,
-        #         self.dia_data.scan_max_index,
-        #         self.scan_tolerance,
-        #         self.cycle_tolerance,
-        #         self.dia_data.dia_mz_cycle,
-        #         self.dia_data.mz_values,
-        #         self.dia_data.tof_indices,
-        #         np.isin(self.peak_collection.indices, self.mono_isotopes)
-        #     )
-        #     precursor_indices.append(precursor_indices_)
-        #     precursor_counts.append(precursor_counts_)
-        #     fragment_indices.append(fragment_indices_)
-
-        precursor_indices = np.concatenate(precursor_indices)
-        precursor_counts = np.cumsum(np.concatenate(precursor_counts))
-        fragment_indices = np.concatenate(fragment_indices)
-        self.precursor_indices = tm.clone(precursor_indices)
-        self.precursor_indptr = tm.clone(precursor_counts)
-        self.fragment_indices = tm.clone(fragment_indices)
-        self.set_fragment_weights()
-
-    def find_unfragmented_precursors(self):
-        unfragmented_precursors = []
-
-        for cycle_index in alphatims.utils.progress_callback(self.cycle_range):
-            unfragmented_precursors_ = find_unfragmented_precursors(
-                cycle_index,
-                self.peak_collection.indices,
-                self.peak_collection.indptr,
-                self.dia_data.zeroth_frame,
-                self.dia_data.scan_max_index,
-                self.scan_tolerance,
-                self.cycle_tolerance,
-                self.dia_data.dia_mz_cycle,
-                self.dia_data.tof_indices,
-                self.tof_tolerance,
-            )
-            unfragmented_precursors.append(unfragmented_precursors_)
-
-        unfragmented_precursors = np.concatenate(unfragmented_precursors)
-        self.unfragmented_precursors = tm.clone(unfragmented_precursors)
-
-    def get_inet_counts(self):
-        self.inet_indptr = tm.zeros(
-            self.dia_data.intensity_values.shape,
-            dtype=np.int64
-        )
-        inet_counts(
-            range(len(self.dia_data.push_indptr) // len(self.dia_data.dia_mz_cycle) + 1),
-            self.dia_data.push_indptr,
-            self.dia_data.tof_indices,
-            self.dia_data.intensity_values,
-            self.tof_tolerance,
-            self.dia_data.scan_max_index,
-            self.dia_data.zeroth_frame,
-            self.connection_counts,
-            self.connections,
-            self.cycle_tolerance,
-            self.inet_indptr,
-            self.potential_peaks,
-        )
-
-    def determine_mono_isotopes(self, isotope_mz_tolerance=0.01):
-        logging.info("Determining mono isotopes")
-        self.isotope_mz_tolerance = isotope_mz_tolerance
-        logging.info("Charge 2")
-        left_connection, right_connection = create_isotopic_pairs(
-            self,
-            difference=1/2,
-            mz_tolerance=isotope_mz_tolerance,
-        )
-        self.mono_isotopes_charge2 = tm.clone(
-            self.peak_collection.indices[
-                left_connection[
-                    ~np.isin(
-                        left_connection, right_connection
-                    ) & np.isin(
-                        right_connection, left_connection
-                    )
-                ]
-            ]
-        )
-        logging.info("Charge 3")
-        left_connection, right_connection = create_isotopic_pairs(
-            self,
-            difference=1/3,
-            mz_tolerance=isotope_mz_tolerance,
-        )
-        self.mono_isotopes_charge3 = tm.clone(
-            self.peak_collection.indices[
-                left_connection[
-                    ~np.isin(
-                        left_connection, right_connection
-                    ) & np.isin(
-                        right_connection, left_connection
-                    )
-                ]
-            ]
-        )
-        self.mono_isotopes = np.unique(
-            np.concatenate(
-                [
-                    self.mono_isotopes_charge2,
-                    self.mono_isotopes_charge3,
-                ]
-            )
-        )
-
-    def set_fragment_weights(self):
-        logging.info("Setting fragment weights")
-        dia_data = self.dia_data
-        fdf = pd.DataFrame(
-            dia_data.convert_from_indices(
-                self.fragment_indices,
-                return_scan_indices=True,
-                return_push_indices=True,
-            )
-        )
-        pdf = pd.DataFrame(
-            dia_data.convert_from_indices(
-                np.repeat(
-                    self.precursor_indices,
-                    np.diff(self.precursor_indptr)
-                ),
-                return_scan_indices=True,
-                return_push_indices=True,
-            )
-        )
-        pdf["cycle"] = (pdf.push_indices - dia_data.zeroth_frame * dia_data.scan_max_index) // dia_data.dia_mz_cycle.shape[0]
-        fdf["cycle"] = (fdf.push_indices - dia_data.zeroth_frame * dia_data.scan_max_index) // dia_data.dia_mz_cycle.shape[0]
-        self.fragment_frequencies = (
-            np.exp(
-                -((pdf.scan_indices - fdf.scan_indices) / self.scan_sigma)**2 / 2
-            ) * np.exp(
-                -((pdf.cycle - fdf.cycle) / self.cycle_sigma)**2 / 2
-            )
-        ).values
-
-    def add_library(self, library_file_name):
-        logging.info("Loading library")
-        self.library_file_name = library_file_name
-        self.lib = alphabase.io.hdf.HDF_File(
-            self.library_file_name
-        #     read_only=False
-        )
-
-        predicted_library_df = self.lib.library.precursor_df[...]
-        # predicted_library_df.sort_values(by=["rt_pred", "mobility_pred"], inplace=True)
-        predicted_library_df.sort_values(by="precursor_mz", inplace=True)
-        predicted_library_df.reset_index(level=0, inplace=True)
-        predicted_library_df.rename(columns={"index": "original_index"}, inplace=True)
-        predicted_library_df.decoy = predicted_library_df.decoy.astype(np.bool_)
-
-        self.y_mzs = self.lib.library.fragment_mz_df.y_z1.mmap
-        self.b_mzs = self.lib.library.fragment_mz_df.b_z1.mmap
-        self.y_ions_intensities = self.lib.library.fragment_intensity_df.y_z1.mmap
-        self.b_ions_intensities = self.lib.library.fragment_intensity_df.b_z1.mmap
-
-        self.predicted_library_df = predicted_library_df
-
-    def quick_annotate(
-        self,
-        precursor_ppm=50,
-        fragment_ppm=50,
-        min_size=10,
-        ppm_mean=0,
-        min_hit_count=1,
-        append_stats=True,
-        top_n_hits=1,
-    ):
-        logging.info(f"Quick library annotation of mono isotopes with {ppm_mean=} and {precursor_ppm=}")
-        o = np.argsort(self.dia_data.tof_indices[self.precursor_indices])
-        mz_values = self.dia_data.mz_values * (1 + ppm_mean * 10**-6)
-        p_mzs = mz_values[
-            self.dia_data.tof_indices[self.precursor_indices][o]
-        ]
-        lower = np.empty(len(self.precursor_indices), dtype=np.int64)
-        upper = np.empty(len(self.precursor_indices), dtype=np.int64)
-        lower[o] = np.searchsorted(
-            self.predicted_library_df.precursor_mz.values,
-            p_mzs / (1 + precursor_ppm * 10**-6)
-        )
-        upper[o] = np.searchsorted(
-            self.predicted_library_df.precursor_mz.values,
-            p_mzs * (1 + precursor_ppm * 10**-6)
-        )
-        logging.info(
-            f"PSMs to test: {np.sum(((upper - lower) * (np.diff(self.precursor_indptr) >= min_size)))}"
-        )
-        (
-            precursor_indices,
-            precursor_indptr,
-            hit_counts,
-            frequency_counts,
-            db_indices,
-        ) = annotate(
-            range(len(lower)),
-            self.predicted_library_df.frag_start_idx.values,
-            self.predicted_library_df.frag_end_idx.values,
-            self.fragment_indices,
-            self.fragment_frequencies,
-            self.precursor_indptr,
-            mz_values,
-            self.dia_data.tof_indices,
-            fragment_ppm,
-            lower,
-            upper,
-            self.y_mzs,
-            self.b_mzs,
-            min_size,
-            min_hit_count,
-            top_n_hits,
-        )
-
-        precursor_selection = np.repeat(precursor_indices, precursor_indptr)
-        hits = self.dia_data.as_dataframe(self.precursor_indices[precursor_selection])
-        hits["inet_index"] = precursor_selection
-        hits["candidates"] = (upper - lower)[precursor_selection]
-        hits["total_peaks"] = np.diff(self.precursor_indptr)[precursor_selection]
-        hits["db_index"] = db_indices.astype(np.int64)
-        # hits["counts"] = np.repeat(hit_counts, precursor_indptr)
-        hits["counts"] = hit_counts
-        hits["frequency_counts"] = frequency_counts
-        self.quick_annotation = hits
-        self.quick_annotation["smooth_intensity"] = self.smooth_intensity_values[
-            self.quick_annotation.raw_indices
-        ]
-        self.quick_annotation = self.quick_annotation.join(self.predicted_library_df, on="db_index")
-        self.quick_annotation["im_diff"] = self.quick_annotation.mobility_pred - self.quick_annotation.mobility_values
-        self.quick_annotation["mz_diff"] = self.quick_annotation.precursor_mz - self.quick_annotation.mz_values
-        self.quick_annotation["ppm_diff"] = self.quick_annotation.mz_diff / self.quick_annotation.precursor_mz * 10**6
-        self.quick_annotation["target"] = ~self.quick_annotation.decoy
-        self.quick_annotation.reset_index(drop=True, inplace=True)
-        if append_stats:
-            quick_annotation_stats(self)
-
-    def estimate_mz_tolerance(self):
-        ppm_diffs = self.quick_annotation.ppm_diff
-        order = np.argsort(ppm_diffs.values)
-
-        decoys, targets = np.bincount(self.quick_annotation.decoy.values)
-        distribution = np.cumsum(
-            [
-                1 / targets if i else -1 / decoys for i in self.quick_annotation.decoy.values[order]
-            ]
-        )
-        low = ppm_diffs[order[np.argmin(distribution)]]
-        high = ppm_diffs[order[np.argmax(distribution)]]
-        self.ppm_mean = (low + high) / 2
-        self.ppm_width = abs(high - low)
-        # plt.plot(
-        #     ppm_diffs[order],
-        #     distribution,
-        # )
-        # sns.histplot(
-        #     data=self.quick_annotation,
-        #     x="ppm_diff",
-        #     hue="decoy",
-        # )
-
-    def quick_calibration(
-        self,
-        fdr=0.01,
-        train_fdr_level_pre_calibration=0.1,
-        train_fdr_level_post_calibration=0.1,
-        n_neighbors=4,
-        test_size=0.8,
-        random_state=0,
-    ):
-        val_names = [
-            "counts",
-            "frequency_counts",
-            "ppm_diff",
-            "im_diff",
-            "charge",
-            "total_peaks",
-            "nAA",
-            "b_hit_counts",
-            "y_hit_counts",
-            "b_mean_ppm",
-            "y_mean_ppm",
-            "relative_found_b_int",
-            "relative_missed_b_int",
-            "relative_found_y_int",
-            "relative_missed_y_int",
-            "relative_found_int",
-            "relative_missed_int",
-            "pearsons",
-            "pearsons_log",
-            "candidates",
-        ]
-        logging.info("Calculating quick log odds")
-        score_df = self.quick_annotation.copy()
-        log_odds = calculate_log_odds_product(
-            score_df,
-            val_names,
-        )
-        score_df["log_odds"] = log_odds
-        # score_df = alphadia.prefilter.train_and_score(
-        #     score_df,
-        #     val_names,
-        #     ini_score="log_odds",
-        #     train_fdr_level=train_fdr_level_pre_calibration,
-        # ).reset_index(drop=True)
-        score_df = alphadia.library.get_q_values(score_df, "log_odds", 'decoy', drop=True)
-        score_df_above_fdr = score_df[
-            (score_df.q_value < fdr) & (score_df.target)
-        ].reset_index(drop=True)
-        logging.info(f"Found {len(score_df_above_fdr)} targets for calibration")
-        score_df_above_fdr["im_pred"] = score_df_above_fdr.mobility_pred
-        score_df_above_fdr["im_values"] = score_df_above_fdr.mobility_values
-        self.predictors = {}
-        for dimension in ["rt", "im"]:
-            X = score_df_above_fdr[f"{dimension}_pred"].values.reshape(-1, 1)
-            y = score_df_above_fdr[f"{dimension}_values"].values
-            (
-                X_train,
-                X_test,
-                y_train,
-                y_test
-            ) = sklearn.model_selection.train_test_split(
-                X,
-                y,
-                test_size=test_size,
-                random_state=random_state,
-            )
-            self.predictors[dimension] = sklearn.neighbors.KNeighborsRegressor(
-                n_neighbors=n_neighbors,
-                # weights="distance",
-                n_jobs=alphatims.utils.set_threads(alphatims.utils.MAX_THREADS)
-            )
-            self.predictors[dimension].fit(X_train, y_train)
-            score_df_above_fdr[f"{dimension}_calibrated"] = self.predictors[dimension].predict(
-                score_df_above_fdr[f"{dimension}_pred"].values.reshape(-1, 1)
-            )
-            score_df_above_fdr[f"{dimension}_diff"] = score_df_above_fdr[f"{dimension}_values"] - score_df_above_fdr[f"{dimension}_calibrated"]
-        score_df["rt_calibrated"] = self.predictors["rt"].predict(
-            score_df.rt_pred.values.reshape(-1, 1)
-        )
-        score_df["im_calibrated"] = self.predictors["im"].predict(
-            score_df.mobility_pred.values.reshape(-1, 1)
-        )
-        ppm_mean = np.mean(score_df_above_fdr.ppm_diff.values)
-        score_df["mz_calibrated"] = score_df.precursor_mz * (
-            1 - ppm_mean * 10**-6
-        )
-
-        score_df["ppm_diff_calibrated"] = (score_df.mz_calibrated - score_df.mz_values) / score_df.mz_calibrated * 10**6
-        score_df["rt_diff_calibrated"] = score_df.rt_calibrated - score_df.rt_values
-        score_df["im_diff_calibrated"] = score_df.im_calibrated - score_df.mobility_values
-        self.score_df = alphadia.prefilter.train_and_score(
-            # score_df[np.abs(score_df.rt_diff_calibrated) < 250].reset_index(drop=True),
-            score_df,
-            [
-                "counts",
-                "frequency_counts",
-                "ppm_diff_calibrated",
-                "im_diff_calibrated",
-                "rt_diff_calibrated",
-                "charge",
-                "total_peaks",
-                "nAA",
-                "b_hit_counts",
-                "y_hit_counts",
-                "b_mean_ppm",
-                "y_mean_ppm",
-                "relative_found_b_int",
-                "relative_missed_b_int",
-                "relative_found_y_int",
-                "relative_missed_y_int",
-                "relative_found_int",
-                "relative_missed_int",
-                "pearsons",
-                "pearsons_log",
-                "candidates",
-                # "log_odds",
-            ],
-            ini_score="log_odds",
-            train_fdr_level=train_fdr_level_post_calibration,
-        ).reset_index(drop=True)
-
-        self.score_df["target_type"] = np.array([-1, 0])[
-            self.score_df.target.astype(np.int)
-        ]
-        self.score_df["target_type"][
-            (self.score_df.q_value < fdr) & (self.score_df.target)
-        ] = 1
-
-    def preprocess(
-        self
-    ):
-        self.smooth()
-        self.find_peaks()
-        self.determine_mono_isotopes()
-        self.create_precursor_centric_ion_network()
-        # self.save()
-
-    def save(self):
-        hdf = alphabase.io.hdf.HDF_File(
-            f"sandbox_{self.dia_data.sample_name}_analysis.hdf",
-            read_only=False,
-            truncate=True,
-        )
-        hdf.preprocessing = {
-            "smooth_intensity_values": self.smooth_intensity_values,
-            "neighbor_types": self.neighbor_types,
-            "density_values": self.density_values,
-            "potential_peaks": self.potential_peaks,
-            "peak_collection": {
-                "indptr": self.peak_collection.indptr,
-                "indices": self.peak_collection.indices,
-            },
-            "isotopes": {
-                "mono_isotopes_charge2": self.mono_isotopes_charge2,
-                "mono_isotopes_charge3": self.mono_isotopes_charge3,
-                "mono_isotopes": self.mono_isotopes,
-            },
-            "pseudo_msms_spectra": {
-                "precursor_indices": self.precursor_indices,
-                "precursor_indptr": self.precursor_indptr,
-                "fragment_indices": self.fragment_indices,
-            },
-            "connections": {
-                "connection_counts": self.connection_counts,
-                "connections": self.connections,
-            },
-            "tof_tolerance": self.tof_tolerance,
-            "cycle_tolerance": self.cycle_tolerance,
-            "scan_tolerance": self.scan_tolerance,
-            "multiple_frames_per_cycle": self.multiple_frames_per_cycle,
-            "ms1": self.ms1,
-            "ms2": self.ms2,
-            "cycle_sigma": self.cycle_sigma,
-            "scan_sigma": self.scan_sigma,
-            "tof_sigma": self.tof_sigma,
-            # "cycle_range": self.cycle_range,
-            "cycle_length": self.cycle_length,
-        }
-        hdf.annotation = {
-            "fragment_frequencies": self.fragment_frequencies,
-            "quick_annotation": self.quick_annotation,
-            "score_df": self.score_df,
-            "ppm_width": self.ppm_width,
-            "ppm_mean": self.ppm_mean,
-        }
-
-
-class PeakCollection(object):
-
-    def __init__(
-        self,
-        indptr: np.ndarray,
-        peaks: np.ndarray,
-    ):
-        if peaks.dtype == np.bool_:
-            self.indices = tm.clone(np.flatnonzero(peaks))
-        else:
-            self.indices = tm.clone(peaks)
-        self.indptr = tm.empty(indptr.shape, indptr.dtype)
-        set_peak_indptr(indptr, self.indptr, self.indices)
-
-
-@alphatims.utils.njit
-def set_peak_indptr(old_indptr, new_indptr, indices):
-    count = 0
-    offset = 0
-    for index in indices:
-        while index >= old_indptr[offset]:
-            new_indptr[offset] = count
-            offset += 1
-        count += 1
-    while index >= old_indptr[offset]:
-        new_indptr[offset] = count
-        offset += 1
-    new_indptr[offset:] = count
-
-
-@alphatims.utils.pjit
-# @alphatims.utils.njit(nogil=True)
-def match(
-    index,
-    indices1,
-    indices2,
-    indptr1,
-    indptr2,
-    fragments1,
-    fragments2,
-    tof_indices1,
-    tof_indices2,
-    mz_values1,
-    mz_values2,
-    fragment_ppm,
-    overlaps,
-    fragment_hits1,
-    fragment_hits2,
-):
-    precursor1 = indices1[index]
-    start1 = indptr1[precursor1]
-    end1 = indptr1[precursor1 + 1]
-    frags1 = fragments1[start1: end1]
-    tofs1 = tof_indices1[frags1]
-    mzs1 = mz_values1[tofs1]
-    order1 = np.argsort(mzs1)
-    precursor2 = indices2[index]
-    start2 = indptr2[precursor2]
-    end2 = indptr2[precursor2 + 1]
-    frags2 = fragments2[start2: end2]
-    tofs2 = tof_indices2[frags2]
-    mzs2 = mz_values2[tofs2]
-    order2 = np.argsort(mzs2)
-    index1 = 0
-    index2 = 0
-    hits = 0
-    while (index1 < len(mzs1)) and (index2 < len(mzs2)):
-        fragment_mz = mzs1[order1[index1]]
-        database_mz = mzs2[order2[index2]]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            index1 += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            index2 += 1
-        else:
-            hits += 1
-            fragment_hits1[start1 + order1[index1]] = True
-            fragment_hits2[start2 + order2[index2]] = True
-            index1 += 1
-            index2 += 1
-    overlaps[index] = hits
-
-
-
-@alphatims.utils.njit(nogil=True)
-def rough_match2_count_only(
-    fragment_mzs,
-    database_mzs,
-    fragment_ppm,
-):
-    fragment_index = 0
-    database_index = 0
-    hits = 0
-    while (fragment_index < len(fragment_mzs)) and (database_index < len(database_mzs)):
-        fragment_mz = fragment_mzs[fragment_index]
-        database_mz = database_mzs[database_index]
-        if fragment_mz < (database_mz / (1 + 10**-6 * fragment_ppm)):
-            fragment_index += 1
-        elif database_mz < (fragment_mz / (1 + 10**-6 * fragment_ppm)):
-            database_index += 1
-        else:
-            hits += 1
-            fragment_index += 1
-            database_index += 1
-    return hits
-
-
-def align(
-    analysis1,
-    analysis2,
-    ppm=30,
-    fragment_ppm=30
-):
-    logging.info("Aligning samples")
-    df1 = analysis1.dia_data.as_dataframe(
-        analysis1.precursor_indices
-    )
-    df1.sort_values(by="mz_values", inplace=True)
-    df1.reset_index(inplace=True)
-    df2 = analysis2.dia_data.as_dataframe(
-        analysis2.precursor_indices
-    )
-    df2.sort_values(by="mz_values", inplace=True)
-    df2.reset_index(inplace=True)
-    mz1 = df1.mz_values.values
-    mz2 = df2.mz_values.values
-    lower = np.searchsorted(mz1, mz2 / (1 + ppm*10**-6))
-    upper = np.searchsorted(mz1, mz2 * (1 + ppm*10**-6))
-    indices2 = np.repeat(df2["index"].values, upper - lower)
-    indices1 = np.concatenate(
-        [
-            df1["index"].values[l:h] for l, h in zip(lower, upper)
-        ]
-    )
-    overlaps = tm.empty(len(indices1), dtype=np.int16)
-    fragment_hits1 = tm.zeros(len(analysis1.fragment_indices), dtype=np.bool_)
-    fragment_hits2 = tm.zeros(len(analysis2.fragment_indices), dtype=np.bool_)
-    match(
-        range(len(overlaps)),
-        indices1,
-        indices2,
-        analysis1.precursor_indptr,
-        analysis2.precursor_indptr,
-        analysis1.fragment_indices,
-        analysis2.fragment_indices,
-        analysis1.dia_data.tof_indices,
-        analysis2.dia_data.tof_indices,
-        analysis1.dia_data.mz_values,
-        analysis2.dia_data.mz_values,
-        fragment_ppm,
-        overlaps,
-        fragment_hits1,
-        fragment_hits2,
-    )
-    alignment_indptr = np.empty(len(upper) + 1, dtype=np.int64)
-    alignment_indptr[1:] = np.cumsum(upper - lower)
-    alignment_indptr[0] = 0
-    best = np.array(
-        [
-            start + np.argmax(overlaps[start:end]) for start, end in zip(
-                alignment_indptr[:-1],
-                alignment_indptr[1:]
-            ) if end > start
-        ]
-    )
-    overlaps = tm.empty(len(indices1), dtype=np.int16)
-    fragment_hits1 = tm.zeros(len(analysis1.fragment_indices), dtype=np.bool_)
-    fragment_hits2 = tm.zeros(len(analysis2.fragment_indices), dtype=np.bool_)
-    match(
-        range(len(best)),
-        indices1[best],
-        indices2[best],
-        analysis1.precursor_indptr,
-        analysis2.precursor_indptr,
-        analysis1.fragment_indices,
-        analysis2.fragment_indices,
-        analysis1.dia_data.tof_indices,
-        analysis2.dia_data.tof_indices,
-        analysis1.dia_data.mz_values,
-        analysis2.dia_data.mz_values,
-        fragment_ppm,
-        overlaps,
-        fragment_hits1,
-        fragment_hits2,
-    )
-    return fragment_hits1, fragment_hits2
-
-
-def run_flow(file_name):
-    analysis1 = alphadia.smoothing.Analysis(
-        file_name,
-        tof_tolerance=3,
-        cycle_tolerance=3,
-        scan_tolerance=6,
-        multiple_frames_per_cycle=False,
-        ms1=True,
-        ms2=True,
-        tof_sigma=3,
-        cycle_sigma=3,
-        scan_sigma=6,
-    )
-    analysis1.preprocess()
-    analysis1.fragment_frequencies = np.ones(len(analysis1.fragment_indices))
-    analysis1.add_library(
-        "/Users/swillems/Data/peptide_centric/FZW_predicted_spec_libs/human_reviewed_fasta_regular_w_decoy.speclib.hdf"
-    )
-    analysis1.quick_annotate(
-        precursor_ppm=50,
-        fragment_ppm=50,
-        min_size=5,
-        min_hit_count=3,
-        append_stats=False,
-    )
-    analysis1.estimate_mz_tolerance()
-    analysis1.quick_annotate(
-        precursor_ppm=analysis1.ppm_width,
-        fragment_ppm=analysis1.ppm_width,
-        ppm_mean=analysis1.ppm_mean,
-        min_size=5,
-        min_hit_count=3,
-    )
-    fdr = 0.01
-    analysis1.quick_calibration(
-        fdr=fdr,
-        train_fdr_level_pre_calibration=1,
-        train_fdr_level_post_calibration=0.1,
-        n_neighbors=4,
-        test_size=0.8,
-        random_state=0,
-    )
-    new_lib = analysis1.score_df[
-        (analysis1.score_df.q_value < fdr) & (analysis1.score_df.target)
-    ]
-    return new_lib
-
-
-def calculate_odds(df, column_name, *, target_name="target", smooth=1, plot=False):
-    order = np.argsort(df[column_name].values)
-    negatives, positives = np.bincount(df.target.values)
-    tp_count = positives - negatives
-    n = int(tp_count * smooth)
-    forward = np.cumsum(df[target_name].values[order])
-    odds = np.zeros_like(forward, dtype=np.float)
-    odds[n:-n] = forward[2*n:] - forward[:-2*n]
-    odds[:n] = forward[n:2*n]
-    odds[-n:] = forward[-1] - forward[-2*n:-n]
-    odds[n:-n] /= 2*n
-    odds[:n] /= np.arange(n, 2*n)
-    odds[-n:] /= np.arange(n, 2*n)[::-1]
-    odds /= (1 - odds)
-    odds = odds[np.argsort(order)]
-    if plot:
-        import matplotlib.pyplot as plt
-        plt.scatter(df[column_name], odds, marker=".")
-    return odds
-
-
-def calculate_log_odds_product(
-    df_,
-    val_names
-):
-    df = df_[val_names]
-    df = sklearn.preprocessing.StandardScaler().fit_transform(df)
-    pca = sklearn.decomposition.PCA(n_components=df.shape[1])
-    pca.fit(df)
-    df = pd.DataFrame(pca.transform(df))
-    df["target"] = df_.target
-    negative, positive = np.bincount(df.target)
-    log_odds = np.zeros(len(df))
-    for val_name in range(df.shape[1] - 1):
-        odds = alphadia.smoothing.calculate_odds(df, val_name, smooth=1)
-        log_odds += np.log2(odds) * pca.explained_variance_[val_name]
-    return log_odds
-    # new_df = analysis1.score_df[["decoy", "target"]]
-    # new_df['odds'] = log_odds
-    # new_df = alphadia.library.get_q_values(new_df, "odds", 'decoy', drop=True)
-    # new_df.reset_index(drop=True, inplace=True)
-
-
-def deconvolute_frame_groups(
-    analysis1,
-    ppm=20,
-    tolerance=1,
-):
-    import multiprocessing
-
-    def starfunc(index):
-        return deconvolute_frame_groups_(
-            index,
-            analysis1,
-            ppm,
-            tolerance,
-        )
-    fragment_indices = []
-    precursor_counts = 0
-    precursor_indptr = [precursor_counts]
-    # iterable = range(100)
-    iterable = range(len(analysis1.precursor_indices))
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for reproducible_fragments in alphatims.utils.progress_callback(
-            pool.imap(starfunc, iterable),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            fragment_indices.append(reproducible_fragments)
-            precursor_counts += len(reproducible_fragments)
-            precursor_indptr.append(precursor_counts)
-    return (
-        tm.clone(np.array(precursor_indptr)),
-        np.concatenate(fragment_indices),
-    )
-
-
-def deconvolute_frame_groups_(
-    prec_index,
-    analysis1,
-    ppm,
-    tolerance
-):
-    start = analysis1.precursor_indptr[prec_index]
-    end = analysis1.precursor_indptr[prec_index + 1]
-    frags = analysis1.fragment_indices[start:end]
-    # df = dia_data.as_dataframe(frags)
-    # df.sort_values(by="mz_values", inplace=True)
-    # df.reset_index(inplace=True, drop=True)
-    # # df["cycle"] = (df.push_indices - dia_data.zeroth_frame * dia_data.scan_max_index) // dia_data.dia_mz_cycle.shape[0]
-    # # df["frame_group"] = df.precursor_indices
-    # to_keep = match_frame_groups(
-    #     df.mz_values.values,
-    #     df.precursor_indices.values,
-    #     df.intensity_values.values,
-    #     df.raw_indices.values,
-    #     ppm,
-    # )
-    coordinates = analysis1.dia_data.convert_from_indices(
-         frags,
-         return_raw_indices=True,
-         return_precursor_indices=True,
-         return_mz_values=True,
-         return_intensity_values=True,
-         raw_indices_sorted=True,
-    )
-    order = np.argsort(coordinates["mz_values"])
-    to_keep = match_frame_groups(
-        coordinates["mz_values"][order],
-        coordinates["precursor_indices"][order],
-        coordinates["intensity_values"][order],
-        coordinates["raw_indices"][order],
-        ppm,
-        tolerance,
-    )
-    return to_keep
-
-
-@alphatims.utils.njit(nogil=True)
-def match_frame_groups(
-    # df,
-    mz_values,
-    frame_groups,
-    intensity_values,
-    raw_indices,
-    ppm,
-    tolerance
-):
-    unique_frame_groups = len(np.unique(frame_groups))
-    if tolerance < 1:
-        tolerance *= unique_frame_groups
-    index2 = 0
-    to_keep = []
-    for index1, mz1 in enumerate(mz_values[:-1]):
-        if index1 < index2:
-            continue
-        prev_mz = mz1
-        for index2, mz2 in enumerate(mz_values[index1 + 1:], index1 + 1):
-            if ((mz2 - prev_mz) / prev_mz) * 10**6 > ppm:
-                break
-            prev_mz = mz2
-        detected_frame_groups = np.unique(frame_groups[index1: index2])
-        if (unique_frame_groups - len(detected_frame_groups)) <= tolerance:
-            max_intensity = np.argmax(intensity_values[index1: index2])
-            raw_index = raw_indices[index1: index2][max_intensity]
-            to_keep.append(raw_index)
-    to_keep = np.sort(np.array(to_keep))
-    return to_keep
-
-
-
-@alphatims.utils.njit(nogil=True)
-def match_frame_groups_frequency(
-    # df,
-    mz_values,
-    frame_groups,
-    intensity_values,
-    raw_indices,
-    ppm,
-    tolerance
-):
-    to_keep = []
-    index2 = 0
-    for index1, mz1 in enumerate(mz_values[:-1]):
-        if index1 < index2:
-            continue
-        prev_mz = mz1
-        for index2, mz2 in enumerate(mz_values[index1 + 1:], index1 + 1):
-            if ((mz2 - prev_mz) / prev_mz) * 10**6 > ppm:
-                break
-            prev_mz = mz2
-        detected_frame_groups = len(np.unique(frame_groups[index1: index2]))
-        for index in range(index1, index2):
-            to_keep.append(detected_frame_groups)
-    if index2 != len(mz_values):
-        to_keep.append(1)
-    return np.array(to_keep)
-
-
-def deconvolute_frame_groups_frequencies_(
-    prec_index,
-    analysis1,
-    ppm,
-    tolerance
-):
-    start = analysis1.precursor_indptr[prec_index]
-    end = analysis1.precursor_indptr[prec_index + 1]
-    frags = analysis1.fragment_indices[start:end]
-    coordinates = analysis1.dia_data.convert_from_indices(
-         frags,
-         return_raw_indices=True,
-         return_precursor_indices=True,
-         return_mz_values=True,
-         return_intensity_values=True,
-         raw_indices_sorted=True,
-    )
-    frequencies = np.empty(len(frags), dtype=np.int64)
-    order = np.argsort(coordinates["mz_values"])
-    frequencies[order] = match_frame_groups_frequency(
-        coordinates["mz_values"][order],
-        coordinates["precursor_indices"][order],
-        coordinates["intensity_values"][order],
-        coordinates["raw_indices"][order],
-        ppm,
-        tolerance,
-    )
-    return frequencies
-
-
-def deconvolute_frame_groups_frequencies(
-    analysis1,
-    ppm=20,
-    tolerance=1,
-):
-    import multiprocessing
-
-    def starfunc(index):
-        return deconvolute_frame_groups_frequencies_(
-            index,
-            analysis1,
-            ppm,
-            tolerance,
-        )
-    fragment_indices = []
-    # iterable = range(100)
-    iterable = range(len(analysis1.precursor_indices))
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for reproducible_fragments in alphatims.utils.progress_callback(
-            pool.imap(starfunc, iterable),
-            total=len(iterable),
-            include_progress_callback=True
-        ):
-            fragment_indices.append(reproducible_fragments)
-    return np.concatenate(fragment_indices)
-
-
-@alphatims.utils.njit(nogil=True)
-def match_ms1_to_ms2_(
-    cycle_index,
-    indices,
-    indptr,
-    zeroth_frame,
-    scan_max_index,
-    ppm_tolerance,
-    scan_tolerance,
-    cycle_tolerance,
-    mz_windows,
-    mz_values,
-    tof_indices,
-    is_mono,
-):
-    cycle_length = len(mz_windows)
-    frame_count = cycle_length // scan_max_index
-    push_offset = cycle_length * cycle_index + zeroth_frame * scan_max_index
-    precursor_indices = []
-    precursor_count = []
-    fragment_indices = []
-    for self_push_offset in np.flatnonzero(mz_windows[:, 0] == -1):
-        self_push_index = push_offset + self_push_offset
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        self_scan = self_push_offset % scan_max_index
-        for precursor_index_ in range(self_start, self_end):
-            if not is_mono[precursor_index_]:
-                continue
-            precursor_index = indices[precursor_index_]
-            precursor_mz = mz_values[tof_indices[precursor_index]]
-            hits = 0
-            for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-                for frame_offset in range(frame_count):
-                    for scan_offset in range(-scan_tolerance, scan_tolerance + 1):
-                        other_scan = self_scan + scan_offset
-                        if not (0 <= other_scan < scan_max_index):
-                            continue
-                    # for other_scan in range(scan_max_index):
-                        other_push_offset = frame_offset * scan_max_index + other_scan
-                        low_mz, high_mz = mz_windows[other_push_offset]
-                        if low_mz == -1:
-                            continue
-                        other_push_index = push_offset + other_push_offset + cycle_length * cycle_offset
-                        if not (0 <= other_push_index < len(indptr)):
-                            continue
-                        other_start = indptr[other_push_index]
-                        other_end = indptr[other_push_index + 1]
-                        for fragment_index_ in range(other_start, other_end):
-                            fragment_index = indices[fragment_index_]
-                            fragment_mz = mz_values[tof_indices[fragment_index]]
-                            # if np.abs(fragment_mz - precursor_mz) / precursor_mz * 10**6 < ppm_tolerance:
-                            if is_within_ppm_tolerance(fragment_mz, precursor_mz, ppm_tolerance):
-                                fragment_indices.append(fragment_index)
-                                hits += 1
-            if hits > 0:
-                precursor_indices.append(precursor_index)
-                precursor_count.append(hits)
-    return (
-        np.array(precursor_indices),
-        np.array(precursor_count),
-        np.array(fragment_indices),
-    )
-
-
-@alphatims.utils.njit(nogil=True)
-def is_within_ppm_tolerance(mz1, mz2, ppm):
-    return np.abs(mz1 - mz2) / mz2 * 10**6 < ppm
-
-
-
-def match_ms1_to_ms2(self, ppm):
-    import multiprocessing
-    logging.info("Matching precursors")
-
-    def starfunc(cycle_index):
-        return match_ms1_to_ms2_(
-            cycle_index,
-            self.peak_collection.indices,
-            self.peak_collection.indptr,
-            self.dia_data.zeroth_frame,
-            self.dia_data.scan_max_index,
-            ppm,
-            self.scan_tolerance,
-            self.cycle_tolerance,
-            self.dia_data.dia_mz_cycle,
-            self.dia_data.mz_values,
-            self.dia_data.tof_indices,
-            np.isin(self.peak_collection.indices, self.mono_isotopes)
-        )
-
-    precursor_indices = []
-    precursor_counts = [[0]]
-    fragment_indices = []
-
-    with multiprocessing.pool.ThreadPool(alphatims.utils.MAX_THREADS) as pool:
-        for (
-            precursor_indices_,
-            precursor_counts_,
-            fragment_indices_,
-        ) in alphatims.utils.progress_callback(
-            pool.imap(starfunc, self.cycle_range),
-            total=len(self.cycle_range),
-            include_progress_callback=True
-        ):
-            precursor_indices.append(precursor_indices_)
-            precursor_counts.append(precursor_counts_)
-            fragment_indices.append(fragment_indices_)
-
-    precursor_indices = np.concatenate(precursor_indices)
-    precursor_counts = np.cumsum(np.concatenate(precursor_counts))
-    fragment_indices = np.concatenate(fragment_indices)
-    return (
-        tm.clone(precursor_indices),
-        tm.clone(precursor_counts),
-        tm.clone(fragment_indices),
-    )
-    # self.precursor_indices = tm.clone(precursor_indices)
-    # self.precursor_indptr = tm.clone(precursor_counts)
-    # self.fragment_indices = tm.clone(fragment_indices)
-
-
-@alphatims.utils.njit
-def sort_query_data_fragments_by_mz(indptr, mz_values, intensities):
-    for index, start in enumerate(indptr[:-1]):
-        end = indptr[index + 1]
-        mzs = mz_values[start: end]
-        order = np.argsort(mzs)
-        mz_values[start:end] = mz_values[start:end][order]
-        intensities[start:end] = intensities[start:end][order]
-
-
-def create_ap_like_query_data(analysis1):
-    M_PROTON = 1
-    ms1_coordinates = analysis1.dia_data.convert_from_indices(
-        analysis1.precursor_indices,
-        return_mobility_values=True,
-        return_rt_values_min=True,
-        return_mz_values=True,
-        return_push_indices=True,
-    )
-
-    query_data = {}
-    query_data['prec_id2'] = analysis1.precursor_indices
-    query_data['mono_mzs2'] = ms1_coordinates["mz_values"]
-    query_data['rt_list_ms2'] = ms1_coordinates["rt_values_min"]
-    query_data['scan_list_ms2'] = ms1_coordinates["push_indices"]
-    query_data['mobility2'] = ms1_coordinates["mobility_values"]
-    query_data['charge2'] = np.array([2, 3])[np.isin(analysis1.precursor_indices, analysis1.mono_isotopes_charge3).astype(np.int)]
-    query_data['prec_mass_list2'] = (query_data['mono_mzs2'] - M_PROTON) * query_data['charge2']
-    query_data["indices_ms2"] = analysis1.precursor_indptr
-    query_data["mass_list_ms2"] = analysis1.dia_data.mz_values[analysis1.dia_data.tof_indices[analysis1.fragment_indices]]
-    query_data["int_list_ms2"] = analysis1.smooth_intensity_values[analysis1.fragment_indices]
-    sort_query_data_fragments_by_mz(
-        query_data["indices_ms2"],
-        query_data["mass_list_ms2"],
-        query_data["int_list_ms2"],
-    )
-    return query_data
-
-
-def create_ap_like_hdf_file(query_data, file_name):
-    import alphabase.io.hdf
-    hdf = alphabase.io.hdf.HDF_File(
-        file_name,
-        read_only=False,
-        truncate=True,
-    )
-    hdf.Raw = {"MS2_scans": query_data}
-
-
-
-
-@alphatims.utils.pjit
-def cluster_to_max_peaks_(
-    cycle_index,
-    indptr,
-    tof_indices,
-    smooth_intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    clusters,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index == self_push_index:
-                    continue
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        self_ref = clusters[self_index]
-                        max_intensity = smooth_intensity_values[self_ref]
-                        other_intensity = smooth_intensity_values[other_index]
-                        if max_intensity < other_intensity:
-                            clusters[self_index] = other_index
-                        elif max_intensity == other_intensity:
-                            if self_index <= other_index:
-                                clusters[self_index] = other_index
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-
-
-@alphatims.utils.njit
-def walk_cluster_path(
-    clusters
-):
-    for index, pointer in enumerate(clusters):
-        initial_index = index
-        path_length = 1
-        while (pointer >= 0) and (index != pointer):
-            index = pointer
-            pointer = clusters[index]
-            path_length += 1
-        if pointer >= 0:
-            final_pointer = -(pointer + 1)
-        else:
-            final_pointer = pointer
-        index = initial_index
-        for i in range(path_length):
-            pointer = clusters[index]
-            clusters[index] = final_pointer
-            index = pointer
-    for index, pointer in enumerate(clusters):
-        clusters[index] = -(pointer + 1)
-
-
-@alphatims.utils.njit
-def walk_cluster_path_backup(
-    clusters
-):
-    for index, pointer in enumerate(clusters):
-        elements_on_path = []
-        while pointer >= 0:
-            elements_on_path.append(index)
-            if index == pointer:
-                pointer = -(pointer + 1)
-                break
-            index = pointer
-            pointer = clusters[index]
-        for index in elements_on_path:
-            clusters[index] = pointer
-    for index, pointer in enumerate(clusters):
-        clusters[index] = -(pointer + 1)
-
-
-@alphatims.utils.pjit
-def find_unique_peaks_(
-    cycle_index,
-    indptr,
-    tof_indices,
-    smooth_intensity_values,
-    tof_tolerance,
-    scan_max_index,
-    zeroth_frame,
-    connection_counts,
-    connections,
-    cycle_tolerance,
-    clusters,
-    unique_peaks,
-):
-    len_dia_mz_cycle = len(connection_counts) - 1
-    push_offset = len_dia_mz_cycle * cycle_index + zeroth_frame * scan_max_index
-    for self_connection_index, connection_start in enumerate(
-        connection_counts[:-1]
-    ):
-        connection_end = connection_counts[self_connection_index + 1]
-        self_push_index = push_offset + self_connection_index
-        if self_push_index > len(indptr):
-            break
-        self_start = indptr[self_push_index]
-        self_end = indptr[self_push_index + 1]
-        if self_start == self_end:
-            continue
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            for other_connection_index in connections[connection_start: connection_end]:
-                other_push_index = push_offset + other_connection_index + len_dia_mz_cycle * cycle_offset
-                if other_push_index <= self_push_index:
-                    continue
-                if other_push_index >= len(indptr):
-                    continue
-                other_start = indptr[other_push_index]
-                other_end = indptr[other_push_index + 1]
-                if other_start == other_end:
-                    continue
-                self_index = self_start
-                other_index = other_start
-                while (self_index < self_end) and (other_index < other_end):
-                    self_tof = tof_indices[self_index]
-                    other_tof = tof_indices[other_index]
-                    if (self_tof - tof_tolerance) <= other_tof <= (self_tof + tof_tolerance):
-                        self_intensity = smooth_intensity_values[self_index]
-                        other_intensity = smooth_intensity_values[other_index]
-                        if self_intensity <= other_intensity:
-                            if clusters[self_index] != clusters[other_index]:
-                                unique_peaks[self_index] = False
-                        if self_intensity >= other_intensity:
-                            if clusters[self_index] != clusters[other_index]:
-                                unique_peaks[other_index] = False
-                    if self_tof < other_tof:
-                        self_index += 1
-                    else:
-                        other_index += 1
-
-
-@alphatims.utils.njit
-def walk_unique_cluster_path(
-    cluster_pointers,
-    nonambiguous_elements,
-    to_visit
-):
-    for index, nonambiguous in enumerate(nonambiguous_elements):
-        initial_index = index
-        path_length = 0
-        while nonambiguous:
-            path_length += 1
-            if not to_visit[index]:
-                break
-            else:
-                to_visit[index] = False
-            pointer = cluster_pointers[index]
-            if index == pointer:
-                break
-            index = pointer
-            nonambiguous = nonambiguous_elements[index]
-        if not nonambiguous:
-            index = initial_index
-            for i in range(path_length):
-                nonambiguous_elements[index] = False
-                index = cluster_pointers[index]
-
-
-@alphatims.utils.njit
-def walk_unique_cluster_path_backup(
-    clusters,
-    uniques,
-    to_visit,
-):
-    for index, pointer in enumerate(clusters):
-        elements_on_path = []
-        while uniques[index] and to_visit[index]:
-            elements_on_path.append(index)
-            if index == pointer:
-                break
-            index = pointer
-            pointer = clusters[index]
-        unique = uniques[index]
-        for index in elements_on_path[::-1]:
-            unique &= uniques[index]
-            uniques[index] = unique
-            to_visit[index] = False
-
-
-@alphatims.utils.njit
-def assemble_clusters(
-    cluster_pointers,
-    nonambiguous_ions,
-    cluster_assemblies,
-):
-    for index, pointer in enumerate(cluster_pointers):
-        if nonambiguous_ions[index]:
-            if index != pointer:
-                secondary_pointer = cluster_assemblies[pointer]
-                cluster_assemblies[index] = secondary_pointer
-                cluster_assemblies[pointer] = index
-
-
-
-
-@alphatims.utils.njit(nogil=True)
-def create_pseudo_msms_spectra_for_monos(
-    precursor_indices,
-    precursor_mz_values,
-    precursor_push_indices,
-    zeroth_frame,
-    scan_max_index,
-    scan_tolerance,
-    cycle_tolerance,
-    dia_mz_cycle,
-    push_indptr,
-    tof_indices,
-    intensity_values,
-    tof_max_index,
-    cycle_sigma,
-    scan_sigma,
-    spectrum_intensities,
-    spectrum_frequencies,
-    spectrum_mzs,
-    spectrum_indptr,
-    tof_sigma,
-    tof_tolerance,
-    max_peaks_per_spectrum,
-):
-    cycle_length = len(dia_mz_cycle)
-    frame_count = cycle_length // scan_max_index
-    hits = np.zeros((2, tof_max_index))
-    to_clear = np.zeros(tof_max_index, dtype=np.int32)
-    for precursor_index, precursor_mz in enumerate(
-        precursor_mz_values[precursor_indices]
-    ):
-        elements_to_clear = 0
-        self_push_index = precursor_push_indices[precursor_index]
-        self_scan_index = self_push_index % scan_max_index
-        self_cycle_index = (
-            self_push_index - zeroth_frame * scan_max_index
-        ) // cycle_length
-        max_positive_element_count = 0
-        for cycle_offset in range(-cycle_tolerance, cycle_tolerance + 1):
-            other_cycle_index = self_cycle_index + cycle_offset
-            if other_cycle_index < 0:
-                continue
-            cycle_blur = gauss_correction(cycle_offset, cycle_sigma)
-            for frame_index in range(frame_count):
-                for scan_offset in range(-scan_tolerance, scan_tolerance + 1):
-                    other_scan_index = self_scan_index + scan_offset
-                    if not (0 <= other_scan_index < scan_max_index):
-                        continue
-                    other_push_offset = frame_index * scan_max_index + other_scan_index
-                    low_mz, high_mz = dia_mz_cycle[other_push_offset]
-                    if not (low_mz <= precursor_mz < high_mz):
-                        continue
-                    other_push_index = zeroth_frame * scan_max_index
-                    other_push_index += other_cycle_index * cycle_length
-                    other_push_index += other_push_offset
-                    if not (0 <= other_push_index < len(push_indptr)):
-                        continue
-                    other_start = push_indptr[other_push_index]
-                    other_end = push_indptr[other_push_index + 1]
-                    scan_blur = gauss_correction(scan_offset, scan_sigma)
-                    intensity_weight = cycle_blur * scan_blur
-                    max_positive_element_count += intensity_weight
-                    for index in range(other_start, other_end):
-                        tof_index = tof_indices[index]
-                        intensity = intensity_values[index]
-                        if hits[0, tof_index] == 0:
-                            to_clear[elements_to_clear] = tof_index
-                            elements_to_clear += 1
-                        hits[0, tof_index] += intensity_weight * intensity
-                        hits[1, tof_index] += intensity_weight
-        if elements_to_clear == 0:
-            spectrum_indptr[precursor_index + 1] = spectrum_indptr[precursor_index]
-        centroid_deconvoluted_peak(
-            precursor_index,
-            hits,
-            to_clear,
-            elements_to_clear,
-            max_positive_element_count,
-            spectrum_intensities,
-            spectrum_frequencies,
-            spectrum_mzs,
-            spectrum_indptr,
-            tof_tolerance,
-            tof_sigma,
-            tof_max_index,
-            max_peaks_per_spectrum,
-        )
-        for index, tof_index in enumerate(to_clear[:elements_to_clear]):
-            hits[0, tof_index] = 0
-            hits[1, tof_index] = 0
-            to_clear[index] = 0
-
-
-@alphatims.utils.njit(nogil=True)
-def centroid_deconvoluted_peak(
-    precursor_index,
-    hits,
-    to_clear,
-    elements_to_clear,
-    max_positive_element_count,
-    spectrum_intensities,
-    spectrum_frequencies,
-    spectrum_mzs,
-    spectrum_indptr,
-    tof_tolerance,
-    tof_sigma,
-    tof_max_index,
-    max_peaks_per_spectrum,
-):
-    for tof_index in to_clear[:elements_to_clear]:
-        hits[1, tof_index] /= max_positive_element_count
-    for tof_index in to_clear[:elements_to_clear]:
-        for tof_offset in range(-tof_tolerance, tof_tolerance + 1):
-            other_tof = tof_index + tof_offset
-            if not (0 <= other_tof < tof_max_index):
-                continue
-            if tof_offset == 0:
-                continue
-            tof_blur = gauss_correction(tof_offset, tof_sigma)
-            other_intensity = hits[0, other_tof]
-            other_frequency = hits[1, other_tof]
-            hits[0, tof_index] += tof_blur * other_intensity
-            hits[1, tof_index] += tof_blur * other_frequency
-    for tof_index in to_clear[:elements_to_clear]:
-        for tof_offset in range(-tof_tolerance, tof_tolerance + 1):
-            other_tof = tof_index + tof_offset
-            if not (0 <= other_tof < tof_max_index):
-                continue
-            if tof_offset == 0:
-                continue
-            if hits[1, tof_index] <= hits[1, other_tof]:
-                hits[1, tof_index] = 0
-                break
-    elems0 = to_clear[:elements_to_clear]
-    elems = hits[1][elems0]
-    order = np.argsort(elems)[::-1]
-    hit_offset = spectrum_indptr[precursor_index]
-    for element in order[:max_peaks_per_spectrum]:
-        tof_index = to_clear[element]
-        spectrum_intensities[hit_offset] = hits[0, tof_index]
-        spectrum_frequencies[hit_offset] = hits[1, tof_index]
-        spectrum_mzs[hit_offset] = tof_index
-        hit_offset += 1
-    spectrum_indptr[precursor_index + 1] = hit_offset
diff --git a/alphadia/extraction/testing.py b/alphadia/testing.py
similarity index 96%
rename from alphadia/extraction/testing.py
rename to alphadia/testing.py
index fb7d3c30..2885e324 100644
--- a/alphadia/extraction/testing.py
+++ b/alphadia/testing.py
@@ -1,26 +1,19 @@
 # native imports
-
-# alphadia imports
-
-# alpha family imports
-
-# third party imports
-
 import os
-import numpy as np
-
 import base64
-import urllib.request
 from urllib.request import urlopen
 from urllib.request import urlretrieve
 import cgi
-
-import logging
 import zipfile
 import progressbar
-from typing import Union
+import logging
+import typing
+
+# alphadia imports
+
+# alpha family imports
 
-import shutil
+# third party imports
 
 class Progress(): # pragma: no cover
     """Class to report the download progress of a file to the console.
@@ -85,7 +78,7 @@ def filename_onedrive(sharing_url: str) -> str: # pragma: no cover
     value, params = cgi.parse_header(info)
     return params["filename"]
 
-def download_onedrive(sharing_url: str, output_dir: str) -> Union[str, None]: # pragma: no cover
+def download_onedrive(sharing_url: str, output_dir: str) -> typing.Union[str, None]: # pragma: no cover
     """download file from onedrive sharing link
 
     Parameters
@@ -193,7 +186,7 @@ def filename_datashare(sharing_url: str, tar=False) -> str: # pragma: no cover
     filename = params["filename"]
     return filename
 
-def download_datashare(sharing_url: str, output_dir: str) -> Union[str, None]: # pragma: no cover
+def download_datashare(sharing_url: str, output_dir: str) -> typing.Union[str, None]: # pragma: no cover
     """download file from datashare sharing link
 
     Parameters
diff --git a/alphadia/thermo.py b/alphadia/thermo.py
deleted file mode 100644
index d195a9af..00000000
--- a/alphadia/thermo.py
+++ /dev/null
@@ -1,215 +0,0 @@
-
-#import alphatims.bruker
-import logging
-import numpy as np
-import pandas as pd
-import os
-
-
-def load_thermo_raw(
-    raw_file_name: str,
-    dda: bool,
-    profile: bool = False,
-) -> tuple:
-    """Load raw thermo data as a dictionary."""
-    import alphapept.pyrawfilereader
-    import tqdm
-    rawfile = alphapept.pyrawfilereader.RawFileReader(raw_file_name)
-    _push_indices = []
-    mz_values = []
-    intensity_values = []
-    rt_values = []
-    quad_mz_values = []
-    precursor_indices = []
-    precursor = 0
-    for i in tqdm.tqdm(
-        range(
-            rawfile.FirstSpectrumNumber,
-            rawfile.LastSpectrumNumber + 1
-        )
-    ):
-        if profile:
-            masses, intensities = rawfile.GetProfileMassListFromScanNum(i)
-        else:
-            masses, intensities = rawfile.GetCentroidMassListFromScanNum(i)
-        mz_values.append(masses)
-        intensity_values.append(intensities)
-        _push_indices.append(len(masses))
-        rt = rawfile.RTFromScanNum(i)
-        rt_values.append(rt)
-        ms_order = rawfile.GetMSOrderForScanNum(i)
-        if ms_order == 1:
-            quad_mz_values.append((-1, -1))
-            if dda:
-                precursor_indices.append(0)
-            else:
-                precursor = 0
-                precursor_indices.append(precursor)
-        elif ms_order == 2:
-            precursor += 1
-            isolation_center = rawfile.GetPrecursorMassForScanNum(i)
-            DIA_width = rawfile.GetIsolationWidthForScanNum(i)
-            quad_mz_values.append(
-                (
-                    isolation_center - DIA_width / 2,
-                    isolation_center + DIA_width / 2,
-                )
-            )
-            precursor_indices.append(precursor)
-    rawfile.Close()
-    push_indices = np.empty(rawfile.LastSpectrumNumber + 1, np.int64)
-    push_indices[0] = 0
-    push_indices[1:] = np.cumsum(_push_indices)
-    return (
-        push_indices,
-        np.concatenate(mz_values),
-        np.concatenate(intensity_values),
-        np.array(rt_values) * 60,
-        np.array(quad_mz_values),
-        np.array(precursor_indices),
-    )
-
-
-class RawFile(alphatims.bruker.TimsTOF):
-    def __init__(
-        self,
-        thermo_raw_file_name: str,
-        dda: bool,
-        slice_as_dataframe: bool = True
-    ):
-        """Create a Bruker Orbitrap object that contains all data in-memory.
-​
-        Parameters
-        ----------
-        thermo_raw_file_name : str
-            The full file name to a Bruker .d folder.
-            Alternatively, the full file name of an already exported .hdf
-            can be provided as well.
-        dda : bool
-            If DDA, precursor indices will be equal to scan numbers.
-            If not DDA (i.e. DIA), precursor indices will be equal to the
-            scan number within a DIA cycle.
-        slice_as_dataframe : bool
-            If True, slicing returns a pd.DataFrame by default.
-            If False, slicing provides a np.int64[:] with raw indices.
-            This value can also be modified after creation.
-            Default is True.
-        """
-        self._use_calibrated_mz_values_as_default = False
-        self.thermo_raw_file_name = os.path.abspath(thermo_raw_file_name)
-        logging.info(f"Importing data from {thermo_raw_file_name}")
-        if thermo_raw_file_name.endswith(".raw"):
-            self._import_data_from_raw_file(
-                thermo_raw_file_name,
-                dda,
-            )
-        elif thermo_raw_file_name.endswith(".hdf"):
-            self._import_data_from_hdf_file(
-                thermo_raw_file_name,
-            )
-            self.thermo_raw_file_name = os.path.abspath(thermo_raw_file_name)
-        self.bruker_d_folder_name = self.thermo_raw_file_name
-        if not hasattr(self, "version"):
-            self._version = "none"
-        if self.version != alphatims.__version__:
-            logging.info(
-                "WARNING: "
-                f"AlphaTims version {self.version} was used to initialize "
-                f"{thermo_raw_file_name}, while the current version of "
-                f"AlphaTims is {alphatims.__version__}."
-            )
-        logging.info(f"Succesfully imported data from {thermo_raw_file_name}")
-        self.slice_as_dataframe = slice_as_dataframe
-        # Precompile
-        self[0, "raw"]
-
-    def _import_data_from_raw_file(
-        self,
-        thermo_raw_file_name: str,
-        dda: bool,
-    ):
-        self._version = alphatims.__version__
-        (
-            self._push_indptr,
-            mz_values,
-            self._intensity_values,
-            self._rt_values,
-            self._quad_mz_values,
-            self._precursor_indices,
-        ) = load_thermo_raw(thermo_raw_file_name, dda)
-        self.thermo_raw_file_name = thermo_raw_file_name
-        scan_count = len(self._precursor_indices)
-        self._frame_max_index = scan_count
-        self._scan_max_index = 1
-        self._mobility_max_value = 0
-        self._mobility_min_value = 0
-        self._mobility_values = np.array([0])
-        self._quad_indptr = self._push_indptr
-        self._raw_quad_indptr = np.arange(scan_count + 1)
-        self._intensity_min_value = float(np.min(self._intensity_values))
-        self._intensity_max_value = float(np.max(self._intensity_values))
-        self._intensity_corrections = np.ones(self._frame_max_index)
-        self._quad_min_mz_value = float(
-            np.min(
-                self._quad_mz_values[self._quad_mz_values != -1]
-            )
-        )
-        self._quad_max_mz_value = float(np.max(self._quad_mz_values))
-        self._precursor_max_index = int(np.max(self._precursor_indices)) + 1
-        self._acquisition_mode = "ddaPASEF" # TODO
-        self._mz_min_value = int(np.min(mz_values))
-        self._mz_max_value = int(np.max(mz_values)) + 1
-        self._decimals = 4
-        self._mz_values = np.arange(
-            10**self._decimals * self._mz_min_value,
-            10**self._decimals * (self._mz_max_value + 1)
-        ) / 10**self._decimals
-        self._tof_indices = (
-            mz_values * 10**self._decimals
-        ).astype(np.int32) - 10**self._decimals * self._mz_min_value
-        self._tof_max_index = len(self._mz_values)
-        self._meta_data = {
-            "SampleName": thermo_raw_file_name
-        }
-        msmstype = np.array(
-            [0 if s == -1 else 1 for s, e in self._quad_mz_values]
-        )
-        summed_intensities_ = np.cumsum(self._intensity_values)
-        summed_intensities = -summed_intensities_[self._push_indptr[:-1]]
-        summed_intensities[:-1] += summed_intensities_[self._push_indptr[1:-1]]
-        summed_intensities[-1] += summed_intensities_[-1]
-        self._frames = pd.DataFrame(
-            {
-                'MsMsType': msmstype,
-                'Time': self._rt_values,
-                'SummedIntensities': summed_intensities,
-                'Id': np.arange(len(self._rt_values)),
-            }
-        )
-        self._zeroth_frame = False
-        offset = int(self.zeroth_frame)
-        cycle_index = np.searchsorted(
-            self.raw_quad_indptr,
-            (self.scan_max_index) * (self.precursor_max_index + offset),
-            "r"
-        ) + 1
-        repeats = np.diff(self.raw_quad_indptr[: cycle_index])
-        if self.zeroth_frame:
-            repeats[0] -= self.scan_max_index
-        cycle_length = self.scan_max_index * self.precursor_max_index
-        repeat_length = np.sum(repeats)
-        if repeat_length != cycle_length:
-            repeats[-1] -= repeat_length - cycle_length
-        self._dia_mz_cycle = np.empty((cycle_length, 2))
-        self._dia_mz_cycle[:, 0] = np.repeat(
-            self.quad_mz_values[: cycle_index - 1, 0],
-            repeats
-        )
-        self._dia_mz_cycle[:, 1] = np.repeat(
-            self.quad_mz_values[: cycle_index - 1, 1],
-            repeats
-        )
-        self._dia_precursor_cycle = np.repeat(
-            self.precursor_indices[: cycle_index - 1],
-            repeats
-        )
diff --git a/alphadia/extraction/utils.py b/alphadia/utils.py
similarity index 98%
rename from alphadia/extraction/utils.py
rename to alphadia/utils.py
index 3bedc52e..3beac478 100644
--- a/alphadia/extraction/utils.py
+++ b/alphadia/utils.py
@@ -1,23 +1,18 @@
 # native imports
 import logging
 from ctypes import Structure, c_double
-from typing import Tuple, Union, List
 
 # alphadia imports
 
 # alpha family imports
 import alphatims.bruker
 import alphatims.utils
-from alphabase.spectral_library.base import SpecLibBase
 
 # third party imports
 import pandas as pd
 import numpy as np
 import numba as nb
 import matplotlib.patches as patches
-import matplotlib.patheffects as patheffects
-import matplotlib.pyplot as plt
-
 
 ISOTOPE_DIFF = 1.0032999999999674
 
@@ -193,10 +188,6 @@ def astd1(array):
         out[i] = np.std(array[i])
     return out
 
-
-
-
-
 def get_isotope_columns(colnames):
     isotopes = []
     for col in colnames:
diff --git a/alphadia/extraction/validate.py b/alphadia/validate.py
similarity index 99%
rename from alphadia/extraction/validate.py
rename to alphadia/validate.py
index 0631ba34..c281b841 100644
--- a/alphadia/extraction/validate.py
+++ b/alphadia/validate.py
@@ -1,8 +1,17 @@
-import pandas as pd
+# native imports
 import logging
+logger = logging.getLogger()
+
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
+import pandas as pd
 import numpy as np
 
-logger = logging.getLogger()
+
+
 
 class Property():
     """Column property base class"""
diff --git a/alphadia/venn.py b/alphadia/venn.py
deleted file mode 100644
index acf0d956..00000000
--- a/alphadia/venn.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# copied from https://github.com/tctianchi/pyvenn
-
-# coding: utf-8
-from itertools import chain
-try:
-    # since python 3.10
-    from collections.abc import Iterable
-except ImportError:
-    from collections import Iterable
-import matplotlib.pyplot as plt
-import matplotlib.patches as patches
-from matplotlib import colors
-import math
-
-default_colors = [
-    # r, g, b, a
-    [92, 192, 98, 0.5],
-    [90, 155, 212, 0.5],
-    [246, 236, 86, 0.6],
-    [241, 90, 96, 0.4],
-    [255, 117, 0, 0.3],
-    [82, 82, 190, 0.2],
-]
-default_colors = [
-    [i[0] / 255.0, i[1] / 255.0, i[2] / 255.0, i[3]]
-    for i in default_colors
-]
-
-def draw_ellipse(fig, ax, x, y, w, h, a, fillcolor):
-    e = patches.Ellipse(
-        xy=(x, y),
-        width=w,
-        height=h,
-        angle=a,
-        color=fillcolor)
-    ax.add_patch(e)
-
-def draw_triangle(fig, ax, x1, y1, x2, y2, x3, y3, fillcolor):
-    xy = [
-        (x1, y1),
-        (x2, y2),
-        (x3, y3),
-    ]
-    polygon = patches.Polygon(
-        xy=xy,
-        closed=True,
-        color=fillcolor)
-    ax.add_patch(polygon)
-
-def draw_text(fig, ax, x, y, text, color=[0, 0, 0, 1], fontsize=14, ha="center", va="center"):
-    ax.text(
-        x, y, text,
-        horizontalalignment=ha,
-        verticalalignment=va,
-        fontsize=fontsize,
-        color="black")
-
-def draw_annotate(fig, ax, x, y, textx, texty, text, color=[0, 0, 0, 1], arrowcolor=[0, 0, 0, 0.3]):
-    plt.annotate(
-        text,
-        xy=(x, y),
-        xytext=(textx, texty),
-        arrowprops=dict(color=arrowcolor, shrink=0, width=0.5, headwidth=8),
-        fontsize=14,
-        color=color,
-        xycoords="data",
-        textcoords="data",
-        horizontalalignment='center',
-        verticalalignment='center'
-    )
-
-def get_labels(data, fill=["number"]):
-    """
-    get a dict of labels for groups in data
-
-    @type data: list[Iterable]
-    @rtype: dict[str, str]
-
-    input
-      data: data to get label for
-      fill: ["number"|"logic"|"percent"]
-
-    return
-      labels: a dict of labels for different sets
-
-    example:
-    In [12]: get_labels([range(10), range(5,15), range(3,8)], fill=["number"])
-    Out[12]:
-    {'001': '0',
-     '010': '5',
-     '011': '0',
-     '100': '3',
-     '101': '2',
-     '110': '2',
-     '111': '3'}
-    """
-
-    N = len(data)
-
-    sets_data = [set(data[i]) for i in range(N)]  # sets for separate groups
-    s_all = set(chain(*data))                     # union of all sets
-
-    # bin(3) --> '0b11', so bin(3).split('0b')[-1] will remove "0b"
-    set_collections = {}
-    for n in range(1, 2**N):
-        key = bin(n).split('0b')[-1].zfill(N)
-        value = s_all
-        sets_for_intersection = [sets_data[i] for i in range(N) if  key[i] == '1']
-        sets_for_difference = [sets_data[i] for i in range(N) if  key[i] == '0']
-        for s in sets_for_intersection:
-            value = value & s
-        for s in sets_for_difference:
-            value = value - s
-        set_collections[key] = value
-
-    labels = {k: "" for k in set_collections}
-    if "logic" in fill:
-        for k in set_collections:
-            labels[k] = k + ": "
-    if "number" in fill:
-        for k in set_collections:
-            labels[k] += str(len(set_collections[k]))
-    if "percent" in fill:
-        data_size = len(s_all)
-        for k in set_collections:
-            labels[k] += "(%.1f%%)" % (100.0 * len(set_collections[k]) / data_size)
-
-    return labels
-
-def venn2(labels, names=['A', 'B'], **options):
-    """
-    plots a 2-set Venn diagram
-
-    @type labels: dict[str, str]
-    @type names: list[str]
-    @rtype: (Figure, AxesSubplot)
-
-    input
-      labels: a label dict where keys are identified via binary codes ('01', '10', '11'),
-              hence a valid set could look like: {'01': 'text 1', '10': 'text 2', '11': 'text 3'}.
-              unmentioned codes are considered as ''.
-      names:  group names
-      more:   colors, figsize, dpi, fontsize
-
-    return
-      pyplot Figure and AxesSubplot object
-    """
-    colors = options.get('colors', [default_colors[i] for i in range(2)])
-    figsize = options.get('figsize', (9, 7))
-    dpi = options.get('dpi', 96)
-    fontsize = options.get('fontsize', 14)
-
-    fig = plt.figure(0, figsize=figsize, dpi=dpi)
-    ax = fig.add_subplot(111, aspect='equal')
-    ax.set_axis_off()
-    ax.set_ylim(bottom=0.0, top=0.7)
-    ax.set_xlim(left=0.0, right=1.0)
-
-    # body
-    draw_ellipse(fig, ax, 0.375, 0.3, 0.5, 0.5, 0.0, colors[0])
-    draw_ellipse(fig, ax, 0.625, 0.3, 0.5, 0.5, 0.0, colors[1])
-    draw_text(fig, ax, 0.74, 0.30, labels.get('01', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.26, 0.30, labels.get('10', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.30, labels.get('11', ''), fontsize=fontsize)
-
-    # legend
-    draw_text(fig, ax, 0.20, 0.56, names[0], colors[0], fontsize=fontsize, ha="right", va="bottom")
-    draw_text(fig, ax, 0.80, 0.56, names[1], colors[1], fontsize=fontsize, ha="left", va="bottom")
-    leg = ax.legend(names, loc='center left', bbox_to_anchor=(1.0, 0.5), fancybox=True)
-    leg.get_frame().set_alpha(0.5)
-
-    return fig, ax
-
-def venn3(labels, names=['A', 'B', 'C'], **options):
-    """
-    plots a 3-set Venn diagram
-
-    @type labels: dict[str, str]
-    @type names: list[str]
-    @rtype: (Figure, AxesSubplot)
-
-    input
-      labels: a label dict where keys are identified via binary codes ('001', '010', '100', ...),
-              hence a valid set could look like: {'001': 'text 1', '010': 'text 2', '100': 'text 3', ...}.
-              unmentioned codes are considered as ''.
-      names:  group names
-      more:   colors, figsize, dpi, fontsize
-
-    return
-      pyplot Figure and AxesSubplot object
-    """
-    colors = options.get('colors', [default_colors[i] for i in range(3)])
-    figsize = options.get('figsize', (9, 9))
-    dpi = options.get('dpi', 96)
-    fontsize = options.get('fontsize', 14)
-
-    fig = plt.figure(0, figsize=figsize, dpi=dpi)
-    ax = fig.add_subplot(111, aspect='equal')
-    ax.set_axis_off()
-    ax.set_ylim(bottom=0.0, top=1.0)
-    ax.set_xlim(left=0.0, right=1.0)
-
-    # body
-    draw_ellipse(fig, ax, 0.333, 0.633, 0.5, 0.5, 0.0, colors[0])
-    draw_ellipse(fig, ax, 0.666, 0.633, 0.5, 0.5, 0.0, colors[1])
-    draw_ellipse(fig, ax, 0.500, 0.310, 0.5, 0.5, 0.0, colors[2])
-    draw_text(fig, ax, 0.50, 0.27, labels.get('001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.73, 0.65, labels.get('010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.61, 0.46, labels.get('011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.27, 0.65, labels.get('100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.39, 0.46, labels.get('101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.65, labels.get('110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.51, labels.get('111', ''), fontsize=fontsize)
-
-    # legend
-    draw_text(fig, ax, 0.15, 0.87, names[0], colors[0], fontsize=fontsize, ha="right", va="bottom")
-    draw_text(fig, ax, 0.85, 0.87, names[1], colors[1], fontsize=fontsize, ha="left", va="bottom")
-    draw_text(fig, ax, 0.50, 0.02, names[2], colors[2], fontsize=fontsize, va="top")
-    leg = ax.legend(names, loc='center left', bbox_to_anchor=(1.0, 0.5), fancybox=True)
-    leg.get_frame().set_alpha(0.5)
-
-    return fig, ax
-
-def venn4(labels, names=['A', 'B', 'C', 'D'], **options):
-    """
-    plots a 4-set Venn diagram
-
-    @type labels: dict[str, str]
-    @type names: list[str]
-    @rtype: (Figure, AxesSubplot)
-
-    input
-      labels: a label dict where keys are identified via binary codes ('0001', '0010', '0100', ...),
-              hence a valid set could look like: {'0001': 'text 1', '0010': 'text 2', '0100': 'text 3', ...}.
-              unmentioned codes are considered as ''.
-      names:  group names
-      more:   colors, figsize, dpi, fontsize
-
-    return
-      pyplot Figure and AxesSubplot object
-    """
-    colors = options.get('colors', [default_colors[i] for i in range(4)])
-    figsize = options.get('figsize', (12, 12))
-    dpi = options.get('dpi', 96)
-    fontsize = options.get('fontsize', 14)
-
-    fig = plt.figure(0, figsize=figsize, dpi=dpi)
-    ax = fig.add_subplot(111, aspect='equal')
-    ax.set_axis_off()
-    ax.set_ylim(bottom=0.0, top=1.0)
-    ax.set_xlim(left=0.0, right=1.0)
-
-    # body
-    draw_ellipse(fig, ax, 0.350, 0.400, 0.72, 0.45, 140.0, colors[0])
-    draw_ellipse(fig, ax, 0.450, 0.500, 0.72, 0.45, 140.0, colors[1])
-    draw_ellipse(fig, ax, 0.544, 0.500, 0.72, 0.45, 40.0, colors[2])
-    draw_ellipse(fig, ax, 0.644, 0.400, 0.72, 0.45, 40.0, colors[3])
-    draw_text(fig, ax, 0.85, 0.42, labels.get('0001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.68, 0.72, labels.get('0010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.77, 0.59, labels.get('0011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.32, 0.72, labels.get('0100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.71, 0.30, labels.get('0101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.66, labels.get('0110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.65, 0.50, labels.get('0111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.14, 0.42, labels.get('1000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.17, labels.get('1001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.29, 0.30, labels.get('1010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.39, 0.24, labels.get('1011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.23, 0.59, labels.get('1100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.61, 0.24, labels.get('1101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.35, 0.50, labels.get('1110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.38, labels.get('1111', ''), fontsize=fontsize)
-
-    # legend
-    draw_text(fig, ax, 0.13, 0.18, names[0], colors[0], fontsize=fontsize, ha="right")
-    draw_text(fig, ax, 0.18, 0.83, names[1], colors[1], fontsize=fontsize, ha="right", va="bottom")
-    draw_text(fig, ax, 0.82, 0.83, names[2], colors[2], fontsize=fontsize, ha="left", va="bottom")
-    draw_text(fig, ax, 0.87, 0.18, names[3], colors[3], fontsize=fontsize, ha="left", va="top")
-    leg = ax.legend(names, loc='center left', bbox_to_anchor=(1.0, 0.5), fancybox=True)
-    leg.get_frame().set_alpha(0.5)
-
-    return fig, ax
-
-def venn5(labels, names=['A', 'B', 'C', 'D', 'E'], **options):
-    """
-    plots a 5-set Venn diagram
-
-    @type labels: dict[str, str]
-    @type names: list[str]
-    @rtype: (Figure, AxesSubplot)
-
-    input
-      labels: a label dict where keys are identified via binary codes ('00001', '00010', '00100', ...),
-              hence a valid set could look like: {'00001': 'text 1', '00010': 'text 2', '00100': 'text 3', ...}.
-              unmentioned codes are considered as ''.
-      names:  group names
-      more:   colors, figsize, dpi, fontsize
-
-    return
-      pyplot Figure and AxesSubplot object
-    """
-    colors = options.get('colors', [default_colors[i] for i in range(5)])
-    figsize = options.get('figsize', (13, 13))
-    dpi = options.get('dpi', 96)
-    fontsize = options.get('fontsize', 14)
-
-    fig = plt.figure(0, figsize=figsize, dpi=dpi)
-    ax = fig.add_subplot(111, aspect='equal')
-    ax.set_axis_off()
-    ax.set_ylim(bottom=0.0, top=1.0)
-    ax.set_xlim(left=0.0, right=1.0)
-
-    # body
-    draw_ellipse(fig, ax, 0.428, 0.449, 0.87, 0.50, 155.0, colors[0])
-    draw_ellipse(fig, ax, 0.469, 0.543, 0.87, 0.50, 82.0, colors[1])
-    draw_ellipse(fig, ax, 0.558, 0.523, 0.87, 0.50, 10.0, colors[2])
-    draw_ellipse(fig, ax, 0.578, 0.432, 0.87, 0.50, 118.0, colors[3])
-    draw_ellipse(fig, ax, 0.489, 0.383, 0.87, 0.50, 46.0, colors[4])
-    draw_text(fig, ax, 0.27, 0.11, labels.get('00001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.72, 0.11, labels.get('00010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.55, 0.13, labels.get('00011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.91, 0.58, labels.get('00100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.78, 0.64, labels.get('00101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.84, 0.41, labels.get('00110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.76, 0.55, labels.get('00111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.51, 0.90, labels.get('01000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.39, 0.15, labels.get('01001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.42, 0.78, labels.get('01010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.50, 0.15, labels.get('01011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.67, 0.76, labels.get('01100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.70, 0.71, labels.get('01101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.51, 0.74, labels.get('01110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.64, 0.67, labels.get('01111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.10, 0.61, labels.get('10000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.20, 0.31, labels.get('10001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.76, 0.25, labels.get('10010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.65, 0.23, labels.get('10011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.18, 0.50, labels.get('10100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.21, 0.37, labels.get('10101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.81, 0.37, labels.get('10110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.74, 0.40, labels.get('10111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.27, 0.70, labels.get('11000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.34, 0.25, labels.get('11001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.33, 0.72, labels.get('11010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.51, 0.22, labels.get('11011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.25, 0.58, labels.get('11100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.28, 0.39, labels.get('11101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.36, 0.66, labels.get('11110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.51, 0.47, labels.get('11111', ''), fontsize=fontsize)
-
-    # legend
-    draw_text(fig, ax, 0.02, 0.72, names[0], colors[0], fontsize=fontsize, ha="right")
-    draw_text(fig, ax, 0.72, 0.94, names[1], colors[1], fontsize=fontsize, va="bottom")
-    draw_text(fig, ax, 0.97, 0.74, names[2], colors[2], fontsize=fontsize, ha="left")
-    draw_text(fig, ax, 0.88, 0.05, names[3], colors[3], fontsize=fontsize, ha="left")
-    draw_text(fig, ax, 0.12, 0.05, names[4], colors[4], fontsize=fontsize, ha="right")
-    leg = ax.legend(names, loc='center left', bbox_to_anchor=(1.0, 0.5), fancybox=True)
-    leg.get_frame().set_alpha(0.5)
-
-    return fig, ax
-
-def venn6(labels, names=['A', 'B', 'C', 'D', 'E'], **options):
-    """
-    plots a 6-set Venn diagram
-
-    @type labels: dict[str, str]
-    @type names: list[str]
-    @rtype: (Figure, AxesSubplot)
-
-    input
-      labels: a label dict where keys are identified via binary codes ('000001', '000010', '000100', ...),
-              hence a valid set could look like: {'000001': 'text 1', '000010': 'text 2', '000100': 'text 3', ...}.
-              unmentioned codes are considered as ''.
-      names:  group names
-      more:   colors, figsize, dpi, fontsize
-
-    return
-      pyplot Figure and AxesSubplot object
-    """
-    colors = options.get('colors', [default_colors[i] for i in range(6)])
-    figsize = options.get('figsize', (20, 20))
-    dpi = options.get('dpi', 96)
-    fontsize = options.get('fontsize', 14)
-
-    fig = plt.figure(0, figsize=figsize, dpi=dpi)
-    ax = fig.add_subplot(111, aspect='equal')
-    ax.set_axis_off()
-    ax.set_ylim(bottom=0.230, top=0.845)
-    ax.set_xlim(left=0.173, right=0.788)
-
-    # body
-    # See https://web.archive.org/web/20040819232503/http://www.hpl.hp.com/techreports/2000/HPL-2000-73.pdf
-    draw_triangle(fig, ax, 0.637, 0.921, 0.649, 0.274, 0.188, 0.667, colors[0])
-    draw_triangle(fig, ax, 0.981, 0.769, 0.335, 0.191, 0.393, 0.671, colors[1])
-    draw_triangle(fig, ax, 0.941, 0.397, 0.292, 0.475, 0.456, 0.747, colors[2])
-    draw_triangle(fig, ax, 0.662, 0.119, 0.316, 0.548, 0.662, 0.700, colors[3])
-    draw_triangle(fig, ax, 0.309, 0.081, 0.374, 0.718, 0.681, 0.488, colors[4])
-    draw_triangle(fig, ax, 0.016, 0.626, 0.726, 0.687, 0.522, 0.327, colors[5])
-    draw_text(fig, ax, 0.212, 0.562, labels.get('000001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.430, 0.249, labels.get('000010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.356, 0.444, labels.get('000011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.609, 0.255, labels.get('000100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.323, 0.546, labels.get('000101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.513, 0.316, labels.get('000110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.523, 0.348, labels.get('000111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.747, 0.458, labels.get('001000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.325, 0.492, labels.get('001001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.670, 0.481, labels.get('001010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.359, 0.478, labels.get('001011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.653, 0.444, labels.get('001100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.344, 0.526, labels.get('001101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.653, 0.466, labels.get('001110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.363, 0.503, labels.get('001111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.750, 0.616, labels.get('010000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.682, 0.654, labels.get('010001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.402, 0.310, labels.get('010010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.392, 0.421, labels.get('010011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.653, 0.691, labels.get('010100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.651, 0.644, labels.get('010101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.490, 0.340, labels.get('010110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.468, 0.399, labels.get('010111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.692, 0.545, labels.get('011000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.666, 0.592, labels.get('011001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.665, 0.496, labels.get('011010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.374, 0.470, labels.get('011011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.653, 0.537, labels.get('011100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.652, 0.579, labels.get('011101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.653, 0.488, labels.get('011110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.389, 0.486, labels.get('011111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.553, 0.806, labels.get('100000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.313, 0.604, labels.get('100001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.388, 0.694, labels.get('100010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.375, 0.633, labels.get('100011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.605, 0.359, labels.get('100100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.334, 0.555, labels.get('100101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.582, 0.397, labels.get('100110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.542, 0.372, labels.get('100111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.468, 0.708, labels.get('101000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.355, 0.572, labels.get('101001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.420, 0.679, labels.get('101010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.375, 0.597, labels.get('101011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.641, 0.436, labels.get('101100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.348, 0.538, labels.get('101101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.635, 0.453, labels.get('101110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.370, 0.548, labels.get('101111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.594, 0.689, labels.get('110000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.579, 0.670, labels.get('110001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.398, 0.670, labels.get('110010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.395, 0.653, labels.get('110011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.633, 0.682, labels.get('110100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.616, 0.656, labels.get('110101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.587, 0.427, labels.get('110110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.526, 0.415, labels.get('110111', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.495, 0.677, labels.get('111000', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.505, 0.648, labels.get('111001', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.428, 0.663, labels.get('111010', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.430, 0.631, labels.get('111011', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.639, 0.524, labels.get('111100', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.591, 0.604, labels.get('111101', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.622, 0.477, labels.get('111110', ''), fontsize=fontsize)
-    draw_text(fig, ax, 0.501, 0.523, labels.get('111111', ''), fontsize=fontsize)
-
-    # legend
-    draw_text(fig, ax, 0.674, 0.824, names[0], colors[0], fontsize=fontsize)
-    draw_text(fig, ax, 0.747, 0.751, names[1], colors[1], fontsize=fontsize)
-    draw_text(fig, ax, 0.739, 0.396, names[2], colors[2], fontsize=fontsize)
-    draw_text(fig, ax, 0.700, 0.247, names[3], colors[3], fontsize=fontsize)
-    draw_text(fig, ax, 0.291, 0.255, names[4], colors[4], fontsize=fontsize)
-    draw_text(fig, ax, 0.203, 0.484, names[5], colors[5], fontsize=fontsize)
-    leg = ax.legend(names, loc='center left', bbox_to_anchor=(1.0, 0.5), fancybox=True)
-    leg.get_frame().set_alpha(0.5)
-
-    return fig, ax
diff --git a/alphadia/extraction/workflow/base.py b/alphadia/workflow/base.py
similarity index 96%
rename from alphadia/extraction/workflow/base.py
rename to alphadia/workflow/base.py
index 144ef9c7..4000a86c 100644
--- a/alphadia/extraction/workflow/base.py
+++ b/alphadia/workflow/base.py
@@ -1,18 +1,18 @@
+# native imports
 import os
 import logging
 logger = logging.getLogger()
-import tempfile
-import numpy as np
-import pandas as pd
 import typing
-import platform
 
-import alphadia
-from alphadia.extraction.data import bruker, thermo
-from alphadia.extraction.workflow import manager, reporting
+# alphadia imports
+from alphadia.data import bruker, thermo
+from alphadia.workflow import manager, reporting
 
+# alpha family imports
 from alphabase.spectral_library.base import SpecLibBase
 
+# third party imports
+
 TEMP_FOLDER = ".progress"
 
 class WorkflowBase():
diff --git a/alphadia/extraction/workflow/manager.py b/alphadia/workflow/manager.py
similarity index 98%
rename from alphadia/extraction/workflow/manager.py
rename to alphadia/workflow/manager.py
index 765f4401..19996acb 100644
--- a/alphadia/extraction/workflow/manager.py
+++ b/alphadia/workflow/manager.py
@@ -1,22 +1,19 @@
+# native imports
 import os
 import typing
 import pickle
-from typing import Literal
-
-import pandas as pd
-import numpy as np
-import xxhash
-import numba as nb
-
+from copy import deepcopy
 
+# alphadia imports
 import alphadia
-from alphadia.extraction import calibration, fdr
-from alphadia.extraction.workflow import reporting
-import sklearn
-import matplotlib.pyplot as plt
-import matplotlib
-from copy import deepcopy
+from alphadia import calibration, fdr
+from alphadia.workflow import reporting
+
+# alpha family imports
 
+# third party imports
+import pandas as pd
+import xxhash
 
 class BaseManager():
 
@@ -477,7 +474,7 @@ def __init__(
     def fit_predict(
             self,
             features_df : pd.DataFrame,
-            decoy_strategy : Literal['precursor', 'precursor_channel_wise', 'channel'] = 'precursor',
+            decoy_strategy : typing.Literal['precursor', 'precursor_channel_wise', 'channel'] = 'precursor',
             competetive : bool = True,
             decoy_channel : int = -1,
             ):
diff --git a/alphadia/extraction/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py
similarity index 98%
rename from alphadia/extraction/workflow/peptidecentric.py
rename to alphadia/workflow/peptidecentric.py
index 1526eab4..986624e4 100644
--- a/alphadia/extraction/workflow/peptidecentric.py
+++ b/alphadia/workflow/peptidecentric.py
@@ -1,19 +1,20 @@
+# native imports
 import os
 import logging
 logger = logging.getLogger()
-from typing import Union
+import typing
 
-import numpy as np
-import pandas as pd
+# alphadia imports
+from alphadia import plexscoring, hybridselection
+from alphadia import fdrexperimental as fdrx
+from alphadia.workflow import manager, base
 
-from alphadia.extraction import plexscoring, hybridselection
-from alphadia.extraction import fdrexperimental as fdrx
-from alphadia.extraction.workflow import manager, base
+# alpha family imports
 from alphabase.spectral_library.base import SpecLibBase
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.neural_network import MLPClassifier
 
+# third party imports
+import numpy as np
+import pandas as pd
 
 feature_columns = [
     'reference_intensity_correlation',
@@ -162,8 +163,8 @@ def norm_to_rt(
         self,
         dia_data,
         norm_values : np.ndarray, 
-        active_gradient_start : Union[float,None] = None, 
-        active_gradient_stop : Union[float,None] = None,
+        active_gradient_start : typing.Union[float,None] = None, 
+        active_gradient_stop : typing.Union[float,None] = None,
         mode = None
     ):
         """Convert normalized retention time values to absolute retention time values.
diff --git a/alphadia/extraction/workflow/reporting.py b/alphadia/workflow/reporting.py
similarity index 93%
rename from alphadia/extraction/workflow/reporting.py
rename to alphadia/workflow/reporting.py
index 5fc6f94c..6c4e7b7d 100644
--- a/alphadia/extraction/workflow/reporting.py
+++ b/alphadia/workflow/reporting.py
@@ -1,19 +1,21 @@
-
-import typing
-import os
+# native imports
+import traceback
+import logging, os, time
 from datetime import datetime, timedelta
-from typing import Any, List, Type, Union
-
-import matplotlib
-from matplotlib.figure import Figure
-import numpy as np
+import typing
 import warnings
 import json
 import base64
 from io import BytesIO
 
-import traceback
-import logging, os, time
+# alphadia imports
+
+# alpha family imports
+
+# third party imports
+import matplotlib
+from matplotlib.figure import Figure
+import numpy as np
 
 # global variable which tracks if any logger has been initiated
 # As soon as its instantiated the default logger will be configured with a path to save the log file
@@ -161,7 +163,7 @@ class Backend():
 
     REQUIRES_CONTEXT = False
     
-    def log_figure(self, name : str, figure : Any, *args, **kwargs):
+    def log_figure(self, name : str, figure : typing.Any, *args, **kwargs):
         pass
 
     def log_metric(self, name : str, value : float, *args, **kwargs):
@@ -170,10 +172,10 @@ def log_metric(self, name : str, value : float, *args, **kwargs):
     def log_string(self, value : str, *args, **kwargs):
         pass
 
-    def log_data(self, name : str, value : Any, *args, **kwargs):
+    def log_data(self, name : str, value : typing.Any, *args, **kwargs):
         pass
 
-    def log_event(self, name : str, value : Any, *args, **kwargs):
+    def log_event(self, name : str, value : typing.Any, *args, **kwargs):
         pass
 
 class FigureBackend(Backend):
@@ -215,7 +217,7 @@ def __init__(
     def log_figure(
             self, 
             name : str, 
-            figure : Union[Figure, np.ndarray],
+            figure : typing.Union[Figure, np.ndarray],
             extension : str = 'png'
         ):
         """Log a figure to the figures folder.
@@ -226,7 +228,7 @@ def log_figure(
         name : str
             Name of the figure. Will be used as the filename.
 
-        figure : Union[matplotlib.figure.Figure, np.ndarray]
+        figure : typing.Union[matplotlib.figure.Figure, np.ndarray]
             Figure to log. Can be a matplotlib figure or a numpy array.
 
         extension : str, default 'png'
@@ -326,9 +328,9 @@ def __enter__(self):
 
     def __exit__(
             self,
-            exc_type: Any,
-            exc_value: Any,
-            exc_traceback: Any
+            exc_type: typing.Any,
+            exc_value: typing.Any,
+            exc_traceback: typing.Any
         ):
         """Exit the context of the backend.
         This method will write a `stop` event to the `events.jsonl` file.
@@ -336,13 +338,13 @@ def __exit__(
         Parameters
         ----------
 
-        exc_type : Any
+        exc_type : typing.Any
             Type of the exception raised. If no exception was raised, this will be None.
 
-        exc_value : Any
+        exc_value : typing.Any
             Value of the exception raised. If no exception was raised, this will be None.
 
-        exc_traceback : Any
+        exc_traceback : typing.Any
             Traceback of the exception raised. If no exception was raised, this will be None.
         """
 
@@ -359,7 +361,7 @@ def __exit__(
     def log_event(
             self, 
             name : str, 
-            value : Any
+            value : typing.Any
         ):
         """Log an event to the `events.jsonl` file.
 
@@ -372,7 +374,7 @@ def log_event(
         name : str
             Name of the event.
 
-        value : Any
+        value : typing.Any
             Value of the event. Must be a JSON-serializable object.
         """
 
@@ -464,7 +466,7 @@ def log_string(
     def log_figure(
             self, 
             name: str, 
-            figure: Any
+            figure: typing.Any
         ):
         """Log a base64 image of a figure to the `events.jsonl` file.
 
@@ -477,7 +479,7 @@ def log_figure(
         name : str
             Name of the figure.
 
-        figure : Any
+        figure : typing.Any
             Figure to log. Can be a matplotlib figure or a numpy array.
 
         """
@@ -539,14 +541,14 @@ def log_string(self, value : str, verbosity : str = 'info'):
 
 class Context():
     
-    def __init__(self, parent: Any) -> None:
+    def __init__(self, parent: typing.Any) -> None:
         """Helper class to allow backends to use a context manager.
         This allows the parent class to be instantiated without context and to receive  context later.
 
         Parameters
         ----------
 
-        parent : Any
+        parent : typing.Any
             The metric logger which owns this context
         
         """
@@ -563,15 +565,15 @@ class Pipeline():
 
     def __init__(
             self,
-            backends : List[Type[Backend]] = [],
+            backends : typing.List[typing.Type[Backend]] = [],
     ):
         """Metric logger which allows to log metrics, plots and strings to multiple backends.
 
         Parameters
         ----------
 
-        backends : List[Type[Backend]], default [LogBackend]
-            List of backends to use. Each backend must be a class inheriting from Backend.
+        backends : typing.List[Type[Backend]], default [LogBackend]
+            typing.List of backends to use. Each backend must be a class inheriting from Backend.
         """
         
         # the context will store a Context object
@@ -591,7 +593,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
             if backend.REQUIRES_CONTEXT:
                 backend.__exit__(exc_type, exc_value, exc_traceback)
 
-    def log_figure(self, name : str, figure : Any, *args, **kwargs):
+    def log_figure(self, name : str, figure : typing.Any, *args, **kwargs):
         for backend in self.backends:
             backend.log_figure(name, figure, *args, **kwargs)
 
@@ -603,10 +605,10 @@ def log_string(self, value : str, *args, verbosity='info',**kwargs):
         for backend in self.backends:
             backend.log_string(value, *args, verbosity = verbosity, **kwargs)
 
-    def log_data(self, name : str, value : Any, *args, **kwargs):
+    def log_data(self, name : str, value : typing.Any, *args, **kwargs):
         for backend in self.backends:
             backend.log_data(name, value, *args, **kwargs)
 
-    def log_event(self, name : str, value : Any, *args, **kwargs):
+    def log_event(self, name : str, value : typing.Any, *args, **kwargs):
         for backend in self.backends:
             backend.log_event(name, value, *args, **kwargs)
\ No newline at end of file
diff --git a/nbs/search/library_search.ipynb b/nbs/search/library_search.ipynb
index f5094cba..70130761 100644
--- a/nbs/search/library_search.ipynb
+++ b/nbs/search/library_search.ipynb
@@ -9,15 +9,9 @@
     "%reload_ext autoreload\n",
     "%autoreload 2\n",
     "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
     "import os\n",
     "\n",
-    "from alphabase.spectral_library.base import SpecLibBase\n",
-    "from alphadia.extraction import data, planning\n",
-    "from alphadia.extraction.workflow import manager, peptidecentric"
+    "from alphadia import planning"
    ]
   },
   {
@@ -48,19 +42,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_lib = SpecLibBase()\n",
-    "test_lib.load_hdf(speclib, load_mod_seq=True)\n",
-    "plan = planning.Plan(output_location, raw_files, test_lib)\n",
-    "\n",
-    "plan.config['general']['reuse_calibration'] = True\n",
-    "plan.config['general']['thread_count'] = 10\n",
-    "plan.config['general']['astral_ms1'] = False\n",
-    "plan.config['calibration']['norm_rt_mode'] = 'linear'\n",
-    "\n",
-    "plan.config['extraction_target']['target_num_candidates'] = 1\n",
-    "plan.config['extraction_target']['target_ms1_tolerance'] = 3 if MODE == 'astral' else 15\n",
-    "plan.config['extraction_target']['target_ms2_tolerance'] = 5 if MODE == 'astral' else 15\n",
-    "plan.config['extraction_target']['target_rt_tolerance'] = 150\n",
+    "config_update = {\n",
+    "    'general': {\n",
+    "        'reuse_calibration': True,\n",
+    "        'thread_count': 10,\n",
+    "        'astral_ms1': False\n",
+    "        },\n",
+    "    'calibration': {\n",
+    "        'norm_rt_mode': 'linear'\n",
+    "    },\n",
+    "    'library': {\n",
+    "        'save_hdf': True\n",
+    "    },\n",
+    "    'extraction_target': {\n",
+    "        'target_num_candidates': 5,\n",
+    "        'target_ms1_tolerance': 3 if MODE == 'astral' else 15,\n",
+    "        'target_ms2_tolerance': 5 if MODE == 'astral' else 15,\n",
+    "        'target_rt_tolerance': 150\n",
+    "    },\n",
+    "}\n",
+    "plan = planning.Plan(output_location, raw_files, speclib, config_update = config_update)\n",
     "\n",
     "plan.run()"
    ]