diff --git a/alphadia/__init__.py b/alphadia/__init__.py index 814bfa1f..cc41db0f 100644 --- a/alphadia/__init__.py +++ b/alphadia/__init__.py @@ -1,3 +1,3 @@ #!python -__version__ = "1.8.2" +__version__ = "1.9.0" diff --git a/alphadia/calibration/property.py b/alphadia/calibration/property.py index 040a45f4..080ef280 100644 --- a/alphadia/calibration/property.py +++ b/alphadia/calibration/property.py @@ -75,6 +75,7 @@ def __init__( float(transform_deviation) if transform_deviation is not None else None ) self.is_fitted = False + self.metrics = None def __repr__(self) -> str: return f"" @@ -172,10 +173,12 @@ def fit(self, dataframe: pd.DataFrame, plot: bool = False, **kwargs): self.function.fit(input_values, target_value) self.is_fitted = True except Exception as e: - logging.error(f"Could not fit estimator {self.name}: {e}") + logging.exception(f"Could not fit estimator {self.name}: {e}") return - if plot is True: + self._save_metrics(dataframe) + + if plot: self.plot(dataframe, **kwargs) def predict(self, dataframe, inplace=True): @@ -200,13 +203,13 @@ def predict(self, dataframe, inplace=True): logging.warning( f"{self.name} prediction was skipped as it has not been fitted yet" ) - return + return None if not set(self.input_columns).issubset(dataframe.columns): logging.warning( f"{self.name} calibration was skipped as input column {self.input_columns} not found in dataframe" ) - return + return None input_values = dataframe[self.input_columns].values @@ -297,6 +300,13 @@ def deviation(self, dataframe: pd.DataFrame): axis=1, ) + def _save_metrics(self, dataframe): + deviation = self.deviation(dataframe) + self.metrics = { + "median_accuracy": np.median(np.abs(deviation[:, 1])), + "median_precision": np.median(np.abs(deviation[:, 2])), + } + def ci(self, dataframe, ci: float = 0.95): """Calculate the residual deviation at the given confidence interval. diff --git a/alphadia/cli.py b/alphadia/cli.py index 829e3390..4702d555 100644 --- a/alphadia/cli.py +++ b/alphadia/cli.py @@ -74,7 +74,6 @@ action="append", default=[], ) - parser.add_argument( "--config", "-c", @@ -83,7 +82,6 @@ nargs="?", default=None, ) - parser.add_argument( "--wsl", "-w", @@ -97,6 +95,13 @@ nargs="?", default="{}", ) +parser.add_argument( + "--quant-dir", + type=str, + help="Directory to save the quantification results (psm & frag parquet files) to be reused in a distributed search", + nargs="?", + default=None, +) def parse_config(args: argparse.Namespace) -> dict: @@ -167,6 +172,41 @@ def parse_output_directory(args: argparse.Namespace, config: dict) -> str: return output_directory +def parse_quant_dir(args: argparse.Namespace, config: dict) -> str: + """Parse custom quant path. + 1. Use custom quant path from config file if specified. + 2. Use custom quant path from command line if specified. + + Parameters + ---------- + + args : argparse.Namespace + Command line arguments. + + config : dict + Config dictionary. + + Returns + ------- + + quant_dir : str + path to quant directory. + """ + + quant_dir = None + if "quant_dir" in config: + quant_dir = ( + utils.windows_to_wsl(config["quant_dir"]) + if args.wsl + else config["quant_dir"] + ) + + if args.quant_dir is not None: + quant_dir = utils.windows_to_wsl(args.quant_dir) if args.wsl else args.quant_dir + + return quant_dir + + def parse_raw_path_list(args: argparse.Namespace, config: dict) -> list: """Parse raw file list. 1. Use raw file list from config file if specified. @@ -305,6 +345,8 @@ def run(*args, **kwargs): print("No output directory specified.") return + quant_dir = parse_quant_dir(args, config) + reporting.init_logging(output_directory) raw_path_list = parse_raw_path_list(args, config) @@ -321,7 +363,10 @@ def run(*args, **kwargs): for f in fasta_path_list: logger.progress(f" {f}") + # TODO rename all output_directory, output_folder => output_path, quant_dir->quant_path (except cli parameter) logger.progress(f"Saving output to: {output_directory}") + if quant_dir is not None: + logger.progress(f"Saving quantification output to {quant_dir=}") try: import matplotlib @@ -337,6 +382,7 @@ def run(*args, **kwargs): library_path=library_path, fasta_path_list=fasta_path_list, config=config, + quant_path=quant_dir, ) plan.run() diff --git a/alphadia/data/alpharaw.py b/alphadia/data/alpharaw_wrapper.py similarity index 91% rename from alphadia/data/alpharaw.py rename to alphadia/data/alpharaw_wrapper.py index ca8edc49..da5809a2 100644 --- a/alphadia/data/alpharaw.py +++ b/alphadia/data/alpharaw_wrapper.py @@ -1,26 +1,16 @@ # native imports import logging -import math import os import numba as nb - -# third party imports import numpy as np import pandas as pd -from alpharaw import mzml as alpharawmzml - -# TODO fix: "import resolves to its containing file" -from alpharaw import sciex as alpharawsciex - -# alpha family imports -from alpharaw import ( - thermo as alpharawthermo, -) +from alpharaw import ms_data_base as alpharaw_ms_data_base +from alpharaw import mzml as alpharaw_mzml +from alpharaw import sciex as alpharaw_sciex +from alpharaw import thermo as alpharaw_thermo -# alphadia imports from alphadia import utils -from alphadia.data.stats import log_stats logger = logging.getLogger() @@ -252,7 +242,7 @@ def calculate_valid_scans(quad_slices: np.ndarray, cycle: np.ndarray): return np.array(precursor_idx_list) -class AlphaRaw(alpharawthermo.MSData_Base): +class AlphaRaw(alpharaw_thermo.MSData_Base): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.has_mobility = False @@ -318,8 +308,6 @@ def filter_spectra(self, **kwargs): This function is implemented in the sub-class. """ - pass - def jitclass(self): return AlphaRawJIT( self.cycle, @@ -340,28 +328,32 @@ def jitclass(self): ) -class MzML(AlphaRaw, alpharawmzml.MzMLReader): +class AlphaRawBase(AlphaRaw, alpharaw_ms_data_base.MSData_Base): + def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs): + super().__init__(process_count=process_count) + self.load_hdf(raw_file_path) + self.process_alpharaw(**kwargs) + + +class MzML(AlphaRaw, alpharaw_mzml.MzMLReader): def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs): super().__init__(process_count=process_count) self.load_raw(raw_file_path) self.process_alpharaw(**kwargs) - log_stats(self.rt_values, self.cycle) -class Sciex(AlphaRaw, alpharawsciex.SciexWiffData): +class Sciex(AlphaRaw, alpharaw_sciex.SciexWiffData): def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs): super().__init__(process_count=process_count) self.load_raw(raw_file_path) self.process_alpharaw(**kwargs) - log_stats(self.rt_values, self.cycle) -class Thermo(AlphaRaw, alpharawthermo.ThermoRawData): +class Thermo(AlphaRaw, alpharaw_thermo.ThermoRawData): def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs): super().__init__(process_count=process_count) self.load_raw(raw_file_path) self.process_alpharaw(**kwargs) - log_stats(self.rt_values, self.cycle) def filter_spectra(self, cv: float = None, astral_ms1: bool = False, **kwargs): """ @@ -460,7 +452,9 @@ def __init__( self.scan_max_index = scan_max_index self.frame_max_index = frame_max_index - def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): + def get_frame_indices( + self, rt_values: np.array, optimize_size: int = 16, min_size: int = 32 + ): """ Convert an interval of two rt values to a frame index interval. @@ -474,6 +468,9 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): optimize_size : int, default = 16 To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16 + min_size : int, default = 32 + The minimum number of dia cycles to include + Returns ------- np.ndarray, shape = (2,), dtype = int64 @@ -481,47 +478,18 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): """ - if rt_values.shape != (2,): - raise ValueError("rt_values must be a numpy array of shape (2,)") - - frame_index = np.zeros(len(rt_values), dtype=np.int64) - for i, rt in enumerate(rt_values): - frame_index[i] = search_sorted_left(self.rt_values, rt) - - precursor_cycle_limits = (frame_index + self.zeroth_frame) // self.cycle.shape[ - 1 - ] - precursor_cycle_len = precursor_cycle_limits[1] - precursor_cycle_limits[0] - - # round up to the next multiple of 16 - optimal_len = int( - optimize_size * math.ceil(precursor_cycle_len / optimize_size) + return utils.get_frame_indices( + rt_values=rt_values, + rt_values_array=self.rt_values, + zeroth_frame=self.zeroth_frame, + cycle_len=self.cycle.shape[1], + precursor_cycle_max_index=self.precursor_cycle_max_index, + optimize_size=optimize_size, + min_size=min_size, ) - # by default, we extend the precursor cycle to the right - optimal_cycle_limits = np.array( - [precursor_cycle_limits[0], precursor_cycle_limits[0] + optimal_len], - dtype=np.int64, - ) - - # if the cycle is too long, we extend it to the left - if optimal_cycle_limits[1] > self.precursor_cycle_max_index: - optimal_cycle_limits[1] = self.precursor_cycle_max_index - optimal_cycle_limits[0] = self.precursor_cycle_max_index - optimal_len - - if optimal_cycle_limits[0] < 0: - optimal_cycle_limits[0] = ( - 0 if self.precursor_cycle_max_index % 2 == 0 else 1 - ) - - # second element is the index of the first whole cycle which should not be used - # precursor_cycle_limits[1] += 1 - # convert back to frame indices - frame_limits = optimal_cycle_limits * self.cycle.shape[1] + self.zeroth_frame - return utils.make_slice_1d(frame_limits) - def get_frame_indices_tolerance( - self, rt: float, tolerance: float, optimize_size: int = 16 + self, rt: float, tolerance: float, optimize_size: int = 16, min_size: int = 32 ): """ Determine the frame indices for a given retention time and tolerance. @@ -538,6 +506,9 @@ def get_frame_indices_tolerance( optimize_size : int, default = 16 To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16 + min_size : int, default = 32 + The minimum number of dia cycles to include + Returns ------- np.ndarray, shape = (1, 3,), dtype = int64 @@ -547,7 +518,9 @@ def get_frame_indices_tolerance( rt_limits = np.array([rt - tolerance, rt + tolerance], dtype=np.float32) - return self.get_frame_indices(rt_limits, optimize_size=optimize_size) + return self.get_frame_indices( + rt_limits, optimize_size=optimize_size, min_size=min_size + ) def get_scan_indices(self, mobility_values: np.array, optimize_size: int = 16): return np.array([[0, 2, 1]], dtype=np.int64) diff --git a/alphadia/data/bruker.py b/alphadia/data/bruker.py index 5493761a..86ce1328 100644 --- a/alphadia/data/bruker.py +++ b/alphadia/data/bruker.py @@ -11,7 +11,6 @@ from numba.experimental import jitclass from alphadia import utils -from alphadia.data.stats import log_stats from alphadia.exceptions import NotDiaDataError logger = logging.getLogger() @@ -95,7 +94,6 @@ def __init__( # Precompile logger.info(f"Successfully imported data from {bruker_d_folder_name}") - log_stats(self.rt_values, self.cycle) def transpose(self): # abort if transposed data is already present @@ -288,7 +286,9 @@ def __init__( self.has_mobility = True - def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): + def get_frame_indices( + self, rt_values: np.array, optimize_size: int = 16, min_size: int = 32 + ): """ Convert an interval of two rt values to a frame index interval. @@ -302,6 +302,9 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): optimize_size : int, default = 16 To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16 + min_size : int, default = 32 + The minimum number of dia cycles to include + Returns ------- np.ndarray, shape = (2,), dtype = int64 @@ -309,45 +312,18 @@ def get_frame_indices(self, rt_values: np.array, optimize_size: int = 16): """ - if rt_values.shape != (2,): - raise ValueError("rt_values must be a numpy array of shape (2,)") - - frame_index = np.searchsorted(self.rt_values, rt_values, "left") - - precursor_cycle_limits = (frame_index + self.zeroth_frame) // self.cycle.shape[ - 1 - ] - precursor_cycle_len = precursor_cycle_limits[1] - precursor_cycle_limits[0] - - # round up to the next multiple of 16 - optimal_len = int( - optimize_size * math.ceil(precursor_cycle_len / optimize_size) - ) - - # by default, we extend the precursor cycle to the right - optimal_cycle_limits = np.array( - [precursor_cycle_limits[0], precursor_cycle_limits[0] + optimal_len], - dtype=np.int64, + return utils.get_frame_indices( + rt_values=rt_values, + rt_values_array=self.rt_values, + zeroth_frame=self.zeroth_frame, + cycle_len=self.cycle.shape[1], + precursor_cycle_max_index=self.precursor_cycle_max_index, + optimize_size=optimize_size, + min_size=min_size, ) - # if the cycle is too long, we extend it to the left - if optimal_cycle_limits[1] > self.precursor_cycle_max_index: - optimal_cycle_limits[1] = self.precursor_cycle_max_index - optimal_cycle_limits[0] = self.precursor_cycle_max_index - optimal_len - - if optimal_cycle_limits[0] < 0: - optimal_cycle_limits[0] = ( - 0 if self.precursor_cycle_max_index % 2 == 0 else 1 - ) - - # second element is the index of the first whole cycle which should not be used - # precursor_cycle_limits[1] += 1 - # convert back to frame indices - frame_limits = optimal_cycle_limits * self.cycle.shape[1] + self.zeroth_frame - return utils.make_slice_1d(frame_limits) - def get_frame_indices_tolerance( - self, rt: float, tolerance: float, optimize_size: int = 16 + self, rt: float, tolerance: float, optimize_size: int = 16, min_size: int = 32 ): """ Determine the frame indices for a given retention time and tolerance. @@ -364,6 +340,9 @@ def get_frame_indices_tolerance( optimize_size : int, default = 16 To optimize for fft efficiency, we want to extend the precursor cycle to a multiple of 16 + min_size : int, default = 32 + The minimum number of dia cycles to include + Returns ------- np.ndarray, shape = (1, 3,), dtype = int64 @@ -373,7 +352,9 @@ def get_frame_indices_tolerance( rt_limits = np.array([rt - tolerance, rt + tolerance], dtype=np.float32) - return self.get_frame_indices(rt_limits, optimize_size=optimize_size) + return self.get_frame_indices( + rt_limits, optimize_size=optimize_size, min_size=min_size + ) def get_scan_indices(self, mobility_values: np.array, optimize_size: int = 16): """convert array of mobility values into scan indices, njit compatible. diff --git a/alphadia/data/stats.py b/alphadia/data/stats.py deleted file mode 100644 index 94a2f885..00000000 --- a/alphadia/data/stats.py +++ /dev/null @@ -1,45 +0,0 @@ -import logging - -import numpy as np - -logger = logging.getLogger() - - -def log_stats(rt_values: np.array, cycle: np.array): - """Log raw file statistics - - Parameters - ---------- - - rt_values: np.ndarray - retention time values in seconds for all frames - - cycle: np.ndarray - DIA cycle object describing the msms pattern - """ - - logger.info("============ Raw file stats ============") - - rt_limits = rt_values.min() / 60, rt_values.max() / 60 - rt_duration_sec = rt_values.max() - rt_values.min() - rt_duration_min = rt_duration_sec / 60 - - logger.info(f"{'RT (min)':<20}: {rt_limits[0]:.1f} - {rt_limits[1]:.1f}") - logger.info(f"{'RT duration (sec)':<20}: {rt_duration_sec:.1f}") - logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}") - - cycle_length = cycle.shape[1] - cycle_duration = np.diff(rt_values[::cycle_length]).mean() - cycle_number = len(rt_values) // cycle_length - - logger.info(f"{'Cycle len (scans)':<20}: {cycle_length:.0f}") - logger.info(f"{'Cycle len (sec)':<20}: {cycle_duration:.2f}") - logger.info(f"{'Number of cycles':<20}: {cycle_number:.0f}") - - flat_cycle = cycle.flatten() - flat_cycle = flat_cycle[flat_cycle > 0] - msms_range = flat_cycle.min(), flat_cycle.max() - - logger.info(f"{'MS2 range (m/z)':<20}: {msms_range[0]:.1f} - {msms_range[1]:.1f}") - - logger.info("========================================") diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index c8d4cc20..c35459a9 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -8,9 +8,8 @@ # third party imports import numpy as np import torch -import torch.nn as nn -import torch.optim as optim from sklearn import model_selection +from torch import nn, optim from torchmetrics.classification import BinaryAUROC from tqdm import tqdm @@ -30,7 +29,6 @@ class Classifier(ABC): @abstractmethod def fitted(self): """Return whether the classifier has been fitted.""" - pass @abstractmethod def fit(self, x: np.array, y: np.array): @@ -46,7 +44,6 @@ def fit(self, x: np.array, y: np.array): Target values of shape (n_samples,) or (n_samples, n_classes). """ - pass @abstractmethod def predict(self, x: np.array): @@ -65,7 +62,6 @@ def predict(self, x: np.array): Predicted class of shape (n_samples,). """ - pass @abstractmethod def predict_proba(self, x: np.array): @@ -84,7 +80,6 @@ def predict_proba(self, x: np.array): Predicted class probabilities of shape (n_samples, n_classes). """ - pass @abstractmethod def to_state_dict(self): @@ -97,7 +92,6 @@ def to_state_dict(self): state_dict : dict State dict of the classifier. """ - pass @abstractmethod def from_state_dict(self, state_dict: dict): @@ -111,7 +105,6 @@ def from_state_dict(self, state_dict: dict): State dict of the classifier. """ - pass class BinaryClassifier(Classifier): diff --git a/alphadia/libtransform.py b/alphadia/libtransform.py index 2dbd5021..805b4073 100644 --- a/alphadia/libtransform.py +++ b/alphadia/libtransform.py @@ -32,7 +32,6 @@ class ProcessingStep: def __init__(self) -> None: """Base class for processing steps. Each implementation must implement the `validate` and `forward` method. Processing steps can be chained together in a ProcessingPipeline.""" - pass def __call__(self, *args: typing.Any) -> typing.Any: """Run the processing step on the input object.""" diff --git a/alphadia/numba/fft.py b/alphadia/numba/fft.py index 8c93b57e..f43897b6 100644 --- a/alphadia/numba/fft.py +++ b/alphadia/numba/fft.py @@ -47,13 +47,13 @@ def rfft2(x: np.array, s: None | tuple = None) -> np.array: @overload(rfft2, fastmath=True) def _(x, s=None): if not isinstance(x, nb.types.Array): - return + return None if x.ndim != 2: - return + return None if x.dtype != nb.types.float32: - return + return None def funcx_impl(x, s=None): s, axes = ndshape_and_axes(x, s, (-2, -1)) @@ -98,13 +98,13 @@ def irfft2(x: np.array, s: None | tuple = None) -> np.array: @overload(irfft2, fastmath=True) def _(x, s=None): if not isinstance(x, nb.types.Array): - return + return None if x.ndim != 2: - return + return None if x.dtype != nb.types.complex64: - return + return None def funcx_impl(x, s=None): s, axes = ndshape_and_axes(x, s, (-2, -1)) @@ -161,16 +161,16 @@ def convolve_fourier(dense, kernel): @overload(convolve_fourier, fastmath=True) def _(dense, kernel): if not isinstance(dense, nb.types.Array): - return + return None if not isinstance(kernel, nb.types.Array): - return + return None if kernel.ndim != 2: - return + return None if dense.ndim < 2: - return + return None if dense.ndim == 2: diff --git a/alphadia/numba/fragments.py b/alphadia/numba/fragments.py index 0d0ed3ba..5746d5fa 100644 --- a/alphadia/numba/fragments.py +++ b/alphadia/numba/fragments.py @@ -331,7 +331,9 @@ def get_ion_group_mapping( score_group_intensity = np.zeros((len(ion_mz)), dtype=np.float32) - for precursor, mz, intensity in zip(ion_precursor, ion_mz, ion_intensity): # noqa: B905 ('strict' not supported by numba yet + for precursor, mz, intensity in zip( + ion_precursor, ion_mz, ion_intensity + ): # ('strict' not supported by numba yet # score_group_idx = precursor_group[precursor] if len(grouped_mz) == 0 or np.abs(grouped_mz[-1] - mz) > EPSILON: diff --git a/alphadia/outputaccumulator.py b/alphadia/outputaccumulator.py index 16b8b808..9f1e8547 100644 --- a/alphadia/outputaccumulator.py +++ b/alphadia/outputaccumulator.py @@ -78,7 +78,8 @@ def _calculate_fragment_position(self): def parse_output_folder( self, folder: str, - selected_precursor_columns: list[str] | None = None, + mandatory_precursor_columns: list[str] | None = None, + optional_precursor_columns: list[str] | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Parse the output folder to get a precursor and fragment dataframe in the flat format. @@ -87,7 +88,7 @@ def parse_output_folder( ---------- folder : str The output folder to be parsed. - selected_precursor_columns : list, optional + mandatory_precursor_columns : list, optional The columns to be selected from the precursor dataframe, by default ['precursor_idx', 'sequence', 'flat_frag_start_idx', 'flat_frag_stop_idx', 'charge', 'rt_library', 'mobility_library', 'mz_library', 'proteins', 'genes', 'mods', 'mod_sites', 'proba'] Returns @@ -99,8 +100,8 @@ def parse_output_folder( """ - if selected_precursor_columns is None: - selected_precursor_columns = [ + if mandatory_precursor_columns is None: + mandatory_precursor_columns = [ "precursor_idx", "sequence", "flat_frag_start_idx", @@ -108,12 +109,10 @@ def parse_output_folder( "charge", "rt_library", "rt_observed", - "rt_calibrated", "mobility_library", "mobility_observed", "mz_library", "mz_observed", - "mz_calibrated", "proteins", "genes", "mods", @@ -121,16 +120,28 @@ def parse_output_folder( "proba", "decoy", ] + + if optional_precursor_columns is None: + optional_precursor_columns = [ + "rt_calibrated", + "mz_calibrated", + ] + psm_df = pd.read_parquet(os.path.join(folder, "psm.parquet")) frag_df = pd.read_parquet(os.path.join(folder, "frag.parquet")) - assert set( - selected_precursor_columns - ).issubset( - psm_df.columns - ), f"selected_precursor_columns must be a subset of psm_df.columns didnt find {set(selected_precursor_columns) - set(psm_df.columns)}" - psm_df = psm_df[selected_precursor_columns] - # validate.precursors_flat_from_output(psm_df) + if not set(mandatory_precursor_columns).issubset(psm_df.columns): + raise ValueError( + f"mandatory_precursor_columns must be a subset of psm_df.columns didnt find {set(mandatory_precursor_columns) - set(psm_df.columns)}" + ) + + available_columns = sorted( + list( + set(mandatory_precursor_columns) + | (set(optional_precursor_columns) & set(psm_df.columns)) + ) + ) + psm_df = psm_df[available_columns] # get foldername of the output folder foldername = os.path.basename(folder) @@ -260,9 +271,6 @@ def __init__(self, folders: list, number_of_processes: int): self._lock = threading.Lock() # Lock to prevent two processes trying to update the same subscriber at the same time def subscribe(self, subscriber: BaseAccumulator): - assert isinstance( - subscriber, BaseAccumulator - ), f"subscriber must be an instance of BaseAccumulator, got {type(subscriber)}" self._subscribers.append(subscriber) def _update_subscriber( @@ -420,14 +428,21 @@ def post_process(self): Post process the consensus_speclibase by normalizing retention times. """ + norm_delta_max = self._norm_delta_max + if "rt_calibrated" not in self.consensus_speclibase.precursor_df.columns: + logger.warning( + "rt_calibrated not found in the precursor_df, delta-max normalization will not be performed" + ) + norm_delta_max = False + + logger.info("Performing quality control for transfer learning.") + logger.info(f"Normalize by delta: {norm_delta_max}") logger.info( - "Performing quality control for transfer learning." - + f"Normalize by delta: {self._norm_delta_max}" - + f"Precursor correlation cutoff: {self._precursor_correlation_cutoff}" - + f"Fragment correlation cutoff: {self._fragment_correlation_ratio}" + f"Precursor correlation cutoff: {self._precursor_correlation_cutoff}" ) + logger.info(f"Fragment correlation cutoff: {self._fragment_correlation_ratio}") - if self._norm_delta_max: + if norm_delta_max: self.consensus_speclibase = normalize_rt_delta_max( self.consensus_speclibase ) @@ -563,13 +578,20 @@ def ms2_quality_control( # calculate the median correlation for the precursor intensity_mask = flat_intensity > 0.0 - median_correlation = np.median(flat_correlation[intensity_mask]) + median_correlation = ( + np.median(flat_correlation[intensity_mask]) if intensity_mask.any() else 0.0 + ) # use the precursor for MS2 learning if the median correlation is above the cutoff use_for_ms2[i] = median_correlation > precursor_correlation_cutoff - fragment_intensity_view[:] = fragment_intensity_view * ( - fragment_correlation_view > median_correlation * fragment_correlation_ratio + # Fix: Use loc to modify the original DataFrame instead of the view + spec_lib_base.fragment_intensity_df.loc[start_idx:stop_idx] = ( + fragment_intensity_view.values + * ( + fragment_correlation_view + > median_correlation * fragment_correlation_ratio + ) ) spec_lib_base.precursor_df["use_for_ms2"] = use_for_ms2 diff --git a/alphadia/outputtransform.py b/alphadia/outputtransform.py index 504e14a4..d98ad9cd 100644 --- a/alphadia/outputtransform.py +++ b/alphadia/outputtransform.py @@ -1,6 +1,7 @@ # native imports import logging import os +from collections import defaultdict from collections.abc import Iterator import directlfq.config as lfqconfig @@ -25,6 +26,7 @@ ) from alphadia.transferlearning.train import FinetuneManager from alphadia.workflow import manager, peptidecentric +from alphadia.workflow.managers.raw_file_manager import RawFileManager logger = logging.getLogger() @@ -116,7 +118,7 @@ def accumulate_frag_df( raw_name, df = next(df_iterable, (None, None)) if df is None: logger.warning(f"no frag file found for {raw_name}") - return + return None df = prepare_df(df, self.psm_df, column=self.column) @@ -886,7 +888,7 @@ def build_library( if len(psm_df) == 0: logger.warning("No precursors found, skipping library building") - return + return None libbuilder = libtransform.MbrLibraryBuilder( fdr=0.01, @@ -938,53 +940,139 @@ def _build_run_stat_df( Dataframe containing the statistics """ - optimization_manager_path = os.path.join( - folder, peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH - ) if channels is None: channels = [0] - out_df = [] + all_stats = [] for channel in channels: channel_df = run_df[run_df["channel"] == channel] - base_dict = { + stats = { "run": raw_name, "channel": channel, "precursors": len(channel_df), "proteins": channel_df["pg"].nunique(), } - if "weighted_mass_error" in channel_df.columns: - base_dict["ms1_accuracy"] = np.mean(channel_df["weighted_mass_error"]) - + stats["fwhm_rt"] = np.nan if "cycle_fwhm" in channel_df.columns: - base_dict["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"]) + stats["fwhm_rt"] = np.mean(channel_df["cycle_fwhm"]) + stats["fwhm_mobility"] = np.nan if "mobility_fwhm" in channel_df.columns: - base_dict["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"]) - - if os.path.exists(optimization_manager_path): + stats["fwhm_mobility"] = np.mean(channel_df["mobility_fwhm"]) + + # collect optimization stats + optimization_stats = defaultdict(lambda: np.nan) + if os.path.exists( + optimization_manager_path := os.path.join( + folder, + peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME, + ) + ): optimization_manager = manager.OptimizationManager( path=optimization_manager_path ) + optimization_stats["ms2_error"] = optimization_manager.ms2_error + optimization_stats["ms1_error"] = optimization_manager.ms1_error + optimization_stats["rt_error"] = optimization_manager.rt_error + optimization_stats["mobility_error"] = optimization_manager.mobility_error + else: + logger.warning(f"Error reading optimization manager for {raw_name}") + + prefix = "optimization." + for key in ["ms2_error", "ms1_error", "rt_error", "mobility_error"]: + stats[f"{prefix}{key}"] = optimization_stats[key] + + # collect calibration stats + calibration_stats = defaultdict(lambda: np.nan) + if os.path.exists( + calibration_manager_path := os.path.join( + folder, + peptidecentric.PeptideCentricWorkflow.CALIBRATION_MANAGER_PKL_NAME, + ) + ): + calibration_manager = manager.CalibrationManager( + path=calibration_manager_path + ) + + if ( + fragment_mz_estimator := calibration_manager.get_estimator( + "fragment", "mz" + ) + ) and (fragment_mz_metrics := fragment_mz_estimator.metrics): + calibration_stats["ms2_median_accuracy"] = fragment_mz_metrics[ + "median_accuracy" + ] + calibration_stats["ms2_median_precision"] = fragment_mz_metrics[ + "median_precision" + ] - base_dict["ms2_error"] = optimization_manager.ms2_error - base_dict["ms1_error"] = optimization_manager.ms1_error - base_dict["rt_error"] = optimization_manager.rt_error - base_dict["mobility_error"] = optimization_manager.mobility_error + if ( + precursor_mz_estimator := calibration_manager.get_estimator( + "precursor", "mz" + ) + ) and (precursor_mz_metrics := precursor_mz_estimator.metrics): + calibration_stats["ms1_median_accuracy"] = precursor_mz_metrics[ + "median_accuracy" + ] + calibration_stats["ms1_median_precision"] = precursor_mz_metrics[ + "median_precision" + ] else: - logger.warning(f"Error reading optimization manager for {raw_name}") - base_dict["ms2_error"] = np.nan - base_dict["ms1_error"] = np.nan - base_dict["rt_error"] = np.nan - base_dict["mobility_error"] = np.nan + logger.warning(f"Error reading calibration manager for {raw_name}") + + prefix = "calibration." + for key in [ + "ms2_median_accuracy", + "ms2_median_precision", + "ms1_median_accuracy", + "ms1_median_precision", + ]: + stats[f"{prefix}{key}"] = calibration_stats[key] + + # collect raw stats + raw_stats = defaultdict(lambda: np.nan) + if os.path.exists( + raw_file_manager_path := os.path.join( + folder, peptidecentric.PeptideCentricWorkflow.RAW_FILE_MANAGER_PKL_NAME + ) + ): + raw_stats = RawFileManager( + path=raw_file_manager_path, load_from_file=True + ).stats + else: + logger.warning(f"Error reading raw file manager for {raw_name}") + + # deliberately mapping explicitly to avoid coupling raw_stats to the output too tightly + prefix = "raw." + + stats[f"{prefix}gradient_min_m"] = raw_stats["rt_limit_min"] / 60 + stats[f"{prefix}gradient_max_m"] = raw_stats["rt_limit_max"] / 60 + stats[f"{prefix}gradient_length_m"] = ( + raw_stats["rt_limit_max"] - raw_stats["rt_limit_min"] + ) / 60 + for key in [ + "cycle_length", + "cycle_duration", + "cycle_number", + "msms_range_min", + "msms_range_max", + ]: + stats[f"{prefix}{key}"] = raw_stats[key] + + all_stats.append(stats) + + return pd.DataFrame(all_stats) - out_df.append(base_dict) - return pd.DataFrame(out_df) +def _get_value_or_nan(d: dict, key: str): + try: + return d[key] + except KeyError: + return np.nan def _build_run_internal_df( @@ -1006,7 +1094,7 @@ def _build_run_internal_df( """ timing_manager_path = os.path.join( - folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH + folder_path, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME ) raw_name = os.path.basename(folder_path) diff --git a/alphadia/peakgroup/kernel.py b/alphadia/peakgroup/kernel.py index 8cd37e28..f62b3668 100644 --- a/alphadia/peakgroup/kernel.py +++ b/alphadia/peakgroup/kernel.py @@ -8,7 +8,7 @@ import numpy as np # alphadia imports -from alphadia.data import alpharaw, bruker +from alphadia.data import alpharaw_wrapper, bruker logger = logging.getLogger() @@ -54,7 +54,7 @@ def __init__( self, dia_data: bruker.TimsTOFTransposeJIT | bruker.TimsTOFTranspose - | alpharaw.AlphaRaw, + | alpharaw_wrapper.AlphaRaw, fwhm_rt: float = 10.0, sigma_scale_rt: float = 1.0, fwhm_mobility: float = 0.03, @@ -177,7 +177,6 @@ def get_dense_matrix(self, verbose: bool = True): mobility_resolution = np.mean(np.diff(self.dia_data.mobility_values[::-1])) if verbose: - pass logger.info( f"Duty cycle consists of {rt_datapoints} frames, {rt_resolution:.2f} seconds cycle time" ) @@ -189,7 +188,6 @@ def get_dense_matrix(self, verbose: bool = True): mobility_sigma = self.determine_mobility_sigma(mobility_resolution) if verbose: - pass logger.info( f"FWHM in RT is {self.fwhm_rt:.2f} seconds, sigma is {rt_sigma:.2f}" ) diff --git a/alphadia/peakgroup/search.py b/alphadia/peakgroup/search.py index dcfb9550..ff7e692e 100644 --- a/alphadia/peakgroup/search.py +++ b/alphadia/peakgroup/search.py @@ -351,7 +351,9 @@ def select_candidates( rt = precursor_container.rt[i] mobility = precursor_container.mobility[i] - frame_limits = jit_data.get_frame_indices_tolerance(rt, config.rt_tolerance) + frame_limits = jit_data.get_frame_indices_tolerance( + rt, config.rt_tolerance, min_size=config.kernel_size + ) scan_limits = jit_data.get_scan_indices_tolerance( mobility, config.mobility_tolerance ) @@ -688,7 +690,7 @@ def build_candidates( cycle_limits_list = np.zeros((peak_cycle_list.shape[0], 2), dtype="int32") for candidate_rank, (scan_relative, cycle_relative) in enumerate( - zip(peak_scan_list, peak_cycle_list) # noqa: B905 ('strict' not supported by numba yet) + zip(peak_scan_list, peak_cycle_list) # ('strict' not supported by numba yet) ): scan_limits_relative, cycle_limits_relative = numeric.symetric_limits_2d( score, @@ -738,7 +740,7 @@ def build_candidates( peak_score_list, scan_limits_list, cycle_limits_list, - ): # noqa: B905 ('strict' not supported by numba yet) + ): # ('strict' not supported by numba yet) # does not work anymore scan_limits_absolute = numeric.wrap1( diff --git a/alphadia/peakgroup/utils.py b/alphadia/peakgroup/utils.py index 788c8f7e..6921d192 100644 --- a/alphadia/peakgroup/utils.py +++ b/alphadia/peakgroup/utils.py @@ -18,16 +18,16 @@ def assemble_isotope_mz(mono_mz, charge, isotope_intensity): @overload(assemble_isotope_mz) def _(mono_mz, charge, isotope_intensity): if not isinstance(mono_mz, nb.types.Float): - return + return None if not isinstance(charge, nb.types.Integer): - return + return None if not isinstance(isotope_intensity, nb.types.Array): - return + return None if isotope_intensity.ndim != 1: - return + return None def funcx_impl(mono_mz, charge, isotope_intensity): offset = np.arange(len(isotope_intensity)) * 1.0033548350700006 / charge diff --git a/alphadia/planning.py b/alphadia/planning.py index ae4fd08d..c8dc429f 100644 --- a/alphadia/planning.py +++ b/alphadia/planning.py @@ -40,6 +40,7 @@ def __init__( fasta_path_list: list[str] | None = None, config: dict | None = None, config_base_path: str | None = None, + quant_path: str | None = None, ) -> None: """Highest level class to plan a DIA Search. Owns the input file list, speclib and the config. @@ -59,6 +60,9 @@ def __init__( config_update : dict, optional dict to update the default config. Can be used for debugging purposes etc. + quant_path : str, optional + path to directory to save the quantification results (psm & frag parquet files). If not provided, the results are saved in the usual workflow folder + """ if config is None: config = {} @@ -70,9 +74,9 @@ def __init__( reporting.init_logging(self.output_folder) logger.progress(" _ _ ___ ___ _ ") - logger.progress(" __ _| |_ __| |_ __ _| \_ _| /_\ ") - logger.progress(" / _` | | '_ \ ' \\/ _` | |) | | / _ \ ") - logger.progress(" \__,_|_| .__/_||_\__,_|___/___/_/ \_\\") + logger.progress(r" __ _| |_ __| |_ __ _| \_ _| /_\ ") + logger.progress(" / _` | | '_ \\ ' \\/ _` | |) | | / _ \\ ") + logger.progress(" \\__,_|_| .__/_||_\\__,_|___/___/_/ \\_\\") logger.progress(" |_| ") logger.progress("") @@ -80,6 +84,7 @@ def __init__( self.raw_path_list = raw_path_list self.library_path = library_path self.fasta_path_list = fasta_path_list + self.quant_path = quant_path logger.progress(f"version: {alphadia.__version__}") @@ -135,12 +140,12 @@ def raw_path_list(self, raw_path_list: list[str]): self._raw_path_list = raw_path_list @property - def config(self) -> dict: + def config(self) -> Config: """Dict with all configuration parameters for the extraction.""" return self._config @config.setter - def config(self, config: dict) -> None: + def config(self, config: Config) -> None: self._config = config @property @@ -318,11 +323,11 @@ def run( workflow = peptidecentric.PeptideCentricWorkflow( raw_name, self.config, + quant_path=self.quant_path, ) - workflow_folder_list.append(workflow.path) - # check if the raw file is already processed + workflow_folder_list.append(workflow.path) psm_location = os.path.join(workflow.path, "psm.parquet") frag_location = os.path.join(workflow.path, "frag.parquet") diff --git a/alphadia/plexscoring.py b/alphadia/plexscoring.py index 0a33c8fa..4ae4e674 100644 --- a/alphadia/plexscoring.py +++ b/alphadia/plexscoring.py @@ -11,7 +11,7 @@ # alphadia imports from alphadia import features, quadrupole, utils, validate -from alphadia.data import alpharaw, bruker +from alphadia.data import alpharaw_wrapper, bruker from alphadia.numba import config, fragments from alphadia.plotting.cycle import plot_cycle from alphadia.plotting.debug import ( @@ -1389,7 +1389,7 @@ class CandidateScoring: def __init__( self, - dia_data: bruker.TimsTOFTransposeJIT | alpharaw.AlphaRawJIT, + dia_data: bruker.TimsTOFTransposeJIT | alpharaw_wrapper.AlphaRawJIT, precursors_flat: pd.DataFrame, fragments_flat: pd.DataFrame, quadrupole_calibration: quadrupole.SimpleQuadrupole = None, diff --git a/alphadia/transferlearning/train.py b/alphadia/transferlearning/train.py index b9aa2426..9704d557 100644 --- a/alphadia/transferlearning/train.py +++ b/alphadia/transferlearning/train.py @@ -994,7 +994,7 @@ def finetune_ccs(self, psm_df: pd.DataFrame) -> pd.DataFrame: logger.error( "Failed to finetune CCS model. PSM dataframe does not contain mobility or ccs columns." ) - return + return None if "ccs" not in psm_df.columns: psm_df["ccs"] = mobility_to_ccs_for_df(psm_df, "mobility") elif "mobility" not in psm_df.columns: diff --git a/alphadia/utils.py b/alphadia/utils.py index 8408b584..7281f4d9 100644 --- a/alphadia/utils.py +++ b/alphadia/utils.py @@ -1,5 +1,6 @@ # native imports import logging +import math import platform import re from ctypes import Structure, c_double @@ -8,13 +9,13 @@ # alpha family imports import alphatims.bruker import alphatims.utils -import matplotlib.patches as patches import numba as nb import numpy as np # third party imports import pandas as pd import torch +from matplotlib import patches logger = logging.getLogger() @@ -681,3 +682,71 @@ def merge_missing_columns( # merge return left_df.merge(right_df[on + missing_from_left], on=on, how=how) + + +@nb.njit(inline="always") +def get_frame_indices( + rt_values: np.ndarray, + rt_values_array: np.ndarray, + zeroth_frame: int, + cycle_len: int, + precursor_cycle_max_index: int, + optimize_size: int = 16, + min_size: int = 32, +) -> np.ndarray: + """ + Convert an interval of two rt values to a frame index interval. + The length of the interval is rounded up so that a multiple of `optimize_size` cycles are included. + + Parameters + ---------- + rt_values : np.ndarray, shape = (2,), dtype = float32 + Array of rt values. + rt_values_array : np.ndarray + Array containing all rt values for searching. + zeroth_frame : int + Indicator if the first frame is zero. + cycle_len : int + The size of the cycle dimension. + precursor_cycle_max_index : int + Maximum index for precursor cycles. + optimize_size : int, default = 16 + Optimize for FFT efficiency by using multiples of this size. + min_size : int, default = 32 + Minimum number of DIA cycles to include. + + Returns + ------- + np.ndarray, shape = (1, 3), dtype = int64 + Array of frame indices (start, stop, 1) + """ + if rt_values.shape != (2,): + raise ValueError("rt_values must be a numpy array of shape (2,)") + + frame_index = np.searchsorted(rt_values_array, rt_values, "left") + + precursor_cycle_limits = (frame_index + zeroth_frame) // cycle_len + precursor_cycle_len = precursor_cycle_limits[1] - precursor_cycle_limits[0] + + # Apply minimum size + optimal_len = max(precursor_cycle_len, min_size) + # Round up to the next multiple of `optimize_size` + optimal_len = int(optimize_size * math.ceil(optimal_len / optimize_size)) + + # By default, extend the precursor cycle to the right + optimal_cycle_limits = np.array( + [precursor_cycle_limits[0], precursor_cycle_limits[0] + optimal_len], + dtype=np.int64, + ) + + # If the cycle is too long, extend it to the left + if optimal_cycle_limits[1] > precursor_cycle_max_index: + optimal_cycle_limits[1] = precursor_cycle_max_index + optimal_cycle_limits[0] = precursor_cycle_max_index - optimal_len + + if optimal_cycle_limits[0] < 0: + optimal_cycle_limits[0] = 0 + + # Convert back to frame indices + frame_limits = optimal_cycle_limits * cycle_len + zeroth_frame + return make_slice_1d(frame_limits) diff --git a/alphadia/workflow/base.py b/alphadia/workflow/base.py index 08b16143..befee8ac 100644 --- a/alphadia/workflow/base.py +++ b/alphadia/workflow/base.py @@ -4,17 +4,18 @@ # alpha family imports from alphabase.spectral_library.base import SpecLibBase -from alphabase.spectral_library.flat import SpecLibFlat # alphadia imports -from alphadia.data import alpharaw, bruker +from alphadia.data import alpharaw_wrapper, bruker from alphadia.workflow import manager, reporting +from alphadia.workflow.config import Config +from alphadia.workflow.managers.raw_file_manager import RawFileManager # third party imports logger = logging.getLogger() -TEMP_FOLDER = ".progress" +QUANT_FOLDER_NAME = "quant" class WorkflowBase: @@ -22,16 +23,18 @@ class WorkflowBase: It also initializes the calibration_manager and fdr_manager for the workflow. """ - CALIBRATION_MANAGER_PATH = "calibration_manager.pkl" - OPTIMIZATION_MANAGER_PATH = "optimization_manager.pkl" - TIMING_MANAGER_PATH = "timing_manager.pkl" - FDR_MANAGER_PATH = "fdr_manager.pkl" - FIGURE_PATH = "figures" + RAW_FILE_MANAGER_PKL_NAME = "raw_file_manager.pkl" + CALIBRATION_MANAGER_PKL_NAME = "calibration_manager.pkl" + OPTIMIZATION_MANAGER_PKL_NAME = "optimization_manager.pkl" + TIMING_MANAGER_PKL_NAME = "timing_manager.pkl" + FDR_MANAGER_PKL_NAME = "fdr_manager.pkl" + FIGURES_FOLDER_NAME = "figures" def __init__( self, instance_name: str, - config: dict, + config: Config, + quant_path: str = None, ) -> None: """ Parameters @@ -40,18 +43,22 @@ def __init__( instance_name: str Name for the particular workflow instance. this will usually be the name of the raw file - parent_path: str - Path where the workflow folder will be created - config: dict Configuration for the workflow. This will be used to initialize the calibration manager and fdr manager + quant_path: str + path to directory holding quant folders, relevant for distributed searches + """ self._instance_name: str = instance_name - self._parent_path: str = os.path.join(config["output"], TEMP_FOLDER) - self._config: dict = config + self._parent_path: str = quant_path or os.path.join( + config["output"], QUANT_FOLDER_NAME + ) + self._config: Config = config self.reporter: reporting.Pipeline | None = None - self._dia_data: bruker.TimsTOFTranspose | alpharaw.AlphaRaw | None = None + self._dia_data: bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw | None = ( + None + ) self._spectral_library: SpecLibBase | None = None self._calibration_manager: manager.CalibrationManager | None = None self._optimization_manager: manager.OptimizationManager | None = None @@ -82,9 +89,17 @@ def load( self.reporter.context.__enter__() self.reporter.log_event("section_start", {"name": "Initialize Workflow"}) - self.reporter.log_event("loading_data", {"progress": 0}) # load the raw data - self._dia_data = self._get_dia_data_object(dia_data_path) + self.reporter.log_event("loading_data", {"progress": 0}) + raw_file_manager = RawFileManager( + self.config, + path=os.path.join(self.path, self.RAW_FILE_MANAGER_PKL_NAME), + reporter=self.reporter, + ) + + self._dia_data = raw_file_manager.get_dia_data_object(dia_data_path) + raw_file_manager.save() + self.reporter.log_event("loading_data", {"progress": 1}) # load the spectral library @@ -93,7 +108,7 @@ def load( # initialize the calibration manager self._calibration_manager = manager.CalibrationManager( self.config["calibration_manager"], - path=os.path.join(self.path, self.CALIBRATION_MANAGER_PATH), + path=os.path.join(self.path, self.CALIBRATION_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], reporter=self.reporter, ) @@ -106,14 +121,14 @@ def load( self._optimization_manager = manager.OptimizationManager( self.config, gradient_length=self.dia_data.rt_values.max(), - path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PATH), + path=os.path.join(self.path, self.OPTIMIZATION_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], - figure_path=os.path.join(self.path, self.FIGURE_PATH), + figure_path=os.path.join(self.path, self.FIGURES_FOLDER_NAME), reporter=self.reporter, ) self._timing_manager = manager.TimingManager( - path=os.path.join(self.path, self.TIMING_MANAGER_PATH), + path=os.path.join(self.path, self.TIMING_MANAGER_PKL_NAME), load_from_file=self.config["general"]["reuse_calibration"], ) @@ -135,7 +150,7 @@ def path(self) -> str: return os.path.join(self.parent_path, self.instance_name) @property - def config(self) -> dict: + def config(self) -> Config: """Configuration for the workflow.""" return self._config @@ -155,91 +170,13 @@ def timing_manager(self) -> manager.TimingManager: return self._timing_manager @property - def spectral_library(self) -> SpecLibFlat: + def spectral_library(self) -> SpecLibBase | None: """Spectral library for the workflow. Owns the spectral library data""" return self._spectral_library @property def dia_data( self, - ) -> bruker.TimsTOFTransposeJIT | alpharaw.AlphaRawJIT: + ) -> bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRawJIT: """DIA data for the workflow. Owns the DIA data""" return self._dia_data - - def _get_dia_data_object( - self, dia_data_path: str - ) -> bruker.TimsTOFTranspose | alpharaw.AlphaRaw: - """Get the correct data class depending on the file extension of the DIA data file. - - Parameters - ---------- - - dia_data_path: str - Path to the DIA data file - - Returns - ------- - typing.Union[bruker.TimsTOFTranspose, thermo.Thermo], - TimsTOFTranspose object containing the DIA data - - """ - file_extension = os.path.splitext(dia_data_path)[1] - - if self.config["general"]["wsl"]: - # copy file to /tmp - import shutil - - tmp_path = "/tmp" - tmp_dia_data_path = os.path.join(tmp_path, os.path.basename(dia_data_path)) - shutil.copyfile(dia_data_path, tmp_dia_data_path) - dia_data_path = tmp_dia_data_path - - if file_extension.lower() == ".d" or file_extension.lower() == ".hdf": - self.reporter.log_metric("raw_data_type", "bruker") - dia_data = bruker.TimsTOFTranspose( - dia_data_path, - mmap_detector_events=self.config["general"]["mmap_detector_events"], - ) - - elif file_extension.lower() == ".raw": - self.reporter.log_metric("raw_data_type", "thermo") - # check if cv selection exists - cv = None - if ( - "raw_data_loading" in self.config - and "cv" in self.config["raw_data_loading"] - ): - cv = self.config["raw_data_loading"]["cv"] - - dia_data = alpharaw.Thermo( - dia_data_path, - process_count=self.config["general"]["thread_count"], - astral_ms1=self.config["general"]["astral_ms1"], - cv=cv, - ) - - elif file_extension.lower() == ".mzml": - self.reporter.log_metric("raw_data_type", "mzml") - - dia_data = alpharaw.MzML( - dia_data_path, - process_count=self.config["general"]["thread_count"], - ) - - elif file_extension.lower() == ".wiff": - self.reporter.log_metric("raw_data_type", "sciex") - - dia_data = alpharaw.Sciex( - dia_data_path, - process_count=self.config["general"]["thread_count"], - ) - - else: - raise ValueError( - f"Unknown file extension {file_extension} for file at {dia_data_path}" - ) - - # remove tmp file if wsl - if self.config["general"]["wsl"]: - os.remove(tmp_dia_data_path) - return dia_data diff --git a/alphadia/workflow/config.py b/alphadia/workflow/config.py index 8b30904b..e84ef56a 100644 --- a/alphadia/workflow/config.py +++ b/alphadia/workflow/config.py @@ -528,6 +528,9 @@ def from_dict(self, config: dict[str, Any]) -> None: def to_dict(self) -> dict[str, Any]: return self.config + def get(self, key: str, default: Any = None) -> Any: + return self.config.get(key, default) + def __getitem__(self, key: str) -> Any: return self.config[key] diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 6dc2e3e8..04c12bbf 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -2,6 +2,7 @@ import logging import os import pickle +import traceback import typing from collections import defaultdict from copy import deepcopy @@ -19,9 +20,12 @@ from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider from alphadia.workflow import reporting +from alphadia.workflow.config import Config logger = logging.getLogger() +# TODO move all managers to dedicated modules + class BaseManager: def __init__( @@ -51,6 +55,7 @@ def __init__( self.reporter = reporting.LogBackend() if reporter is None else reporter if load_from_file: + # Note: be careful not to overwrite loaded values by initializing them in child classes after calling super().__init__() self.load() @property @@ -78,46 +83,51 @@ def is_fitted(self, value): def save(self): """Save the state to pickle file.""" - if self.path is not None: - try: - with open(self.path, "wb") as f: - pickle.dump(self, f) - except Exception: - self.reporter.log_string( - f"Failed to save {self.__class__.__name__} to {self.path}", - verbosity="error", - ) + if self.path is None: + return + + try: + with open(self.path, "wb") as f: + pickle.dump(self, f) + except Exception as e: + self.reporter.log_string( + f"Failed to save {self.__class__.__name__} to {self.path}: {str(e)}", + verbosity="error", + ) + + self.reporter.log_string( + f"Traceback: {traceback.format_exc()}", verbosity="error" + ) def load(self): """Load the state from pickle file.""" - if self.path is not None: - if os.path.exists(self.path): - try: - with open(self.path, "rb") as f: - loaded_state = pickle.load(f) - - if loaded_state._version == self._version: - self.__dict__.update(loaded_state.__dict__) - self.is_loaded_from_file = True - else: - self.reporter.log_string( - f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.", - verbosity="warning", - ) - except Exception: + if self.path is None or not os.path.exists(self.path): + self.reporter.log_string( + f"{self.__class__.__name__} not found at: {self.path}", + verbosity="warning", + ) + return + + try: + with open(self.path, "rb") as f: + loaded_state = pickle.load(f) + + if loaded_state._version == self._version: + self.__dict__.update(loaded_state.__dict__) + self.is_loaded_from_file = True self.reporter.log_string( - f"Failed to load {self.__class__.__name__} from {self.path}", - verbosity="error", + f"Loaded {self.__class__.__name__} from {self.path}" ) else: self.reporter.log_string( - f"Loaded {self.__class__.__name__} from {self.path}" + f"Version mismatch while loading {self.__class__}: {loaded_state._version} != {self._version}. Will not load.", + verbosity="warning", ) - else: - self.reporter.log_string( - f"{self.__class__.__name__} not found at: {self.path}", - verbosity="warning", - ) + except Exception: + self.reporter.log_string( + f"Failed to load {self.__class__.__name__} from {self.path}", + verbosity="error", + ) def fit(self): """Fit the workflow property of the manager.""" @@ -454,7 +464,7 @@ def fit_predict( class OptimizationManager(BaseManager): def __init__( self, - config: None | dict = None, + config: None | Config = None, gradient_length: None | float = None, path: None | str = None, load_from_file: bool = True, diff --git a/alphadia/workflow/managers/__init__.py b/alphadia/workflow/managers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alphadia/workflow/managers/raw_file_manager.py b/alphadia/workflow/managers/raw_file_manager.py new file mode 100644 index 00000000..bada5b69 --- /dev/null +++ b/alphadia/workflow/managers/raw_file_manager.py @@ -0,0 +1,163 @@ +"""Manager handling the raw data file and its statistics.""" + +import logging +import os + +import numpy as np + +from alphadia.data import alpharaw_wrapper, bruker +from alphadia.workflow.config import Config +from alphadia.workflow.manager import BaseManager + +logger = logging.getLogger() + + +class RawFileManager(BaseManager): + def __init__( + self, + config: None | Config = None, + path: None | str = None, + load_from_file: bool = False, + **kwargs, + ): + """Handles raw file loading and contains information on the raw file.""" + self.stats = {} # needs to be before super().__init__ to avoid overwriting loaded values + + super().__init__(path=path, load_from_file=load_from_file, **kwargs) + + self._config: Config = config + + # deliberately not saving the actual raw data here to avoid the saved manager file being too large + + self.reporter.log_string(f"Initializing {self.__class__.__name__}") + self.reporter.log_event("initializing", {"name": f"{self.__class__.__name__}"}) + + def get_dia_data_object( + self, dia_data_path: str + ) -> bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw: + """Get the correct data class depending on the file extension of the DIA data file. + + Parameters + ---------- + + dia_data_path: str + Path to the DIA data file + + Returns + ------- + typing.Union[bruker.TimsTOFTranspose, thermo.Thermo], + TimsTOFTranspose object containing the DIA data + + """ + file_extension = os.path.splitext(dia_data_path)[1] + + is_wsl = self._config["general"]["wsl"] + if is_wsl: + # copy file to /tmp # TODO check if WSL support can be dropped + import shutil + + tmp_path = "/tmp" + tmp_dia_data_path = os.path.join(tmp_path, os.path.basename(dia_data_path)) + shutil.copyfile(dia_data_path, tmp_dia_data_path) + dia_data_path = tmp_dia_data_path + + if file_extension.lower() == ".d": + raw_data_type = "bruker" + dia_data = bruker.TimsTOFTranspose( + dia_data_path, + mmap_detector_events=self._config["general"]["mmap_detector_events"], + ) + + elif file_extension.lower() == ".hdf": + raw_data_type = "alpharaw" + dia_data = alpharaw_wrapper.AlphaRawBase( + dia_data_path, + process_count=self._config["general"]["thread_count"], + ) + + elif file_extension.lower() == ".raw": + raw_data_type = "thermo" + + cv = self._config.get("raw_data_loading", {}).get("cv") + + dia_data = alpharaw_wrapper.Thermo( + dia_data_path, + process_count=self._config["general"]["thread_count"], + astral_ms1=self._config["general"]["astral_ms1"], + cv=cv, + ) + + elif file_extension.lower() == ".mzml": + raw_data_type = "mzml" + + dia_data = alpharaw_wrapper.MzML( + dia_data_path, + process_count=self._config["general"]["thread_count"], + ) + + elif file_extension.lower() == ".wiff": + raw_data_type = "sciex" + + dia_data = alpharaw_wrapper.Sciex( + dia_data_path, + process_count=self._config["general"]["thread_count"], + ) + + else: + raise ValueError( + f"Unknown file extension {file_extension} for file at {dia_data_path}" + ) + + self.reporter.log_metric("raw_data_type", raw_data_type) + + # remove tmp file if wsl + if is_wsl: + os.remove(tmp_dia_data_path) + + self._calc_stats(dia_data) + + self._log_stats() + + return dia_data + + def _calc_stats( + self, dia_data: bruker.TimsTOFTranspose | alpharaw_wrapper.AlphaRaw + ): + """Calculate statistics from the DIA data.""" + rt_values = dia_data.rt_values + cycle = dia_data.cycle + + stats = {} + stats["rt_limit_min"] = rt_values.min() + stats["rt_limit_max"] = rt_values.max() + + cycle_length = cycle.shape[1] + stats["cycle_length"] = cycle_length + stats["cycle_duration"] = np.diff(rt_values[::cycle_length]).mean() + stats["cycle_number"] = len(rt_values) // cycle_length + + flat_cycle = cycle.flatten() + flat_cycle = flat_cycle[flat_cycle > 0] + + stats["msms_range_min"] = flat_cycle.min() + stats["msms_range_max"] = flat_cycle.max() + + self.stats = stats + + def _log_stats(self): + """Log the statistics calculated from the DIA data.""" + rt_duration = self.stats["rt_limit_max"] - self.stats["rt_limit_min"] + + logger.info( + f"{'RT (min)':<20}: {self.stats['rt_limit_min']/60:.1f} - {self.stats['rt_limit_max']/60:.1f}" + ) + logger.info(f"{'RT duration (sec)':<20}: {rt_duration:.1f}") + logger.info(f"{'RT duration (min)':<20}: {rt_duration/60:.1f}") + + logger.info(f"{'Cycle len (scans)':<20}: {self.stats['cycle_length']:.0f}") + logger.info(f"{'Cycle len (sec)':<20}: {self.stats['cycle_duration']:.2f}") + logger.info(f"{'Number of cycles':<20}: {self.stats['cycle_number']:.0f}") + + logger.info( + f"{'MS2 range (m/z)':<20}: {self.stats['msms_range_min']:.1f} - {self.stats['msms_range_max']:.1f}" + ) diff --git a/alphadia/workflow/optimization.py b/alphadia/workflow/optimization.py index a446a87b..3f00b85a 100644 --- a/alphadia/workflow/optimization.py +++ b/alphadia/workflow/optimization.py @@ -57,12 +57,9 @@ def step(self, precursors_df: pd.DataFrame, fragments_df: pd.DataFrame): """ - pass - @abstractmethod def skip(self): """Record skipping of optimization. Can be overwritten with an empty method if there is no need to record skips.""" - pass def proceed_with_insufficient_precursors(self, precursors_df, fragments_df): self.workflow.reporter.log_string( @@ -80,7 +77,6 @@ def proceed_with_insufficient_precursors(self, precursors_df, fragments_df): @abstractmethod def plot(self): """Plots the progress of the optimization. Can be overwritten with an empty method if there is no need to plot the progress.""" - pass @abstractmethod def _update_workflow(): @@ -92,7 +88,6 @@ def _update_workflow(): and FWHM_mobility """ - pass @abstractmethod def _update_history(): @@ -107,7 +102,6 @@ def _update_history(): The filtered fragment dataframe for the search. """ - pass class AutomaticOptimizer(BaseOptimizer): @@ -503,7 +497,6 @@ def _get_feature_value( """ - pass class TargetedOptimizer(BaseOptimizer): @@ -611,11 +604,9 @@ def step( def skip(self): """See base class.""" - pass def plot(self): """See base class""" - pass def _update_workflow(self): pass diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 7e6b8f79..b03c05c0 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -17,6 +17,7 @@ from alphadia import fragcomp, plexscoring, utils from alphadia.peakgroup import search from alphadia.workflow import base, manager, optimization +from alphadia.workflow.config import Config logger = logging.getLogger() @@ -102,11 +103,13 @@ class PeptideCentricWorkflow(base.WorkflowBase): def __init__( self, instance_name: str, - config: dict, + config: Config, + quant_path: str = None, ) -> None: super().__init__( instance_name, config, + quant_path, ) self.optlock = None @@ -257,6 +260,36 @@ def norm_to_rt( else: raise ValueError(f"Unknown norm_rt_mode {mode}") + def get_precursor_mz_column(self): + """Get the precursor m/z column name. + This function will return `mz_calibrated` if precursor calibration has happened, otherwise it will return `mz_library`. + If no MS1 data is present, it will always return `mz_library`. + + Returns + ------- + str + Name of the precursor m/z column + + """ + return ( + f"mz_{self.optimization_manager.column_type}" + if self.dia_data.has_ms1 + else "mz_library" + ) + + def get_fragment_mz_column(self): + return f"mz_{self.optimization_manager.column_type}" + + def get_rt_column(self): + return f"rt_{self.optimization_manager.column_type}" + + def get_mobility_column(self): + return ( + f"mobility_{self.optimization_manager.column_type}" + if self.dia_data.has_mobility + else "mobility_library" + ) + def get_ordered_optimizers(self): """Select appropriate optimizers. Targeted optimization is used if a valid target value (i.e. a number greater than 0) is specified in the config; if a value less than or equal to 0 is supplied, automatic optimization is used. @@ -477,6 +510,7 @@ def search_parameter_optimization(self): log_string( "==============================================", verbosity="progress" ) + if insufficient_precursors_to_optimize: precursor_df_filtered, fragments_df_filtered = self.filter_dfs( precursor_df, self.optlock.fragments_df @@ -756,14 +790,10 @@ def extract_batch( batch_precursor_df, batch_fragment_df, config.jitclass(), - rt_column=f"rt_{self.optimization_manager.column_type}", - mobility_column=f"mobility_{self.optimization_manager.column_type}" - if self.dia_data.has_mobility - else "mobility_library", - precursor_mz_column=f"mz_{self.optimization_manager.column_type}" - if self.dia_data.has_ms1 - else "mz_library", - fragment_mz_column=f"mz_{self.optimization_manager.column_type}", + rt_column=self.get_rt_column(), + mobility_column=self.get_mobility_column(), + precursor_mz_column=self.get_precursor_mz_column(), + fragment_mz_column=self.get_fragment_mz_column(), fwhm_rt=self.optimization_manager.fwhm_rt, fwhm_mobility=self.optimization_manager.fwhm_mobility, ) @@ -803,14 +833,10 @@ def extract_batch( batch_precursor_df, batch_fragment_df, config=config, - rt_column=f"rt_{self.optimization_manager.column_type}", - mobility_column=f"mobility_{self.optimization_manager.column_type}" - if self.dia_data.has_mobility - else "mobility_library", - precursor_mz_column=f"mz_{self.optimization_manager.column_type}" - if self.dia_data.has_ms1 - else "mz_library", - fragment_mz_column=f"mz_{self.optimization_manager.column_type}", + rt_column=self.get_rt_column(), + mobility_column=self.get_mobility_column(), + precursor_mz_column=self.get_precursor_mz_column(), + fragment_mz_column=self.get_fragment_mz_column(), ) features_df, fragments_df = candidate_scoring( @@ -1048,12 +1074,10 @@ def requantify(self, psm_df): self.spectral_library.precursor_df_unfiltered, self.spectral_library.fragment_df, config=config, - precursor_mz_column="mz_calibrated", - fragment_mz_column="mz_calibrated", - rt_column="rt_calibrated", - mobility_column="mobility_calibrated" - if self.dia_data.has_mobility - else "mobility_library", + rt_column=self.get_rt_column(), + mobility_column=self.get_mobility_column(), + precursor_mz_column=self.get_precursor_mz_column(), + fragment_mz_column=self.get_fragment_mz_column(), ) multiplexed_candidates["rank"] = 0 @@ -1141,8 +1165,10 @@ def requantify_fragments( candidate_speclib_flat.precursor_df, candidate_speclib_flat.fragment_df, config=config, - precursor_mz_column="mz_calibrated", - fragment_mz_column="mz_calibrated", + rt_column=self.get_rt_column(), + mobility_column=self.get_mobility_column(), + precursor_mz_column=self.get_precursor_mz_column(), + fragment_mz_column=self.get_fragment_mz_column(), ) # we disregard the precursors, as we want to keep the original scoring from the top12 search diff --git a/alphadia/workflow/reporting.py b/alphadia/workflow/reporting.py index c45211db..40519453 100644 --- a/alphadia/workflow/reporting.py +++ b/alphadia/workflow/reporting.py @@ -595,7 +595,7 @@ def log_figure(self, name: str, figure: typing.Any, *args, **kwargs): for backend in self.backends: backend.log_figure(name, figure, *args, **kwargs) - def log_metric(self, name: str, value: float, *args, **kwargs): + def log_metric(self, name: str, value: float | str, *args, **kwargs): for backend in self.backends: backend.log_metric(name, value, *args, **kwargs) diff --git a/assets/distributed_search_schematic.svg b/assets/distributed_search_schematic.svg new file mode 100644 index 00000000..be9319ff --- /dev/null +++ b/assets/distributed_search_schematic.svg @@ -0,0 +1 @@ + diff --git a/docs/developer_guide.md b/docs/developer_guide.md index 74424fb0..54663e06 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -16,7 +16,9 @@ This package uses a shared release process defined in the ## Notes for developers ### Debugging -To debug e2e tests with PyCharm: +A good start for debugging is this notebook: `nvs/debug/debug_lvl1.ipynb` + +##### Debug e2e tests with PyCharm 1. Create a "Run/Debug configuration" with - "module": `alphadia.cli` - "script parameters": `--config /abs/path/to/tests/e2e_tests/basic/config.yaml` @@ -24,6 +26,29 @@ To debug e2e tests with PyCharm: 2. Uncomment the lines following the `uncomment for debugging` comment in `alphadia/cli.py`. 3. Run the configuration. +##### Debug e2e tests with VS Code +1. Create the following debug configuration (`launch.json`, see [here](https://code.visualstudio.com/docs/editor/debugging#_launch-configurations)): +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "cwd": "/abs/path/to/tests/e2e_tests", + "program": "../../alphadia/cli.py", + "console": "integratedTerminal", + "args": [ + "--config", "/abs/path/to/tests/e2e_tests/basic/config.yaml" + ] + } + ] +} +``` +2. Uncomment the lines following the `uncomment for debugging` comment in `alphadia/cli.py`. +3. Run the configuration. + ### pre-commit hooks It is highly recommended to use the provided pre-commit hooks, as the CI pipeline enforces all checks therein to diff --git a/docs/methods.md b/docs/methods.md index 83e3ad8b..134a5984 100644 --- a/docs/methods.md +++ b/docs/methods.md @@ -3,8 +3,9 @@ ```{toctree} :maxdepth: 1 methods/command-line -methods/configuration methods/calibration methods/transfer-learning - +methods/output-format +methods/dist_search_setup +methods/search_parameter_optimization ``` diff --git a/docs/methods/dist_search_setup.md b/docs/methods/dist_search_setup.md new file mode 100644 index 00000000..401bea47 --- /dev/null +++ b/docs/methods/dist_search_setup.md @@ -0,0 +1,58 @@ +# Distributed Search + +This guide deals with setting up a distributed search in AlphaDIA, with the following prerequisites: +- A Linux (Ubuntu) HPCL system with Slurm Workload Manager installed. All resource management is handled by Slurm, AlphaDIA does not select, manage or monitor worker nodes. +- The distributed search requires absolute paths for each raw file, saved in the second column of a two-column .csv document. Simpler structures that e.g. process all files in a given directory are disfavored as large cohorts frequently consist of rawfiles spread across a number of subfolders. +- An Anaconda environment called "alphadia" with _mono_ and _alphadia_ installed (for installing _mono_, see [alpharaw](https://github.com/MannLabs/alpharaw#installation)) + +## Distributed search concept + +![Distributed_Search](../../assets/distributed_search_schematic.svg) + +Compared to a linear two step search, distributing the raw file search steps offers a significant advantage in speed. At the most extreme, N files could be searched on N machines in parallel, completely decoupling search time from the number of files. In practice, not all steps of a search can be easily parallelized (i.e. steps which require knowledge of all files at the same time). Additionally, processing nodes available are never unlimited, requiring a chunking approach to ensure the maximum amount of parallelization with a given number of processing nodes. The diagram above summarizes these steps, indicating parallel AlphaDIA instances for first and second pass fo raw file searches. + +## Steps to set up a search + +1. Set up an empty search directory on your HPCL partition. One directory corresponds to one study, i.e. one set of raw files, fasta/library and search configuration. +2. Copy all files from [alphadia/misc/distributed_search](https://github.com/MannLabs/alphadia/tree/main/misc/distributed_search) into the search directory +3. If no .csv file with rawfile paths exists, it can be obtained by running **discover_project_files.py** from the search directory. +4. Set first and second search configurations in **first_config.yaml** and **second_config.yaml**. For example, number of precursor candidates and inference strategy, as well as mass tolerances may differ between first and second search. +Leave all the predefined settings in the two .yaml files as they are. +5. Set the search parameters in **outer.sh**. While these can also be provided as command line arguments, it is convenient to set them in **outer.sh** itself. This file requires the following settings: + - `input_directory`: the search directory + - `input_filename`: the .csv file containing rawfile paths + - `target_directory`: the directory where intermediate and final outputs are written (mind that slow read/write speeds to this location may slow down your search) + - `library_path` (optional, will be reannotated if `fasta_path` is provided and `predict_library` is set to 1): absolute path to a .hdf spectral library + - `fasta_path` (optional if `library_path` is provided and `predict_library` is set to 0): absolute path to .fasta file + - `first_search_config_filename`: name of .yaml file for the first search + - `second_search_config_filename`: name of the .yaml file for the building the MBR library, second search and LFQ +6. Run **outer.sh** with the following search settings: + - `--nnodes` (int): specifies how many nodes can be occupied. Rawfile search will be distributed across these nodes. If there are 5 nodes and 50 raw files, the search will take place on 5 nodes in chunks of 10 rawfiles each. + - `--ntasks_per_node` (int): default to 1, some HPCL systems allow for multiple tasks to run on one node + - `--cpus` (int): default to 12, specifies how many CPUs shall be used per task + - `--mem` (str): default to '250G', specifies RAM requirements for each task. + **HPCL systems may be set to restrict user resources to certain limits. Make sure the above parameters comply with your HPCL setup.** + - `--predict_library` (bool): default to 1, whether to predict a spectral library from a given fasta + - `--first_search` (bool): default to 1, whether to search all files with the initial spectral library + - `--mbr_library` (bool): whether to aggregate first search results into a focused "MBR" library + - `--second_search` (bool): whether to perform a second search with the focused MBR library + - `--lfq` (bool): whether to perform LFQ quantification of the second search results + +#### A typical call for running the search could look like this: + +```console +sbatch outer.sh --nnodes 3 --search_config search.config +``` +where the 'search.config' contains the name of the .csv file containing rawfile paths and other settings, and +```console +--nnodes 3 +``` +indicates that the search will be parallelized across three nodes. + +#### Running the search creates five subdirectories in the target folder: + +- `_predicted_speclib_`: If spectral library prediction was set, this folder contains the .hdf spectral library +- `_first_search_`: Contains one subdirectory for each processing chunk. AlphaDIA subprocesses for the first search are run from these chunks and their specific config.yaml files. Precursor and fragment datasets from these searches are saved into the `_mbr_library_` folder +- `_mbr_library_`: Contains one chunk, since the library is built from all first search results. +- `_second_search_`: Analogous to `_first_search_`, one subdirectory is created for each chunk of rawfiles that are searched with the mbr_library. Precursor and fragment datasets from these searches are saved into the `_lfq_` folder. +- `_lfq_`: Analogous to `_mbr_library_`, contains one chunk which runs label free quantification (LFQ) on each output from the second search. After all search steps are completed, the final precursor and protein tables are saved here. diff --git a/docs/methods/output-format.md b/docs/methods/output-format.md new file mode 100644 index 00000000..67954550 --- /dev/null +++ b/docs/methods/output-format.md @@ -0,0 +1,78 @@ +# Output format +AlphaDIA writes tab-separated values (TSV) files. + +## `stats.tsv` +The `stats.tsv` file contains summary statistics and quality metrics for each run and channel in the analysis. +It provides insights into the search results, calibration quality, and general performance metrics. + +Format: one row per run/channel combination. + +### Columns + +#### Basic Information +- `run`: Name of the raw file/run +- `channel`: Channel number (0 for label-free data, or channel numbers for multiplexed data) + +#### Search Results Statistics +- `precursors`: Number of identified precursors in this run/channel +- `proteins`: Number of unique protein groups identified in this run/channel + +#### Peak Width Statistics +These columns are only present if the data contains the relevant measurements: +- `fwhm_rt`: Mean full width at half maximum (FWHM) of peaks in retention time dimension +- `fwhm_mobility`: Mean FWHM of peaks in mobility dimension (only for ion mobility data) + +#### Optimization Statistics +These metrics show the final values used for various search parameters after optimization: + +- `optimization.ms2_error`: Final MS2 mass error tolerance used +- `optimization.ms1_error`: Final MS1 mass error tolerance used +- `optimization.rt_error`: Final retention time tolerance used +- `optimization.mobility_error`: Final ion mobility tolerance used (only for ion mobility data) + +#### Calibration Statistics +These metrics show the quality of mass calibration: + +##### MS2 Level +- `calibration.ms2_median_accuracy`: Median mass accuracy for fragment ions +- `calibration.ms2_median_precision`: Median mass precision for fragment ions + +##### MS1 Level +- `calibration.ms1_median_accuracy`: Median mass accuracy for precursor ions +- `calibration.ms1_median_precision`: Median mass precision for precursor ions + +### Notes + +- All FWHM values are reported in the native units of the measurement (minutes for RT, mobility units for IM) +- Mass accuracy values are typically reported in parts per million (ppm) +- Some columns may be NaN if the corresponding measurements or calibrations were not performed +- For label-free data, there will typically be one row per run with channel=0 +- For multiplexed data, there will be multiple rows per run (one for each channel) + +#### Raw File Statistics +These metrics are derived from the raw data file analysis: + +- `raw.gradient_min_m`: Minimum retention time in the run (minutes) +- `raw.gradient_max_m`: Maximum retention time in the run (minutes) +- `raw.gradient_length_m`: Total duration of the run (minutes) +- `raw.cycle_length`: Number of scans per cycle +- `raw.cycle_duration`: Average duration of each cycle in seconds +- `raw.cycle_number`: Total number of cycles in the run +- `raw.msms_range_min`: Minimum MS2 m/z value measured +- `raw.msms_range_max`: Maximum MS2 m/z value measured + + +## `precursors.tsv` +TODO + +## `pg.matrix.tsv` +TODO + +## `internal.tsv` +TODO + +## `speclib.mbr.hdf ` +TODO + +## `quant folder` +TODO diff --git a/docs/methods/search_parameter_optimization.md b/docs/methods/search_parameter_optimization.md index 24519f69..c716e649 100644 --- a/docs/methods/search_parameter_optimization.md +++ b/docs/methods/search_parameter_optimization.md @@ -1,12 +1,36 @@ -# Search Parameter Optimization +# Optimization and Calibration +In peptide centric DIA search, calibration of the library and optimization of search parameters is required to maximize the number of confident identifications. AlphaDIA performs both calibration and optimization iteratively. Calibration removes the systematic deviation of observed and library values to account for technical variation from the LC or MS instrument. Optimization reduces the search space to improve the confidence in identifications and to accelerate search. +:::{note} +Calibration and optimization are different but both connected to transfer learning. In [transfer learning](./transfer-learning.md) the residual (non-systematic) variation is learned and thereby reduced. This usually leads to better performance if used with optimization and calibration. +::: -## Calibration and optimization +## Overview of optimization +AlphaDIA can perform optimization for the following parameters, `target_ms1_tolerance`, `target_ms2_tolerance`, `target_mobility_tolerance` and `target_rt_tolerance`. There are two optimization strategies, targeted and automatic. + +### Targeted optimization +The search space is progressively narrowed until a target tolerance is reached for a given parameter. + +To activate targeted optimization for example for fragment m/z tolerance, set `target_ms2_tolerance` to `10` for using a target tolerance of 10 ppm. +For retention time, the target value can be either set as an absolute value in seconds or as a fraction of the total retention time range. + +For example, setting `target_rt_tolerance` to `300` will result in a target tolerance of 300 seconds, while setting it to `0.3` will use 30% of the gradient length as the target tolerance. + +### Automatic optimization +In automatic optimization the search space is reduced until an optimal value is detected. This optimization is curently performed for every raw file individually. The results of the optimization can be found in the [stats.tsv]() file in the output directory. + +To activate automatic optimization for a certain quantity, set the respective target to `0.0`, for example set `target_rt_tolerance=0.0` for automatic retention time optimization. + +:::{tip} +We recommend to always use automatic optimization for retention time and ion mobility as set by default. For automatic optimization of mass tolerances we recommend using optimization in the first pass and then using the optimized values in the second pass. +::: + +## Optimization and Calibration Algorithm ### Overall process -The first step of every AlphaDIA run is the optimization of search parameters and the calibration of the empirical or fully predicted spectral library to the observed values. This step has two main purposes: 1) removing the systematic deviation of observed and library values, and 2) optimizing the size of the search space to reflect the expected deviation from the library values. For DIA search this means calibration and optimization of certain parameters: retention time, ion mobility, precursor m/z and fragment m/z. The process of iterative calibration and optimization is illustrated below. +AlphaDIA performs iterative optimization and calibration of retention time, ion mobility, precursor m/z and fragment m/z parameters as illustrated below. -Optimization can be performed in either a targeted or automatic manner. In targeted optimization, the search space is progressively narrowed until a target tolerance is reached for a given parameter. In automatic optimization, the search space is progressively narrowed until an internal algorithm detects that further narrowing will reduce the confident identification of precursors (either by directly assessing the proportion of the library which has been detected or using a surrogate metric, such as the mean isotope intensity correlation for precursor m/z tolerance), at which point the optimal value is selected for search. Automatic optimization can be triggered by setting the target tolerance to 0.0 (or a negative value). It is possible to use targeted optimization for some parameters and automatic optimization for others; currently, it is recommended to use targeted optimization for precursor m/z, fragment m/z and ion mobility, and automatic optimization for retention time. +Optimization can be performed in either a targeted or automatic manner. In targeted optimization, the search space is progressively narrowed until a target tolerance is reached for a given parameter. In automatic optimization, the search space is progressively narrowed until an internal algorithm detects that further narrowing will reduce the confident identification of precursors (either by directly assessing the proportion of the library which has been detected or using a surrogate metric, such as the mean isotope intensity correlation for precursor m/z tolerance), at which point the optimal value is selected for search. It is possible to use targeted optimization for some parameters and automatic optimization for others. AlphaDIA iteratively performs calibration and optimization based on a subset of the spectral library used for search. The size of this subset is adjusted according to an exponential batch plan to balance accuracy and efficiency. A defined number of precursors, set by the ``optimization_lock_target`` (default: 200), need to be identified at 1% FDR before calibration and optimization are performed. If fewer precursors than the target number are identified using a given step of the batch plan, AlphaDIA will search for precursors from the next step of the batch plan in addition to those already searched. If more precursors than the target number are identified, AlphaDIA will check if any previous step of the batch plan is also likely to yield at least the target number, in which case it will use the smallest such step of the batch plan for the next iteration of calibration and optimization. In this way, AlphaDIA ensures that calibration is always performed on sufficient precursors to be reliable, while calibrating on the smallest-possible subset of the library to maximize efficiency. @@ -20,48 +44,31 @@ If enough confident target precursors have been detected, they are calibrated to ### Optimization For optimizing the search space, tolerances like retention time, ion mobility and m/z ratios need to be reduced. The goal is to cover the expected spectrum space but reduce it as much as possible to accelerate search and gain statistical power. Search starts with initial tolerances as defined in `search_initial`. For targeted optimization, the 95% deviation after calibration is adopted as the new tolerance until the target tolerances defined in the `search` section are reached. For automatic optimization, the 99% deviation plus 10% of the absolute value of the tolerance is adopted as the new tolerance, and search continues until parameter-specific convergence rules are met. -The optimization is finished as soon as the minimum number of steps `min_steps` has passed and all tolerances have either 1) reached the target tolerances defined in `search` if using targeted optimization, or 2) have converged if using automatic optimization. +The optimization is finished as soon as the minimum number of steps `min_steps` has passed and all tolerances have either 1. reached the target tolerances defined in `search` if using targeted optimization, or 2. have converged if using automatic optimization. ## Configuring calibration and optimization -The configuration below will perform targeted optimization of precursor m/z, fragment m/z and ion mobility, and automatic optimization of retention time. +By default, alphaDIA performs targeted optimization of precursor m/z, fragment m/z, and automatic optimization of retention time and ion mobility using the settings below. ```yaml -calibration: - # Number of precursors searched and scored per batch - batch_size: 8000 - - # minimum number of precursors to be found before search parameter optimization begins - optimization_lock_target: 200 - - # the maximum number of steps that a given optimizer is permitted to take - max_steps: 20 - - # the minimum number of steps that a given optimizer must take before it can be said to have converged - min_steps: 2 - -search_initial: +search: # Number of peak groups identified in the convolution score to classify with target decoy competition - initial_num_candidates: 1 - - # initial ms1 tolerance in ppm - initial_ms1_tolerance: 30 + target_num_candidates: 2 - # initial ms2 tolerance in ppm - initial_ms2_tolerance: 30 + # Targeted optimization of precursor m/z tolerance. + # Use absolute values in ppm (e.g. 15ppm) or set to 0 for automatic optimization. + target_ms1_tolerance: 5 - # initial ion mobility tolerance in 1/K_0 - initial_mobility_tolerance: 0.08 + # Targeted optimization of fragment m/z tolerance. + # Use absolute values in ppm (e.g. 15ppm) or set to 0 for automatic optimization. + target_ms2_tolerance: 10 - # initial retention time tolerance in seconds - initial_rt_tolerance: 240 + # Targeted optimization of ion mobility tolerance. + # Use absolute values in 1/K0 (e.g. 0.04 1/K0) or set to 0 for automatic optimization. + target_mobility_tolerance: 0 -search: - target_num_candidates: 2 - target_ms1_tolerance: 15 - target_ms2_tolerance: 15 - target_mobility_tolerance: 0.04 + # Targeted optimization of retention time tolerance. + # Use absolute values in seconds (e.g. 300s) or set to 0 for automatic optimization. target_rt_tolerance: 0 - ``` ## Calibration using LOESS @@ -87,6 +94,7 @@ calibration_manager: - mz_observed output_columns: - mz_calibrated + # display deviation in ppm transform_deviation: 1e6 - name: precursor estimators: @@ -100,6 +108,7 @@ calibration_manager: - mz_observed output_columns: - mz_calibrated + # display deviation in ppm transform_deviation: 1e6 - name: rt model: LOESSRegression diff --git a/docs/methods/transfer-learning.md b/docs/methods/transfer-learning.md index 5edaf684..e4423855 100644 --- a/docs/methods/transfer-learning.md +++ b/docs/methods/transfer-learning.md @@ -1,5 +1,5 @@ -## Transfer Learning +# Transfer Learning Generally, transfer learning refers to the process of leveraging the knowledge learned in one model for better performance on a different task. A task is a vague term, but it essentially includes learning a different objective, for example, transitioning from regression to classification. It can also involve learning the same objective with a different loss function or optimizer, or using the same loss and objective but with different data. In cases where the dataset is too small to train a model from scratch without overfitting, we start from a pretrained model that has good performance on a larger dataset. This last type is the transfer learning we are using in alphaDIA. @@ -26,12 +26,12 @@ Learning rates are crucial parameters that define the magnitude of updates made For alphaDIA, we use a custom learning rate scheduler with two phases: -### 1) Warmup Phase +### 1. Warmup Phase In this phase, the learning rate starts small and gradually increases over a certain number of "warmup epochs". Our default is **5**. This technique significantly helps in training transformers when using optimizers like Adam or SGD ([https://arxiv.org/abs/2002.04745](https://arxiv.org/abs/2002.04745)). Since we are not training from scratch, we set the default number of warmup epochs to 5. The user only needs to define the maximum learning rate and the number of epochs for warm-up. During this phase, the learning rate lr(t) is calculated as: $$\text{lr}(t) = \text{max lr} \times \left( \frac{t}{\text{number of warmup epochs}} \right)$$ -### 2) Reduce on Plateau LR Schedule +### 2. Reduce on Plateau LR Schedule After the warmup phase, the learning rate reaches the maximum value set by the user and remains there until the training loss reaches a plateau. A plateau is defined as the training loss not significantly improving for a certain number of epochs, referred to as "patience". For this phase, we use the PyTorch implementation `torch.optim.lr_scheduler.ReduceLROnPlateau` with a default patience value of 3 epochs. This approach makes the fine-tuning process less sensitive to the user-defined learning rate. If the model is not learning for 3 epochs, it is likely that the learning rate is too high, and the scheduler will then reduce the learning rate to encourage further learning. @@ -129,15 +129,13 @@ With the learning scheduler we are using, we could theoretically keep training i To address these issues, we implement two measures: -### 1) Maximum Number of Epochs +### 1. Maximum Number of Epochs We set the maximum number of epochs to 50. From our experiments, we find that 50 epochs are usually sufficient to achieve significant performance gains without spending unnecessary time/epochs on insignificant improvements. -### 2) Early Stopping +### 2. Early Stopping We use an Early Stopping implementation that monitors the validation loss and terminates the training if one of the following criteria is met for more than the patience epochs (this is different from the learning rate scheduler's patience value, but they are related, more on this later): - -a) The validation loss is increasing, which may indicate overfitting. - -b) The validation loss is not significantly improving, indicating no significant performance gains on the validation dataset. +- The validation loss is increasing, which may indicate overfitting. +- The validation loss is not significantly improving, indicating no significant performance gains on the validation dataset. The early stopping patience value represents the number of epochs we allow the model to meet the criteria without taking any action. This is because training neural networks with smaller batches can be a bit unstable, so we allow for some volatility before intervening. We set the early stopping patience to be a multiple of the learning rate scheduler patience. The idea is to give the learning rate scheduler a chance to address the problem before terminating the training. diff --git a/gui/package.json b/gui/package.json index cfe6619d..739ef0d8 100644 --- a/gui/package.json +++ b/gui/package.json @@ -1,7 +1,7 @@ { "name": "alphadia", "productName": "alphadia-gui", - "version": "1.8.2", + "version": "1.9.0", "description": "Graphical user interface for DIA data analysis", "main": "dist/electron.js", "homepage": "./", diff --git a/gui/src/main/modules/profile.js b/gui/src/main/modules/profile.js index 13adde9b..0b07f113 100644 --- a/gui/src/main/modules/profile.js +++ b/gui/src/main/modules/profile.js @@ -3,7 +3,7 @@ const path = require("path") const { app, shell, BrowserWindow} = require("electron") const { dialog } = require('electron') -const VERSION = "1.8.2" +const VERSION = "1.9.0" const Profile = class { diff --git a/misc/.bumpversion.cfg b/misc/.bumpversion.cfg index 7ce38171..3cc6626b 100644 --- a/misc/.bumpversion.cfg +++ b/misc/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.8.2 +current_version = 1.9.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/misc/distributed_search/discover_project_files.py b/misc/distributed_search/discover_project_files.py new file mode 100644 index 00000000..17c58263 --- /dev/null +++ b/misc/distributed_search/discover_project_files.py @@ -0,0 +1,98 @@ +# Discover project files by searching a project regex. Attempted to make this +# as fast as possible by running rglob/glob search to get all filepaths with +# correct endings into a dict, and then searching that dict with the actual +# project-specific regex. Searching the dict is significantly faster than +# stepping through all directories and subdirectories and running re.search. + +import sys +from pathlib import Path + +import pandas as pd +import regex as re + + +def match_files( + project_regex: str, + source_directories: list, + search_recursively: bool = True, + file_ending: str = "raw", +): + # Convert search directories to paths and report + _search_dirs = [Path(d) for d in source_directories] + + # Collect all files with correct endings into dict + print( + f"--> Collecting '.{file_ending}' files from {source_directories} \n--> search_recursively = {search_recursively}" + ) + _file_dict = {} + for _dir in _search_dirs: + _dir_files = list( + _dir.rglob(f"*.{file_ending}") + if search_recursively + else _dir.glob(f"*.{file_ending}") + ) + for _file in _dir_files: + # assign path to filename-key, for quick & unique searching + _file_dict[str(_file)] = _file + print(f"--> Collected {len(_file_dict)} '.{file_ending}' files") + + # search project regex against file dict keys and return all matching paths + _matched_paths = [] + regex_pattern = re.compile(project_regex) + print(f"--> Searching files matching '{project_regex}'") + for _file, _path in _file_dict.items(): + if regex_pattern.search(_file): + _matched_paths.append(_path) + + # report + print( + f"--> Discovered {len(_matched_paths)} matching filepaths for {project_regex}." + ) + + # suitable path dataframe + out_frame = pd.DataFrame( + columns=["project", "filepath"], index=range(len(_matched_paths)) + ) + out_frame["project"] = project_regex + out_frame["filepath"] = _matched_paths + + return out_frame + + +if __name__ == "__main__": + import argparse + import os + + parser = argparse.ArgumentParser( + prog="Discovering project filenames", + description="Search project files based on regex string and put them into a csv file for distributed processing", + ) + parser.add_argument( + "--project_regex", help="Regex string to match project files", default=".*" + ) + parser.add_argument( + "--source_directories", nargs="+", help="List of source directories" + ) + parser.add_argument("--search_recursively", action="store_true") + parser.add_argument("--file_ending", default="raw") + parser.add_argument("--output_filename", default="file_list.csv") + + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + args = parser.parse_args() + + out_frame = match_files( + args.project_regex, + args.source_directories, + args.search_recursively, + args.file_ending, + ) + + output_path = ( + args.output_filename + if os.path.isabs(args.output_filename) + else os.path.join("./", args.output_filename) + ) + out_frame.to_csv(output_path, index=False) diff --git a/misc/distributed_search/dist_search_setup.md b/misc/distributed_search/dist_search_setup.md new file mode 100644 index 00000000..528f6fa3 --- /dev/null +++ b/misc/distributed_search/dist_search_setup.md @@ -0,0 +1,50 @@ +Distributed AlphaDIA search on HPCL +================================================= + +This guide deals with setting up a distributed search in AlphaDIA, with the following prerequisites: +- A Linux (Ubuntu) HPCL system with Slurm Workload Manager installed. All resource management is handled by Slurm, AlphaDIA does not select, manage or monitor worker nodes. +- The distributed search requires absolute paths for each raw file, saved in the second column of a two-column .csv document. Simpler structures that e.g. process all files in a given directory are disfavored as large cohorts frequently consist of rawfiles spread across a number of subfolders. +- An Anaconda environment called "alphadia" with _mono_ and _alphadia_ installed (for installing _mono_, see https://github.com/MannLabs/alpharaw#installation) + +Distributed search concept +========================== + +![Distributed_Search](../../assets/distributed_search_schematic.svg) + +Compared to a linear two step search, distributing the raw file search steps offers a significant advantage in speed. At the most extreme, N files could be searched on N machines in parallel, completely decoupling search time from the number of files. In practice, not all steps of a search can be easily parallelized (i.e. steps which require knowledge of all files at the same time). Additionally, processing nodes available are never unlimited, requiring a chunking approach to ensure the maximum amount of parallelization with a given number of processing nodes. The diagram above summarizes these steps, indicating parallel AlphaDIA instances for first and second pass fo raw file searches. + +Steps to set up a search +======================== + +1. Set up an empty search directory on your HPCL partition. One directory corresponds to one study, i.e. one set of raw files, fasta/library and search configuration. +2. Copy all files from alphadia/misc/distributed_search into the search directory +3. If no .csv file with rawfile paths exists, it can be obtained by running **discover_project_files.py** from the search directory. +4. Set first and second search configurations in **first_config.yaml** and **second_config.yaml**. For example, number of precursor candidates and inference strategy, as well as mass tolerances may differ between first and second search. +Leave all the predefined settings in the two .yaml files as they are. +5. Set the search parameters in **outer.sh**. While these can also be provided as command line arguments, it is convenient to set them in **outer.sh** itself. This file requires the following settings: + - input_directory: the search directory + - input_filename: the .csv file containing rawfile paths + - target_directory: the directory where intermediate and final outputs are written (mind that slow read/write speeds to this location may slow down your search) + - library_path (optional, will be reannotated if fasta_path is provided and predict_library is set to 1): absolute path to a .hdf spectral library + - fasta_path (optional if library_path is provided and predict_library is set to 0): absolute path to .fasta file + - first_search_config_filename: name of .yaml file for the first search + - second_search_config_filename: name of the .yaml file for the building the MBR library, second search and LFQ +6. Run **outer.sh** with the following search settings: + - --nnodes (int): specifies how many nodes can be occupied. Rawfile search will be distributed across these nodes. If there are 5 nodes and 50 raw files, the search will take place on 5 nodes in chunks of 10 rawfiles each. + - --ntasks_per_node (int): default to 1, some HPCL systems allow for multiple tasks to run on one node + - --cpus (int): default to 12, specifies how many CPUs shall be used per task + - --mem (str): default to '250G', specifies RAM requirements for each task. + **HPCL systems may be set to restrict user resources to certain limits. Make sure the above parameters comply with your HPCL setup.** + - --predict_library (1/0): default to 1, whether to predict a spectral library from a given fasta + - --first_search (1/0): default to 1, whether to search all files with the initial spectral library + - --mbr_library (1/0): whether to aggregate first search results into a focused "MBR" library + - --second_search (1/0): whether to perform a second search with the focused MBR library + - --lfq (1/0): whether to perform LFQ quantification of the second search results + +Running the search creates five subdirectories in the target folder: + +- _predicted_speclib_: If spectral library prediction was set, this folder contains the .hdf spectral library +- _first_search_: Contains one subdirectory for each processing chunk. AlphaDIA subprocesses for the first search are run from these chunks and their specific config.yaml files. Precursor and fragment datasets from these searches are saved into the _mbr_library_ folder +- _mbr_library_: Contains one chunk, since the library is built from all first search results. +- _second_search_: Analogous to _first_search_, one subdirectory is created for each chunk of rawfiles that are searched with the mbr_library. Precursor and fragment datasets from these searches are saved into the _lfq_ folder. +- _lfq_: Analogous to _mbr_library_, contains one chunk which runs label free quantification (LFQ) on each output from the second search. After all search steps are completed, the final precursor and protein tables are saved here. diff --git a/misc/distributed_search/first_config.yaml b/misc/distributed_search/first_config.yaml new file mode 100755 index 00000000..492d5193 --- /dev/null +++ b/misc/distributed_search/first_config.yaml @@ -0,0 +1,4 @@ +# config template for "first search" +# (Default settings are suitable for the first search.) + +# adapt the rest of the config to your use case ... diff --git a/misc/distributed_search/inner.sh b/misc/distributed_search/inner.sh new file mode 100755 index 00000000..ad2c1c08 --- /dev/null +++ b/misc/distributed_search/inner.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Run alphadia with custom quantification directory. The point here is to use +# the slurm_index to find the proper chunk folder. + +#SBATCH --job-name=alphaDIA +#SBATCH --time=21-00:00:00 +#SBATCH --output=./logs/%A_%a_%x-slurm.out + +# navigate to chunk directory +slurm_index=${SLURM_ARRAY_TASK_ID} +chunk_directory="${target_directory}/chunk_${slurm_index}/" +cd $chunk_directory || exit + +# config file fixed as config.yaml +config_filename="config.yaml" + +# run with or without custom quant_dir +if [ -z "${quant_dir}" ]; then + alphadia --config ${config_filename} +else + alphadia --config ${config_filename} --quant-dir ${quant_dir} +fi + +echo "AlphaDIA completed successfully" diff --git a/misc/distributed_search/outer.sh b/misc/distributed_search/outer.sh new file mode 100755 index 00000000..74f763f3 --- /dev/null +++ b/misc/distributed_search/outer.sh @@ -0,0 +1,254 @@ +#!/usr/bin/env bash + +# Main script, sets source and destination folders, which csv file to read +# in order to get rawfile paths. Walks through the steps of a distributed +# search, waiting for each one to be finished before initializing +# the next one. For searches, the "inner.sh" script is called. For predicting +# speclibs, alphadia is run directly without rawfiles. + +#SBATCH --job-name=dist_AD +#SBATCH --time=21-00:00:00 +#SBATCH --output=./logs/%j-%x-slurm.out + +# Set behavior when errors are encountered +# # TODO: unresolved issues with failing on error due to library generation steps expecting AlphaDIA to fail since there are no rawfiles. +set -u -x + +# Default search parameters +nnodes=1 +ntasks_per_node=1 +cpus=32 +mem='250G' + +# Default search flags +predict_library=1 +first_search=1 +mbr_library=1 +second_search=1 +lfq=1 + +# Default search configuration file +search_config="search.config" + +while [[ "$#" -gt 0 ]]; do + case $1 in + # Search parameters + --search_config) search_config="$2"; shift ;; + # SLURM parameters + --nnodes) nnodes="$2"; shift ;; + --ntasks_per_node) ntasks_per_node="$2"; shift ;; + --cpus) cpus="$2"; shift ;; + --mem) mem="$2"; shift ;; + # Search flags: if a flag is present, set to true + --predict_library) predict_library="$2"; shift ;; + --first_search) first_search="$2"; shift ;; + --mbr_library) mbr_library="$2"; shift ;; + --second_search) second_search="$2"; shift ;; + --lfq) lfq="$2"; shift ;; + *) echo "Unknown parameter $1"; shift ;; + esac + shift +done + +# Source search configuration +if [[ -n "${search_config}" ]]; then + source ./${search_config} + echo "Using search config: ${search_config}" +else + echo "No search config provided, exiting" + exit 1 +fi + +# report set parameters +echo "Search config: ${search_config}" +echo "SLURM parameters: nnodes=${nnodes}, ntasks_per_node=${ntasks_per_node}, cpus=${cpus}, mem=${mem}" +echo "Search flags: predict_library=${predict_library}, first_search=${first_search}, mbr_library=${mbr_library}, second_search=${second_search}, lfq=${lfq}" + +# if target directory is not empty, exit +if [ "$(ls -A ${target_directory})" ]; then + echo "Target directory is not empty, exiting" + exit 1 +fi + +# create logs directory if it does not exist +mkdir -p ./logs + +predicted_library_directory="${target_directory}/1_predicted_speclib" +mkdir -p ${predicted_library_directory} + +first_search_directory="${target_directory}/2_first_search" +mkdir -p ${first_search_directory} + +mbr_library_directory="${target_directory}/3_mbr_library" +mkdir -p ${mbr_library_directory} + +mbr_progress_directory="${target_directory}/3_mbr_library/chunk_0/quant" +mkdir -p ${mbr_progress_directory} + +second_search_directory="${target_directory}/4_second_search" +mkdir -p ${second_search_directory} + +lfq_directory="${target_directory}/5_lfq" +mkdir -p ${lfq_directory} + +lfq_progress_directory="${target_directory}/5_lfq/chunk_0/quant" +mkdir -p ${lfq_progress_directory} + +### PREDICT LIBRARY ### + +if [[ "$predict_library" -eq 1 ]]; then + + # generate config without rawfiles and with fasta + python ./speclib_config.py \ + --input_directory "${input_directory}" \ + --target_directory "${predicted_library_directory}" \ + --fasta_path "${fasta_path}" \ + --library_path "${library_path}" \ + --config_filename "${first_search_config_filename}" \ + + # log current directory and navigate to predicted speclib directory + home_directory=$(pwd) + cd "${predicted_library_directory}" + + # call alphadia to predict spectral library as one task + echo "Predicting spectral library with AlphaDIA..." + sbatch --array="0-0%1" \ + --wait \ + --nodes=1 \ + --ntasks-per-node=${ntasks_per_node} \ + --cpus-per-task=${cpus} \ + --mem=${mem} \ + --output="${home_directory}/logs/%j-%x-speclib-slurm.out" \ + --export=ALL --wrap="alphadia --config=speclib_config.yaml" + + # navigate back to home directory + cd "${home_directory}" + + # if prediction took place, let the new speclib.hdf be the library path + library_path="${predicted_library_directory}/speclib.hdf" +else + echo "Skipping library prediction" +fi + +### FIRST SEARCH ### + +if [[ "$first_search" -eq 1 ]]; then + + # generate subdirectories for chunked first search + num_tasks=$(python ./parse_parameters.py \ + --input_directory "${input_directory}" \ + --input_filename "${input_filename}" \ + --config_filename "${first_search_config_filename}" \ + --target_directory "${first_search_directory}" \ + --nnodes ${nnodes} \ + --reuse_quant 0 \ + --library_path ${library_path}) + + # create slurm array for first search + slurm_array="0-$(($num_tasks-1))%${nnodes}" + echo "First search array: ${slurm_array}" + + # slurm passes the array index to the inner script, we add the target directory + echo "Performing first search in ${num_tasks} chunks..." + sbatch --array=${slurm_array} \ + --wait \ + --nodes=1 \ + --ntasks-per-node=${ntasks_per_node} \ + --cpus-per-task=${cpus} \ + --mem=${mem} \ + --export=ALL,target_directory=${first_search_directory},quant_dir=${mbr_progress_directory} ./inner.sh +else + echo "Skipping first search" +fi + +### MBR LIBRARY building ### --> simply reuse inner.sh with one chunk containing all files + +if [[ "$mbr_library" -eq 1 ]]; then + + # set mbr library directory to the quant files from the first search + num_tasks=$(python ./parse_parameters.py \ + --input_directory "${input_directory}" \ + --input_filename "${input_filename}" \ + --config_filename "${second_search_config_filename}" \ + --target_directory "${mbr_library_directory}" \ + --nnodes 1 \ + --reuse_quant 1 \ + --library_path ${library_path}) + + # create slurm array with one subdir, which is the quant files from the first search + slurm_array="0-$(($num_tasks-1))%${nnodes}" + echo "MBR library array: ${slurm_array}" + + # we force the single array to select the correct chunk and run the library building search + echo "Performing MBR library building in ${num_tasks} chunks on all quant files from first search..." + sbatch --array=${slurm_array} \ + --wait \ + --nodes=1 \ + --ntasks-per-node=${ntasks_per_node} \ + --cpus-per-task=${cpus} \ + --mem=${mem} \ + --export=ALL,target_directory=${mbr_library_directory} ./inner.sh +else + echo "Skipping MBR library building" +fi + +### SECOND SEARCH ### + +if [[ "$second_search" -eq 1 ]]; then + + num_tasks=$(python ./parse_parameters.py \ + --input_directory "${input_directory}" \ + --input_filename "${input_filename}" \ + --config_filename "${second_search_config_filename}" \ + --target_directory "${second_search_directory}" \ + --nnodes ${nnodes} \ + --reuse_quant 0 \ + --library_path "${mbr_library_directory}/chunk_0/speclib.mbr.hdf") + + # create slurm array for second search + slurm_array="0-$(($num_tasks-1))%${nnodes}" + echo "Second search array: ${slurm_array}" + + # slurm passes the array index to the inner script, we add the target directory + echo "Performing second search in ${num_tasks} chunks..." + sbatch --array=${slurm_array} \ + --wait \ + --nodes=1 \ + --ntasks-per-node=${ntasks_per_node} \ + --cpus-per-task=${cpus} \ + --mem=${mem} \ + --export=ALL,target_directory=${second_search_directory},quant_dir=${lfq_progress_directory} ./inner.sh +else + echo "Skipping second search" +fi + +### LFQ ### + +if [[ "$lfq" -eq 1 ]]; then + + # set lfq directory to the quant files from the second search + num_tasks=$(python ./parse_parameters.py \ + --input_directory "${input_directory}" \ + --input_filename "${input_filename}" \ + --config_filename "${second_search_config_filename}" \ + --target_directory "${lfq_directory}" \ + --nnodes 1 \ + --reuse_quant 1 \ + --library_path "${mbr_library_directory}/chunk_0/speclib.mbr.hdf") + + # create slurm array with one subdir, which is the quant files from the second search + slurm_array="0-$(($num_tasks-1))%${nnodes}" + echo "LFQ array: ${slurm_array}" + + # we force the single array to select the correct chunk and run the library building search + echo "Performing LFQ in ${num_tasks} chunks on all quant files from second search..." + sbatch --array=${slurm_array} \ + --wait \ + --nodes=1 \ + --ntasks-per-node=${ntasks_per_node} \ + --cpus-per-task=${cpus} \ + --mem=${mem} \ + --export=ALL,target_directory=${lfq_directory} ./inner.sh +else + echo "Skipping LFQ" +fi diff --git a/misc/distributed_search/parse_parameters.py b/misc/distributed_search/parse_parameters.py new file mode 100755 index 00000000..476013e5 --- /dev/null +++ b/misc/distributed_search/parse_parameters.py @@ -0,0 +1,117 @@ +# Parse input parameters for distributed AlphaDIA search +# The objective is to pull as much logic from the outer and inner shell scripts +# into a python script for easier reading/debugging. This script splits filepaths +# into properly sized chunks, creates chunk folders, copies the spectral library +# to each chunk folder*, writes an adapted config.yaml file for each chunk +# (i.e. a config.yaml which contains the filepaths for that specific chunk +# for running inner.sh). +# *Temporary solution to avoid simultaneous reads of the same library file. + +import argparse +import copy +import os +import sys + +import numpy as np +import pandas as pd +import yaml + + +# Add keys to config if they don't exist +def safe_add_key(config, parent_key, key, value): + """Safely add keys and values to config file dictionary""" + if parent_key is None: + config[key] = value + else: + if parent_key not in config: + config[parent_key] = {} + config[parent_key][key] = value + + +# parse input parameters +parser = argparse.ArgumentParser( + prog="DistributedAlphaDIAParams", + description="Parse input parameters into config files for chunked cluster processing of files with AlphaDIA", +) +parser.add_argument("--input_directory") +parser.add_argument("--input_filename") +parser.add_argument("--library_path") +parser.add_argument("--config_filename") +parser.add_argument("--target_directory") +parser.add_argument("--nnodes") +parser.add_argument("--reuse_quant") +args = parser.parse_args() + +# read the input filename +infile = pd.read_csv( + os.path.join(args.input_directory, args.input_filename), skiprows=0 +) + +# read the config .yaml file +with open(os.path.join(args.input_directory, args.config_filename)) as file: + config = yaml.safe_load(file) or {} + +# set requantition, False for searches, True for MBR, LFQ +safe_add_key(config, "general", "reuse_quant", args.reuse_quant == "1") + +# library must be predicted/annotated prior to chunking +safe_add_key(config, "library_prediction", "predict", False) + +# remove any fasta if one is present in the config file +config.pop("fasta_list", None) + +# determine chunk size: division of infile rowcount and number of nodes +chunk_size = int(np.ceil(infile.shape[0] / int(args.nnodes))) + +# determine maximum number of tasks, i.e. the number of chunks needed +max_tasks = int(np.ceil(infile.shape[0] / chunk_size)) + +# split the filepaths into chunks +all_filepaths = infile.iloc[:, 1].values +target_subdirectories = [] +for i in range(0, max_tasks): + # get current chunk indices + start_idx = chunk_size * i + end_idx = start_idx + chunk_size + + # copy original config for the current chunk + current_config = copy.deepcopy(config) + + # save current chunk indices into chunk-yaml as raw files + safe_add_key( + current_config, None, "raw_path_list", list(all_filepaths[start_idx:end_idx]) + ) + + # create folder for current chunk in target directory. Don't create the folder if it already exists. + chunk_folder = os.path.join(args.target_directory, "chunk_" + str(i)) + os.makedirs(chunk_folder, exist_ok=True) + + # retrieve library path from config or arguments, set new library path in config + if os.path.exists(args.library_path) and os.path.basename( + args.library_path + ).endswith(".hdf"): + lib_source = args.library_path + else: + print( + "No valid library_path to a .hdf file provided and no valid library path to a .hdf file specified in config file, exiting...", + file=sys.stderr, + ) + sys.exit(1) + + # set library path in config + safe_add_key(current_config, None, "library", lib_source) + + # set chunk folder as output_directory in the config + safe_add_key(current_config, None, "output_directory", "./") + + # save the config with the current chunk's rawfiles belonging to the current chunk folder + with open(os.path.join(chunk_folder, "config.yaml"), "w") as file: + yaml.safe_dump( + current_config, file, default_style=None, default_flow_style=False + ) + + # save the target subdirectory + target_subdirectories.append(chunk_folder) + +# the only return value needed is number of tasks to be distributed by the scheduler +print(max_tasks) diff --git a/misc/distributed_search/search.config b/misc/distributed_search/search.config new file mode 100644 index 00000000..36562b95 --- /dev/null +++ b/misc/distributed_search/search.config @@ -0,0 +1,7 @@ +input_directory="/fs/home/brennsteiner/alphadia/distributed_search_test" +input_filename="file_list.csv" +target_directory="/fs/home/brennsteiner/alphadia/distributed_search_teset/search" +library_path="/fs/home/brennsteiner/alphadia/distributed_search_test/48_fraction_hela_PaSk_orbitrap_ms2.hdf" +fasta_path="/fs/home/brennsteiner/alphadia/distributed_search_test/uniprotkb_proteome_UP000005640_2023_11_12.fasta" +first_search_config_filename="config.yaml" +second_search_config_filename="config.yaml" diff --git a/misc/distributed_search/second_config.yaml b/misc/distributed_search/second_config.yaml new file mode 100755 index 00000000..82691dec --- /dev/null +++ b/misc/distributed_search/second_config.yaml @@ -0,0 +1,6 @@ +# config template for "second search" +fdr: + inference_strategy: library # do not change for "second search" +search: + target_num_candidates: 5 # do not change for "second search" +# adapt the rest of the config to your use case ... diff --git a/misc/distributed_search/speclib_config.py b/misc/distributed_search/speclib_config.py new file mode 100755 index 00000000..0ad79e67 --- /dev/null +++ b/misc/distributed_search/speclib_config.py @@ -0,0 +1,59 @@ +# Modify config.yaml file for spectral library prediction by removing any +# rawfiles that may be set, setting prediction to True and adding +# library/fasta paths. Since spectral library prediction should take place +# with the same settings as the first search, the config filename is hard-coded. + +import argparse +import os + +import yaml + + +# Add keys to config if they don't exist +def safe_add_key(config, parent_key, key, value): + """Safely add keys and values to config file dictionary""" + if parent_key is None: + config[key] = value + else: + if parent_key not in config: + config[parent_key] = {} + config[parent_key][key] = value + + +# parse input parameters +parser = argparse.ArgumentParser( + prog="DistributedAlphaDIALibrary", + description="Append fasta file to config for library prediction.", +) +parser.add_argument("--input_directory") +parser.add_argument("--target_directory") +parser.add_argument("--fasta_path") +parser.add_argument("--library_path") +parser.add_argument("--config_filename") +args = parser.parse_args() + +# read the config.yaml file from the input directory +with open(os.path.join(args.input_directory, args.config_filename)) as file: + config = yaml.safe_load(file) or {} + +# if library and fasta are set, predicting will result in repredicted & annotated library +# add fasta_list to config +_new_fasta_list = [args.fasta_path] if args.fasta_path else [] +safe_add_key(config, None, "fasta_list", _new_fasta_list) + +# add library path to config +_new_library = args.library_path if args.library_path else None +safe_add_key(config, None, "library", _new_library) + +# set library prediction to True +safe_add_key(config, "library_prediction", "predict", True) + +# remove rawfiles for prediction step in case some are set +config.pop("raw_path_list", None) + +# set output directory for predicted spectral library +safe_add_key(config, None, "output_directory", os.path.join(args.target_directory)) + +# write speclib_config.yaml to input directory for the library prediction +with open(os.path.join(config["output_directory"], "speclib_config.yaml"), "w") as file: + yaml.safe_dump(config, file, default_style=None, default_flow_style=False) diff --git a/pyproject.toml b/pyproject.toml index 7407154d..711b3bf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ version = {attr = "alphadia.__version__"} alphadia = "alphadia.cli:run" [tool.ruff] -extend-exclude = ["misc/.bumpversion.cfg"] +extend-exclude = ["misc/.bumpversion.cfg", "tests"] [tool.ruff.lint] select = [ @@ -76,10 +76,39 @@ select = [ "SIM", # isort "I", + #"ALL" ] ignore = [ + "D", + "ANN", + "SLF001", # Private member accessed TODO this needs to be fixed in alphabase + "E501", # Line too long (ruff wraps code, but not docstrings) "B028", # No explicit `stacklevel` keyword argument found (for warnings) - "B905" # This causes problems in numba code: `zip()` without an explicit `strict=` parameter + "B905", # This causes problems in numba code: `zip()` without an explicit `strict=` parameter + "COM812", #may cause conflicts when used with the formatter + "ISC001", #may cause conflicts when used with the formatter + "D211", # no-blank-line-before-class + "D213", # multi-line-summary-second-line + "S101", # Use of `assert` detected + "INP001", # implicit namespace package. + "ERA001", # Found commented-out code + "D203", # 1 blank line required before class docstring + "TD002", "TD003", "FIX002", # things around TO-DO + "PT011", #pytest.raises(ValueError) is too broad + "G004", "EM102", # Logging statement uses f-string + "TRY003", # Avoid specifying long messages outside the exception class + "ANN101", # Missing type annotation for `self` in method + "ANN102", # Missing type annotation for `cls` in classmethod + "ANN002", # Missing type annotation for `*args` + "ANN003", # Missing type annotation for `**kwargs + "FA102", # Missing `from __future__ import annotations + "EM101", # Exception must not use a string literal, assign to variable first + "D104", # Missing docstring in public package + "ANN204", # Missing return type annotation for special method `__init__` + "D401", # First line of docstring should be in imperative mood + "B023", # Function definition does not bind loop variable + "PD901", # Avoid using the generic variable name `df` for DataFrames" + "TCH003", # Move standard library import into a type-checking block ] diff --git a/release/linux/build_installer_linux.sh b/release/linux/build_installer_linux.sh index 039bc277..27cec8ef 100755 --- a/release/linux/build_installer_linux.sh +++ b/release/linux/build_installer_linux.sh @@ -10,7 +10,7 @@ rm -rf dist build *.egg-info rm -rf dist_pyinstaller build_pyinstaller python -m build -pip install "dist/alphadia-1.8.2-py3-none-any.whl[stable]" +pip install "dist/alphadia-1.9.0-py3-none-any.whl[stable]" if [ "${CPU_OR_GPU}" != "GPU" ]; then pip install torch -U --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/release/linux/control b/release/linux/control index 6df1d0c7..3e8ef7da 100644 --- a/release/linux/control +++ b/release/linux/control @@ -1,5 +1,5 @@ Package: alphadia -Version: 1.8.2 +Version: 1.9.0 Architecture: all Maintainer: Mann Labs Description: alphadia diff --git a/release/macos/build_installer_macos.sh b/release/macos/build_installer_macos.sh index 6f3f48b2..f6bd301e 100755 --- a/release/macos/build_installer_macos.sh +++ b/release/macos/build_installer_macos.sh @@ -10,7 +10,7 @@ rm -rf dist_pyinstaller build_pyinstaller export EAGER_IMPORT=true # TODO check if this can be removed with newset peptdeep version w/out transformer dependenc python -m build -pip install "dist/alphadia-1.8.2-py3-none-any.whl[stable]" +pip install "dist/alphadia-1.9.0-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pyinstaller release/pyinstaller/alphadia.spec --distpath dist_pyinstaller --workpath build_pyinstaller -y diff --git a/release/macos/build_package_macos.sh b/release/macos/build_package_macos.sh index 0460a227..30b56b52 100755 --- a/release/macos/build_package_macos.sh +++ b/release/macos/build_package_macos.sh @@ -7,10 +7,10 @@ set -e -u # Set up package name and version PACKAGE_NAME="alphadia" APP_NAME="alphadia" -PACKAGE_VERSION="1.8.2" +PACKAGE_VERSION="1.9.0" PKG_FOLDER="dist/$APP_NAME.app" -# BUILD_NAME is taken from environment variables, e.g. alphadia-1.8.2-macos-darwin-arm64 or alphadia-1.8.2-macos-darwin-x64 +# BUILD_NAME is taken from environment variables, e.g. alphadia-1.9.0-macos-darwin-arm64 or alphadia-1.9.0-macos-darwin-x64 rm -rf ${BUILD_NAME}.pkg # Cleanup the package folder diff --git a/release/macos/distribution.xml b/release/macos/distribution.xml index f9fe6faf..b11a01d2 100644 --- a/release/macos/distribution.xml +++ b/release/macos/distribution.xml @@ -1,6 +1,6 @@ - AlphaDIA 1.8.2 + AlphaDIA 1.9.0 diff --git a/release/macos/info.plist b/release/macos/info.plist index 788e1ffb..13571787 100644 --- a/release/macos/info.plist +++ b/release/macos/info.plist @@ -9,9 +9,9 @@ CFBundleIconFile alphadia.icns CFBundleIdentifier - alphadia.1.8.2 + alphadia.1.9.0 CFBundleShortVersionString - 1.8.2 + 1.9.0 CFBundleInfoDictionaryVersion 6.0 CFBundleName diff --git a/release/windows/alphadia_innoinstaller.iss b/release/windows/alphadia_innoinstaller.iss index 1797fe00..86f218a5 100644 --- a/release/windows/alphadia_innoinstaller.iss +++ b/release/windows/alphadia_innoinstaller.iss @@ -5,7 +5,7 @@ ; so all paths are given relative to the location of this .iss file. #define MyAppName "AlphaDIA" -#define MyAppVersion "1.8.2" +#define MyAppVersion "1.9.0" #define MyAppPublisher "Max Planck Institute of Biochemistry, Mann Labs" #define MyAppURL "https://github.com/MannLabs/alphadia" #define MyAppExeName "alphadia-gui.exe" @@ -29,7 +29,7 @@ PrivilegesRequired=lowest PrivilegesRequiredOverridesAllowed=dialog ; release workflow expects artifact at root of repository OutputDir=..\..\ -; example for BUILD_NAME: alphadia-1.8.2-win-x64 +; example for BUILD_NAME: alphadia-1.9.0-win-x64 OutputBaseFilename={#GetEnv('BUILD_NAME')} SetupIconFile=..\logos\alphadia.ico Compression=lzma diff --git a/release/windows/build_installer_windows.ps1 b/release/windows/build_installer_windows.ps1 index a8e75d7f..0805f0df 100644 --- a/release/windows/build_installer_windows.ps1 +++ b/release/windows/build_installer_windows.ps1 @@ -5,7 +5,7 @@ Remove-Item -Recurse -Force -ErrorAction SilentlyContinue ./build Remove-Item -Recurse -Force -ErrorAction SilentlyContinue ./dist python -m build -pip install "dist/alphadia-1.8.2-py3-none-any.whl[stable]" +pip install "dist/alphadia-1.9.0-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install tbb==2021.13.1 diff --git a/tests/performance_tests/1_brunner_2022_1ng_all.py b/tests/performance_tests/1_brunner_2022_1ng_all.py index 6bd9f948..b40107a1 100644 --- a/tests/performance_tests/1_brunner_2022_1ng_all.py +++ b/tests/performance_tests/1_brunner_2022_1ng_all.py @@ -22,7 +22,7 @@ try: test_dir = os.environ["TEST_DATA_DIR"] except KeyError: - logging.error("TEST_DATA_DIR environtment variable not set") + logging.exception("TEST_DATA_DIR environtment variable not set") raise KeyError from None logging.info(f"Test data directory: {test_dir}") diff --git a/tests/performance_tests/diann_psm_extraction.py b/tests/performance_tests/diann_psm_extraction.py index 29118a7d..f45280a8 100644 --- a/tests/performance_tests/diann_psm_extraction.py +++ b/tests/performance_tests/diann_psm_extraction.py @@ -25,7 +25,7 @@ try: neptune_token = os.environ["NEPTUNE_TOKEN"] except KeyError: - logging.error("NEPTUNE_TOKEN environtment variable not set") + logging.exception("NEPTUNE_TOKEN environtment variable not set") raise KeyError from None run = neptune.init_run(project="MannLabs/alphaDIA", api_token=neptune_token) @@ -42,7 +42,7 @@ try: test_dir = os.environ["TEST_DATA_DIR"] except KeyError: - logging.error("TEST_DATA_DIR environtment variable not set") + logging.exception("TEST_DATA_DIR environtment variable not set") raise KeyError from None logging.info(f"Test data directory: {test_dir}") diff --git a/tests/unit_tests/test_calibration_property.py b/tests/unit_tests/test_calibration_property.py index cadee274..cf13183d 100644 --- a/tests/unit_tests/test_calibration_property.py +++ b/tests/unit_tests/test_calibration_property.py @@ -20,6 +20,8 @@ def test_uninitialized_calibration(): with pytest.raises(ValueError): mz_calibration.fit(mz_df) + assert mz_calibration.metrics is None + def test_fit_predict_linear(): library_mz = np.linspace(100, 1000, 100) @@ -38,6 +40,8 @@ def test_fit_predict_linear(): mz_calibration.predict(mz_df) assert "calibrated_mz" in mz_df.columns + assert "median_accuracy" in mz_calibration.metrics + assert "median_precision" in mz_calibration.metrics def test_fit_predict_loess(): @@ -57,6 +61,8 @@ def test_fit_predict_loess(): mz_calibration.predict(mz_df) assert "calibrated_mz" in mz_df.columns + assert "median_accuracy" in mz_calibration.metrics + assert "median_precision" in mz_calibration.metrics def test_save_load(): @@ -86,3 +92,5 @@ def test_save_load(): mz_calibration_loaded.predict(df_loaded) assert np.allclose(df_original["calibrated_mz"], df_loaded["calibrated_mz"]) + assert "median_accuracy" in mz_calibration.metrics + assert "median_precision" in mz_calibration.metrics diff --git a/tests/unit_tests/test_data.py b/tests/unit_tests/test_data.py index e1b643a0..ba3e972f 100644 --- a/tests/unit_tests/test_data.py +++ b/tests/unit_tests/test_data.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from alphadia.data import alpharaw, bruker +from alphadia.data import alpharaw_wrapper, bruker def test_transpose(): @@ -31,16 +31,16 @@ def test_cycle(): cycle = np.zeros(rand_cycle_start) cycle = np.append(cycle, np.tile(np.arange(rand_cycle_length), rand_num_cycles)) - cycle_length = alpharaw.get_cycle_length(cycle) - cycle_start = alpharaw.get_cycle_start(cycle, cycle_length) - cycle_valid = alpharaw.assert_cycle(cycle, cycle_length, cycle_start) + cycle_length = alpharaw_wrapper.get_cycle_length(cycle) + cycle_start = alpharaw_wrapper.get_cycle_start(cycle, cycle_length) + cycle_valid = alpharaw_wrapper.assert_cycle(cycle, cycle_length, cycle_start) assert cycle_valid assert cycle_length == rand_cycle_length assert cycle_start == rand_cycle_start -@pytest.mark.slow +@pytest.mark.slow() def test_raw_data(): if pytest.test_data is None: pytest.skip("No test data found") @@ -51,7 +51,7 @@ def test_raw_data(): if name == "bruker": jit_data = bruker.TimsTOFTranspose(file).jitclass() elif name == "thermo": - jit_data = alpharaw.Thermo(file).jitclass() + jit_data = alpharaw_wrapper.Thermo(file).jitclass() else: continue @@ -224,3 +224,155 @@ def fuzz_get_dense(jit_data): assert dense.dtype == np.float32 assert precursor_index.ndim == 1 assert precursor_index.dtype == np.int64 + + +@pytest.fixture +def mock_alpha_raw_jit(): + # Create mock data for AlphaRawJIT + cycle = np.zeros((1, 5, 1, 2), dtype=np.float64) + cycle[0, :, 0, 0] = [100.0, 200.0, 300.0, 400.0, 500.0] + cycle[0, :, 0, 1] = [200.0, 300.0, 400.0, 500.0, 600.0] + + rt_values = np.arange(0, 100, 1).astype(np.float32) + mobility_values = np.array([0.0, 0.0], dtype=np.float32) + zeroth_frame = 0 + + max_mz_value = 1000.0 + min_mz_value = 100.0 + + quad_max_mz_value = 500.0 + quad_min_mz_value = 100.0 + + precursor_cycle_max_index = 19 + + # 0, 10, 20, ..., 990 with length 100 + peak_start_idx_list = np.arange(0, 1000, 10, dtype=np.int64) + # 1, 2, 3, ..., 1001 with length 100 + peak_stop_idx_list = peak_start_idx_list + 1 + + mz_values = ( + np.random.rand(1000) * (max_mz_value - min_mz_value) + min_mz_value + ).astype(np.float32) + intensity_values = np.random.rand(1000).astype(np.float32) + + scan_max_index = 0 + + frame_max_index = 99 + + # Instantiate AlphaRawJIT + alpha_raw_jit = alpharaw_wrapper.AlphaRawJIT( + cycle=cycle, + rt_values=rt_values, + mobility_values=mobility_values, + zeroth_frame=zeroth_frame, + max_mz_value=max_mz_value, + min_mz_value=min_mz_value, + quad_max_mz_value=quad_max_mz_value, + quad_min_mz_value=quad_min_mz_value, + precursor_cycle_max_index=precursor_cycle_max_index, + peak_start_idx_list=peak_start_idx_list, + peak_stop_idx_list=peak_stop_idx_list, + mz_values=mz_values, + intensity_values=intensity_values, + scan_max_index=scan_max_index, + frame_max_index=frame_max_index, + ) + return alpha_raw_jit + + +def test_get_frame_indices(mock_alpha_raw_jit): + # given + optimize_size = 1 + min_size = 1 + rt_values = np.array([10.0, 20.0], dtype=np.float32) + expected_indices = np.array([[10, 20, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) + + +def test_get_frame_indices_optimization_right(mock_alpha_raw_jit): + # given + optimize_size = 4 + min_size = 1 + rt_values = np.array([10.0, 20.0], dtype=np.float32) + expected_indices = np.array([[10, 30, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) + + +def test_get_frame_indices_optimization_right_min_size(mock_alpha_raw_jit): + # given + optimize_size = 4 + min_size = 8 + rt_values = np.array([10.0, 20.0], dtype=np.float32) + expected_indices = np.array([[10, 50, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) + + +def test_get_frame_indices_optimization_left(mock_alpha_raw_jit): + # given + optimize_size = 4 + min_size = 1 + rt_values = np.array([90.0, 95.0], dtype=np.float32) + expected_indices = np.array([[75, 95, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) + + # test optimization and min size left + + +def test_get_frame_indices_optimization_left_min_size(mock_alpha_raw_jit): + # given + optimize_size = 4 + min_size = 8 + rt_values = np.array([90.0, 95.0], dtype=np.float32) + expected_indices = np.array([[55, 95, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) + + +def test_get_frame_indices_optimization_left_min_size_overflow(mock_alpha_raw_jit): + # given + optimize_size = 4 + min_size = 1000 + rt_values = np.array([90.0, 95.0], dtype=np.float32) + expected_indices = np.array([[0, 95, 1]], dtype=np.int64) + + # when + frame_indices = mock_alpha_raw_jit.get_frame_indices( + rt_values, optimize_size, min_size + ) + + # then + assert np.array_equal(frame_indices, expected_indices) diff --git a/tests/unit_tests/test_fdr.py b/tests/unit_tests/test_fdr.py index 8042ce7d..d2aa30db 100644 --- a/tests/unit_tests/test_fdr.py +++ b/tests/unit_tests/test_fdr.py @@ -159,7 +159,7 @@ def test_get_q_values(): ) -@pytest.mark.slow +@pytest.mark.slow() def test_fdr(): matplotlib.use("Agg") diff --git a/tests/unit_tests/test_outputaccumulator.py b/tests/unit_tests/test_outputaccumulator.py index 5b4c683e..896aa06d 100644 --- a/tests/unit_tests/test_outputaccumulator.py +++ b/tests/unit_tests/test_outputaccumulator.py @@ -8,6 +8,7 @@ from conftest import mock_fragment_df, mock_precursor_df from alphadia import outputtransform +from alphadia.workflow.base import QUANT_FOLDER_NAME def prepare_input_data(): @@ -64,11 +65,11 @@ def prepare_input_data(): temp_folder = os.path.join(tempfile.gettempdir(), "alphadia") os.makedirs(temp_folder, exist_ok=True) - progress_folder = os.path.join(temp_folder, "progress") - os.makedirs(progress_folder, exist_ok=True) + quant_path = os.path.join(temp_folder, QUANT_FOLDER_NAME) + os.makedirs(quant_path, exist_ok=True) # setup raw folders - raw_folders = [os.path.join(progress_folder, run) for run in run_columns] + raw_folders = [os.path.join(quant_path, run) for run in run_columns] psm_base_df = mock_precursor_df(n_precursor=100, with_decoy=True) fragment_base_df = mock_fragment_df(n_precursor=200, n_fragments=10) diff --git a/tests/unit_tests/test_outputtransform.py b/tests/unit_tests/test_outputtransform.py index 23f5da92..eb582e93 100644 --- a/tests/unit_tests/test_outputtransform.py +++ b/tests/unit_tests/test_outputtransform.py @@ -8,6 +8,7 @@ from alphadia import outputtransform from alphadia.workflow import manager, peptidecentric +from alphadia.workflow.base import QUANT_FOLDER_NAME def test_output_transform(): @@ -54,11 +55,11 @@ def test_output_transform(): temp_folder = os.path.join(tempfile.gettempdir(), "alphadia") os.makedirs(temp_folder, exist_ok=True) - progress_folder = os.path.join(temp_folder, "progress") - os.makedirs(progress_folder, exist_ok=True) + quant_path = os.path.join(temp_folder, QUANT_FOLDER_NAME) + os.makedirs(quant_path, exist_ok=True) # setup raw folders - raw_folders = [os.path.join(progress_folder, run) for run in run_columns] + raw_folders = [os.path.join(quant_path, run) for run in run_columns] psm_base_df = mock_precursor_df(n_precursor=100) fragment_base_df = mock_fragment_df(n_precursor=200) @@ -79,13 +80,13 @@ def test_output_transform(): config, path=os.path.join( raw_folder, - peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PATH, + peptidecentric.PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME, ), ) timing_manager = manager.TimingManager( path=os.path.join( - raw_folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PATH + raw_folder, peptidecentric.PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME ) ) @@ -134,17 +135,16 @@ def test_output_transform(): os.path.join(temp_folder, f"{output.STAT_OUTPUT}.tsv"), sep="\t" ) assert len(stat_df) == 3 - assert stat_df["ms2_error"][0] == 6 - assert stat_df["rt_error"][0] == 200 + + assert stat_df["optimization.ms2_error"][0] == 6 + assert stat_df["optimization.rt_error"][0] == 200 assert all( [ col in stat_df.columns - for col in [ - "run", - "precursors", - "proteins", - ] + for col in ['run', 'channel', 'precursors', 'proteins', 'fwhm_rt', 'fwhm_mobility', 'optimization.ms2_error', + 'optimization.ms1_error', 'optimization.rt_error', 'optimization.mobility_error', 'calibration.ms2_median_accuracy', 'calibration.ms2_median_precision', 'calibration.ms1_median_accuracy', 'calibration.ms1_median_precision', 'raw.gradient_min_m', 'raw.gradient_max_m', 'raw.gradient_length_m', 'raw.cycle_length', 'raw.cycle_duration', 'raw.cycle_number', 'raw.msms_range_min', 'raw.msms_range_max'] + ] ) diff --git a/tests/unit_tests/test_planning.py b/tests/unit_tests/test_planning.py index d9a23ce8..f46b92dc 100644 --- a/tests/unit_tests/test_planning.py +++ b/tests/unit_tests/test_planning.py @@ -9,7 +9,7 @@ from alphadia.test_data_downloader import DataShareDownloader -@pytest.mark.slow +@pytest.mark.slow() def test_fasta_digest(): # digest & predict new library common_contaminants = os.path.join(_const.CONST_FILE_FOLDER, "contaminants.fasta") @@ -45,7 +45,7 @@ def test_fasta_digest(): assert len(plan.spectral_library.fragment_df) > 0 -@pytest.mark.slow +@pytest.mark.slow() def test_library_loading(): temp_directory = tempfile.gettempdir() diff --git a/tests/unit_tests/test_workflow.py b/tests/unit_tests/test_workflow.py index 1e289b13..807c5518 100644 --- a/tests/unit_tests/test_workflow.py +++ b/tests/unit_tests/test_workflow.py @@ -293,7 +293,7 @@ def test_optimization_manager_fit(): os.remove(temp_path) -@pytest.mark.slow +@pytest.mark.slow() def test_workflow_base(): if pytest.test_data is None: pytest.skip("No test data found") @@ -320,7 +320,7 @@ def test_workflow_base(): assert my_workflow.config["output"] == config["output"] assert my_workflow.instance_name == workflow_name assert my_workflow.parent_path == os.path.join( - config["output"], base.TEMP_FOLDER + config["output"], base.QUANT_FOLDER_NAME ) assert my_workflow.path == os.path.join( my_workflow.parent_path, workflow_name @@ -434,16 +434,16 @@ def create_workflow_instance(): ) workflow._calibration_manager = manager.CalibrationManager( workflow.config["calibration_manager"], - path=os.path.join(workflow.path, workflow.CALIBRATION_MANAGER_PATH), + path=os.path.join(workflow.path, workflow.CALIBRATION_MANAGER_PKL_NAME), load_from_file=workflow.config["general"]["reuse_calibration"], reporter=workflow.reporter, ) workflow._optimization_manager = manager.OptimizationManager( TEST_OPTIMIZATION_CONFIG, - path=os.path.join(workflow.path, workflow.OPTIMIZATION_MANAGER_PATH), + path=os.path.join(workflow.path, workflow.OPTIMIZATION_MANAGER_PKL_NAME), load_from_file=workflow.config["general"]["reuse_calibration"], - figure_path=os.path.join(workflow.path, workflow.FIGURE_PATH), + figure_path=os.path.join(workflow.path, workflow.FIGURES_FOLDER_NAME), reporter=workflow.reporter, )