From 8f451922443b7495bc9719e2c047fc9334785b82 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Wed, 30 Oct 2024 11:43:59 +0100 Subject: [PATCH 1/3] Add Simeons version of FindRecalSeries Fixes RECETOX/MFAssignR#69 --- R/FindRecalSeries.R | 50 ++++++++++++++++-- .../expected_FindRecalSeriesSimple.rds | Bin 0 -> 1327 bytes tests/testthat/test-findRecalSeries.R | 31 +++++++---- 3 files changed, 66 insertions(+), 15 deletions(-) create mode 100644 tests/testthat/test-data/expected_FindRecalSeriesSimple.rds diff --git a/R/FindRecalSeries.R b/R/FindRecalSeries.R index 7bf011d..03972e4 100644 --- a/R/FindRecalSeries.R +++ b/R/FindRecalSeries.R @@ -1,8 +1,8 @@ #' Filters the input dataframe -#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and +#' This function filters the input dataframe based on Abundance.Score threshold and peak distance threshold; and #' computes the length of the series. #' @param df DataFrame An output from RecalList, containing recalibrant CH2 series. -#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100. +#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100. #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the #' better. #' @return DataFrame A filtered dataframe. @@ -173,13 +173,13 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) { #' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria: #' 1) Series should cover the full mass spectral range, #' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z, -#' 3) Abundance score: the higher, the better, +#' 3) Abundance.Score: the higher, the better, #' 4) Peak score: the closer to 0, the better, #' 5) Peak Distance: the closer to 1, the better, #' 6) Series Score: the closer to this value, the better. #' #' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10 -#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #' +#' elements, a pre-filtering is done: only the series which have Abundance.Score > 100 are considered and the one #' #' having Peak Distance < 2. #' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak #' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest @@ -192,7 +192,7 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) { #' @param global_max Float A higher bound of the instrument m/z range. #' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can #' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially) -#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100. +#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100. #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the #' better. #' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %. @@ -232,3 +232,43 @@ FindRecalSeries <- function(df, # Return the top scoring series return(final_series) } + +#' Simple rewritten version of the FindRecalSeries function. +#' +#' @description This function is not based on combinations of series but simply computes the scores and returns +#' the 10 best seires. +#' @param Recal data.frame A dataframe containing the various recal series. +#' @return A dataframe of n-10 best-scoring series. +#' @export +FindRecalSeriesSimple <- function(Recal) { + Cal_Pick <- dplyr::filter(Recal, `Series.Score`>= 1 & `Peak.Distance` <= 3.3 & `Peak.Distance` >= 1) + Cal_Pick <- dplyr::mutate(Cal_Pick, `Peak.Distance` = floor(`Peak.Distance`), `Series.Score` = round(`Series.Score`, 1)) + + #The weighting for each of the terms was determined experimentally and via my experience choosing series and what is most important. + Cal_Pick <- dplyr::mutate( + Cal_Pick, + Num_Weight = `Number.Observed` / max(`Number.Observed`) *25, + Abund_weight = (((`Abundance.Score`) / max(`Abundance.Score`))*20), + TallP_Weight = `Tall.Peak`/30, + PeakS_Weight = abs((`Peak.Score`* 5 - 10)), + PeakD_Weight = -3 * `Peak.Distance` + 10, + Series_Weight = (`Series.Score` / (( `Series.Score` - 0.5 )))*5 + ) + + Cal_Pick <- dplyr::mutate(Cal_Pick, Total_score = Num_Weight + PeakS_Weight + PeakD_Weight + Abund_weight+ Series_Weight + TallP_Weight) + Cal_Pick <- Cal_Pick[order(-Cal_Pick$Total_score),] + + Cal_Pick_HM <- tidyr::separate(Cal_Pick, `Mass.Range`, into = c("Low", "High"), sep = "-", remove = FALSE) + Top10 <- dplyr::slice(Cal_Pick_HM, c(1:10)) + + Cal_Pick_HM2 <- dplyr::filter(Cal_Pick_HM, Low > as.numeric(max(Top10$High))-50 & Low < as.numeric(max(Top10$High))-10) + HM_Cal <- dplyr::filter(Cal_Pick_HM2, Total_score == max(Total_score)) + + Cal_Pick_LM <- dplyr::filter(Cal_Pick_HM, (Low > as.numeric(min(Cal_Pick_HM$Low)) & Low < as.numeric(min(Cal_Pick_HM$Low))+20) & High > as.numeric(min(Top10$Low))) + LM_Cal <- dplyr::filter(Cal_Pick_LM, Total_score == max(Total_score)) + + Picked_Cal_series <- dplyr::bind_rows(LM_Cal, Top10, HM_Cal) + Picked_Cal_series <- dplyr::select(Picked_Cal_series, -c(5,6)) + + Picked_Cal_series +} \ No newline at end of file diff --git a/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds b/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds new file mode 100644 index 0000000000000000000000000000000000000000..9e387ec2097f93191c52b47f58b7f57c3e104cbd GIT binary patch literal 1327 zcmV+~1fD(sDsBUg~fgkeQ%iclk3POC}8(h^IPIKj$kg2YKBJ8prrO4J8K<1`_jb|lSPuq0{^ zd9V^qsZk#+g=k()j(GS=@`vtzX~ID}~t&f0z6@`$veMIk4fj;Zqmq*L|Yo@?p`RyP2~o ztYP^Pu0wCgJ=QkNwWmA3SH-k&XRch@q5aOvowok7j;yGX95|*ei50mb?NyAY}lNaG<+EpLj8-+=2GW14Z`*?)i!wjH`l-4^@Tq$A=(>(K(~T?2tO-C`$j$0C6XC%B!!ZLA>F~$o@O+kXq3o zzcyG0@_;VNHTV-`yxY}(aA+4y`Mh##d2r+S`3&tJ?pv}2k}4MzHZK@~q^9(q)_aYR zxbe<(ga1-Ubgnz{{Lwy$BeE*5Hf6(P!Z3LH%6v$;rqG;NM8Ipe_HwqM!Eb4;s z3?#0NcGdT$O8&9JIg2RsLErV4>3mS~7jMHUEt+X9n&ki7k|It$kTvl@)+Bgc$m3xQ z--z}603<%K*fS)5(9YkaO~OOsca3az2Z=)Nr&PaSx+-|D0_{wdF6Xi1TeN$wyS0Gln@CxcGOD%R>SL(3X>n&!1 zesxTQYiwrADzQ#Js*TSz*_K%ZCPZ(tTJb`QX{G2kv$W1;HrdK7cu`rEUDSp05-%p) z!(>-DYI(6wQLnA2KVyldV%hRqQRS&&Z1CdvMhbm&e2Jp};j+EC@Ug1iWSG9H)?_t0 lJe_zSYKXn60T&OJf2I2K|B}(s(I*H-{s9i5-$#lF005exouL2# literal 0 HcmV?d00001 diff --git a/tests/testthat/test-findRecalSeries.R b/tests/testthat/test-findRecalSeries.R index 7b00368..f386fe8 100644 --- a/tests/testthat/test-findRecalSeries.R +++ b/tests/testthat/test-findRecalSeries.R @@ -57,7 +57,7 @@ patrick::with_parameters_test_that("Selection of the final series works", { if (mode == TRUE) { expect_equal(nrow(actual), 10) } else { - expect_equal(nrow(actual), n) + expect_equal(nrow(actual), n) } expect_equal(actual, expected) }, @@ -69,15 +69,26 @@ patrick::with_parameters_test_that("FindRecalSeries function works", { expected <- readRDS(file.path("test-data", paste0("findRecalSeries", mode, ".rds"))) n <- 3 - actual <- FindRecalSeries(df, - global_min = 100, - global_max = 500, - number_of_combinations = 3, - abundance_score_threshold = 100, - peak_distance_threshold = 2, - coverage_threshold = 60, - fill_series = mode) + actual <- FindRecalSeries( + df, + global_min = 100, + global_max = 500, + number_of_combinations = 3, + abundance_score_threshold = 100, + peak_distance_threshold = 2, + coverage_threshold = 60, + fill_series = mode) expect_equal(actual, expected) }, mode = c(TRUE, FALSE) -) \ No newline at end of file +) + +test_that("FindRecalSeriesSimple works", { + df <- readRDS("test-data/pos_recallist.rds") + actual <- FindRecalSeriesSimple(df) + + expected_path <- file.path("test-data", "expected_FindRecalSeriesSimple.rds") + expected <- readRDS(expected_path) + + expect_equal(actual, expected) +}) \ No newline at end of file From 2ad5bf134158481e0c7ac1c9123c0a552ce9d685 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Wed, 30 Oct 2024 11:50:45 +0100 Subject: [PATCH 2/3] updated NAMESPACE --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 875fa7d..aba9bd5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(Even) export(FindCoreFormulae2) export(FindCoreFormulae2_Halo) export(FindRecalSeries) +export(FindRecalSeriesSimple) export(HighMoles) export(HistNoise) export(IsoFiltR) From dc2e179b7df10d0812b18a73efd3c47d09927a27 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Wed, 30 Oct 2024 11:52:13 +0100 Subject: [PATCH 3/3] updated changelog --- changelog.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/changelog.txt b/changelog.txt index cc004ad..6db9de0 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,5 +1,8 @@ Package Updates +10/30/2024 Version 1.1.2 + - introduced new function to choose the best recal list [#72](https://github.com/RECETOX/MFAssignR/pull/72) + 09/12/2024 Version 1.1.1 - quick fix of hardcoded paths and imports [#59](https://github.com/RECETOX/MFAssignR/pull/59)