Merge pull request #72 from RECETOX/hechth/issue69

Add Simeons version of FindRecalSeries
RECETOX · Oct 30, 2024 · 1dc1678 · 1dc1678
2 parents e791c8a + 98fddc8
commit 1dc1678
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 15 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(Even)
 export(FindCoreFormulae2)
 export(FindCoreFormulae2_Halo)
 export(FindRecalSeries)
+export(FindRecalSeriesSimple)
 export(HighMoles)
 export(HistNoise)
 export(IsoFiltR)

diff --git a/R/FindRecalSeries.R b/R/FindRecalSeries.R
@@ -1,8 +1,8 @@
 #' Filters the input dataframe
-#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and
+#' This function filters the input dataframe based on Abundance.Score threshold and peak distance threshold; and
 #' computes the length of the series.
 #' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
-#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
 #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
 #' better.
 #' @return DataFrame A filtered dataframe.
@@ -173,13 +173,13 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
 #' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria:
 #' 1) Series should cover the full mass spectral range,
 #' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z,
-#' 3) Abundance score: the higher, the better,
+#' 3) Abundance.Score: the higher, the better,
 #' 4) Peak score: the closer to 0, the better,
 #' 5) Peak Distance: the closer to 1, the better,
 #' 6) Series Score: the closer to this value, the better.
 #'
 #' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10
-#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #'
+#' elements, a pre-filtering is done: only the series which have Abundance.Score > 100 are considered and the one #'
 #' having Peak Distance < 2.
 #' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak
 #' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest
@@ -192,7 +192,7 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
 #' @param global_max Float A higher bound of the instrument m/z range.
 #' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can
 #' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially)
-#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
 #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
 #' better.
 #' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
@@ -232,3 +232,43 @@ FindRecalSeries <- function(df,
   # Return the top scoring series
   return(final_series)
 }
+
+#' Simple rewritten version of the FindRecalSeries function.
+#' 
+#' @description This function is not based on combinations of series but simply computes the scores and returns
+#' the 10 best seires.
+#' @param Recal data.frame A dataframe containing the various recal series.
+#' @return A dataframe of n-10 best-scoring series.
+#' @export
+FindRecalSeriesSimple <- function(Recal) {
+  Cal_Pick <- dplyr::filter(Recal, `Series.Score`>= 1 & `Peak.Distance` <= 3.3 & `Peak.Distance` >= 1)
+  Cal_Pick <- dplyr::mutate(Cal_Pick, `Peak.Distance` = floor(`Peak.Distance`), `Series.Score` = round(`Series.Score`, 1))
+
+  #The weighting for each of the terms was determined experimentally and via my experience choosing series and what is most important.
+  Cal_Pick <- dplyr::mutate(
+    Cal_Pick,
+    Num_Weight = `Number.Observed` / max(`Number.Observed`) *25,
+    Abund_weight = (((`Abundance.Score`) / max(`Abundance.Score`))*20),
+    TallP_Weight = `Tall.Peak`/30,
+    PeakS_Weight = abs((`Peak.Score`* 5 - 10)),
+    PeakD_Weight = -3 * `Peak.Distance` + 10,
+    Series_Weight = (`Series.Score` / (( `Series.Score` - 0.5 )))*5
+  )
+
+  Cal_Pick <- dplyr::mutate(Cal_Pick, Total_score = Num_Weight + PeakS_Weight + PeakD_Weight + Abund_weight+ Series_Weight + TallP_Weight)  
+  Cal_Pick <- Cal_Pick[order(-Cal_Pick$Total_score),]
+
+  Cal_Pick_HM <- tidyr::separate(Cal_Pick, `Mass.Range`, into = c("Low", "High"), sep = "-", remove = FALSE)  
+  Top10 <- dplyr::slice(Cal_Pick_HM, c(1:10))
+
+  Cal_Pick_HM2 <- dplyr::filter(Cal_Pick_HM, Low > as.numeric(max(Top10$High))-50 & Low < as.numeric(max(Top10$High))-10)  
+  HM_Cal <- dplyr::filter(Cal_Pick_HM2, Total_score == max(Total_score))
+
+  Cal_Pick_LM <- dplyr::filter(Cal_Pick_HM, (Low > as.numeric(min(Cal_Pick_HM$Low)) & Low < as.numeric(min(Cal_Pick_HM$Low))+20) & High > as.numeric(min(Top10$Low)))
+  LM_Cal <- dplyr::filter(Cal_Pick_LM, Total_score == max(Total_score))
+
+  Picked_Cal_series <- dplyr::bind_rows(LM_Cal, Top10, HM_Cal)
+  Picked_Cal_series <- dplyr::select(Picked_Cal_series, -c(5,6))
+
+  Picked_Cal_series
+}
diff --git a/changelog.txt b/changelog.txt
@@ -1,6 +1,7 @@
 Package Updates
 
 10/30/2024 Version 1.1.2
+ - introduced new function to choose the best recal list [#72](https://github.com/RECETOX/MFAssignR/pull/72)
  - aligned IsoFiltR function with upstream version [#70](https://github.com/RECETOX/MFAssignR/pull/70)
 
 09/12/2024 Version 1.1.1

diff --git a/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds b/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds
diff --git a/tests/testthat/test-findRecalSeries.R b/tests/testthat/test-findRecalSeries.R
@@ -57,7 +57,7 @@ patrick::with_parameters_test_that("Selection of the final series works", {
   if (mode == TRUE) {
     expect_equal(nrow(actual), 10)
   } else {
-    expect_equal(nrow(actual), n)  
+    expect_equal(nrow(actual), n)
   }
   expect_equal(actual, expected)
 },
@@ -69,15 +69,26 @@ patrick::with_parameters_test_that("FindRecalSeries function works", {
   expected <- readRDS(file.path("test-data", paste0("findRecalSeries", mode, ".rds")))
   n <- 3
 
-  actual <- FindRecalSeries(df,
-                        global_min = 100,
-                        global_max = 500,
-                        number_of_combinations = 3,
-                        abundance_score_threshold = 100,
-                        peak_distance_threshold = 2,
-                        coverage_threshold = 60,
-                        fill_series = mode)
+  actual <- FindRecalSeries(
+    df,
+    global_min = 100,
+    global_max = 500,
+    number_of_combinations = 3,
+    abundance_score_threshold = 100,
+    peak_distance_threshold = 2,
+    coverage_threshold = 60,
+    fill_series = mode)
   expect_equal(actual, expected)
 },
   mode = c(TRUE, FALSE)
-)
+)
+
+test_that("FindRecalSeriesSimple works", {
+  df <- readRDS("test-data/pos_recallist.rds")
+  actual <- FindRecalSeriesSimple(df)
+
+  expected_path <- file.path("test-data", "expected_FindRecalSeriesSimple.rds")
+  expected <- readRDS(expected_path)
+
+  expect_equal(actual, expected)
+})