From 8f451922443b7495bc9719e2c047fc9334785b82 Mon Sep 17 00:00:00 2001
From: Helge Hecht <helge.hecht@recetox.muni.cz>
Date: Wed, 30 Oct 2024 11:43:59 +0100
Subject: [PATCH 1/3] Add Simeons version of FindRecalSeries Fixes
 RECETOX/MFAssignR#69

---
 R/FindRecalSeries.R                           |  50 ++++++++++++++++--
 .../expected_FindRecalSeriesSimple.rds        | Bin 0 -> 1327 bytes
 tests/testthat/test-findRecalSeries.R         |  31 +++++++----
 3 files changed, 66 insertions(+), 15 deletions(-)
 create mode 100644 tests/testthat/test-data/expected_FindRecalSeriesSimple.rds

diff --git a/R/FindRecalSeries.R b/R/FindRecalSeries.R
index 7bf011d..03972e4 100644
--- a/R/FindRecalSeries.R
+++ b/R/FindRecalSeries.R
@@ -1,8 +1,8 @@
 #' Filters the input dataframe
-#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and
+#' This function filters the input dataframe based on Abundance.Score threshold and peak distance threshold; and
 #' computes the length of the series.
 #' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
-#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
 #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
 #' better.
 #' @return DataFrame A filtered dataframe.
@@ -173,13 +173,13 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
 #' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria:
 #' 1) Series should cover the full mass spectral range,
 #' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z,
-#' 3) Abundance score: the higher, the better,
+#' 3) Abundance.Score: the higher, the better,
 #' 4) Peak score: the closer to 0, the better,
 #' 5) Peak Distance: the closer to 1, the better,
 #' 6) Series Score: the closer to this value, the better.
 #'
 #' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10
-#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #'
+#' elements, a pre-filtering is done: only the series which have Abundance.Score > 100 are considered and the one #'
 #' having Peak Distance < 2.
 #' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak
 #' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest
@@ -192,7 +192,7 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
 #' @param global_max Float A higher bound of the instrument m/z range.
 #' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can
 #' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially)
-#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
 #' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
 #' better.
 #' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
@@ -232,3 +232,43 @@ FindRecalSeries <- function(df,
   # Return the top scoring series
   return(final_series)
 }
+
+#' Simple rewritten version of the FindRecalSeries function.
+#' 
+#' @description This function is not based on combinations of series but simply computes the scores and returns
+#' the 10 best seires.
+#' @param Recal data.frame A dataframe containing the various recal series.
+#' @return A dataframe of n-10 best-scoring series.
+#' @export
+FindRecalSeriesSimple <- function(Recal) {
+  Cal_Pick <- dplyr::filter(Recal, `Series.Score`>= 1 & `Peak.Distance` <= 3.3 & `Peak.Distance` >= 1)
+  Cal_Pick <- dplyr::mutate(Cal_Pick, `Peak.Distance` = floor(`Peak.Distance`), `Series.Score` = round(`Series.Score`, 1))
+
+  #The weighting for each of the terms was determined experimentally and via my experience choosing series and what is most important.
+  Cal_Pick <- dplyr::mutate(
+    Cal_Pick,
+    Num_Weight = `Number.Observed` / max(`Number.Observed`) *25,
+    Abund_weight = (((`Abundance.Score`) / max(`Abundance.Score`))*20),
+    TallP_Weight = `Tall.Peak`/30,
+    PeakS_Weight = abs((`Peak.Score`* 5 - 10)),
+    PeakD_Weight = -3 * `Peak.Distance` + 10,
+    Series_Weight = (`Series.Score` / (( `Series.Score` - 0.5 )))*5
+  )
+
+  Cal_Pick <- dplyr::mutate(Cal_Pick, Total_score = Num_Weight + PeakS_Weight + PeakD_Weight + Abund_weight+ Series_Weight + TallP_Weight)  
+  Cal_Pick <- Cal_Pick[order(-Cal_Pick$Total_score),]
+
+  Cal_Pick_HM <- tidyr::separate(Cal_Pick, `Mass.Range`, into = c("Low", "High"), sep = "-", remove = FALSE)  
+  Top10 <- dplyr::slice(Cal_Pick_HM, c(1:10))
+
+  Cal_Pick_HM2 <- dplyr::filter(Cal_Pick_HM, Low > as.numeric(max(Top10$High))-50 & Low < as.numeric(max(Top10$High))-10)  
+  HM_Cal <- dplyr::filter(Cal_Pick_HM2, Total_score == max(Total_score))
+
+  Cal_Pick_LM <- dplyr::filter(Cal_Pick_HM, (Low > as.numeric(min(Cal_Pick_HM$Low)) & Low < as.numeric(min(Cal_Pick_HM$Low))+20) & High > as.numeric(min(Top10$Low)))
+  LM_Cal <- dplyr::filter(Cal_Pick_LM, Total_score == max(Total_score))
+
+  Picked_Cal_series <- dplyr::bind_rows(LM_Cal, Top10, HM_Cal)
+  Picked_Cal_series <- dplyr::select(Picked_Cal_series, -c(5,6))
+
+  Picked_Cal_series
+}
\ No newline at end of file
diff --git a/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds b/tests/testthat/test-data/expected_FindRecalSeriesSimple.rds
new file mode 100644
index 0000000000000000000000000000000000000000..9e387ec2097f93191c52b47f58b7f57c3e104cbd
GIT binary patch
literal 1327
zcmV+~1<?8*iwFP!0000019epkOjB1FzPx0>fD(<ugpG%Qz<Rll_VzZo-Ct#nLZM7b
z@m&ioik1Q`4<9iSW{k3_8|Wf#^Re!Q30qv$_?TKX8<BwqHztZvtU5QNiQ*%eDeT;G
z?`7@gN&kH3|IT;5|L?i&x${CX3=6;l1O2doAT-5-iuH5kEaE1jL?%QS7Knu)3^e8#
zX<;6$Q=yC$(+N5jrHqtOC{sxpFI7kz`8AT17QJnle{Brbk%ZSyPdctu@LSO=g6|O~
zVV(gp0dWB-0pAob-ba@%;4~j3FmZ%?&H%#}qNqTTh9VIKzq0_v3KV=bK=C$;6rXd8
zBq<svNtK*bsc{qn9YIqnoKVUYY9&t5A{$Auh*hcN3YNwxH6wW_5Uo(iX;z8T((y%7
zG>sDsBUg~fgkeQ%iclk3POC}8(h^IPIKj$kg2YKBJ8prrO4J8K<1`_jb|lSPuq0{^
zd9V^qsZk#+g=k()j(<gDtx(Hpr5d%zNKa)+=bDSpT_sRk-F5BE)Sa;USH}P`Zx<~4
z;_kP9G#`h0hc<3FH40X`%LYSw8L&CS-)gikg1VvW$FU(BtfjYX+kWvVtZpsHNOM+0
zdD16`)_?Pg=<k}PTNl)--3#`!g+HH*IuB9Z?%aJ}?g35I-Nj7yeOJQFz^=?yPht6c
z16R+k*FjB3?;7>GS=@`vtzX~ID}~t&f0z6@`$veMIk4fj;Zqmq*L|Yo@?p`RyP2~o
ztYP^Pu0wCgJ=QkNwWmA3SH-k&XRch@q5aOvowok<Xo2b(cSBowcu(eb?%w*^k#l|h
z+%LJA@9SUe=57tQJ!;(P<_?UQch_6nxU)4+m)133k^CQw{lr8B%o}~zU#9a#Uw;TK
z_l%eManHN?(ZdZ3`C}n}D(n;ahi&}WB|YD%ov}5Yv38iu-rR~lpy(?`!vJcbE?vV5
z;vhv>7j;yGX95|*ei50mb?NyAY}lNaG<+EpLj8-+=2GW14<Ynucy!Cbyi;!Tj0x?<
zVUTvOI9S%c4XA!)xUr~@I~6<q!NUD+2z0f&n-ZG28+wRQZGOjvZETST?X8#km#rvs
z9>Z`*?)i!wjH`l-4^@Tq$A=(>(K(~T?2tO-C`$j$0C6XC%B!!ZLA>F~$o@O+kXq3o
zzcyG0@_;VNHTV-`yxY}(aA+4y`Mh##d2r+S`3&tJ?pv}2k}4MzHZK@~q^9(q)_aYR
zxbe<(ga1-Ubgnz{{Lwy$BeE*5Hf6(P!Z3LH%6v$;rqG;NM8Ipe_H<s@>wqM!Eb4;s
z3?#0NcGdT$O8&9JIg2RsLErV4>3mS~7jMHUEt+X9n&ki7k|It$kTvl@)+Bgc$m3xQ
z--z}603<%K*fS)5(9Y<I-qsKKo097v2mAo}dgbp;DczubJn199jy%wAKYwF7!9woh
zIsS8*OORVIs#hOx0nMr%=9Kx>kaO~OOsca3az2Z=)Nr&Pa<X~{K5*;M%gP`5`w7=m
zP?^=_aA0HCvPg!SO|>Sx+-|D0_{wdF6Xi1TeN$wyS0Gln@CxcGOD%R>SL(3X>n&!1
zesxTQYiwrADzQ#Js*TSz*_K%ZCPZ(tTJb`QX{G2kv$W1;HrdK7cu`rEUDSp05-%p)
z!(>-DYI(6wQLnA2KVyldV%hRqQRS&&Z1CdvMhbm&e2Jp};j+EC@Ug1iWSG9H)?_t0
lJe_zSYKXn60T&OJf2I2K|B}(s(I*H-{s9i5-$#lF005exouL2#

literal 0
HcmV?d00001

diff --git a/tests/testthat/test-findRecalSeries.R b/tests/testthat/test-findRecalSeries.R
index 7b00368..f386fe8 100644
--- a/tests/testthat/test-findRecalSeries.R
+++ b/tests/testthat/test-findRecalSeries.R
@@ -57,7 +57,7 @@ patrick::with_parameters_test_that("Selection of the final series works", {
   if (mode == TRUE) {
     expect_equal(nrow(actual), 10)
   } else {
-    expect_equal(nrow(actual), n)  
+    expect_equal(nrow(actual), n)
   }
   expect_equal(actual, expected)
 },
@@ -69,15 +69,26 @@ patrick::with_parameters_test_that("FindRecalSeries function works", {
   expected <- readRDS(file.path("test-data", paste0("findRecalSeries", mode, ".rds")))
   n <- 3
 
-  actual <- FindRecalSeries(df,
-                        global_min = 100,
-                        global_max = 500,
-                        number_of_combinations = 3,
-                        abundance_score_threshold = 100,
-                        peak_distance_threshold = 2,
-                        coverage_threshold = 60,
-                        fill_series = mode)
+  actual <- FindRecalSeries(
+    df,
+    global_min = 100,
+    global_max = 500,
+    number_of_combinations = 3,
+    abundance_score_threshold = 100,
+    peak_distance_threshold = 2,
+    coverage_threshold = 60,
+    fill_series = mode)
   expect_equal(actual, expected)
 },
   mode = c(TRUE, FALSE)
-)
\ No newline at end of file
+)
+
+test_that("FindRecalSeriesSimple works", {
+  df <- readRDS("test-data/pos_recallist.rds")
+  actual <- FindRecalSeriesSimple(df)
+
+  expected_path <- file.path("test-data", "expected_FindRecalSeriesSimple.rds")
+  expected <- readRDS(expected_path)
+
+  expect_equal(actual, expected)
+})
\ No newline at end of file

From 2ad5bf134158481e0c7ac1c9123c0a552ce9d685 Mon Sep 17 00:00:00 2001
From: Helge Hecht <helge.hecht@recetox.muni.cz>
Date: Wed, 30 Oct 2024 11:50:45 +0100
Subject: [PATCH 2/3] updated NAMESPACE

---
 NAMESPACE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NAMESPACE b/NAMESPACE
index 875fa7d..aba9bd5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -10,6 +10,7 @@ export(Even)
 export(FindCoreFormulae2)
 export(FindCoreFormulae2_Halo)
 export(FindRecalSeries)
+export(FindRecalSeriesSimple)
 export(HighMoles)
 export(HistNoise)
 export(IsoFiltR)

From dc2e179b7df10d0812b18a73efd3c47d09927a27 Mon Sep 17 00:00:00 2001
From: Helge Hecht <helge.hecht@recetox.muni.cz>
Date: Wed, 30 Oct 2024 11:52:13 +0100
Subject: [PATCH 3/3] updated changelog

---
 changelog.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/changelog.txt b/changelog.txt
index cc004ad..6db9de0 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,5 +1,8 @@
 Package Updates
 
+10/30/2024 Version 1.1.2
+ - introduced new function to choose the best recal list [#72](https://github.com/RECETOX/MFAssignR/pull/72)
+
 09/12/2024 Version 1.1.1
  - quick fix of hardcoded paths and imports [#59](https://github.com/RECETOX/MFAssignR/pull/59)