Skip to content

Commit

Permalink
Merge pull request #72 from RECETOX/hechth/issue69
Browse files Browse the repository at this point in the history
Add Simeons version of FindRecalSeries
  • Loading branch information
hechth authored Oct 30, 2024
2 parents e791c8a + 98fddc8 commit 1dc1678
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 15 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(Even)
export(FindCoreFormulae2)
export(FindCoreFormulae2_Halo)
export(FindRecalSeries)
export(FindRecalSeriesSimple)
export(HighMoles)
export(HistNoise)
export(IsoFiltR)
Expand Down
50 changes: 45 additions & 5 deletions R/FindRecalSeries.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Filters the input dataframe
#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and
#' This function filters the input dataframe based on Abundance.Score threshold and peak distance threshold; and
#' computes the length of the series.
#' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
#' better.
#' @return DataFrame A filtered dataframe.
Expand Down Expand Up @@ -173,13 +173,13 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
#' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria:
#' 1) Series should cover the full mass spectral range,
#' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z,
#' 3) Abundance score: the higher, the better,
#' 3) Abundance.Score: the higher, the better,
#' 4) Peak score: the closer to 0, the better,
#' 5) Peak Distance: the closer to 1, the better,
#' 6) Series Score: the closer to this value, the better.
#'
#' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10
#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #'
#' elements, a pre-filtering is done: only the series which have Abundance.Score > 100 are considered and the one #'
#' having Peak Distance < 2.
#' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak
#' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest
Expand All @@ -192,7 +192,7 @@ find_final_series <- function(scores_df, number_of_combinations, fill_series) {
#' @param global_max Float A higher bound of the instrument m/z range.
#' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can
#' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially)
#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
#' @param abundance_score_threshold Float A threshold for filtering Abundance.Score parameter. The series with higher values #' are better. Default value is 100.
#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
#' better.
#' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
Expand Down Expand Up @@ -232,3 +232,43 @@ FindRecalSeries <- function(df,
# Return the top scoring series
return(final_series)
}

#' Simple rewritten version of the FindRecalSeries function.
#'
#' @description This function is not based on combinations of series but simply computes the scores and returns
#' the 10 best seires.
#' @param Recal data.frame A dataframe containing the various recal series.
#' @return A dataframe of n-10 best-scoring series.
#' @export
FindRecalSeriesSimple <- function(Recal) {
Cal_Pick <- dplyr::filter(Recal, `Series.Score`>= 1 & `Peak.Distance` <= 3.3 & `Peak.Distance` >= 1)
Cal_Pick <- dplyr::mutate(Cal_Pick, `Peak.Distance` = floor(`Peak.Distance`), `Series.Score` = round(`Series.Score`, 1))

#The weighting for each of the terms was determined experimentally and via my experience choosing series and what is most important.
Cal_Pick <- dplyr::mutate(
Cal_Pick,
Num_Weight = `Number.Observed` / max(`Number.Observed`) *25,
Abund_weight = (((`Abundance.Score`) / max(`Abundance.Score`))*20),
TallP_Weight = `Tall.Peak`/30,
PeakS_Weight = abs((`Peak.Score`* 5 - 10)),
PeakD_Weight = -3 * `Peak.Distance` + 10,
Series_Weight = (`Series.Score` / (( `Series.Score` - 0.5 )))*5
)

Cal_Pick <- dplyr::mutate(Cal_Pick, Total_score = Num_Weight + PeakS_Weight + PeakD_Weight + Abund_weight+ Series_Weight + TallP_Weight)
Cal_Pick <- Cal_Pick[order(-Cal_Pick$Total_score),]

Cal_Pick_HM <- tidyr::separate(Cal_Pick, `Mass.Range`, into = c("Low", "High"), sep = "-", remove = FALSE)
Top10 <- dplyr::slice(Cal_Pick_HM, c(1:10))

Cal_Pick_HM2 <- dplyr::filter(Cal_Pick_HM, Low > as.numeric(max(Top10$High))-50 & Low < as.numeric(max(Top10$High))-10)
HM_Cal <- dplyr::filter(Cal_Pick_HM2, Total_score == max(Total_score))

Cal_Pick_LM <- dplyr::filter(Cal_Pick_HM, (Low > as.numeric(min(Cal_Pick_HM$Low)) & Low < as.numeric(min(Cal_Pick_HM$Low))+20) & High > as.numeric(min(Top10$Low)))
LM_Cal <- dplyr::filter(Cal_Pick_LM, Total_score == max(Total_score))

Picked_Cal_series <- dplyr::bind_rows(LM_Cal, Top10, HM_Cal)
Picked_Cal_series <- dplyr::select(Picked_Cal_series, -c(5,6))

Picked_Cal_series
}
1 change: 1 addition & 0 deletions changelog.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Package Updates

10/30/2024 Version 1.1.2
- introduced new function to choose the best recal list [#72](https://github.com/RECETOX/MFAssignR/pull/72)
- aligned IsoFiltR function with upstream version [#70](https://github.com/RECETOX/MFAssignR/pull/70)

09/12/2024 Version 1.1.1
Expand Down
Binary file not shown.
31 changes: 21 additions & 10 deletions tests/testthat/test-findRecalSeries.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ patrick::with_parameters_test_that("Selection of the final series works", {
if (mode == TRUE) {
expect_equal(nrow(actual), 10)
} else {
expect_equal(nrow(actual), n)
expect_equal(nrow(actual), n)
}
expect_equal(actual, expected)
},
Expand All @@ -69,15 +69,26 @@ patrick::with_parameters_test_that("FindRecalSeries function works", {
expected <- readRDS(file.path("test-data", paste0("findRecalSeries", mode, ".rds")))
n <- 3

actual <- FindRecalSeries(df,
global_min = 100,
global_max = 500,
number_of_combinations = 3,
abundance_score_threshold = 100,
peak_distance_threshold = 2,
coverage_threshold = 60,
fill_series = mode)
actual <- FindRecalSeries(
df,
global_min = 100,
global_max = 500,
number_of_combinations = 3,
abundance_score_threshold = 100,
peak_distance_threshold = 2,
coverage_threshold = 60,
fill_series = mode)
expect_equal(actual, expected)
},
mode = c(TRUE, FALSE)
)
)

test_that("FindRecalSeriesSimple works", {
df <- readRDS("test-data/pos_recallist.rds")
actual <- FindRecalSeriesSimple(df)

expected_path <- file.path("test-data", "expected_FindRecalSeriesSimple.rds")
expected <- readRDS(expected_path)

expect_equal(actual, expected)
})

0 comments on commit 1dc1678

Please sign in to comment.