From 04c793ad041a74b4c90b672195e9408980df6a11 Mon Sep 17 00:00:00 2001 From: yannsay-impact <79831444+yannsay-impact@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:32:38 +0100 Subject: [PATCH 1/2] fixing create_analysis_median the function was calculating the mean instead of the median when a subset of the dataframe had only NAs. All values were the mean, instead of the median. error was coming from the tryCatch that was not catching the correct error. The tryCatch was initially introduced to have the function run when only NA. It was now catching another error, i.e. when only subset has only missing value. --- DESCRIPTION | 4 +- R/create_analysis_median.R | 87 +++++++++++--------- tests/testthat/test-create_analysis_median.R | 20 +++++ 3 files changed, 68 insertions(+), 43 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 654dfc5..8f3e63b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,7 @@ Imports: magrittr, purrr, rlang, - srvyr, + srvyr (>= 1.3.0), stringr, tidyr, tidyselect @@ -37,5 +37,5 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 URL: https://impact-initiatives.github.io/analysistools/ diff --git a/R/create_analysis_median.R b/R/create_analysis_median.R index 0ad4bdd..d64acc4 100644 --- a/R/create_analysis_median.R +++ b/R/create_analysis_median.R @@ -42,57 +42,62 @@ create_analysis_median <- function(design, group_var = NA, analysis_var, level = } # calculate + pre_design <- design %>% + dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) - ## error handling - ## survey_median has an error with only NA it passes somewhere if(NA) - ## To handle this problem, the missing_value_catch will try to run summarise around - ## survey_quantile, if it does work, it will run survey_mean to get the NA/NaN. - ## The error also happens with svyby and svyquantile and cannot be swapped + # as 26.11.2024 + # currently srvyr::survey_median return error with only NAs in a group + # Caused by error in `h()`: + # ! error in evaluating the argument 'x' in selecting a method for function 't': + # missing value where TRUE/FALSE needed + # + # fix: + # - calculates median with filter(.preserve = FALSE) (default). it removes groups with all NA + # - calculates the counts and weigthed counts with .preserve = TRUE to have all groups + # - left_join on the counts to have all groups. + # + # - when all are missing return a dataframe with stat,stat_upp,stat_low as NaN as + # filter(.preserve = FALSE will break) - missing_value_catch <- function(expr) { - tryCatch( - error = function(cnd) { - design %>% - dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>% - dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>% - srvyr::summarise( - stat = srvyr::survey_mean( - !!rlang::sym(analysis_var), - vartype = "ci", - level = as.numeric(level), - na.rm = T - ), - n = dplyr::n(), - n_w = srvyr::survey_total( - vartype = "ci", - level = as.numeric(level), - na.rm = T - ) - ) - }, - expr - ) - } - - results <- missing_value_catch( - design %>% - dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>% - dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>% + # fix for 26.11.2024 bug + if (all(is.na(design[["variables"]][[analysis_var]]))) { + # edge case when all NA + results_median <- pre_design %>% + dplyr::filter(!is.na(!!rlang::sym(analysis_var))) %>% + dplyr::summarise( + stat = NaN, + stat_upp = NaN, + stat_low = NaN + ) + } else { + results_median <- pre_design %>% + dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = FALSE) %>% srvyr::summarise( stat = srvyr::survey_median( !!rlang::sym(analysis_var), vartype = "ci", level = as.numeric(level), na.rm = T - ), - n = dplyr::n(), - n_w = srvyr::survey_total( - vartype = "ci", - level = as.numeric(level), - na.rm = T ) ) - ) + } + + results_totals <- pre_design %>% + dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = TRUE) %>% + srvyr::summarise( + n = dplyr::n(), + n_w = srvyr::survey_total( + na.rm = T + ) + ) + + if (is.null(across_by)) { + results <- cbind(results_median, results_totals) + } else { + results <- results_totals %>% + dplyr::left_join(results_median) %>% + suppressMessages() + } results <- results %>% dplyr::mutate( diff --git a/tests/testthat/test-create_analysis_median.R b/tests/testthat/test-create_analysis_median.R index c7adffb..4e379ae 100644 --- a/tests/testthat/test-create_analysis_median.R +++ b/tests/testthat/test-create_analysis_median.R @@ -590,3 +590,23 @@ test_that("stat is set to NaN when there is no value", { expect_equal(results, expected_output, ignore_attr = T) }) + +test_that("When only missing values, the correct values are return", { + set.seed(3452) + repex_df <- data.frame(group = c(rep("a", 5), rep("b",3)), + value = c(runif(5), rep(NA, 3))) + expected_results <- repex_df |> + dplyr::group_by(group) |> + dplyr::summarise(stat = median(value, na.rm = T)) |> + dplyr::mutate(stat = dplyr::if_else(is.na(stat), NaN, stat)) + + data_survey_design <- srvyr::as_survey(repex_df) + + actual_results <- create_analysis_median(data_survey_design, + group_var = "group", + analysis_var = "value") + + expect_equal(actual_results$stat, expected_results$stat) + +}) + From 38ac4d245c1976a6e8f1cc249ce8a52b053fb528 Mon Sep 17 00:00:00 2001 From: yannsay-impact <79831444+yannsay-impact@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:51:04 +0100 Subject: [PATCH 2/2] changing default median for odd set calculation The default calculation from survey_median is to take the lower value when the median is between 2 value, e.g. c(1,2) will return 1. create_analysis_median will take the mean of the value by default, i.e. 1.5 Updating version number as well --- DESCRIPTION | 2 +- R/create_analysis_median.R | 12 +++++++----- man/create_analysis_median.Rd | 11 ++++++----- tests/testthat/test-create_analysis_median.R | 14 ++++++++++---- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8f3e63b..49c59cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: analysistools Title: Tools to perform some analysis on survey data collected with ODK -Version: 0.0.0.902 +Version: 0.0.0.903 Authors@R: c( person("Yann", "Say", , "yann.say@impact-initiatives.org", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7390-4209")), diff --git a/R/create_analysis_median.R b/R/create_analysis_median.R index d64acc4..cc8ab30 100644 --- a/R/create_analysis_median.R +++ b/R/create_analysis_median.R @@ -9,11 +9,12 @@ #' @param analysis_var the independent variable, variable to summarise #' @param level the confidence level. 0.95 is default #' -#' @note The results may differ with median(). There are lots of ways to calculate the median and -#' the default calculation between stats::median and survey::svyquantile/srvyr::survey_median are -#' different. Default from *survey/srvyr* is "school" methodology and does not exist in *stats* -#' package. The default for *stats* is "hf7". *survey/srvyr* methodology is prefered as these -#' packages are built for complex survey design. +#' @note Default from *survey/srvyr* is "math" methodology. In case of odds number, it will return +#' the lower value. Default for stats::median will calculate the mean between the two points. +#' If there is a set of c(1,2), median(1,2) will return 1.5; survey_mean() will return 1 by default. +#' create_analysis_median has the "school" methodology set as default, the results will match the +#' default results from stats::median, pandas.median If want to calculate with the "math" +#' methodology, you should run your own analysis with survey_median. #' #' @return a data frame with the median for each group #' @export @@ -76,6 +77,7 @@ create_analysis_median <- function(design, group_var = NA, analysis_var, level = stat = srvyr::survey_median( !!rlang::sym(analysis_var), vartype = "ci", + qrule = "school", level = as.numeric(level), na.rm = T ) diff --git a/man/create_analysis_median.Rd b/man/create_analysis_median.Rd index f35230f..0f554ed 100644 --- a/man/create_analysis_median.Rd +++ b/man/create_analysis_median.Rd @@ -26,11 +26,12 @@ a data frame with the median for each group Calculate a median from a survey } \note{ -The results may differ with median(). There are lots of ways to calculate the median and -the default calculation between stats::median and survey::svyquantile/srvyr::survey_median are -different. Default from \emph{survey/srvyr} is "school" methodology and does not exist in \emph{stats} -package. The default for \emph{stats} is "hf7". \emph{survey/srvyr} methodology is prefered as these -packages are built for complex survey design. +Default from \emph{survey/srvyr} is "math" methodology. In case of odds number, it will return +the lower value. Default for stats::median will calculate the mean between the two points. +If there is a set of c(1,2), median(1,2) will return 1.5; survey_mean() will return 1 by default. +create_analysis_median has the "school" methodology set as default, the results will match the +default results from stats::median, pandas.median If want to calculate with the "math" +methodology, you should run your own analysis with survey_median. } \examples{ somedata <- data.frame( diff --git a/tests/testthat/test-create_analysis_median.R b/tests/testthat/test-create_analysis_median.R index 4e379ae..42f25db 100644 --- a/tests/testthat/test-create_analysis_median.R +++ b/tests/testthat/test-create_analysis_median.R @@ -14,7 +14,7 @@ test_that("create_analysis_median returns correct output, no weights", { ) svyquantile_results <- survey::svydesign(id = ~1, data = somedata) %>% - survey::svyquantile(~value, design = ., quantiles = c(.5)) %>% + survey::svyquantile(~value, design = ., quantiles = c(.5), qrule = "school") %>% suppressWarnings() expected_output <- svyquantile_results[["value"]] %>% @@ -295,7 +295,9 @@ test_that("create_analysis_median handles lonely PSU", { ~groups, design = ., FUN = survey::svyquantile, - quantiles = .5, vartype = "ci" + quantiles = .5, + vartype = "ci", + qrule = "school" ) %>% suppressWarnings() @@ -411,7 +413,9 @@ test_that("create_analysis_median returns correct output with 3 grouping variabl ~ group_a + group_b + group_c, design = ., FUN = survey::svyquantile, - quantiles = .5, vartype = "ci" + quantiles = .5, + vartype = "ci", + qrule = "school" ) %>% suppressWarnings() @@ -506,7 +510,9 @@ test_that("create_analysis_median returns correct output with 2 grouping variabl ~ group_a + group_b, design = ., FUN = survey::svyquantile, - quantiles = .5, vartype = "ci" + quantiles = .5, + vartype = "ci", + qrule = "school" ) %>% suppressWarnings()