Merge pull request #38 from yannsay-impact/update_create_analysis_median

fixing create_analysis_median
impact-initiatives · Nov 27, 2024 · d748f32 · d748f32
2 parents 9083c6b + 38ac4d2
commit d748f32
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 58 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: analysistools
 Title: Tools to perform some analysis on survey data collected with ODK
-Version: 0.0.0.902
+Version: 0.0.0.903
 Authors@R: c(
     person("Yann", "Say", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-7390-4209")),
@@ -21,7 +21,7 @@ Imports:
     magrittr,
     purrr,
     rlang,
-    srvyr,
+    srvyr (>= 1.3.0),
     stringr,
     tidyr,
     tidyselect
@@ -37,5 +37,5 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 URL: https://impact-initiatives.github.io/analysistools/
diff --git a/R/create_analysis_median.R b/R/create_analysis_median.R
@@ -9,11 +9,12 @@
 #' @param analysis_var the independent variable, variable to summarise
 #' @param level the confidence level. 0.95 is default
 #'
-#' @note The results may differ with median(). There are lots of ways to calculate the median and
-#' the default calculation between stats::median and survey::svyquantile/srvyr::survey_median are
-#' different. Default from *survey/srvyr* is "school" methodology and does not exist in *stats*
-#' package. The default for *stats* is "hf7". *survey/srvyr* methodology is prefered as these
-#' packages are built for complex survey design.
+#' @note Default from *survey/srvyr* is "math" methodology. In case of odds number, it will return
+#' the lower value. Default for stats::median will calculate the mean between the two points.
+#' If there is a set of c(1,2), median(1,2) will return 1.5; survey_mean() will return 1 by default.
+#' create_analysis_median has the "school" methodology set as default, the results will match the
+#' default results from stats::median, pandas.median If want to calculate with the "math"
+#' methodology, you should run your own analysis with survey_median.
 #'
 #' @return a data frame with the median for each group
 #' @export
@@ -42,57 +43,63 @@ create_analysis_median <- function(design, group_var = NA, analysis_var, level =
   }
 
   # calculate
+  pre_design <- design %>%
+    dplyr::group_by(dplyr::across(dplyr::any_of(across_by)))
 
-  ## error handling
-  ## survey_median has an error with only NA it passes somewhere if(NA)
-  ## To handle this problem, the missing_value_catch will try to run summarise around
-  ## survey_quantile, if it does work, it will run survey_mean to get the NA/NaN.
-  ## The error also happens with svyby and svyquantile and cannot be swapped
+  # as 26.11.2024
+  # currently srvyr::survey_median return error with only NAs in a group
+  # Caused by error in `h()`:
+  # ! error in evaluating the argument 'x' in selecting a method for function 't':
+  # missing value where TRUE/FALSE needed
+  #
+  # fix:
+  # - calculates median with filter(.preserve = FALSE) (default). it removes groups with all NA
+  # - calculates the counts and weigthed counts with .preserve = TRUE to have all groups
+  # - left_join on the counts to have all groups.
+  #
+  # - when all are missing return a dataframe with stat,stat_upp,stat_low as NaN as
+  # filter(.preserve = FALSE will break)
 
-  missing_value_catch <- function(expr) {
-    tryCatch(
-      error = function(cnd) {
-        design %>%
-          dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
-          dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
-          srvyr::summarise(
-            stat = srvyr::survey_mean(
-              !!rlang::sym(analysis_var),
-              vartype = "ci",
-              level = as.numeric(level),
-              na.rm = T
-            ),
-            n = dplyr::n(),
-            n_w = srvyr::survey_total(
-              vartype = "ci",
-              level = as.numeric(level),
-              na.rm = T
-            )
-          )
-      },
-      expr
-    )
-  }
-
-  results <- missing_value_catch(
-    design %>%
-      dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
-      dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
+  # fix for 26.11.2024 bug
+  if (all(is.na(design[["variables"]][[analysis_var]]))) {
+    # edge case when all NA
+    results_median <- pre_design %>%
+      dplyr::filter(!is.na(!!rlang::sym(analysis_var))) %>%
+      dplyr::summarise(
+        stat = NaN,
+        stat_upp = NaN,
+        stat_low = NaN
+      )
+  } else {
+    results_median <- pre_design %>%
+      dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = FALSE) %>%
       srvyr::summarise(
         stat = srvyr::survey_median(
           !!rlang::sym(analysis_var),
           vartype = "ci",
-          level = as.numeric(level),
-          na.rm = T
-        ),
-        n = dplyr::n(),
-        n_w = srvyr::survey_total(
-          vartype = "ci",
+          qrule = "school",
           level = as.numeric(level),
           na.rm = T
         )
       )
-  )
+  }
+
+  results_totals <- pre_design %>%
+    dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = TRUE) %>%
+    srvyr::summarise(
+      n = dplyr::n(),
+      n_w = srvyr::survey_total(
+        na.rm = T
+      )
+    )
+
+  if (is.null(across_by)) {
+    results <- cbind(results_median, results_totals)
+  } else {
+    results <- results_totals %>%
+      dplyr::left_join(results_median) %>%
+      suppressMessages()
+  }
 
   results <- results %>%
     dplyr::mutate(

diff --git a/man/create_analysis_median.Rd b/man/create_analysis_median.Rd
diff --git a/tests/testthat/test-create_analysis_median.R b/tests/testthat/test-create_analysis_median.R
@@ -14,7 +14,7 @@ test_that("create_analysis_median returns correct output, no weights", {
   )
 
   svyquantile_results <- survey::svydesign(id = ~1, data = somedata) %>%
-    survey::svyquantile(~value, design = ., quantiles = c(.5)) %>%
+    survey::svyquantile(~value, design = ., quantiles = c(.5), qrule = "school") %>%
     suppressWarnings()
 
   expected_output <- svyquantile_results[["value"]] %>%
@@ -295,7 +295,9 @@ test_that("create_analysis_median handles lonely PSU", {
       ~groups,
       design = .,
       FUN = survey::svyquantile,
-      quantiles = .5, vartype = "ci"
+      quantiles = .5,
+      vartype = "ci",
+      qrule = "school"
     ) %>%
     suppressWarnings()
 
@@ -411,7 +413,9 @@ test_that("create_analysis_median returns correct output with 3 grouping variabl
       ~ group_a + group_b + group_c,
       design = .,
       FUN = survey::svyquantile,
-      quantiles = .5, vartype = "ci"
+      quantiles = .5,
+      vartype = "ci",
+      qrule = "school"
     ) %>%
     suppressWarnings()
 
@@ -506,7 +510,9 @@ test_that("create_analysis_median returns correct output with 2 grouping variabl
       ~ group_a + group_b,
       design = .,
       FUN = survey::svyquantile,
-      quantiles = .5, vartype = "ci"
+      quantiles = .5,
+      vartype = "ci",
+      qrule = "school"
     ) %>%
     suppressWarnings()
 
@@ -590,3 +596,23 @@ test_that("stat is set to NaN when there is no value", {
 
   expect_equal(results, expected_output, ignore_attr = T)
 })
+
+test_that("When only missing values, the correct values are return", {
+  set.seed(3452)
+  repex_df <- data.frame(group = c(rep("a", 5), rep("b",3)),
+                         value = c(runif(5), rep(NA, 3)))
+  expected_results <- repex_df |>
+    dplyr::group_by(group) |>
+    dplyr::summarise(stat = median(value, na.rm = T)) |>
+    dplyr::mutate(stat = dplyr::if_else(is.na(stat), NaN, stat))
+
+  data_survey_design <- srvyr::as_survey(repex_df)
+
+  actual_results <- create_analysis_median(data_survey_design,
+                                           group_var = "group",
+                                           analysis_var = "value")
+
+  expect_equal(actual_results$stat, expected_results$stat)
+
+})
+