Skip to content

Commit

Permalink
Merge pull request #38 from yannsay-impact/update_create_analysis_median
Browse files Browse the repository at this point in the history
fixing create_analysis_median
  • Loading branch information
yannsay-impact authored Nov 27, 2024
2 parents 9083c6b + 38ac4d2 commit d748f32
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 58 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: analysistools
Title: Tools to perform some analysis on survey data collected with ODK
Version: 0.0.0.902
Version: 0.0.0.903
Authors@R: c(
person("Yann", "Say", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-7390-4209")),
Expand All @@ -21,7 +21,7 @@ Imports:
magrittr,
purrr,
rlang,
srvyr,
srvyr (>= 1.3.0),
stringr,
tidyr,
tidyselect
Expand All @@ -37,5 +37,5 @@ Config/testthat/edition: 3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
URL: https://impact-initiatives.github.io/analysistools/
99 changes: 53 additions & 46 deletions R/create_analysis_median.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
#' @param analysis_var the independent variable, variable to summarise
#' @param level the confidence level. 0.95 is default
#'
#' @note The results may differ with median(). There are lots of ways to calculate the median and
#' the default calculation between stats::median and survey::svyquantile/srvyr::survey_median are
#' different. Default from *survey/srvyr* is "school" methodology and does not exist in *stats*
#' package. The default for *stats* is "hf7". *survey/srvyr* methodology is prefered as these
#' packages are built for complex survey design.
#' @note Default from *survey/srvyr* is "math" methodology. In case of odds number, it will return
#' the lower value. Default for stats::median will calculate the mean between the two points.
#' If there is a set of c(1,2), median(1,2) will return 1.5; survey_mean() will return 1 by default.
#' create_analysis_median has the "school" methodology set as default, the results will match the
#' default results from stats::median, pandas.median If want to calculate with the "math"
#' methodology, you should run your own analysis with survey_median.
#'
#' @return a data frame with the median for each group
#' @export
Expand Down Expand Up @@ -42,57 +43,63 @@ create_analysis_median <- function(design, group_var = NA, analysis_var, level =
}

# calculate
pre_design <- design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by)))

## error handling
## survey_median has an error with only NA it passes somewhere if(NA)
## To handle this problem, the missing_value_catch will try to run summarise around
## survey_quantile, if it does work, it will run survey_mean to get the NA/NaN.
## The error also happens with svyby and svyquantile and cannot be swapped
# as 26.11.2024
# currently srvyr::survey_median return error with only NAs in a group
# Caused by error in `h()`:
# ! error in evaluating the argument 'x' in selecting a method for function 't':
# missing value where TRUE/FALSE needed
#
# fix:
# - calculates median with filter(.preserve = FALSE) (default). it removes groups with all NA
# - calculates the counts and weigthed counts with .preserve = TRUE to have all groups
# - left_join on the counts to have all groups.
#
# - when all are missing return a dataframe with stat,stat_upp,stat_low as NaN as
# filter(.preserve = FALSE will break)

missing_value_catch <- function(expr) {
tryCatch(
error = function(cnd) {
design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
srvyr::summarise(
stat = srvyr::survey_mean(
!!rlang::sym(analysis_var),
vartype = "ci",
level = as.numeric(level),
na.rm = T
),
n = dplyr::n(),
n_w = srvyr::survey_total(
vartype = "ci",
level = as.numeric(level),
na.rm = T
)
)
},
expr
)
}

results <- missing_value_catch(
design %>%
dplyr::group_by(dplyr::across(dplyr::any_of(across_by))) %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = T) %>%
# fix for 26.11.2024 bug
if (all(is.na(design[["variables"]][[analysis_var]]))) {
# edge case when all NA
results_median <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var))) %>%
dplyr::summarise(
stat = NaN,
stat_upp = NaN,
stat_low = NaN
)
} else {
results_median <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = FALSE) %>%
srvyr::summarise(
stat = srvyr::survey_median(
!!rlang::sym(analysis_var),
vartype = "ci",
level = as.numeric(level),
na.rm = T
),
n = dplyr::n(),
n_w = srvyr::survey_total(
vartype = "ci",
qrule = "school",
level = as.numeric(level),
na.rm = T
)
)
)
}

results_totals <- pre_design %>%
dplyr::filter(!is.na(!!rlang::sym(analysis_var)), .preserve = TRUE) %>%
srvyr::summarise(
n = dplyr::n(),
n_w = srvyr::survey_total(
na.rm = T
)
)

if (is.null(across_by)) {
results <- cbind(results_median, results_totals)
} else {
results <- results_totals %>%
dplyr::left_join(results_median) %>%
suppressMessages()
}

results <- results %>%
dplyr::mutate(
Expand Down
11 changes: 6 additions & 5 deletions man/create_analysis_median.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 30 additions & 4 deletions tests/testthat/test-create_analysis_median.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_that("create_analysis_median returns correct output, no weights", {
)

svyquantile_results <- survey::svydesign(id = ~1, data = somedata) %>%
survey::svyquantile(~value, design = ., quantiles = c(.5)) %>%
survey::svyquantile(~value, design = ., quantiles = c(.5), qrule = "school") %>%
suppressWarnings()

expected_output <- svyquantile_results[["value"]] %>%
Expand Down Expand Up @@ -295,7 +295,9 @@ test_that("create_analysis_median handles lonely PSU", {
~groups,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -411,7 +413,9 @@ test_that("create_analysis_median returns correct output with 3 grouping variabl
~ group_a + group_b + group_c,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -506,7 +510,9 @@ test_that("create_analysis_median returns correct output with 2 grouping variabl
~ group_a + group_b,
design = .,
FUN = survey::svyquantile,
quantiles = .5, vartype = "ci"
quantiles = .5,
vartype = "ci",
qrule = "school"
) %>%
suppressWarnings()

Expand Down Expand Up @@ -590,3 +596,23 @@ test_that("stat is set to NaN when there is no value", {

expect_equal(results, expected_output, ignore_attr = T)
})

test_that("When only missing values, the correct values are return", {
set.seed(3452)
repex_df <- data.frame(group = c(rep("a", 5), rep("b",3)),
value = c(runif(5), rep(NA, 3)))
expected_results <- repex_df |>
dplyr::group_by(group) |>
dplyr::summarise(stat = median(value, na.rm = T)) |>
dplyr::mutate(stat = dplyr::if_else(is.na(stat), NaN, stat))

data_survey_design <- srvyr::as_survey(repex_df)

actual_results <- create_analysis_median(data_survey_design,
group_var = "group",
analysis_var = "value")

expect_equal(actual_results$stat, expected_results$stat)

})

0 comments on commit d748f32

Please sign in to comment.