From 3d2992aaa116933e132577eccb867d62355abeb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Mon, 27 Jun 2022 02:35:51 -0400 Subject: [PATCH] Winsorize based on the MAD (#179) * addresses #177 & #49 & #47 for winsorizing based on the MAD * forgot to push updated documentation * new argument "method", updated NEWS, resolved failed test, #179 * update winsorize.numeric added raw method made the code easier to maintain by modularizing it made doc more explicit about the methods updated examples to visualize the effect update NEWS * minor modifications to docs * removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette * added new tests for new winsorization methods, insight::format_message(), data[] <- lapply... Co-authored-by: RemPsyc Co-authored-by: Mattan S. Ben-Shachar --- NEWS.md | 1 + R/winsorize.R | 94 +++++++++++++++---- man/winsorize.Rd | 37 +++++++- ..._data_to_numeric.md => data_to_numeric.md} | 6 +- tests/testthat/test-winsorization.R | 17 +++- vignettes/standardize_data.Rmd | 25 +++-- 6 files changed, 141 insertions(+), 39 deletions(-) rename tests/testthat/_snaps/{convert_data_to_numeric.md => data_to_numeric.md} (82%) diff --git a/NEWS.md b/NEWS.md index 9b22a1033..adef3d618 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,7 @@ CHANGES * Some of the text formatting helpers (like `text_concatenate()`) gain an `enclose` argument, to wrap text elements with surrounding characters. +* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47). NEW FUNCTIONS diff --git a/R/winsorize.R b/R/winsorize.R index 84a32e9e1..e74d5fae8 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -17,13 +17,33 @@ #' A dataframe with winsorized columns or a winsorized vector. #' #' @param data Dataframe or vector. -#' @param threshold The amount of winsorization. +#' @param threshold The amount of winsorization, depends on the value of `method`: +#' - For `method = "percentile"`: the amount to winsorize from *each* tail. +#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`) +#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization. #' @param verbose Toggle warnings. +#' @param method One of "percentile" (default), "zscore", or "raw". +#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation. #' @param ... Currently not used. #' #' @examples -#' winsorize(iris$Sepal.Length, threshold = 0.2) +#' hist(iris$Sepal.Length, main = "Original data") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 0.2), +#' xlim = c(4, 8), main = "Percentile Winsorization") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), +#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), +#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), +#' xlim = c(4, 8), main = "Raw Thresholds") +#' +#' # Also works on a data frame: #' winsorize(iris, threshold = 0.2) +#' #' @inherit data_rename seealso #' @export winsorize <- function(data, ...) { @@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor winsorize.logical <- winsorize.factor #' @export -winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) { - out <- sapply(data, winsorize, threshold = threshold, verbose = verbose) - as.data.frame(out) +winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, + verbose = TRUE, ...) { + data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose) + data } #' @rdname winsorize #' @export -winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) { - if (threshold < 0 || threshold > 1) { - if (isTRUE(verbose)) { - warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE) +winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, + verbose = TRUE, ...) { + method <- match.arg(method, choices = c("percentile", "zscore", "raw")) + + if (method == "raw") { + if (length(threshold) != 2L) { + if (isTRUE(verbose)) { + warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE) + } + return(data) } - return(data) } - y <- sort(data) - n <- length(data) - ibot <- floor(threshold * n) + 1 - itop <- length(data) - ibot + 1 - xbot <- y[ibot] - xtop <- y[itop] - winval <- ifelse(data <= xbot, xbot, data) - ifelse(winval >= xtop, xtop, winval) + if(method == "percentile") { + if (threshold < 0 || threshold > 0.5) { + if (isTRUE(verbose)) { + warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE) + } + return(data) + } + + y <- sort(data) + n <- length(data) + ibot <- floor(threshold * n) + 1 + itop <- length(data) - ibot + 1 + + threshold <- c(y[ibot], y[itop]) + } + + if(method == "zscore") { + + if (threshold <= 0) { + if (isTRUE(verbose)) { + warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE) + } + return(data) + } + + if(isTRUE(robust)) { + centeral <- stats::median(data, na.rm = TRUE) + deviation <- stats::mad(data, center = centeral, na.rm = TRUE) + } else { + centeral <- mean(data, na.rm = TRUE) + deviation <- stats::sd(data, na.rm = TRUE) + } + + threshold <- centeral + c(-1, 1) * deviation * threshold + } + + + data[data < threshold[1]] <- threshold[1] + data[data > threshold[2]] <- threshold[2] + return(data) } diff --git a/man/winsorize.Rd b/man/winsorize.Rd index 85a08920e..d13c8b6d6 100644 --- a/man/winsorize.Rd +++ b/man/winsorize.Rd @@ -7,14 +7,30 @@ \usage{ winsorize(data, ...) -\method{winsorize}{numeric}(data, threshold = 0.2, verbose = TRUE, ...) +\method{winsorize}{numeric}( + data, + threshold = 0.2, + method = "percentile", + robust = FALSE, + verbose = TRUE, + ... +) } \arguments{ \item{data}{Dataframe or vector.} \item{...}{Currently not used.} -\item{threshold}{The amount of winsorization.} +\item{threshold}{The amount of winsorization, depends on the value of \code{method}: +\itemize{ +\item For \code{method = "percentile"}: the amount to winsorize from \emph{each} tail. +\item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the \emph{mean}/\emph{median} (see \code{robust}) +\item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound for winsorization. +}} + +\item{method}{One of "percentile" (default), "zscore", or "raw".} + +\item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.} \item{verbose}{Toggle warnings.} } @@ -36,8 +52,23 @@ percentile. Winsorized estimators are usually more robust to outliers than their more standard forms. } \examples{ -winsorize(iris$Sepal.Length, threshold = 0.2) +hist(iris$Sepal.Length, main = "Original data") + +hist(winsorize(iris$Sepal.Length, threshold = 0.2), + xlim = c(4, 8), main = "Percentile Winsorization") + +hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), + xlim = c(4, 8), main = "Mean (+/- SD) Winsorization") + +hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), + xlim = c(4, 8), main = "Median (+/- MAD) Winsorization") + +hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), + xlim = c(4, 8), main = "Raw Thresholds") + +# Also works on a data frame: winsorize(iris, threshold = 0.2) + } \seealso{ \itemize{ diff --git a/tests/testthat/_snaps/convert_data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md similarity index 82% rename from tests/testthat/_snaps/convert_data_to_numeric.md rename to tests/testthat/_snaps/data_to_numeric.md index 8ccb2e767..029f13c37 100644 --- a/tests/testthat/_snaps/convert_data_to_numeric.md +++ b/tests/testthat/_snaps/data_to_numeric.md @@ -1,7 +1,7 @@ # convert dataframe to numeric Code - convert_data_to_numeric(head(ToothGrowth)) + data_to_numeric(head(ToothGrowth)) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 @@ -14,7 +14,7 @@ --- Code - convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE) + data_to_numeric(head(ToothGrowth), dummy_factors = FALSE) Output len supp dose 1 4.2 2 0.5 @@ -27,7 +27,7 @@ # convert factor to numeric Code - convert_data_to_numeric(f) + data_to_numeric(f) Output a c i s t 1 0 0 0 1 0 diff --git a/tests/testthat/test-winsorization.R b/tests/testthat/test-winsorization.R index dcccfe888..f6ebb23f0 100644 --- a/tests/testthat/test-winsorization.R +++ b/tests/testthat/test-winsorization.R @@ -11,11 +11,23 @@ test_that("with missing values", { test_that("winsorize: threshold must be between 0 and 1", { expect_warning( winsorize(sample(1:10, 5), threshold = -0.1), - regexp = "must be a scalar between 0 and 1" + regexp = "must be a scalar between 0 and 0.5" ) expect_warning( winsorize(sample(1:10, 5), threshold = 1.1), - regexp = "must be a scalar between 0 and 1" + regexp = "must be a scalar between 0 and 0.5" + ) + expect_warning( + winsorize(sample(1:10, 5), method = "zscore", threshold = -3), + regexp = "must be a scalar greater than 0" + ) + expect_warning( + winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE), + regexp = "must be a scalar greater than 0" + ) + expect_warning( + winsorize(sample(1:10, 5), method = "raw", threshold = 1.1), + regexp = "must be of length 2 for lower and upper bound" ) x <- sample(1:10, 5) suppressWarnings({ @@ -38,3 +50,4 @@ test_that("winsorize on data.frame", { ) expect_equal(names(iris2), names(iris)) }) + diff --git a/vignettes/standardize_data.Rmd b/vignettes/standardize_data.Rmd index afd22b249..8c3d815a0 100644 --- a/vignettes/standardize_data.Rmd +++ b/vignettes/standardize_data.Rmd @@ -73,17 +73,16 @@ We can see that different methods give different central and variation values: ```{r, eval=FALSE} library(dplyr) -library(tidyr) hardlyworking %>% select(starts_with("xtra_hours")) %>% - pivot_longer(everything()) %>% - group_by(name) %>% + data_to_long() %>% + group_by(Name) %>% summarise( - mean = mean(value), - sd = sd(value), - median = median(value), - mad = mad(value) + mean = mean(Value), + sd = sd(Value), + median = median(Value), + mad = mad(Value) ) ``` @@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking) ```{r, eval=FALSE} hardlyworking_z %>% select(-xtra_hours_z, -xtra_hours_zr) %>% - pivot_longer(everything()) %>% - group_by(name) %>% + data_to_long() %>% + group_by(Name) %>% summarise( - mean = mean(value), - sd = sd(value), - median = median(value), - mad = mad(value) + mean = mean(Value), + sd = sd(Value), + median = median(Value), + mad = mad(Value) ) ```