From 3d2992aaa116933e132577eccb867d62355abeb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?=
 <13123390+rempsyc@users.noreply.github.com>
Date: Mon, 27 Jun 2022 02:35:51 -0400
Subject: [PATCH] Winsorize based on the MAD (#179)

* addresses #177 & #49 & #47 for winsorizing based on the MAD

* forgot to push updated documentation

* new argument "method", updated NEWS, resolved failed test, #179

* update winsorize.numeric

added raw method

made the code easier to maintain by modularizing it

made doc more explicit about the methods

updated examples to visualize the effect

update NEWS

* minor modifications to docs

* removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette

* added new tests for new winsorization methods, insight::format_message(), data[] <- lapply...

Co-authored-by: RemPsyc <RemPsyc@users.noreply.github.com>
Co-authored-by: Mattan S. Ben-Shachar <matanshm@post.bgu.ac.il>
---
 NEWS.md                                       |  1 +
 R/winsorize.R                                 | 94 +++++++++++++++----
 man/winsorize.Rd                              | 37 +++++++-
 ..._data_to_numeric.md => data_to_numeric.md} |  6 +-
 tests/testthat/test-winsorization.R           | 17 +++-
 vignettes/standardize_data.Rmd                | 25 +++--
 6 files changed, 141 insertions(+), 39 deletions(-)
 rename tests/testthat/_snaps/{convert_data_to_numeric.md => data_to_numeric.md} (82%)

diff --git a/NEWS.md b/NEWS.md
index 9b22a1033..adef3d618 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -19,6 +19,7 @@ CHANGES
 
 * Some of the text formatting helpers (like `text_concatenate()`) gain an
   `enclose` argument, to wrap text elements with surrounding characters.
+* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).
 
 NEW FUNCTIONS
 
diff --git a/R/winsorize.R b/R/winsorize.R
index 84a32e9e1..e74d5fae8 100644
--- a/R/winsorize.R
+++ b/R/winsorize.R
@@ -17,13 +17,33 @@
 #' A dataframe with winsorized columns or a winsorized vector.
 #'
 #' @param data Dataframe or vector.
-#' @param threshold The amount of winsorization.
+#' @param threshold The amount of winsorization, depends on the value of `method`:
+#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
+#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
+#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
 #' @param verbose Toggle warnings.
+#' @param method One of "percentile" (default), "zscore", or "raw".
+#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.
 #' @param ... Currently not used.
 #'
 #' @examples
-#' winsorize(iris$Sepal.Length, threshold = 0.2)
+#' hist(iris$Sepal.Length, main = "Original data")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
+#'      xlim = c(4, 8), main = "Percentile Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
+#'      xlim = c(4, 8), main = "Mean (+/- SD) Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
+#'      xlim = c(4, 8), main = "Median (+/- MAD) Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
+#'      xlim = c(4, 8), main = "Raw Thresholds")
+#'
+#' # Also works on a data frame:
 #' winsorize(iris, threshold = 0.2)
+#'
 #' @inherit data_rename seealso
 #' @export
 winsorize <- function(data, ...) {
@@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor
 winsorize.logical <- winsorize.factor
 
 #' @export
-winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) {
-  out <- sapply(data, winsorize, threshold = threshold, verbose = verbose)
-  as.data.frame(out)
+winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
+                                 verbose = TRUE, ...) {
+  data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose)
+  data
 }
 
 #' @rdname winsorize
 #' @export
-winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) {
-  if (threshold < 0 || threshold > 1) {
-    if (isTRUE(verbose)) {
-      warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE)
+winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
+                              verbose = TRUE, ...) {
+  method <- match.arg(method, choices = c("percentile", "zscore", "raw"))
+
+  if (method == "raw") {
+    if (length(threshold) != 2L) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
     }
-    return(data)
   }
 
-  y <- sort(data)
-  n <- length(data)
-  ibot <- floor(threshold * n) + 1
-  itop <- length(data) - ibot + 1
-  xbot <- y[ibot]
-  xtop <- y[itop]
-  winval <- ifelse(data <= xbot, xbot, data)
-  ifelse(winval >= xtop, xtop, winval)
+  if(method == "percentile") {
+    if (threshold < 0 || threshold > 0.5) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
+    }
+
+    y <- sort(data)
+    n <- length(data)
+    ibot <- floor(threshold * n) + 1
+    itop <- length(data) - ibot + 1
+
+    threshold <- c(y[ibot], y[itop])
+  }
+
+  if(method == "zscore") {
+
+    if (threshold <= 0) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
+    }
+
+    if(isTRUE(robust)) {
+      centeral <- stats::median(data, na.rm = TRUE)
+      deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
+    } else {
+      centeral <- mean(data, na.rm = TRUE)
+      deviation <- stats::sd(data, na.rm = TRUE)
+    }
+
+    threshold <- centeral + c(-1, 1) * deviation * threshold
+  }
+
+
+  data[data < threshold[1]] <- threshold[1]
+  data[data > threshold[2]] <- threshold[2]
+  return(data)
 }
diff --git a/man/winsorize.Rd b/man/winsorize.Rd
index 85a08920e..d13c8b6d6 100644
--- a/man/winsorize.Rd
+++ b/man/winsorize.Rd
@@ -7,14 +7,30 @@
 \usage{
 winsorize(data, ...)
 
-\method{winsorize}{numeric}(data, threshold = 0.2, verbose = TRUE, ...)
+\method{winsorize}{numeric}(
+  data,
+  threshold = 0.2,
+  method = "percentile",
+  robust = FALSE,
+  verbose = TRUE,
+  ...
+)
 }
 \arguments{
 \item{data}{Dataframe or vector.}
 
 \item{...}{Currently not used.}
 
-\item{threshold}{The amount of winsorization.}
+\item{threshold}{The amount of winsorization, depends on the value of \code{method}:
+\itemize{
+\item For \code{method = "percentile"}: the amount to winsorize from \emph{each} tail.
+\item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the \emph{mean}/\emph{median} (see \code{robust})
+\item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound for winsorization.
+}}
+
+\item{method}{One of "percentile" (default), "zscore", or "raw".}
+
+\item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.}
 
 \item{verbose}{Toggle warnings.}
 }
@@ -36,8 +52,23 @@ percentile. Winsorized estimators are usually more robust to outliers than
 their more standard forms.
 }
 \examples{
-winsorize(iris$Sepal.Length, threshold = 0.2)
+hist(iris$Sepal.Length, main = "Original data")
+
+hist(winsorize(iris$Sepal.Length, threshold = 0.2),
+     xlim = c(4, 8), main = "Percentile Winsorization")
+
+hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
+     xlim = c(4, 8), main = "Mean (+/- SD) Winsorization")
+
+hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
+     xlim = c(4, 8), main = "Median (+/- MAD) Winsorization")
+
+hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
+     xlim = c(4, 8), main = "Raw Thresholds")
+
+# Also works on a data frame:
 winsorize(iris, threshold = 0.2)
+
 }
 \seealso{
 \itemize{
diff --git a/tests/testthat/_snaps/convert_data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md
similarity index 82%
rename from tests/testthat/_snaps/convert_data_to_numeric.md
rename to tests/testthat/_snaps/data_to_numeric.md
index 8ccb2e767..029f13c37 100644
--- a/tests/testthat/_snaps/convert_data_to_numeric.md
+++ b/tests/testthat/_snaps/data_to_numeric.md
@@ -1,7 +1,7 @@
 # convert dataframe to numeric
 
     Code
-      convert_data_to_numeric(head(ToothGrowth))
+      data_to_numeric(head(ToothGrowth))
     Output
          len supp.OJ supp.VC dose
       1  4.2       0       1  0.5
@@ -14,7 +14,7 @@
 ---
 
     Code
-      convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
+      data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
     Output
          len supp dose
       1  4.2    2  0.5
@@ -27,7 +27,7 @@
 # convert factor to numeric
 
     Code
-      convert_data_to_numeric(f)
+      data_to_numeric(f)
     Output
          a c i s t
       1  0 0 0 1 0
diff --git a/tests/testthat/test-winsorization.R b/tests/testthat/test-winsorization.R
index dcccfe888..f6ebb23f0 100644
--- a/tests/testthat/test-winsorization.R
+++ b/tests/testthat/test-winsorization.R
@@ -11,11 +11,23 @@ test_that("with missing values", {
 test_that("winsorize: threshold must be between 0 and 1", {
   expect_warning(
     winsorize(sample(1:10, 5), threshold = -0.1),
-    regexp = "must be a scalar between 0 and 1"
+    regexp = "must be a scalar between 0 and 0.5"
   )
   expect_warning(
     winsorize(sample(1:10, 5), threshold = 1.1),
-    regexp = "must be a scalar between 0 and 1"
+    regexp = "must be a scalar between 0 and 0.5"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "zscore", threshold = -3),
+    regexp = "must be a scalar greater than 0"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE),
+    regexp = "must be a scalar greater than 0"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "raw", threshold = 1.1),
+    regexp = "must be of length 2 for lower and upper bound"
   )
   x <- sample(1:10, 5)
   suppressWarnings({
@@ -38,3 +50,4 @@ test_that("winsorize on data.frame", {
   )
   expect_equal(names(iris2), names(iris))
 })
+
diff --git a/vignettes/standardize_data.Rmd b/vignettes/standardize_data.Rmd
index afd22b249..8c3d815a0 100644
--- a/vignettes/standardize_data.Rmd
+++ b/vignettes/standardize_data.Rmd
@@ -73,17 +73,16 @@ We can see that different methods give different central and variation values:
 
 ```{r, eval=FALSE}
 library(dplyr)
-library(tidyr)
 
 hardlyworking %>%
   select(starts_with("xtra_hours")) %>%
-  pivot_longer(everything()) %>%
-  group_by(name) %>%
+  data_to_long() %>%
+  group_by(Name) %>%
   summarise(
-    mean = mean(value),
-    sd = sd(value),
-    median = median(value),
-    mad = mad(value)
+    mean = mean(Value),
+    sd = sd(Value),
+    median = median(Value),
+    mad = mad(Value)
   )
 ```
 
@@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking)
 ```{r, eval=FALSE}
 hardlyworking_z %>%
   select(-xtra_hours_z, -xtra_hours_zr) %>%
-  pivot_longer(everything()) %>%
-  group_by(name) %>%
+  data_to_long() %>%
+  group_by(Name) %>%
   summarise(
-    mean = mean(value),
-    sd = sd(value),
-    median = median(value),
-    mad = mad(value)
+    mean = mean(Value),
+    sd = sd(Value),
+    median = median(Value),
+    mad = mad(Value)
   )
 ```