Skip to content

Commit

Permalink
Winsorize based on the MAD (#179)
Browse files Browse the repository at this point in the history
* addresses #177 & #49 & #47 for winsorizing based on the MAD

* forgot to push updated documentation

* new argument "method", updated NEWS, resolved failed test, #179

* update winsorize.numeric

added raw method

made the code easier to maintain by modularizing it

made doc more explicit about the methods

updated examples to visualize the effect

update NEWS

* minor modifications to docs

* removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette

* added new tests for new winsorization methods, insight::format_message(), data[] <- lapply...

Co-authored-by: RemPsyc <[email protected]>
Co-authored-by: Mattan S. Ben-Shachar <[email protected]>
  • Loading branch information
3 people authored Jun 27, 2022
1 parent e986a8d commit 3d2992a
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 39 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ CHANGES

* Some of the text formatting helpers (like `text_concatenate()`) gain an
`enclose` argument, to wrap text elements with surrounding characters.
* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).

NEW FUNCTIONS

Expand Down
94 changes: 76 additions & 18 deletions R/winsorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,33 @@
#' A dataframe with winsorized columns or a winsorized vector.
#'
#' @param data Dataframe or vector.
#' @param threshold The amount of winsorization.
#' @param threshold The amount of winsorization, depends on the value of `method`:
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
#' @param verbose Toggle warnings.
#' @param method One of "percentile" (default), "zscore", or "raw".
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.
#' @param ... Currently not used.
#'
#' @examples
#' winsorize(iris$Sepal.Length, threshold = 0.2)
#' hist(iris$Sepal.Length, main = "Original data")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
#' xlim = c(4, 8), main = "Percentile Winsorization")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
#' xlim = c(4, 8), main = "Raw Thresholds")
#'
#' # Also works on a data frame:
#' winsorize(iris, threshold = 0.2)
#'
#' @inherit data_rename seealso
#' @export
winsorize <- function(data, ...) {
Expand All @@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor
winsorize.logical <- winsorize.factor

#' @export
winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) {
out <- sapply(data, winsorize, threshold = threshold, verbose = verbose)
as.data.frame(out)
winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
verbose = TRUE, ...) {
data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose)
data
}

#' @rdname winsorize
#' @export
winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) {
if (threshold < 0 || threshold > 1) {
if (isTRUE(verbose)) {
warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE)
winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
verbose = TRUE, ...) {
method <- match.arg(method, choices = c("percentile", "zscore", "raw"))

if (method == "raw") {
if (length(threshold) != 2L) {
if (isTRUE(verbose)) {
warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE)
}
return(data)
}
return(data)
}

y <- sort(data)
n <- length(data)
ibot <- floor(threshold * n) + 1
itop <- length(data) - ibot + 1
xbot <- y[ibot]
xtop <- y[itop]
winval <- ifelse(data <= xbot, xbot, data)
ifelse(winval >= xtop, xtop, winval)
if(method == "percentile") {
if (threshold < 0 || threshold > 0.5) {
if (isTRUE(verbose)) {
warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE)
}
return(data)
}

y <- sort(data)
n <- length(data)
ibot <- floor(threshold * n) + 1
itop <- length(data) - ibot + 1

threshold <- c(y[ibot], y[itop])
}

if(method == "zscore") {

if (threshold <= 0) {
if (isTRUE(verbose)) {
warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE)
}
return(data)
}

if(isTRUE(robust)) {
centeral <- stats::median(data, na.rm = TRUE)
deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
} else {
centeral <- mean(data, na.rm = TRUE)
deviation <- stats::sd(data, na.rm = TRUE)
}

threshold <- centeral + c(-1, 1) * deviation * threshold
}


data[data < threshold[1]] <- threshold[1]
data[data > threshold[2]] <- threshold[2]
return(data)
}
37 changes: 34 additions & 3 deletions man/winsorize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# convert dataframe to numeric

Code
convert_data_to_numeric(head(ToothGrowth))
data_to_numeric(head(ToothGrowth))
Output
len supp.OJ supp.VC dose
1 4.2 0 1 0.5
Expand All @@ -14,7 +14,7 @@
---

Code
convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
Output
len supp dose
1 4.2 2 0.5
Expand All @@ -27,7 +27,7 @@
# convert factor to numeric

Code
convert_data_to_numeric(f)
data_to_numeric(f)
Output
a c i s t
1 0 0 0 1 0
Expand Down
17 changes: 15 additions & 2 deletions tests/testthat/test-winsorization.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,23 @@ test_that("with missing values", {
test_that("winsorize: threshold must be between 0 and 1", {
expect_warning(
winsorize(sample(1:10, 5), threshold = -0.1),
regexp = "must be a scalar between 0 and 1"
regexp = "must be a scalar between 0 and 0.5"
)
expect_warning(
winsorize(sample(1:10, 5), threshold = 1.1),
regexp = "must be a scalar between 0 and 1"
regexp = "must be a scalar between 0 and 0.5"
)
expect_warning(
winsorize(sample(1:10, 5), method = "zscore", threshold = -3),
regexp = "must be a scalar greater than 0"
)
expect_warning(
winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE),
regexp = "must be a scalar greater than 0"
)
expect_warning(
winsorize(sample(1:10, 5), method = "raw", threshold = 1.1),
regexp = "must be of length 2 for lower and upper bound"
)
x <- sample(1:10, 5)
suppressWarnings({
Expand All @@ -38,3 +50,4 @@ test_that("winsorize on data.frame", {
)
expect_equal(names(iris2), names(iris))
})

25 changes: 12 additions & 13 deletions vignettes/standardize_data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,16 @@ We can see that different methods give different central and variation values:

```{r, eval=FALSE}
library(dplyr)
library(tidyr)
hardlyworking %>%
select(starts_with("xtra_hours")) %>%
pivot_longer(everything()) %>%
group_by(name) %>%
data_to_long() %>%
group_by(Name) %>%
summarise(
mean = mean(value),
sd = sd(value),
median = median(value),
mad = mad(value)
mean = mean(Value),
sd = sd(Value),
median = median(Value),
mad = mad(Value)
)
```

Expand Down Expand Up @@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking)
```{r, eval=FALSE}
hardlyworking_z %>%
select(-xtra_hours_z, -xtra_hours_zr) %>%
pivot_longer(everything()) %>%
group_by(name) %>%
data_to_long() %>%
group_by(Name) %>%
summarise(
mean = mean(value),
sd = sd(value),
median = median(value),
mad = mad(value)
mean = mean(Value),
sd = sd(Value),
median = median(Value),
mad = mad(Value)
)
```

Expand Down

0 comments on commit 3d2992a

Please sign in to comment.