-
-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Winsorize based on the MAD #179
Changes from 6 commits
3cb5a12
807b8fe
0a7cc4e
658b2b4
c626eca
d45ca3f
03f85bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,13 +17,33 @@ | |
#' A dataframe with winsorized columns or a winsorized vector. | ||
#' | ||
#' @param data Dataframe or vector. | ||
#' @param threshold The amount of winsorization. | ||
#' @param threshold The amount of winsorization, depends on the value of `method`: | ||
#' - For `method = "percentile"`: the amount to winsorize from *each* tail. | ||
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`) | ||
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization. | ||
#' @param verbose Toggle warnings. | ||
#' @param method One of "percentile" (default), "zscore", or "raw". | ||
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation. | ||
#' @param ... Currently not used. | ||
#' | ||
#' @examples | ||
#' winsorize(iris$Sepal.Length, threshold = 0.2) | ||
#' hist(iris$Sepal.Length, main = "Original data") | ||
#' | ||
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2), | ||
#' xlim = c(4, 8), main = "Percentile Winsorization") | ||
#' | ||
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), | ||
#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization") | ||
#' | ||
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), | ||
#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization") | ||
#' | ||
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), | ||
#' xlim = c(4, 8), main = "Raw Thresholds") | ||
#' | ||
#' # Also works on a data frame: | ||
#' winsorize(iris, threshold = 0.2) | ||
#' | ||
#' @inherit data_rename seealso | ||
#' @export | ||
winsorize <- function(data, ...) { | ||
|
@@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor | |
winsorize.logical <- winsorize.factor | ||
|
||
#' @export | ||
winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) { | ||
out <- sapply(data, winsorize, threshold = threshold, verbose = verbose) | ||
as.data.frame(out) | ||
winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, | ||
verbose = TRUE, ...) { | ||
data <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose) | ||
as.data.frame(data) | ||
} | ||
|
||
#' @rdname winsorize | ||
#' @export | ||
winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) { | ||
if (threshold < 0 || threshold > 1) { | ||
if (isTRUE(verbose)) { | ||
warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE) | ||
winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, | ||
verbose = TRUE, ...) { | ||
method <- match.arg(method, choices = c("percentile", "zscore", "raw")) | ||
|
||
if (method == "raw") { | ||
if (length(threshold) != 2L) { | ||
if (isTRUE(verbose)) { | ||
warning("threshold must be of length 2 for lower and upper bound. Did not winsorize data.", call. = FALSE) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest wrapping in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return(data) | ||
} | ||
} | ||
|
||
if(method == "percentile") { | ||
if (threshold < 0 || threshold > 0.5) { | ||
if (isTRUE(verbose)) { | ||
warning("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data.", call. = FALSE) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return(data) | ||
} | ||
return(data) | ||
|
||
y <- sort(data) | ||
n <- length(data) | ||
ibot <- floor(threshold * n) + 1 | ||
itop <- length(data) - ibot + 1 | ||
|
||
threshold <- c(y[ibot], y[itop]) | ||
} | ||
|
||
if(method == "zscore") { | ||
|
||
if (threshold <= 0) { | ||
if (isTRUE(verbose)) { | ||
warning("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data.", call. = FALSE) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return(data) | ||
} | ||
|
||
if(isTRUE(robust)) { | ||
centeral <- stats::median(data, na.rm = TRUE) | ||
deviation <- stats::mad(data, center = centeral, na.rm = TRUE) | ||
} else { | ||
centeral <- mean(data, na.rm = TRUE) | ||
deviation <- stats::sd(data, na.rm = TRUE) | ||
} | ||
|
||
threshold <- centeral + c(-1, 1) * deviation * threshold | ||
} | ||
|
||
y <- sort(data) | ||
n <- length(data) | ||
ibot <- floor(threshold * n) + 1 | ||
itop <- length(data) - ibot + 1 | ||
xbot <- y[ibot] | ||
xtop <- y[itop] | ||
winval <- ifelse(data <= xbot, xbot, data) | ||
ifelse(winval >= xtop, xtop, winval) | ||
|
||
data[data < threshold[1]] <- threshold[1] | ||
data[data > threshold[2]] <- threshold[2] | ||
return(data) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,11 +11,11 @@ test_that("with missing values", { | |
test_that("winsorize: threshold must be between 0 and 1", { | ||
expect_warning( | ||
winsorize(sample(1:10, 5), threshold = -0.1), | ||
regexp = "must be a scalar between 0 and 1" | ||
regexp = "must be a scalar between 0 and 0.5" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be that when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this has been solved (essentially, we had modified the warning message within the function but not within the test checks. Harmonizing them fixed it) |
||
expect_warning( | ||
winsorize(sample(1:10, 5), threshold = 1.1), | ||
regexp = "must be a scalar between 0 and 1" | ||
regexp = "must be a scalar between 0 and 0.5" | ||
) | ||
x <- sample(1:10, 5) | ||
suppressWarnings({ | ||
|
@@ -38,3 +38,4 @@ test_that("winsorize on data.frame", { | |
) | ||
expect_equal(names(iris2), names(iris)) | ||
}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest using
data[] <- lapply...
and remove the next line withas.data.frame()
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, but in that way it didn't return an output, so I had to add a line to return the dataframe anyway.