From 3cb5a12623ad963825f390060d0e0853ad94a6ff Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Fri, 24 Jun 2022 23:23:18 -0400 Subject: [PATCH 1/7] addresses #177 & #49 & #47 for winsorizing based on the MAD --- R/winsorize.R | 57 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/R/winsorize.R b/R/winsorize.R index 84a32e9e1..6cf500157 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -19,10 +19,12 @@ #' @param data Dataframe or vector. #' @param threshold The amount of winsorization. #' @param verbose Toggle warnings. +#' @param robust Logical, if TRUE, winsorizing is done via the median absolute deviation (MAD). #' @param ... Currently not used. #' #' @examples #' winsorize(iris$Sepal.Length, threshold = 0.2) +#' winsorize(iris$Sepal.Length, threshold = 3, robust = TRUE) #' winsorize(iris, threshold = 0.2) #' @inherit data_rename seealso #' @export @@ -43,27 +45,50 @@ winsorize.character <- winsorize.factor winsorize.logical <- winsorize.factor #' @export -winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) { - out <- sapply(data, winsorize, threshold = threshold, verbose = verbose) +winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) { + out <- sapply(data, winsorize, threshold = threshold, verbose = verbose, robust = robust) as.data.frame(out) } #' @rdname winsorize #' @export -winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) { - if (threshold < 0 || threshold > 1) { - if (isTRUE(verbose)) { - warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE) - } - return(data) +winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) { + if(robust == FALSE) { + + if (threshold < 0 || threshold > 0.5) { + if (isTRUE(verbose)) { + warning("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data.", call. = FALSE) + } + return(data) + } + + y <- sort(data) + n <- length(data) + ibot <- floor(threshold * n) + 1 + itop <- length(data) - ibot + 1 + xbot <- y[ibot] + xtop <- y[itop] + + winval <- data + winval[winval <= xbot] <- xbot + winval[winval >= xtop] <- xtop + return(winval) } - y <- sort(data) - n <- length(data) - ibot <- floor(threshold * n) + 1 - itop <- length(data) - ibot + 1 - xbot <- y[ibot] - xtop <- y[itop] - winval <- ifelse(data <= xbot, xbot, data) - ifelse(winval >= xtop, xtop, winval) + if(robust == TRUE) { + + if (threshold <= 0) { + if (isTRUE(verbose)) { + warning("'threshold' for winsorization must be a scalar greater than 1. Did not winsorize data.", call. = FALSE) + } + return(data) + } + + med <- median(data, na.rm = TRUE) + y <- data - med + sc <- mad(y, center = 0, na.rm = TRUE) * threshold + y[y > sc] <- sc + y[y < -sc] <- -sc + y + med + } } From 807b8fe99ef099ee05f191b2b7d9a8fac5cc76f1 Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Fri, 24 Jun 2022 23:36:27 -0400 Subject: [PATCH 2/7] forgot to push updated documentation --- R/winsorize.R | 6 +++--- man/winsorize.Rd | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/R/winsorize.R b/R/winsorize.R index 6cf500157..89f69b5ab 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -79,14 +79,14 @@ winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, robust = FA if (threshold <= 0) { if (isTRUE(verbose)) { - warning("'threshold' for winsorization must be a scalar greater than 1. Did not winsorize data.", call. = FALSE) + warning("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data.", call. = FALSE) } return(data) } - med <- median(data, na.rm = TRUE) + med <- stats::median(data, na.rm = TRUE) y <- data - med - sc <- mad(y, center = 0, na.rm = TRUE) * threshold + sc <- stats::mad(y, center = 0, na.rm = TRUE) * threshold y[y > sc] <- sc y[y < -sc] <- -sc y + med diff --git a/man/winsorize.Rd b/man/winsorize.Rd index 85a08920e..4d082facc 100644 --- a/man/winsorize.Rd +++ b/man/winsorize.Rd @@ -7,7 +7,7 @@ \usage{ winsorize(data, ...) -\method{winsorize}{numeric}(data, threshold = 0.2, verbose = TRUE, ...) +\method{winsorize}{numeric}(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) } \arguments{ \item{data}{Dataframe or vector.} @@ -17,6 +17,8 @@ winsorize(data, ...) \item{threshold}{The amount of winsorization.} \item{verbose}{Toggle warnings.} + +\item{robust}{Logical, if TRUE, winsorizing is done via the median absolute deviation (MAD).} } \value{ A dataframe with winsorized columns or a winsorized vector. @@ -37,6 +39,7 @@ their more standard forms. } \examples{ winsorize(iris$Sepal.Length, threshold = 0.2) +winsorize(iris$Sepal.Length, threshold = 3, robust = TRUE) winsorize(iris, threshold = 0.2) } \seealso{ From 0a7cc4eb37a5f50ec9a5d11645d6f16dbea21222 Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Sat, 25 Jun 2022 20:05:19 -0400 Subject: [PATCH 3/7] new argument "method", updated NEWS, resolved failed test, #179 --- NEWS.md | 1 + R/winsorize.R | 35 ++++++++++++++++++++--------- man/winsorize.Rd | 16 ++++++++++--- tests/testthat/test-winsorization.R | 5 +++-- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9b22a1033..5d363f620 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,7 @@ CHANGES * Some of the text formatting helpers (like `text_concatenate()`) gain an `enclose` argument, to wrap text elements with surrounding characters. +* `winsorize` now accepts "zscore" method (in addition to "percentile"). Additionally, when `robust` is also set to `TRUE`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47). NEW FUNCTIONS diff --git a/R/winsorize.R b/R/winsorize.R index 89f69b5ab..590d762e7 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -19,12 +19,14 @@ #' @param data Dataframe or vector. #' @param threshold The amount of winsorization. #' @param verbose Toggle warnings. -#' @param robust Logical, if TRUE, winsorizing is done via the median absolute deviation (MAD). +#' @param method One of "percentile" or "zscore". +#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation. #' @param ... Currently not used. #' #' @examples #' winsorize(iris$Sepal.Length, threshold = 0.2) -#' winsorize(iris$Sepal.Length, threshold = 3, robust = TRUE) +#' winsorize(iris$Sepal.Length, threshold = 2, method = "zscore") +#' winsorize(iris$Sepal.Length, threshold = 2, method = "zscore", robust = TRUE) #' winsorize(iris, threshold = 0.2) #' @inherit data_rename seealso #' @export @@ -52,8 +54,8 @@ winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, robust = #' @rdname winsorize #' @export -winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) { - if(robust == FALSE) { +winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, method = "percentile", robust = FALSE, ...) { + if(method == "percentile") { if (threshold < 0 || threshold > 0.5) { if (isTRUE(verbose)) { @@ -75,7 +77,7 @@ winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, robust = FA return(winval) } - if(robust == TRUE) { + if(method == "zscore") { if (threshold <= 0) { if (isTRUE(verbose)) { @@ -84,11 +86,22 @@ winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, robust = FA return(data) } - med <- stats::median(data, na.rm = TRUE) - y <- data - med - sc <- stats::mad(y, center = 0, na.rm = TRUE) * threshold - y[y > sc] <- sc - y[y < -sc] <- -sc - y + med + if(isTRUE(robust)) { + med <- stats::median(data, na.rm = TRUE) + y <- data - med + winval <- stats::mad(y, center = 0, na.rm = TRUE) * threshold + y[y > winval] <- winval + y[y < -winval] <- -winval + return(y + med) + } + + if(isFALSE(robust)) { + m <- mean(data, na.rm = TRUE) + y <- data - m + winval <- stats::sd(y, na.rm = TRUE) * threshold + y[y > winval] <- winval + y[y < -winval] <- -winval + y + m + } } } diff --git a/man/winsorize.Rd b/man/winsorize.Rd index 4d082facc..aa284ac22 100644 --- a/man/winsorize.Rd +++ b/man/winsorize.Rd @@ -7,7 +7,14 @@ \usage{ winsorize(data, ...) -\method{winsorize}{numeric}(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) +\method{winsorize}{numeric}( + data, + threshold = 0.2, + verbose = TRUE, + method = "percentile", + robust = FALSE, + ... +) } \arguments{ \item{data}{Dataframe or vector.} @@ -18,7 +25,9 @@ winsorize(data, ...) \item{verbose}{Toggle warnings.} -\item{robust}{Logical, if TRUE, winsorizing is done via the median absolute deviation (MAD).} +\item{method}{One of "percentile" or "zscore".} + +\item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.} } \value{ A dataframe with winsorized columns or a winsorized vector. @@ -39,7 +48,8 @@ their more standard forms. } \examples{ winsorize(iris$Sepal.Length, threshold = 0.2) -winsorize(iris$Sepal.Length, threshold = 3, robust = TRUE) +winsorize(iris$Sepal.Length, threshold = 2, method = "zscore") +winsorize(iris$Sepal.Length, threshold = 2, method = "zscore", robust = TRUE) winsorize(iris, threshold = 0.2) } \seealso{ diff --git a/tests/testthat/test-winsorization.R b/tests/testthat/test-winsorization.R index dcccfe888..c618ad7c9 100644 --- a/tests/testthat/test-winsorization.R +++ b/tests/testthat/test-winsorization.R @@ -11,11 +11,11 @@ test_that("with missing values", { test_that("winsorize: threshold must be between 0 and 1", { expect_warning( winsorize(sample(1:10, 5), threshold = -0.1), - regexp = "must be a scalar between 0 and 1" + regexp = "must be a scalar between 0 and 0.5" ) expect_warning( winsorize(sample(1:10, 5), threshold = 1.1), - regexp = "must be a scalar between 0 and 1" + regexp = "must be a scalar between 0 and 0.5" ) x <- sample(1:10, 5) suppressWarnings({ @@ -38,3 +38,4 @@ test_that("winsorize on data.frame", { ) expect_equal(names(iris2), names(iris)) }) + From 658b2b4965a6378c1ccbfc3a53ae478546f5e518 Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" Date: Sun, 26 Jun 2022 11:15:00 +0300 Subject: [PATCH 4/7] update winsorize.numeric added raw method made the code easier to maintain by modularizing it made doc more explicit about the methods updated examples to visualize the effect update NEWS --- NEWS.md | 2 +- R/winsorize.R | 84 ++++++++++++++++++++++++++++++------------------ man/winsorize.Rd | 34 +++++++++++++++----- 3 files changed, 79 insertions(+), 41 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5d363f620..adef3d618 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,7 +19,7 @@ CHANGES * Some of the text formatting helpers (like `text_concatenate()`) gain an `enclose` argument, to wrap text elements with surrounding characters. -* `winsorize` now accepts "zscore" method (in addition to "percentile"). Additionally, when `robust` is also set to `TRUE`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47). +* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47). NEW FUNCTIONS diff --git a/R/winsorize.R b/R/winsorize.R index 590d762e7..c13d921de 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -17,17 +17,33 @@ #' A dataframe with winsorized columns or a winsorized vector. #' #' @param data Dataframe or vector. -#' @param threshold The amount of winsorization. +#' @param threshold The amount of winsorization, depends on the value of `method`: +#' - For `method = "percentile"`: the amount to windzorize from *each* tail. +#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`) +#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization. #' @param verbose Toggle warnings. -#' @param method One of "percentile" or "zscore". +#' @param method One of "percentile" (default), "zscore" or "raw". #' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation. #' @param ... Currently not used. #' #' @examples -#' winsorize(iris$Sepal.Length, threshold = 0.2) -#' winsorize(iris$Sepal.Length, threshold = 2, method = "zscore") -#' winsorize(iris$Sepal.Length, threshold = 2, method = "zscore", robust = TRUE) +#' hist(iris$Sepal.Length, main = "Original data") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 0.2), +#' xlim = c(4, 8), main = "Percentile Winz") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), +#' xlim = c(4, 8), main = "Mean+-SD Winz") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), +#' xlim = c(4, 8), main = "Median+-MAD Winz") +#' +#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), +#' xlim = c(4, 8), main = "Raw Thresholds") +#' +#' # Also works on a data frame: #' winsorize(iris, threshold = 0.2) +#' #' @inherit data_rename seealso #' @export winsorize <- function(data, ...) { @@ -47,34 +63,41 @@ winsorize.character <- winsorize.factor winsorize.logical <- winsorize.factor #' @export -winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, robust = FALSE, ...) { - out <- sapply(data, winsorize, threshold = threshold, verbose = verbose, robust = robust) - as.data.frame(out) +winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, + verbose = TRUE, ...) { + data <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose) + as.data.frame(data) } #' @rdname winsorize #' @export -winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, method = "percentile", robust = FALSE, ...) { - if(method == "percentile") { +winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, + verbose = TRUE, ...) { + method <- match.arg(method, choices = c("percentile", "zscore", "raw")) + + if (method == "raw") { + if (length(threshold) != 2L) { + if (isTRUE(verbose)) { + warning("threshold must be of length 2 for lower and upper bound. Did not winsorize data.", call. = FALSE) + } + return(data) + } + } + if(method == "percentile") { if (threshold < 0 || threshold > 0.5) { if (isTRUE(verbose)) { warning("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data.", call. = FALSE) - } - return(data) } + return(data) + } y <- sort(data) n <- length(data) ibot <- floor(threshold * n) + 1 itop <- length(data) - ibot + 1 - xbot <- y[ibot] - xtop <- y[itop] - winval <- data - winval[winval <= xbot] <- xbot - winval[winval >= xtop] <- xtop - return(winval) + threshold <- c(y[ibot], y[itop]) } if(method == "zscore") { @@ -87,21 +110,18 @@ winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, method = "p } if(isTRUE(robust)) { - med <- stats::median(data, na.rm = TRUE) - y <- data - med - winval <- stats::mad(y, center = 0, na.rm = TRUE) * threshold - y[y > winval] <- winval - y[y < -winval] <- -winval - return(y + med) + centeral <- stats::median(data, na.rm = TRUE) + deviation <- stats::mad(data, center = centeral, na.rm = TRUE) + } else { + centeral <- mean(data, na.rm = TRUE) + deviation <- stats::sd(data, na.rm = TRUE) } - if(isFALSE(robust)) { - m <- mean(data, na.rm = TRUE) - y <- data - m - winval <- stats::sd(y, na.rm = TRUE) * threshold - y[y > winval] <- winval - y[y < -winval] <- -winval - y + m - } + threshold <- centeral + c(-1, 1) * deviation * threshold } + + + data[data < threshold[1]] <- threshold[1] + data[data > threshold[2]] <- threshold[2] + return(data) } diff --git a/man/winsorize.Rd b/man/winsorize.Rd index aa284ac22..0872163d5 100644 --- a/man/winsorize.Rd +++ b/man/winsorize.Rd @@ -10,9 +10,9 @@ winsorize(data, ...) \method{winsorize}{numeric}( data, threshold = 0.2, - verbose = TRUE, method = "percentile", robust = FALSE, + verbose = TRUE, ... ) } @@ -21,13 +21,18 @@ winsorize(data, ...) \item{...}{Currently not used.} -\item{threshold}{The amount of winsorization.} - -\item{verbose}{Toggle warnings.} +\item{threshold}{The amount of winsorization, depends on the value of \code{method}: +\itemize{ +\item For \code{method = "percentile"}: the amount to windzorize from \emph{each} tail. +\item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the \emph{mean}/\emph{median} (see \code{robust}) +\item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound for winsorization. +}} -\item{method}{One of "percentile" or "zscore".} +\item{method}{One of "percentile" (default), "zscore" or "raw".} \item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.} + +\item{verbose}{Toggle warnings.} } \value{ A dataframe with winsorized columns or a winsorized vector. @@ -47,10 +52,23 @@ percentile. Winsorized estimators are usually more robust to outliers than their more standard forms. } \examples{ -winsorize(iris$Sepal.Length, threshold = 0.2) -winsorize(iris$Sepal.Length, threshold = 2, method = "zscore") -winsorize(iris$Sepal.Length, threshold = 2, method = "zscore", robust = TRUE) +hist(iris$Sepal.Length, main = "Original data") + +hist(winsorize(iris$Sepal.Length, threshold = 0.2), + xlim = c(4, 8), main = "Percentile Winz") + +hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), + xlim = c(4, 8), main = "Mean+-SD Winz") + +hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), + xlim = c(4, 8), main = "Median+-MAD Winz") + +hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), + xlim = c(4, 8), main = "Raw Thresholds") + +# Also works on a data frame: winsorize(iris, threshold = 0.2) + } \seealso{ \itemize{ From c626eca6cd9bf31b6f00e5f2fa28665705c3d017 Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Sun, 26 Jun 2022 07:21:43 -0400 Subject: [PATCH 5/7] minor modifications to docs --- DESCRIPTION | 1 + R/winsorize.R | 10 +++++----- man/winsorize.Rd | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e2b920e1e..244ee8839 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,6 +50,7 @@ Suggests: rmarkdown, rstanarm, see, + tidyr, testthat (>= 3.0.0) VignetteBuilder: knitr diff --git a/R/winsorize.R b/R/winsorize.R index c13d921de..50a0ac92e 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -18,11 +18,11 @@ #' #' @param data Dataframe or vector. #' @param threshold The amount of winsorization, depends on the value of `method`: -#' - For `method = "percentile"`: the amount to windzorize from *each* tail. +#' - For `method = "percentile"`: the amount to winsorize from *each* tail. #' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`) #' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization. #' @param verbose Toggle warnings. -#' @param method One of "percentile" (default), "zscore" or "raw". +#' @param method One of "percentile" (default), "zscore", or "raw". #' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation. #' @param ... Currently not used. #' @@ -30,13 +30,13 @@ #' hist(iris$Sepal.Length, main = "Original data") #' #' hist(winsorize(iris$Sepal.Length, threshold = 0.2), -#' xlim = c(4, 8), main = "Percentile Winz") +#' xlim = c(4, 8), main = "Percentile Winsorization") #' #' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), -#' xlim = c(4, 8), main = "Mean+-SD Winz") +#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization") #' #' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), -#' xlim = c(4, 8), main = "Median+-MAD Winz") +#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization") #' #' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), #' xlim = c(4, 8), main = "Raw Thresholds") diff --git a/man/winsorize.Rd b/man/winsorize.Rd index 0872163d5..d13c8b6d6 100644 --- a/man/winsorize.Rd +++ b/man/winsorize.Rd @@ -23,12 +23,12 @@ winsorize(data, ...) \item{threshold}{The amount of winsorization, depends on the value of \code{method}: \itemize{ -\item For \code{method = "percentile"}: the amount to windzorize from \emph{each} tail. +\item For \code{method = "percentile"}: the amount to winsorize from \emph{each} tail. \item For \code{method = "zscore"}: the number of \emph{SD}/\emph{MAD}-deviations from the \emph{mean}/\emph{median} (see \code{robust}) \item For \code{method = "raw"}: a vector of length 2 with the lower and upper bound for winsorization. }} -\item{method}{One of "percentile" (default), "zscore" or "raw".} +\item{method}{One of "percentile" (default), "zscore", or "raw".} \item{robust}{Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.} @@ -55,13 +55,13 @@ their more standard forms. hist(iris$Sepal.Length, main = "Original data") hist(winsorize(iris$Sepal.Length, threshold = 0.2), - xlim = c(4, 8), main = "Percentile Winz") + xlim = c(4, 8), main = "Percentile Winsorization") hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"), - xlim = c(4, 8), main = "Mean+-SD Winz") + xlim = c(4, 8), main = "Mean (+/- SD) Winsorization") hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE), - xlim = c(4, 8), main = "Median+-MAD Winz") + xlim = c(4, 8), main = "Median (+/- MAD) Winsorization") hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"), xlim = c(4, 8), main = "Raw Thresholds") From d45ca3f815b03b6319c417253e336fbf4b9fd10f Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Sun, 26 Jun 2022 08:41:20 -0400 Subject: [PATCH 6/7] removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette --- DESCRIPTION | 1 - vignettes/standardize_data.Rmd | 25 ++++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 244ee8839..e2b920e1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,7 +50,6 @@ Suggests: rmarkdown, rstanarm, see, - tidyr, testthat (>= 3.0.0) VignetteBuilder: knitr diff --git a/vignettes/standardize_data.Rmd b/vignettes/standardize_data.Rmd index afd22b249..8c3d815a0 100644 --- a/vignettes/standardize_data.Rmd +++ b/vignettes/standardize_data.Rmd @@ -73,17 +73,16 @@ We can see that different methods give different central and variation values: ```{r, eval=FALSE} library(dplyr) -library(tidyr) hardlyworking %>% select(starts_with("xtra_hours")) %>% - pivot_longer(everything()) %>% - group_by(name) %>% + data_to_long() %>% + group_by(Name) %>% summarise( - mean = mean(value), - sd = sd(value), - median = median(value), - mad = mad(value) + mean = mean(Value), + sd = sd(Value), + median = median(Value), + mad = mad(Value) ) ``` @@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking) ```{r, eval=FALSE} hardlyworking_z %>% select(-xtra_hours_z, -xtra_hours_zr) %>% - pivot_longer(everything()) %>% - group_by(name) %>% + data_to_long() %>% + group_by(Name) %>% summarise( - mean = mean(value), - sd = sd(value), - median = median(value), - mad = mad(value) + mean = mean(Value), + sd = sd(Value), + median = median(Value), + mad = mad(Value) ) ``` From 03f85bf5ba50ab0b2a2a16d39d1489a9309ca548 Mon Sep 17 00:00:00 2001 From: RemPsyc Date: Sun, 26 Jun 2022 15:29:55 -0400 Subject: [PATCH 7/7] added new tests for new winsorization methods, insight::format_message(), data[] <- lapply... --- R/winsorize.R | 10 +++++----- ...convert_data_to_numeric.md => data_to_numeric.md} | 6 +++--- tests/testthat/test-winsorization.R | 12 ++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) rename tests/testthat/_snaps/{convert_data_to_numeric.md => data_to_numeric.md} (82%) diff --git a/R/winsorize.R b/R/winsorize.R index 50a0ac92e..e74d5fae8 100644 --- a/R/winsorize.R +++ b/R/winsorize.R @@ -65,8 +65,8 @@ winsorize.logical <- winsorize.factor #' @export winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE, verbose = TRUE, ...) { - data <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose) - as.data.frame(data) + data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose) + data } #' @rdname winsorize @@ -78,7 +78,7 @@ winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robu if (method == "raw") { if (length(threshold) != 2L) { if (isTRUE(verbose)) { - warning("threshold must be of length 2 for lower and upper bound. Did not winsorize data.", call. = FALSE) + warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE) } return(data) } @@ -87,7 +87,7 @@ winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robu if(method == "percentile") { if (threshold < 0 || threshold > 0.5) { if (isTRUE(verbose)) { - warning("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data.", call. = FALSE) + warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE) } return(data) } @@ -104,7 +104,7 @@ winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robu if (threshold <= 0) { if (isTRUE(verbose)) { - warning("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data.", call. = FALSE) + warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE) } return(data) } diff --git a/tests/testthat/_snaps/convert_data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md similarity index 82% rename from tests/testthat/_snaps/convert_data_to_numeric.md rename to tests/testthat/_snaps/data_to_numeric.md index 8ccb2e767..029f13c37 100644 --- a/tests/testthat/_snaps/convert_data_to_numeric.md +++ b/tests/testthat/_snaps/data_to_numeric.md @@ -1,7 +1,7 @@ # convert dataframe to numeric Code - convert_data_to_numeric(head(ToothGrowth)) + data_to_numeric(head(ToothGrowth)) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 @@ -14,7 +14,7 @@ --- Code - convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE) + data_to_numeric(head(ToothGrowth), dummy_factors = FALSE) Output len supp dose 1 4.2 2 0.5 @@ -27,7 +27,7 @@ # convert factor to numeric Code - convert_data_to_numeric(f) + data_to_numeric(f) Output a c i s t 1 0 0 0 1 0 diff --git a/tests/testthat/test-winsorization.R b/tests/testthat/test-winsorization.R index c618ad7c9..f6ebb23f0 100644 --- a/tests/testthat/test-winsorization.R +++ b/tests/testthat/test-winsorization.R @@ -17,6 +17,18 @@ test_that("winsorize: threshold must be between 0 and 1", { winsorize(sample(1:10, 5), threshold = 1.1), regexp = "must be a scalar between 0 and 0.5" ) + expect_warning( + winsorize(sample(1:10, 5), method = "zscore", threshold = -3), + regexp = "must be a scalar greater than 0" + ) + expect_warning( + winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE), + regexp = "must be a scalar greater than 0" + ) + expect_warning( + winsorize(sample(1:10, 5), method = "raw", threshold = 1.1), + regexp = "must be of length 2 for lower and upper bound" + ) x <- sample(1:10, 5) suppressWarnings({ y <- winsorize(x, threshold = -0.1)