Skip to content

Commit

Permalink
feat: add "Q6" and "ckmeans" classification methods to mf_get_breaks()
Browse files Browse the repository at this point in the history
  • Loading branch information
rCarto committed Oct 14, 2024
1 parent d8ec643 commit 5dfcb89
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 114 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Imports:
grDevices
Suggests:
terra,
Ckmeans.1d.dp,
png,
jpeg,
lwgeom,
Expand Down
233 changes: 139 additions & 94 deletions R/mf_get_breaks.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#' @title Get class intervals
#' @name mf_get_breaks
#' @description A function to classify continuous variables.
#'
#' This function is a wrapper for
#' \code{\link[classInt:classIntervals]{classIntervals}}
#' with some additional methods.
#'
#' @param x a vector of numeric values. NA and Inf values are not used in the
#' classification.
#' @param nbreaks a number of classes
#' @param breaks a classification method; one of "fixed", "sd", "equal",
#' "pretty", "quantile",
#' "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih", "q6", "geom",
#' "arith", "em" or "msd" (see Details).
#' "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks",
#' "dpih", "q6", "Q6", geom", "arith", "em", "msd" or "ckmeans" (see Details)
#' @param k number of standard deviation for "msd" method (see Details)
#' @param central creation of a central class for "msd" method (see Details)
#' @param ... further arguments
Expand All @@ -18,11 +22,27 @@
#' "bclust", "fisher", "jenks" and "dpih"
#' are \code{\link[classInt:classIntervals]{classIntervals}}
#' methods. You may need to pass additional arguments for some of them.\cr\cr
#' Jenks ("jenks" method) and Fisher ("fisher" method) algorithms are
#' based on the same principle and give
#' quite similar results but Fisher is much faster. \cr\cr
#'
#' The "jenks", "fisher" and "ckmeans" methods are based on the same concept of
#' **natural breaks** and and produce similar groupings.
#'
#' * The "jenks" method produces class boundaries falling on data points and is
#' slow.
#' * The "fisher" method produces class boundaries located more conveniently
#' between data points, and is faster than the "jenks" method.
#' * The "ckmeans" method produces exactly the same class boundaries as the
#' "fisher" method, but is much faster. It uses the optimal univariate
#' k-means method from the `Ckmeans.1d.dp` package.
#' If the "ckmeans" method is selected but the `Ckmeans.1d.dp` package is not
#' installed then the "fisher" method is used.
#'
#' The relative speeds of these three methods may vary depending on the number
#' of data points and the number of classes.\cr\cr
#'
#' The "q6" method uses the following \code{\link[stats:quantile]{quantile}}
#' probabilities: 0, 0.05, 0.275, 0.5, 0.725, 0.95, 1.\cr\cr
#' The "Q6" method uses the following \code{\link[stats:quantile]{quantile}}
#' probabilities: 0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.\cr\cr
#' The "geom" method is based on a geometric progression along
#' the variable values, all values must be strictly greater than zero.\cr\cr
#' The "arith" method is based on an arithmetic progression along
Expand All @@ -35,13 +55,11 @@
#' the extent of each class in share of standard deviation.
#' If \code{central=TRUE} then
#' the mean value is the center of a class else the mean is a break value.
#' @note This function is mainly a wrapper
#' of \code{\link[classInt:classIntervals]{classIntervals}} +
#' "arith", "em", "q6", "geom" and "msd" methods.
#' @examples
#' mtq <- mf_get_mtq()
#' mf_get_breaks(x = mtq$MED, nbreaks = 6, breaks = "quantile")
#' @return A numeric vector of breaks
#' @md
#' @export
mf_get_breaks <- function(x, nbreaks, breaks, k = 1, central = FALSE, ...) {
if (is.numeric(breaks)) {
Expand All @@ -51,115 +69,142 @@ mf_get_breaks <- function(x, nbreaks, breaks, k = 1, central = FALSE, ...) {

x <- as.vector(na.omit(x))
x <- x[is.finite(x)]
custom_methods <- c("geom", "arith", "q6", "em", "msd")
custom_methods <- c("geom", "arith", "q6", "Q6", "em", "msd", "ckmeans")

# default number of classes
if (missing(nbreaks)) {
nbreaks <- round(1 + 3.3 * log10(length(x)), 0)
}

if (breaks == "ckmeans") {
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
warning(
paste0(
"The 'Ckmeans.1d.dp' package is needed for ",
"this classification method. Please install it.\n",
"The 'fisher' method will be used instead."
),
call. = FALSE
)
breaks <- "fisher"
} else {
bks <- Ckmeans.1d.dp::Ckmeans.1d.dp(x = x, k = nbreaks)
intervals <- Ckmeans.1d.dp::ahist(
x = bks, style = "midpoints",
data = x, plot = FALSE
)$breaks
}
}

if (!breaks %in% custom_methods) {
intervals <- classInt::classIntervals(
var = x,
n = nbreaks,
style = breaks, ...
)$brks
} else {
if (breaks == "geom") {
intervals <- min(x)
if (intervals <= 0) {
stop(
paste0(
"All values must be strictly greater ",
"than 0 when using the 'geom' method."
),
call. = FALSE
)
}
intervals <- c(intervals, max(x))
r <- exp((log(max(x)) - log(min(x))) / nbreaks) # raison
tmp <- min(x)
for (i in 1:(nbreaks - 1)) {
intervals <- c(intervals, tmp * r)
tmp <- tmp * r
intervals <- sort(intervals)
}
}

if (breaks == "geom") {
intervals <- min(x)
if (intervals <= 0) {
stop(
paste0(
"All values must be strictly greater ",
"than 0 when using the 'geom' method."
),
call. = FALSE
)
}
if (breaks == "arith") {
intervals <- min(x)
intervals <- c(intervals, max(x))
r <- (max(x) - min(x)) / sum(1:nbreaks) # raison
tmp <- min(x)
for (i in 1:(nbreaks - 1)) {
intervals <- c(intervals, tmp + r)
tmp <- tmp + r
intervals <- sort(intervals)
}
intervals <- c(intervals, max(x))
r <- exp((log(max(x)) - log(min(x))) / nbreaks) # raison
tmp <- min(x)
for (i in 1:(nbreaks - 1)) {
intervals <- c(intervals, tmp * r)
tmp <- tmp * r
intervals <- sort(intervals)
}
if (breaks == "q6") {
intervals <- as.vector(quantile(x,
probs =
c(0, 5, 27.5, 50, 72.5, 95, 100) / 100
))
}

if (breaks == "arith") {
intervals <- min(x)
intervals <- c(intervals, max(x))
r <- (max(x) - min(x)) / sum(1:nbreaks) # raison
tmp <- min(x)
for (i in 1:(nbreaks - 1)) {
intervals <- c(intervals, tmp + r)
tmp <- tmp + r
intervals <- sort(intervals)
}
if (breaks == "em") {
t <- bitwAnd(nbreaks, (nbreaks - 1))
if (t != 0) {
stop("The number of classes must be a power of 2")
} else {
min_vec <- min(x)
max_vec <- max(x)
it <- log2(nbreaks)
}

int <- c(min_vec, max_vec)
if (breaks == "q6") {
intervals <- as.vector(
quantile(x = x, probs = c(0, .05, .275, .5, .725, .95, 1))
)
}

for (a in 1:it) {
valprv <- c()
for (i in 1:(length(int) - 1)) {
if (i == 1) {
sub_vec <- x[x >= int[i] & x <= int[i + 1]]
} else {
sub_vec <- x[x > int[i] & x <= int[i + 1]]
}
valprv <- c(valprv, mean(sub_vec))
if (breaks == "Q6") {
intervals <- as.vector(
quantile(x = x, probs = c(0, .05, .25, .5, .75, .95, 1))
)
}

if (breaks == "em") {
t <- bitwAnd(nbreaks, (nbreaks - 1))
if (t != 0) {
stop("The number of classes must be a power of 2")
} else {
min_vec <- min(x)
max_vec <- max(x)
it <- log2(nbreaks)

int <- c(min_vec, max_vec)

for (a in 1:it) {
valprv <- c()
for (i in 1:(length(int) - 1)) {
if (i == 1) {
sub_vec <- x[x >= int[i] & x <= int[i + 1]]
} else {
sub_vec <- x[x > int[i] & x <= int[i + 1]]
}
int <- c(int, valprv)
int <- int[order(int)]
valprv <- c(valprv, mean(sub_vec))
}
intervals <- int
int <- c(int, valprv)
int <- int[order(int)]
}
intervals <- int
}
if (breaks == "msd") {
min_vec <- min(x)
max_vec <- max(x)
avg_vec <- mean(x)
sd_vec <- sqrt(sum((x - avg_vec)^2) / length(x))

if (central == FALSE) {
pose <- ceiling((max_vec - avg_vec) / (sd_vec * k))
nege <- ceiling((avg_vec - min_vec) / (sd_vec * k))
}

if (breaks == "msd") {
min_vec <- min(x)
max_vec <- max(x)
avg_vec <- mean(x)
sd_vec <- sqrt(sum((x - avg_vec)^2) / length(x))
if (central == FALSE) {
pose <- ceiling((max_vec - avg_vec) / (sd_vec * k))
nege <- ceiling((avg_vec - min_vec) / (sd_vec * k))
avg_vec + (1:pose) * (sd_vec * k)
bks <- c(
avg_vec - (1:nege) * (sd_vec * k),
avg_vec,
avg_vec + (1:pose) * (sd_vec * k)
bks <- c(
avg_vec - (1:nege) * (sd_vec * k),
avg_vec,
avg_vec + (1:pose) * (sd_vec * k)
)
intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec)
} else {
pose <- ceiling((max_vec - (avg_vec + 0.5 * sd_vec * k)) / (sd_vec * k))
nege <- ceiling(((avg_vec - 0.5 * sd_vec * k) - min_vec) / (sd_vec * k))

bks <- c(
(avg_vec - 0.5 * sd_vec * k) - (1:nege) * (sd_vec * k),
(avg_vec - 0.5 * sd_vec * k),
(avg_vec + 0.5 * sd_vec * k),
(avg_vec + 0.5 * sd_vec * k) + (1:pose) * (sd_vec * k)
)
intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec)
}
intervals <- intervals[order(intervals)]
)
intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec)
} else {
pose <- ceiling((max_vec - (avg_vec + 0.5 * sd_vec * k)) / (sd_vec * k))
nege <- ceiling(((avg_vec - 0.5 * sd_vec * k) - min_vec) / (sd_vec * k))
bks <- c(
(avg_vec - 0.5 * sd_vec * k) - (1:nege) * (sd_vec * k),
(avg_vec - 0.5 * sd_vec * k),
(avg_vec + 0.5 * sd_vec * k),
(avg_vec + 0.5 * sd_vec * k) + (1:pose) * (sd_vec * k)
)
intervals <- c(min_vec, bks[bks > min_vec & bks < max_vec], max_vec)
}
intervals <- intervals[order(intervals)]
}

return(unique(intervals))
}
6 changes: 1 addition & 5 deletions R/mf_map.R
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,10 @@
#' add = TRUE)
#' }
#'
#' ## Breaks limits
#' ## Class boundaries
#' Breaks defined by a numeric vector or a classification method are
#' left-closed: breaks defined by \code{c(2, 5, 10, 15, 20)}
#' will be mapped as [2 - 5[, [5 - 10[, [10 - 15[, \[15 - 20].
#' The "jenks" method is an exception and has to be right-closed.
#' Jenks breaks computed as \code{c(2, 5, 10, 15, 20)}
#' will be mapped as \[2 - 5], ]5 - 10], ]10 - 15], ]15 - 20].
#'
#'
#'
#' @eval my_params(c(
Expand Down
5 changes: 5 additions & 0 deletions inst/tinytest/test_breaks.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,9 @@ expect_equal(mf_get_breaks(x = x, breaks = c(1, 10, 20, 100), k = 1),
c(1, 10, 20, 100))
expect_equal(mf_get_breaks(x = x, breaks = "fisher"),
c(11929, 13953, 15685.5, 17372, 18622, 20354.5, 21761))
expect_equal(mf_get_breaks(x = x, breaks = "ckmeans"),
c(11929, 13953, 15685.5, 17372, 18622, 20354.5, 21761))
expect_error(mf_get_breaks(x = c(0, x), breaks = "geom"))
expect_equal(mf_get_breaks(x = x, breaks = "Q6"),
c(11929, 13168.65, 14457.25, 15685.5, 17979.75, 20159.35, 21761))

37 changes: 26 additions & 11 deletions man/mf_get_breaks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5dfcb89

Please sign in to comment.