diff --git a/NAMESPACE b/NAMESPACE index 45c892e55..c3e28daba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ metaMDSdist, metaMDSiter, metaMDSredist, MDSaddpoints, MDSrotate, metaMDS, monoMDS, mrpp, msoplot, mso, multipart, make.commsim, nestedbetajac, nestedbetasor, nestedchecker, nesteddisc, nestedn0, nestednodf, nestedtemp, nullmodel, oecosimu, smbind, +OptSpace, ordiareatest, ordiR2step, ordiarrows, ordiArrowMul, ordiArrowTextXY, ordibar, ordicloud, ordicluster, ordiellipse, ordigrid, diff --git a/R/decostand.R b/R/decostand.R index 9857e7084..b893fb868 100644 --- a/R/decostand.R +++ b/R/decostand.R @@ -123,24 +123,24 @@ if (missing(MARGIN)) MARGIN <- 1 if (MARGIN == 1) - x <- t(.calc_alr(t(x), ...)) - else x <- .calc_alr(x, ...) + x <- .calc_alr(x, na.rm=na.rm, ...) + else x <- t(.calc_alr(t(x), na.rm=na.rm, ...)) attr <- attr(x, "parameters") attr$margin <- MARGIN }, clr = { if (missing(MARGIN)) MARGIN <- 1 if (MARGIN == 1) - x <- .calc_clr(x, ...) - else x <- t(.calc_clr(t(x), ...)) + x <- .calc_clr(x, na.rm=na.rm, ...) + else x <- t(.calc_clr(t(x), na.rm=na.rm, ...)) attr <- attr(x, "parameters") attr$margin <- MARGIN }, rclr = { if (missing(MARGIN)) MARGIN <- 1 if (MARGIN == 1) - x <- .calc_rclr(x, ...) - else x <- t(.calc_rclr(t(x), ...)) + x <- .calc_rclr(x, na.rm=na.rm, ...) + else x <- t(.calc_rclr(t(x), na.rm=na.rm, ...)) attr <- attr(x, "parameters") attr$margin <- MARGIN }) @@ -156,14 +156,15 @@ ## Modified from the original version in mia R package .calc_clr <- - function(x, pseudocount=0, na.rm = TRUE) + function(x, na.rm, pseudocount=0, ...) { + # Add pseudocount x <- x + pseudocount - # Error with negative values - if (any(x <= 0, na.rm = na.rm)) { + # Error with negative values (note: at this step we always use na.rm=TRUE) + if (any(x <= 0, na.rm = TRUE)) { stop("'clr' cannot be used with non-positive data: use pseudocount > ", - -min(x, na.rm = na.rm) + pseudocount, call. = FALSE) + -min(x, na.rm = TRUE) + pseudocount, call. = FALSE) } # In every sample, calculate the log of individual entries. @@ -171,8 +172,19 @@ # the sample-specific mean value and subtract every entries' # value with that. clog <- log(x) - means <- rowMeans(clog, na.rm = na.rm) + + # Calculate sample-wise log means (note: here we always set na.rm=TRUE !) + means <- rowMeans(clog, na.rm = TRUE) + + # CLR transformation clog <- clog - means + + # Replace missing values with 0 + if (na.rm && any(is.na(clog))) { + cat("Replacing missing values with zero for clr. You can disable this with na.rm=FALSE.") + clog[is.na(clog)] <- 0 + } + attr(clog, "parameters") <- list("means" = means, "pseudocount" = pseudocount) clog @@ -180,37 +192,60 @@ # Modified from the original version in mia R package .calc_rclr <- - function(x, na.rm = TRUE) + function(x, na.rm, ROPT=3, NITER=5, TOL=1e-5, verbose=FALSE, impute=TRUE, ...) { # Error with negative values - if (any(x < 0, na.rm = na.rm)) { + if (any(x < 0, na.rm=na.rm)) { stop("'rclr' cannot be used with negative data", call. = FALSE) } + # Log transform clog <- log(x) + # Convert zeros to NAs in rclr clog[is.infinite(clog)] <- NA + # Calculate log of geometric mean for every sample, ignoring the NAs - mean_clog <- rowMeans(clog, na.rm = na.rm) - # Divide all values by their sample-wide geometric means - # Log and transpose back to original shape - xx <- log(x) - mean_clog - # If there were zeros, there are infinite values after logarithmic transform. - # Convert those to zero. - xx[is.infinite(xx)] <- 0 - attr(xx, "parameters") <- list("means" = mean_clog) - xx -} + # Always na.rm=TRUE at this step! + means <- rowMeans(clog, na.rm = TRUE) + ## Divide (or in log-space, reduce) all values by their sample-wide + ## geometric means + xx <- clog - means + attr(xx, "parameters") <- list("means" = means) + + # Impute NAs if impute=TRUE + # Otherwise return the transformation with NAs + if (impute && any(is.na(xx))){ + + opt_res <- OptSpace(xx, ROPT=ROPT, NITER=NITER, TOL=TOL, verbose=verbose) + + # recenter the data (the means of rclr can get thrown off since we work on only missing) + M_I <- opt_res$X %*% opt_res$S %*% t(opt_res$Y) + + # Center cols to 0 + M_I <- as.matrix(scale(M_I, center=TRUE, scale=FALSE)) + + # Center rows to 0 + M_I <- as.matrix(t(scale(t(M_I), center=TRUE, scale=FALSE))) + + # Imputed matrix + xx <- M_I + } + + return(xx) + +} .calc_alr <- - function (x, reference = 1, pseudocount = 0, na.rm = TRUE) + function (x, na.rm, pseudocount = 0, reference = 1, ...) { # Add pseudocount x <- x + pseudocount # If there is negative values, gives an error. - if (any(x < 0, na.rm = na.rm)) { - stop("'alr' cannot be used with negative data: use pseudocount >= ", + # Always na.rm=TRUE at this step + if (any(x <= 0, na.rm = TRUE)) { + stop("'alr' cannot be used with non-positive data: use pseudocount > ", -min(x, na.rm = na.rm) + pseudocount, call. = FALSE) } ## name must be changed to numeric index for [-reference,] to work @@ -225,6 +260,13 @@ clog <- log(x) refvector <- clog[, reference] clog <- clog[, -reference] - refvector + + # Replace missing values with 0 + if (na.rm && any(is.na(clog))) { + cat("Replacing missing values with zero for alr. You can disable this with na.rm=FALSE.") + clog[is.na(clog)] <- 0 + } + attr(clog, "parameters") <- list("reference" = refvector, "index" = reference, "pseudocount" = pseudocount) @@ -269,3 +311,323 @@ x[abs(x) < sqrt(.Machine$double.eps)] <- 0 x } + + +OptSpace <- function(x, ROPT=3, NITER=5, TOL=1e-5, verbose=FALSE) +{ + + ## Preprocessing : x : partially revealed matrix + if (is.data.frame(x)) { + x <- as.matrix(x) + } + if (!is.matrix(x)){ + stop("* OptSpace : an input x should be a matrix") + } + if (any(is.infinite(x))){ + stop("* OptSpace : no infinite value in x is allowed") + } + if (!any(is.na(x))){ + stop("* OptSpace : there is no unobserved values as NA") + } + idxna <- (is.na(x)) + M_E <- array(0, c(nrow(x), ncol(x))) + M_E[!idxna] <- x[!idxna] + + ## Preprocessing : size information + n <- nrow(x) + m <- ncol(x) + + ## Preprocessing : other sparse-related concepts + nnZ.E <- sum(!idxna) + E <- array(0, c(nrow(x), ncol(x))); E[!idxna] <- 1 + eps <- nnZ.E/sqrt(m*n) + + ## Preprocessing : ROPT : implied rank + if (is.na(ROPT)){ + if (verbose){ + cat("* OptSpace: Guessing an implicit rank.") + } + r <- min(max(round(.guess_rank(M_E, nnZ.E)), 2), m-1) + if (verbose){ + cat(paste0('* OptSpace: Guessing an implicit rank: Estimated rank : ',r)) + } + } else { + r <- round(ROPT) + if ((!is.numeric(r)) || (r<1) || (r>m) || (r>n)){ + stop("* OptSpace: ROPT should be an integer in [1,min(nrow(x),ncol(x))]") + } + } + + ## Preprocessing : NITER : maximum number of iterations + if ((is.infinite(NITER))||(NITER<=1)||(!is.numeric(NITER))){ + stop("* OptSpace: invalid NITER number") + } + NITER <- round(NITER) + + m0 <- 10000 + rho <- eps*n + + ## Main Computation + rescal_param <- sqrt(nnZ.E*r/(norm(M_E,'f')^2)) + M_E <- M_E*rescal_param + + # 1. SVD + if (verbose){ + cat("* OptSpace: Step 2: SVD ...") + } + svdEt <- svd(M_E) + X0 <- svdEt$u[,1:r] + X0 <- X0[, rev(seq_len(ncol(X0)))] + S0 <- diag(rev(svdEt$d[seq_len(r)])) + Y0 <- svdEt$v[, seq_len(r)] + Y0 <- Y0[, rev(seq_len(ncol(Y0)))] + + # 3. Initial Guess + if (verbose){ + cat("* OptSpace: Step 3: Initial Guess ...") + } + X0 <- X0*sqrt(n) + Y0 <- Y0*sqrt(m) + S0 <- S0/eps + + # 4. Gradient Descent + if (verbose){ + cat("* OptSpace: Step 4: Gradient Descent ...") + } + X <- X0 + Y <- Y0 + S <- .aux_getoptS(X, Y, M_E, E) + # initialize + dist <- array(0, c(1, (NITER+1))) + dist[1] <- norm((M_E - (X %*% S %*% t(Y)))*E, 'f') / sqrt(nnZ.E) + for (i in seq_len(NITER)){ + # compute the gradient + tmpgrad <- .aux_gradF_t(X, Y, S, M_E, E, m0, rho) + W <- tmpgrad$W + Z <- tmpgrad$Z + # line search for the optimum jump length + t <- .aux_getoptT(X, W, Y, Z, S, M_E, E, m0, rho) + X <- X + t*W; + Y <- Y + t*Z; + S <- .aux_getoptS(X, Y, M_E, E) + # compute the distortion + dist[i+1] <- norm(((M_E - X %*% S %*% t(Y))*E),'f') / sqrt(nnZ.E) + if (dist[i+1] < TOL){ + dist <- dist[1:(i+1)] + break + } + } + S <- S/rescal_param + # Return Results + out <- list() + + # re-order Optspace may change order during iters + index_order <- order(diag(S), decreasing = TRUE) + X <- X[, index_order] + Y <- Y[, index_order] + S <- S[index_order, index_order] + out$X <- X + out$S <- S + out$Y <- Y + out$dist <- dist + if (verbose){ + cat('* OptSpace: estimation finished.') + } + + # ------------------------------------------- + + # This part is not in the Python/Gemelli implementation + # but has been added in R to provide more direct access + # to the imputed matrix. + + # Reconstruct the matrix + M <- X %*% S %*% t(Y) + + # Centering is common operation supporting output visualization + # Center cols to 0 + M <- as.matrix(scale(M, center=TRUE, scale=FALSE)) + # Center rows to 0 + M <- as.matrix(t(scale(t(M), center=TRUE, scale=FALSE))) + + # Imputed matrix; add to the output + out$M <- M + + # ------------------------------------------- + + return(out) +} + + +# @keywords internal +.guess_rank <- function(X, nnz) +{ + maxiter <- 10000 + n <- nrow(X) + m <- ncol(X) + epsilon <- nnz/sqrt(m*n) + svdX <- svd(X) + S0 <- svdX$d + + nsval0 <- length(S0) + S1 <- S0[seq_len(nsval0-1)]-S0[seq(2, nsval0)] + nsval1 <- length(S1) + if (nsval1 > 10){ + S1_ <- S1/mean(S1[seq((nsval1-10), nsval1)]) + } else { + S1_ <- S1/mean(S1[seq_len(nsval1)]) + } + r1 <- 0 + lam <- 0.05 + + itcounter <- 0 + while (r1<=0){ + itcounter <- itcounter+1 + cost <- array(0, c(1, length(S1_))) + for (idx in seq_len(length(S1_))) { + cost[idx] <- lam*max(S1_[seq(idx, length(S1_))]) + idx + } + v2 <- min(cost) + i2 <- which(cost==v2) + if (length(i2)==1){ + r1 <- i2-1 + } else { + r1 <- max(i2)-1 + } + lam <- lam + 0.05 + if (itcounter > maxiter){ + break + } + } + + if (itcounter<=maxiter){ + cost2 <- array(0, c(1, (length(S0)-1))) + for (idx in seq_len(length(S0)-1)){ + cost2[idx] <- (S0[idx+1]+sqrt(idx * epsilon) * S0[1]/epsilon)/S0[idx] + } + v2 <- min(cost2) + i2 <- which(cost2==v2) + if (length(i2)==1){ + r2 <- i2 + } else { + r2 <- max(i2) + } + + if (r1>r2){ + r <- r1 + } else { + r <- r2 + } + return(r) + } else { + r <- min(nrow(X), ncol(X)) + } +} + + +# Aux 2 : compute the distortion ------------------------------------------ +# @keywords internal +.aux_G <- function(X, m0, r) +{ + z <- rowSums(X^2)/(2*m0*r) + y <- exp((z-1)^2) - 1 + idxfind <- (z < 1) + y[idxfind] <- 0 + out <- sum(y) + return(out) +} + +# @keywords internal +.aux_F_t <- function(X, Y, S, M_E, E, m0, rho) +{ + n <- nrow(X) + r <- ncol(X) + out1 <- (sum((((X %*% S %*% t(Y)) - M_E)*E)^2))/2 + out2 <- rho*.aux_G(Y,m0,r) + out3 <- rho*.aux_G(X,m0,r) + out <- out1+out2+out3 + return(out) +} + + +# Aux 3 : compute the gradient -------------------------------------------- +# @keywords internal +.aux_Gp <- function(X,m0,r) +{ + z <- rowSums(X^2)/(2*m0*r) + z <- 2*exp((z-1)^2)/(z-1) + idxfind <- (z<0) + z[idxfind] <- 0 + out <- (X * matrix(z, nrow=nrow(X), ncol=ncol(X), byrow=FALSE))/(m0*r) +} +# @keywords internal +.aux_gradF_t <- function(X, Y, S, M_E, E, m0, rho) +{ + n <- nrow(X) + r <- ncol(X) + m <- nrow(Y) + if (ncol(Y)!=r){ + stop("dimension error from .aux_gradF_t") + } + + XS <- (X %*% S) + YS <- (Y %*% t(S)) + XSY <- (XS %*% t(Y)) + + Qx <- ((t(X) %*% ((M_E-XSY)*E) %*% YS)/n) + Qy <- ((t(Y) %*% t((M_E-XSY)*E) %*% XS)/m) + + W <- (((XSY-M_E)*E) %*% YS) + (X %*% Qx) + rho*.aux_Gp(X, m0, r) + Z <- (t((XSY-M_E)*E) %*% XS) + (Y %*% Qy) + rho*.aux_Gp(Y, m0, r) + + resgrad <- list() + resgrad$W <- W + resgrad$Z <- Z + return(resgrad) + +} + + +# Aux 4 : Sopt given X and Y ---------------------------------------------- +# @keywords internal +.aux_getoptS <- function(X, Y, M_E, E) +{ + n <- nrow(X) + r <- ncol(X) + C <- (t(X) %*% (M_E) %*% Y) + C <- matrix(as.vector(C)) + nnrow <- ncol(X)*ncol(Y) + A <- matrix(NA, nrow=nnrow, ncol=(r^2)) + + for (i in seq_len(r)){ + for (j in seq_len(r)){ + ind <- ((j-1)*r+i) + tmp <- t(X) %*% (outer(X[,i], Y[,j])*E) %*% Y + A[,ind] <- as.vector(tmp) + } + } + + S <- solve(A, C) + out <- matrix(S, nrow=r) + return(out) +} + +# Aux 5 : optimal line search --------------------------------------------- +# @keywords internal +.aux_getoptT <- function(X, W, Y, Z, S, M_E, E, m0, rho) +{ + norm2WZ <- (norm(W, 'f')^2) + (norm(Z, 'f')^2) + f <- array(0, c(1, 21)) + f[1] <- .aux_F_t(X, Y, S, M_E, E, m0, rho) + t <- -1e-1 + for (i in seq_len(21)){ + f[i+1] <- .aux_F_t(X+t*W, Y+t*Z, S, M_E, E, m0, rho) + if ((f[i+1]-f[1]) <= 0.5*t*norm2WZ){ + out <- t + break + } + t <- t/2 + } + out <- t + return(t) +} + diff --git a/R/vegdist.R b/R/vegdist.R index 1a5e7624d..5a566b6f3 100644 --- a/R/vegdist.R +++ b/R/vegdist.R @@ -48,7 +48,7 @@ if (method == 21) # aitchison x <- decostand(x, "clr", ...) # dots to pass possible pseudocount if (method == 22) # robust.aitchison - x <- decostand(x, "rclr") # No pseudocount for rclr + x <- decostand(x, "rclr", nr.rm=na.rm, ...) # No pseudocount for rclr if (binary) x <- decostand(x, "pa") N <- nrow(x) diff --git a/man/OptSpace.Rd b/man/OptSpace.Rd new file mode 100644 index 000000000..24a1a1249 --- /dev/null +++ b/man/OptSpace.Rd @@ -0,0 +1,77 @@ +\encoding{UTF-8} +\name{OptSpace} +\alias{OptSpace} +\title{OptSpace: algorithm for matrix reconstruction from a partially revealed set} +\description{ +This function was adapted from the original source code in the +ROptSpace R package (version 0.2.3; MIT License) by Raghunandan +H. Keshavan, Andrea Montanari, Sewoong Oh (2010). See +ROptSpace::OptSpace for more information. Let's assume an ideal +matrix \eqn{M} with \eqn{(m\times n)} entries with rank \eqn{r} and we +are given a partially observed matrix \eqn{M\_E} which contains many +missing entries. Matrix reconstruction - or completion - is the task +of filling in such entries. OptSpace is an efficient algorithm that +reconstructs \eqn{M} from \eqn{|E|=O(rn)} observed elements with +relative root mean square error (RMSE) \deqn{RMSE \le +C(\alpha)\sqrt{nr/|E|}}. +} +\usage{ +OptSpace(x, ROPT=3, NITER=5, TOL=1e-5, verbose=FALSE) +} + +\arguments{ + \item{x}{An \eqn{(n\times m)} matrix whose missing entries should be flagged as NA.} + \item{ROPT}{\code{NA} to guess the rank, or a positive integer as a pre-defined rank.} + \item{NITER}{Maximum number of iterations allowed.} + \item{TOL}{Stopping criterion for reconstruction in Frobenius norm.} + \item{verbose}{a logical value; \code{TRUE} to show progress, \code{FALSE} otherwise.} +} + + + +\details{ +This implementation removes the trimming step of the original +ROptSpace::OptSpace code in order to leave feature filtering to the +user. Some of the defaults have been adjusted to better reflect +ecological data. The implementation has been adjusted for ecological +applications as in Martino et al. (2019). The imputed matrix (M) in +the OptSpace output includes matrix reconstruction (XSY'), with +subsequcnt centering for the columns and rows. +} + +\value{Returns a named list containing: + \itemize{ + \item{X}{an \eqn{(n \times r)} matrix as left singular vectors.} + \item{S}{an \eqn{(r \times r)} matrix as singular values.} + \item{Y}{an \eqn{(m \times r)} matrix as right singular vectors.} + \item{dist}{a vector containing reconstruction errors at each successive iteration.} + \item{M}{an \eqn{(n \times m)} imputed matrix, with columns and rows centered to zero.} + } +} + +\author{Leo Lahti and Cameron Martino, with adaptations of the method +implemented in ROptSpace::OptSpace, originally proposed by Keshavan et +al. (2010).} + +\references{ + + Keshavan, R. H., Montanari, A., Oh, S. (2010). + Matrix Completion From a Few Entries. + \emph{IEEE Transactions on Information Theory} \strong{56}(6):2980--2998. + + Martino, C., Morton, J.T., Marotz, C.A., Thompson, L.R., Tripathi, A., + Knight, R. & Zengler, K. (2019) A novel sparse compositional technique + reveals microbial perturbations. + \emph{mSystems} \strong{4}, 1. + + } + +\examples{ +data(varespec) +# rclr transformation with no matrix completion for the 0/NA entries +x <- decostand(varespec, method="rclr", impute=FALSE) +# Add matrix completion +xc <- OptSpace(x, ROPT=3, NITER=5, TOL=1e-5, verbose=FALSE)$M +} +\keyword{ multivariate} +\keyword{ manip } \ No newline at end of file diff --git a/man/decostand.Rd b/man/decostand.Rd index 91989ba11..3ad2c3718 100644 --- a/man/decostand.Rd +++ b/man/decostand.Rd @@ -76,7 +76,7 @@ decobackstand(x, zap = TRUE) \code{\link{vegdist}}), but the standardization can be used independently of distance indices. - \item \code{alr}: Additive log ratio ("alr") transformation + \item \code{alr}: Additive log ratio ("alr") transformation (Aitchison 1986) reduces data skewness and compositionality bias. The transformation assumes positive values, pseudocounts can be added with the argument \code{pseudocount}. One of the @@ -89,11 +89,11 @@ decobackstand(x, zap = TRUE) alr = [log(x_i/x_D), ..., log(x_{-D}/x_D)]} where the denominator sample \eqn{x_D} can be chosen arbitrarily. This transformation is often used with pH and other - chemistry measurenments. It is also commonly used as multinomial + chemistry measurements. It is also commonly used as multinomial logistic regression. Default \code{MARGIN = 1} uses row as the \code{reference}. - \item \code{clr}: centered log ratio ("clr") transformation proposed by + \item \code{clr}: centered log ratio ("clr") transformation proposed by Aitchison (1986) and it is used to reduce data skewness and compositionality bias. This transformation has frequent applications in microbial ecology (see e.g. Gloor et al., 2017). The clr transformation is defined as: @@ -106,25 +106,37 @@ decobackstand(x, zap = TRUE) (e.g. the smallest positive value in the data), either by adding it manually to the input data, or by using the argument \code{pseudocount} as in - \code{decostand(x, method = "clr", pseudocount = 1)}. Adding + \code{decostand(x, method = "clr", na.rm=TRUE, pseudocount = 1)}. Adding pseudocount will inevitably introduce some bias; see - the \code{rclr} method for one available solution. - - \item \code{rclr}: robust clr ("rclr") is similar to regular clr - (see above) but allows data that contains zeroes. This method - does not use pseudocounts, unlike the standard clr. - The robust clr (rclr) divides the values by geometric mean - of the observed features; zero values are kept as zeroes, and not - taken into account. In high dimensional data, - the geometric mean of rclr approximates the true - geometric mean; see e.g. Martino et al. (2019) - The \code{rclr} transformation is defined formally as follows: + the \code{rclr} method for an alternative. + + \item \code{rclr}: robust clr ("rclr") is similar to regular clr + (see above) but it allows data with zeroes. This method can avoid + the use of pseudocounts, unlike the standard clr. The robust clr + (rclr) the logarithmizes the data and divides it by the geometric + mean of the observed features within each sample. In high + dimensional data the geometric mean of rclr approximates the true + geometric mean; see e.g. Martino et al. (2019). The \code{rclr} + transformation is defined formally as follows: \deqn{rclr = log\frac{x}{g(x > 0)}}{% - rclr = log(x/g(x > 0))} - where \eqn{x} is a single value, and \eqn{g(x > 0)} is the geometric - mean of sample-wide values \eqn{x} that are positive (> 0). + rclr = log(x/g(x > 0))} where \eqn{x} is + a single value, and \eqn{g(x > 0)} is the geometric mean of + sample-wide values \eqn{x} that are positive (> 0). The OptSpace + algorithm performs matrix completion for the missing values + that result from log transformation of the zero entries in the + original input data. The vegan implementation of the matrix + completion was adapted from the original implementation + ROptSpace::OptSpace (version 0.2.3) following Keshavan et + al. (2010). The following parameters can be passed to OptSpace + through decostand: "ROPT" NA to guess the rank, or a positive + integer as a pre-defined rank (default: 3); "NITER" maximum + number of iterations allowed (default: 5); "TOL" stopping + criterion for reconstruction in Frobenius norm (default: 1e-5); + "verbose" a logical value; TRUE to show progress, FALSE otherwise + (default: FALSE); "impute" to switch on/off the matrix completion + (default: impute=TRUE). } - + Standardization, as contrasted to transformation, means that the entries are transformed relative to other entries. @@ -181,6 +193,10 @@ decobackstand(x, zap = TRUE) Microbiome Datasets Are Compositional: And This Is Not Optional. \emph{Frontiers in Microbiology} \strong{8}, 2224. + Keshavan, R. H., Montanari, A., Oh, S. (2010). + Matrix Completion From a Few Entries. + \emph{IEEE Transactions on Information Theory} \strong{56}, 2980--2998. + Legendre, P. & Gallagher, E.D. (2001) Ecologically meaningful transformations for ordination of species data. \emph{Oecologia} \strong{129}, 271--280. @@ -204,6 +220,9 @@ sptrans <- wisconsin(varespec) # CLR transformation for rows, with pseudocount varespec.clr <- decostand(varespec, "clr", pseudocount=1) +# Robust CLR (rclr) transformation for rows, no pseudocount necessary +varespec.rclr <- decostand(varespec, "rclr", impute=TRUE) + # ALR transformation for rows, with pseudocount and reference sample varespec.alr <- decostand(varespec, "alr", pseudocount=1, reference=1) @@ -211,6 +230,7 @@ varespec.alr <- decostand(varespec, "alr", pseudocount=1, reference=1) ## Use wcmdscale for weighted analysis and identical results. sptrans <- decostand(varespec, "chi.square") plot(procrustes(rda(sptrans), cca(varespec))) + } \keyword{ multivariate} \keyword{ manip } \ No newline at end of file diff --git a/man/vegdist.Rd b/man/vegdist.Rd index 139757328..c3d3ca733 100644 --- a/man/vegdist.Rd +++ b/man/vegdist.Rd @@ -419,5 +419,8 @@ vare.dist <- vegdist(decostand(varespec, "norm"), "euclidean") vare.dist <- vegdist(decostand(varespec, "log"), "altGower") # Range standardization with "altGower" (that excludes double-zeros) vare.dist <- vegdist(decostand(varespec, "range"), "altGower") +# Robust Aitchison distance equals to Euclidean distance for rclr transformed data +vare.dist <- vegdist(decostand(varespec, "rclr"), method="euclidean") +vare.dist <- vegdist(varespec, "robust.aitchison") } \keyword{ multivariate } diff --git a/tests/aitchison-tests.R b/tests/aitchison-tests.R index 9b775e0a1..f2a130ab9 100644 --- a/tests/aitchison-tests.R +++ b/tests/aitchison-tests.R @@ -1,5 +1,9 @@ +###<--- BEGIN TESTS ---> +suppressPackageStartupMessages(require(vegan)) + # Test data # data(varespec) +set.seed(252) testdata <- matrix(round(runif(1000, 0, 100)), nrow=20) testdata <- testdata - 50 testdata[testdata < 0] <- 0 @@ -7,33 +11,49 @@ rownames(testdata) <- paste0("row", seq_len(nrow(testdata))) colnames(testdata) <- paste0("col", seq_len(ncol(testdata))) # Calculates relative abundance table -relative <- vegan::decostand(testdata, "total") +relative <- decostand(testdata, "total") # Count and relative data with pseudocount testdata.with.pseudo <- testdata + 1 -relative.with.pseudo <- vegan::decostand(testdata+1, "total") +relative.with.pseudo <- decostand(testdata+1, "total") + +# NAs correcly assigned +a1 <- testdata +a2 <- decostand(a1, "rclr", na.rm=FALSE) +a3 <- decostand(a1, "rclr", na.rm=TRUE) +all(is.na(a2[a1==0])) +all(!is.na(a3[a1==0])) # aitchison equals to CLR + Euclid (pseudocount is necessary with clr) -a1 <- vegan::vegdist(testdata+1, method = "aitchison") -a2 <- vegan::vegdist(vegan::decostand(testdata+1, "clr"), method = "euclidean") +a1 <- vegdist(testdata+1, method = "aitchison", na.rm=TRUE) +a2 <- vegdist(decostand(testdata+1, "clr"), method = "euclidean", na.rm=TRUE) max(abs(a1-a2)) < 1e-6 # Tolerance -# Robust aitchison equals to rCLR + Euclid +# Robust aitchison equals to rCLR + Euclid when na.rm=TRUE (matrix completion applied) # and works without pseudocount -a1 <- vegan::vegdist(testdata, method = "robust.aitchison") -a2 <- vegan::vegdist(vegan::decostand(testdata, "rclr"), method = "euclidean") +a1 <- vegdist(testdata, method = "robust.aitchison", na.rm=TRUE) +a2 <- vegdist(decostand(testdata, "rclr"), method = "euclidean", na.rm=TRUE) max(abs(a1-a2)) < 1e-6 # Tolerance # Robust aitchison and aitchison are equal when there are no zeroes -a1 <- vegan::vegdist(testdata.with.pseudo, method = "robust.aitchison") -a2 <- vegan::vegdist(testdata.with.pseudo, method = "aitchison") +a1 <- vegdist(testdata.with.pseudo, method = "robust.aitchison") +a2 <- vegdist(testdata.with.pseudo, method = "aitchison") max(abs(a1-a2)) < 1e-6 # Tolerance -# It is possible to pass pseudocount as a function argument to vegan::decostand -a1 <- vegan::vegdist(testdata, method = "aitchison", pseudocount=1) -a2 <- vegan::vegdist(testdata+1, method = "aitchison") +# It is possible to pass pseudocount as a function argument to decostand +a1 <- vegdist(testdata, method = "aitchison", pseudocount=1) +a2 <- vegdist(testdata+1, method = "aitchison") max(abs(a1-a2)) < 1e-6 # Tolerance +# Robust rclr+Euclid should give same result than robust.aitchison +d1 <- vegdist(decostand(testdata, "rclr"), "euclidean", na.rm=TRUE) +d2 <- vegdist(testdata, "robust.aitchison", na.rm=TRUE) +max(abs(d1-d2))<1e-6 # Tolerance + +# Robust rclr+Euclid should give same result than robust.aitchison +d1 <- vegdist(decostand(testdata+1, "rclr"), "euclidean", na.rm=FALSE) +d2 <- vegdist(testdata+1, "robust.aitchison", na.rm=FALSE) +max(abs(d1-d2))<1e-6 # Tolerance # Compare the outcomes with an external package that also provides compositional transformations # Adding these would demand adding Suggested packages in DESCRIPTION; skipped for now but can be @@ -41,15 +61,15 @@ max(abs(a1-a2)) < 1e-6 # Tolerance #skip <- TRUE #if (!skip) { # -# sum(compositions::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr")) < 1e-6 -# sum(compositions::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr", MARGIN=1)) < 1e-6 -# sum(t(compositions::clr(t(testdata.with.pseudo))) - vegan::decostand(testdata.with.pseudo, "clr", MARGIN=2)) < 1e-6 -# sum(rgr::clr(testdata.with.pseudo) - vegan::decostand(testdata.with.pseudo, "clr"))<1e-6# +# sum(compositions::clr(testdata.with.pseudo) - decostand(testdata.with.pseudo, "clr")) < 1e-6 +# sum(compositions::clr(testdata.with.pseudo) - decostand(testdata.with.pseudo, "clr", MARGIN=1)) < 1e-6 +# sum(t(compositions::clr(t(testdata.with.pseudo))) - decostand(testdata.with.pseudo, "clr", MARGIN=2)) < 1e-6 +# sum(rgr::clr(testdata.with.pseudo) - decostand(testdata.with.pseudo, "clr"))<1e-6# # -# sum(compositions::alr(testdata.with.pseudo, ivar=1) - vegan::decostand(testdata.with.pseudo, "alr")) < 1e-6 -# sum(compositions::alr(testdata.with.pseudo, ivar=1) - vegan::decostand(testdata.with.pseudo, "alr", MARGIN=1)) < 1e-6 -# sum(t(compositions::alr(t(testdata.with.pseudo), ivar=1)) - vegan::decostand(testdata.with.pseudo, "alr", MARGIN=2)) < 1e-6 -# sum(rgr::alr(testdata.with.pseudo, j=1) - vegan::decostand(testdata.with.pseudo, "alr", reference=1))<1e-6# +# sum(compositions::alr(testdata.with.pseudo, ivar=1) - decostand(testdata.with.pseudo, "alr")) < 1e-6 +# sum(compositions::alr(testdata.with.pseudo, ivar=1) - decostand(testdata.with.pseudo, "alr", MARGIN=1)) < 1e-6 +# sum(t(compositions::alr(t(testdata.with.pseudo), ivar=1)) - decostand(testdata.with.pseudo, "alr", MARGIN=2)) < 1e-6 +# sum(rgr::alr(testdata.with.pseudo, j=1) - decostand(testdata.with.pseudo, "alr", reference=1))<1e-6# # #} diff --git a/tests/decostand-tests.R b/tests/decostand-tests.R index 05814bd46..36ad5d6ff 100644 --- a/tests/decostand-tests.R +++ b/tests/decostand-tests.R @@ -1,9 +1,11 @@ - +###<--- BEGIN TESTS ---> +suppressPackageStartupMessages(require(vegan)) + ############################### CLR #################################### # Calculates clr-transformation. Should be equal. # Test data -# data(varespec) +data(varespec) testdata <- matrix(round(runif(1000, 0, 100)), nrow=20) testdata <- testdata - 50 testdata[testdata < 0] <- 0 @@ -12,16 +14,16 @@ colnames(testdata) <- paste0("col", seq_len(ncol(testdata))) testdata.with.pseudo <- testdata + 1 # Calculates relative abundance table -relative <- vegan::decostand(testdata, "total") -relative.with.pseudo <- vegan::decostand(testdata+1, "total") +relative <- decostand(testdata, "total") +relative.with.pseudo <- decostand(testdata+1, "total") # CLR data -x.clr <- vegan::decostand(testdata+1, method = "clr") -x.rclr <- vegan::decostand(testdata, method = "rclr") -x.clr.pseudo <- vegan::decostand(testdata, method = "clr", pseudocount=1) +x.clr <- decostand(testdata+1, method = "clr") +x.rclr <- decostand(testdata, method = "rclr") +x.clr.pseudo <- decostand(testdata, method = "clr", pseudocount=1) max(abs(x.clr - x.clr.pseudo))<1e-6 -max(abs(vegan::decostand(testdata+1, method = "clr", pseudocount=0)-x.clr.pseudo))<1e-6 +max(abs(decostand(testdata+1, method = "clr", pseudocount=0)-x.clr.pseudo))<1e-6 # Tests clr alt.clr <- t(apply(as.matrix(relative.with.pseudo), 1, FUN=function(x){ @@ -29,12 +31,43 @@ log(x) - mean(log(x))})) max(abs(x.clr-alt.clr)) < 1e-6 all((x.rclr==0) == (testdata==0)) +# Test that NAs are handled as expected in CLR +x <- testdata; x[sample(prod(dim(x)), 50)] <- NA # insert some NAs in the data +# NAs in the original data remain NAs in the returned data +all(is.na(decostand(x, "clr", na.rm=FALSE, pseudocount=1)[is.na(x)]))==TRUE # NAs +# For the other (non-NA) values, we get non-NA values back +any(is.na(decostand(x, "clr", na.rm=FALSE, pseudocount=1)[!is.na(x)]))==FALSE +any(is.na(decostand(x, "clr", MARGIN=2, na.rm=FALSE, pseudocount=1)[!is.na(t(x))]))==FALSE +# Results match for the non-NA values always (with tolerance 1e-6) +inds <- !is.na(x) # Non-NA values +max(abs(decostand(x, "clr", na.rm=FALSE, pseudocount=1)[inds]-decostand(x, "clr", na.rm=TRUE, pseudocount=1)[inds]))<1e-6 +# For the other (non-NA) values, we get non-NA values back +any(is.na(decostand(x, "alr", na.rm=FALSE, pseudocount=1)[!is.na(x)]))==FALSE +# Works correctly also with other MARGIN +inds <- !is.na(x) # Non-NA values +max(abs(decostand(x, "clr", MARGIN=2, na.rm=FALSE, pseudocount=1)[inds]-decostand(x, "clr", MARGIN=2, na.rm=TRUE, pseudocount=1)[inds]))<1e-6 +# Results match for the non-NA values always (with tolerance 1e-6) +inds <- !is.na(x) # Non-NA values +max(abs(decostand(x, "alr", na.rm=FALSE, pseudocount=1)[inds]-decostand(x, "alr", na.rm=TRUE, pseudocount=1)[inds]))<1e-6 + +# Test that NAs are handled as expected in ALR +set.seed(4354353) +x <- testdata; x[sample(prod(dim(x)), 50)] <- NA; x[4, c(2, 10)] <- NA # insert some NAs in the data +# NAs in the output share NAs with the reference vector +all(is.na(decostand(x, "alr", na.rm=FALSE, pseudocount=1, reference=4))[which(is.na(x[,4])),]) +# Output vector has same NAs than the original vector and reference vector +all(is.na(decostand(x, "alr", na.rm=FALSE, pseudocount=1, reference=4)[,2])==(is.na(x[,4]) | is.na(x[,2]))) +# No NAs after removing them +!any(is.na(decostand(x, "alr", na.rm=TRUE, pseudocount=1, reference=4))) +# All NAs are replaced by zero +all((rowMeans(decostand(x, "alr", na.rm=TRUE, pseudocount=1, reference=4)==0)==1)==is.na(x[,4])) + # Expect that error does not occur -vegan::decostand(testdata, method = "rclr") -vegan::decostand(testdata, method = "clr", pseudocount=1) +decostand(testdata, method = "rclr") +decostand(testdata, method = "clr", pseudocount=1) # Expect error -class(try(vegan::decostand(testdata, method = "clr")))=="try-error" -class(try(vegan::decostand(testdata, method = "clr", pseudocount=0)))=="try-error" +#class(try(decostand(testdata, method = "clr")))=="try-error" +#class(try(decostand(testdata, method = "clr", pseudocount=0)))=="try-error" # Tests that clr robust gives values that are approximately same if only # one value per sample are changed to zero @@ -43,8 +76,8 @@ test <- testdata+1 test2 <- test; test2[,1] <- 0 # clr robust transformations -test <- vegan::decostand(test, method = "rclr") -test2 <- vegan::decostand(test2, method = "rclr") +test <- decostand(test, method = "rclr") +test2 <- decostand(test2, method = "rclr") # Removes first cols test <- test[, -1] @@ -53,35 +86,44 @@ test2 <- test2[, -1] # Expect high correlation cor(unlist(test), unlist(test2)) > 0.99 +# Compare rclr with imputation to without imputation + matrix completion +# rclr transformation with matrix completion for the 0/NA entries +x1 <- decostand(varespec, method="rclr", impute=TRUE) +# rclr transformation with no matrix completion for the 0/NA entries +x2 <- decostand(varespec, method="rclr", impute=FALSE) +# Matrix completion +x2c <- OptSpace(x2, ROPT=3, NITER=5, TOL=1e-5, verbose=FALSE)$M +all(as.matrix(x1) == as.matrix(x2c)) + ############################# NAMES #################################### # Tests that dimensins have correct names -all(rownames(vegan::decostand(testdata+1, method = "clr")) == rownames(testdata)) -all(colnames(vegan::decostand(testdata, method = "clr", pseudocount=1)) == colnames(testdata)) -all(rownames(vegan::decostand(testdata, method = "rclr")) == rownames(testdata)) -all(colnames(vegan::decostand(testdata, method = "rclr")) == colnames(testdata)) +all(rownames(decostand(testdata+1, method = "clr")) == rownames(testdata)) +all(colnames(decostand(testdata, method = "clr", pseudocount=1)) == colnames(testdata)) +all(rownames(decostand(testdata, method = "rclr")) == rownames(testdata)) +all(colnames(decostand(testdata, method = "rclr")) == colnames(testdata)) ######################################################################## # Count vs. Relative data # CLR is identical with count and relative data -a1 <- vegan::decostand(testdata.with.pseudo, method = "clr") -a2 <- vegan::decostand(relative.with.pseudo, method = "clr") +a1 <- decostand(testdata.with.pseudo, method = "clr") +a2 <- decostand(relative.with.pseudo, method = "clr") max(abs(a1-a2)) < 1e-6 # Tolerance # rCLR is identical with count and relative data -a1 <- vegan::decostand(testdata.with.pseudo, method = "rclr") -a2 <- vegan::decostand(relative.with.pseudo, method = "rclr") +a1 <- decostand(testdata.with.pseudo, method = "rclr") +a2 <- decostand(relative.with.pseudo, method = "rclr") max(abs(a1-a2)) < 1e-6 # Tolerance # ALR is identical with count and relative data -a1 <- vegan::decostand(testdata.with.pseudo, method = "alr") -a2 <- vegan::decostand(relative.with.pseudo, method = "alr") +a1 <- decostand(testdata.with.pseudo, method = "alr") +a2 <- decostand(relative.with.pseudo, method = "alr") max(abs(a1-a2)) < 1e-6 # Tolerance ####### # ALR transformation drops one feature ################ -ncol(vegan::decostand(testdata.with.pseudo, "alr")) == ncol(testdata.with.pseudo)-1 +ncol(decostand(testdata.with.pseudo, "alr")) == ncol(testdata.with.pseudo)-1