diff --git a/DESCRIPTION b/DESCRIPTION index adfde7243..08d285d4a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xcms -Version: 2.99.5 -Date: 2017-07-14 +Version: 2.99.6 +Date: 2017-08-09 Title: LC/MS and GC/MS Data Analysis Author: Colin A. Smith , Ralf Tautenhahn , diff --git a/NAMESPACE b/NAMESPACE index 9cbb81481..38863488d 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,7 +69,8 @@ export( "xcmsRaw", "xcmsSet", "xcmsFragments", - "xcmsPapply" + "xcmsPapply", + "phenoDataFromPaths" ) S3method(plot, xcmsEIC) @@ -212,7 +213,8 @@ export( "highlightChromPeaks", "plotChromPeakDensity", "plotChromPeaks", - "plotChromPeakImage" + "plotChromPeakImage", + "isCalibrated" ) ## New analysis methods @@ -236,7 +238,8 @@ exportClasses( "PeakGroupsParam", "ObiwarpParam", "GenericParam", - "FillChromPeaksParam" + "FillChromPeaksParam", + "CalibrantMassParam" ) ## Param methods exportMethods( @@ -386,7 +389,7 @@ exportMethods( export("CentWaveParam", "MatchedFilterParam", "MassifquantParam", "MSWParam", "CentWavePredIsoParam", "PeakDensityParam", "MzClustParam", "NearestPeaksParam", "PeakGroupsParam", "ObiwarpParam", "GenericParam", - "FillChromPeaksParam") + "FillChromPeaksParam", "CalibrantMassParam") ## Param class methods. ## New Classes diff --git a/R/DataClasses.R b/R/DataClasses.R index 89359fcfb..719de098d 100644 --- a/R/DataClasses.R +++ b/R/DataClasses.R @@ -187,12 +187,14 @@ setClass("xcmsPeaks", contains = "matrix") .PROCSTEP.PEAK.GROUPING <- "Peak grouping" .PROCSTEP.RTIME.CORRECTION <- "Retention time correction" .PROCSTEP.PEAK.FILLING <- "Missing peak filling" +.PROCSTEP.CALIBRATION <- "Calibration" .PROCSTEPS <- c( .PROCSTEP.UNKNOWN, .PROCSTEP.PEAK.DETECTION, .PROCSTEP.PEAK.GROUPING, .PROCSTEP.RTIME.CORRECTION, - .PROCSTEP.PEAK.FILLING + .PROCSTEP.PEAK.FILLING, + .PROCSTEP.CALIBRATION ) ############################################################ @@ -2405,3 +2407,117 @@ setClass("XCMSnExp", } ) +#' @aliases mz,CalibrantMassParam +#' +#' @title Calibrant mass based calibration of chromatgraphic peaks +#' +#' @description Calibrate peaks using mz values of known masses/calibrants. +#' mz values of identified peaks are adjusted based on peaks that are close +#' to the provided mz values. See details below for more information. +#' +#' @param mz a `numeric` or `list` of `numeric` vectors with reference mz +#' values. If a `numeric` vector is provided, this is used for each sample +#' in the `XCMSnExp` object. If a `list` is provided, it's length has to be +#' equal to the number of samples in the experiment. +#' +#' @param mzabs `numeric(1)` the absolute error/deviation for matching peaks to +#' calibrants (in Da). +#' +#' @param mzppm `numeric(1)` the relative error for matching peaks to calibrants +#' in ppm (parts per million). +#' +#' @param neighbors `integer(1)` with the maximal number of peaks within the +#' permitted distance to the calibrants that are considered. Among these the +#' mz value of the peak with the largest intensity is used in the +#' calibration function estimation. +#' +#' @param method `character(1)` defining the method that should be used to +#' estimate the calibration function. Can be `"shift"`, `"linear"` (default) +#' or `"edgeshift"`. +#' +#' @details The method does first identify peaks that are close to the provided +#' mz values and, given that there difference to the calibrants is smaller +#' than the user provided cut off (based on arguments `mzabs` and `mzppm`), +#' their mz values are replaced with the provided mz values. The mz values +#' of all other peaks are either globally shifted (for `method = "shift"` +#' or estimated by a linear model through all calibrants. +#' Peaks are considered close to a calibrant mz if the difference between +#' the calibrant and its mz is `<= mzabs + mz * mzppm /1e6`. +#' +#' **Adjustment methods**: adjustment function/factor is estimated using +#' the difference between calibrant and peak mz values only for peaks +#' that are close enough to the calibrants. The availabel methods are: +#' * `shift`: shifts the m/z of each peak by a global factor which +#' corresponds to the average difference between peak mz and calibrant mz. +#' * `linear`: fits a linear model throught the differences between +#' calibrant and peak mz values and adjusts the mz values of all peaks +#' using this. +#' * `edgeshift`: performs same adjustment as `linear` for peaks that are +#' within the mz range of the calibrants and shift outside of it. +#' +#' For more information, details and examples refer to the +#' *xcms-direct-injection* vignette. +#' +#' @note `CalibrantMassParam` classes don't have exported getter or setter +#' methods. +#' +#' @return For `CalibrantMassParam`: a `CalibrantMassParam` instance. +#' For `calibrate`: an [XCMSnExp] object with chromatographic peaks being +#' calibrated. **Be aware** that the actual raw mz values are not (yet) +#' calibrated, but **only** the identified chromatographic peaks. +#' +#' @author Joachim Bargsten, Johannes Rainer +#' +#' @md +#' +#' @rdname calibrate-calibrant-mass +setClass("CalibrantMassParam", + slots = c( + mz = "list", + mzabs = "numeric", + mzppm = "numeric", + neighbors = "integer", + method = "character" + ), + contains = c("Param"), + prototype = prototype( + mz = list(), + mzabs = 0.0001, + mzppm = 5, + neighbors = 3L, + method = "linear" + ), + validity = function(object) { + msg <- character() + if (length(object@mz)) { + is_num <- vapply(object@mz, FUN = is.numeric, + FUN.VALUE = logical(1), USE.NAMES = FALSE) + if (any(!is_num)) + msg <- c(msg, paste0("'mz' has to be a list of numeric", + " vectors")) + is_unsorted <- vapply(object@mz, FUN = is.unsorted, + FUN.VALUE = logical(1), + USE.NAMES = FALSE) + if (any(is_unsorted)) + msg <- c(msg, paste0("the mz values in 'mz' have to be ", + "increasingly ordered")) + } + if (length(object@mzppm) != 1 | any(object@mzppm < 0)) + msg <- c(msg, paste0("'mzppm' has to be positive numeric", + " of length 1.")) + if (length(object@mzabs) != 1 | any(object@mzabs < 0)) + msg <- c(msg, paste0("'mzabs' has to be positive numeric", + " of length 1.")) + if (length(object@neighbors) != 1 | any(object@neighbors <= 0)) + msg <- c(msg, paste0("'neighbors' has to be positive integer", + " of length 1.")) + if (length(object@method) != 1) + msg <- c(msg, paste0("'method' has to be of length 1.")) + if (!all(object@method %in% c("linear", "shift", "edgeshift"))) + msg <- c(msg, paste0("'method' should be one of 'linear'", + ", 'shift' or 'edgeshift'.")) + if (length(msg)) + msg + else + TRUE + }) diff --git a/R/do_groupChromPeaks-functions.R b/R/do_groupChromPeaks-functions.R index 930fd065c..d87c101d0 100644 --- a/R/do_groupChromPeaks-functions.R +++ b/R/do_groupChromPeaks-functions.R @@ -120,7 +120,8 @@ do_groupChromPeaks_density <- function(peaks, sampleGroups, densFrom <- rtRange[1] - 3 * bw densTo <- rtRange[2] + 3 * bw - densN <- max(512, 2^(ceiling(log2(diff(rtRange) / (bw / 2))))) + ## Increase the number of sampling points for the density distribution. + densN <- max(512, 2 * 2^(ceiling(log2(diff(rtRange) / (bw / 2))))) endIdx <- 0 num <- 0 gcount <- integer(nSampleGroups) diff --git a/R/functions-OnDiskMSnExp.R b/R/functions-OnDiskMSnExp.R index a5ea487f6..d0a1deb56 100644 --- a/R/functions-OnDiskMSnExp.R +++ b/R/functions-OnDiskMSnExp.R @@ -54,12 +54,13 @@ findChromPeaks_Spectrum_list <- function(x, method = "centWave", param, rt) { if (is.unsorted(rt)) stop("Spectra are not ordered by retention time!") mzs <- lapply(x, mz) + vals_per_spect <- lengths(mzs, FALSE) procDat <- date() res <- do.call(method, args = c(list(mz = unlist(mzs, use.names = FALSE), int = unlist(lapply(x, intensity), use.names = FALSE), - valsPerSpect = lengths(mzs, FALSE), + valsPerSpect = vals_per_spect, scantime = rt), as(param, "list"))) ## Ensure that we call the garbage collector to eventually clean unused stuff @@ -301,8 +302,10 @@ findPeaks_MSW_Spectrum_list <- function(x, method = "MSW", param) { mzvals <- length(mzs) cntrVals <- length(cntrPr$profMat) curVals <- length(curP$profMat) - if ((mzvals * valscantime1) != cntrVals | (mzvals * valscantime2) != curVals - | cntrVals != curVals) + if ((mzvals * valscantime1) != cntrVals | (mzvals * valscantime2) != curVals) + ## Here the question is if we REALLY need to have the same numbers + ## of values in both. This caused the problems in issue #196 + ## | cntrVals != curVals) stop("Dimensions of profile matrices of files ", basename(fileNames(cntr)), " and ", basename(fileNames(z)), " do not match!") diff --git a/R/functions-Params.R b/R/functions-Params.R index 6ea734c77..8d581f7b8 100644 --- a/R/functions-Params.R +++ b/R/functions-Params.R @@ -301,3 +301,34 @@ FillChromPeaksParam <- function(expandMz = 0, expandRt = 0, ppm = 0) { return(new("FillChromPeaksParam", expandMz = expandMz, expandRt = expandRt, ppm = ppm)) } + + +#' @return The `CalibrantMassParam` function returns an instance of +#' the `CalibrantMassParam` class with all settings and properties set. +#' +#' @md +#' +#' @rdname calibrate-calibrant-mass +CalibrantMassParam <- function(mz = list(), mzabs = 0.0001, mzppm = 5, + neighbors = 3, method = "linear") { + if (!is.list(mz)) + mz <- list(mz) + mz <- lapply(mz, sort) + new("CalibrantMassParam", mz = mz, mzabs = mzabs, mzppm = mzppm, + neighbors = as.integer(neighbors), method = method) +} + +.mzabs <- function(x) + x@mzabs + +.mzppm <- function(x) + x@mzppm + +.neighbors <- function(x) + x@neighbors + +.method <- function(x) + x@method + +.mz <- function(x) + x@mz diff --git a/R/functions-XCMSnExp.R b/R/functions-XCMSnExp.R index bb56a23ba..69e2984ca 100644 --- a/R/functions-XCMSnExp.R +++ b/R/functions-XCMSnExp.R @@ -992,56 +992,74 @@ plotAdjustedRtime <- function(object, col = "#00000080", lty = 1, type = "l", #' @title Plot chromatographic peak density along the retention time axis #' #' @description Plot the density of chromatographic peaks along the retention -#' time axis and indicate which peaks would be grouped into the same feature -#' based using the \emph{peak density} correspondence method. Settings for -#' the \emph{peak density} method can be passed with an -#' \code{\link{PeakDensityParam}} object to parameter \code{param}. -#' -#' @details The \code{plotChromPeakDensity} function allows to evaluate -#' different settings for the \emph{peak density} on an mz slice of +#' time axis and indicate which peaks would be (or were) grouped into the +#' same feature based using the *peak density* correspondence method. +#' Settings for the *peak density* method can be passed with an +#' [PeakDensityParam] object to parameter `param`. If the `object` contains +#' correspondence results and the correspondence was performed with the +#' *peak groups* method, the results from that correspondence can be +#' visualized setting `simulate = FALSE`. +#' +#' @details The `plotChromPeakDensity` function allows to evaluate +#' different settings for the *peak density* on an mz slice of #' interest (e.g. containing chromatographic peaks corresponding to a known #' metabolite). #' The plot shows the individual peaks that were detected within the -#' specified \code{mz} slice at their retention time (x-axis) and sample in +#' specified `mz` slice at their retention time (x-axis) and sample in #' which they were detected (y-axis). The density function is plotted as a -#' black line. Parameters for the \code{density} function are taken from the -#' \code{param} object. Grey rectangles indicate which chromatographic peaks -#' would be grouped into a feature by the \emph{peak density} correspondence -#' method. Parameters for the algorithm are also taken from \code{param}. -#' See \code{\link{groupChromPeaks-density}} for more information about the +#' black line. Parameters for the `density` function are taken from the +#' `param` object. Grey rectangles indicate which chromatographic peaks +#' would be grouped into a feature by the `peak density` correspondence +#' method. Parameters for the algorithm are also taken from `param`. +#' See [groupChromPeaks-density()] for more information about the #' algorithm and its supported settings. #' -#' @param object A \code{\link{XCMSnExp}} object with identified +#' @param object A [XCMSnExp] object with identified #' chromatographic peaks. #' -#' @param mz \code{numeric(2)} defining an mz range for which the peak density +#' @param mz `numeric(2)` defining an mz range for which the peak density #' should be plotted. #' -#' @param rt \code{numeric(2)} defining an optional rt range for which the +#' @param rt `numeric(2)` defining an optional rt range for which the #' peak density should be plotted. Defaults to the absolute retention time -#' range of \code{object}. +#' range of `object`. #' -#' @param param \code{\link{PeakDensityParam}} from which parameters for the -#' \emph{peak density} correspondence algorithm can be extracted. +#' @param param [PeakDensityParam] from which parameters for the +#' *peak density* correspondence algorithm can be extracted. If not provided +#' and if `object` contains feature definitions with the correspondence/ +#' peak grouping being performed by the *peak density* method, the +#' corresponding parameter class stored in `object` is used. +#' +#' @param simulate `logical(1)` defining whether correspondence should be +#' simulated within the specified m/z / rt region or (with +#' `simulate = FALSE`) whether the results from an already performed +#' correspondence should be shown. #' #' @param col Color to be used for the individual samples. Length has to be 1 -#' or equal to the number of samples in \code{object}. +#' or equal to the number of samples in `object`. +#' +#' @param xlab `character(1)` with the label for the x-axis. #' -#' @param xlab \code{character(1)} with the label for the x-axis. +#' @param ylab `character(1)` with the label for the y-axis. #' -#' @param ylab \code{character(1)} with the label for the y-axis. +#' @param xlim `numeric(2)` representing the limits for the x-axis. +#' Defaults to the range of the `rt` parameter. #' -#' @param xlim \code{numeric(2)} representing the limits for the x-axis. -#' Defaults to the range of the \code{rt} parameter. +#' @param main `character(1)` defining the title of the plot. By default +#' (for `main = NULL`) the mz-range is used. #' -#' @param ... Additional parameters to be passed to the \code{plot} function. +#' @param ... Additional parameters to be passed to the `plot` function. Data +#' point specific parameters such as `bg` or `pch` have to be of length 1 +#' or equal to the number of samples. #' #' @return The function is called for its side effect, i.e. to create a plot. #' #' @author Johannes Rainer #' -#' @seealso \code{\link{groupChromPeaks-density}} for details on the -#' \emph{peak density} correspondence method and supported settings. +#' @seealso [groupChromPeaks-density()] for details on the +#' *peak density* correspondence method and supported settings. +#' +#' @md #' #' @examples #' @@ -1076,9 +1094,10 @@ plotAdjustedRtime <- function(object, col = "#00000080", lty = 1, type = "l", #' ## Require the chromatographic peak to be present in all samples of a group #' plotChromPeakDensity(res, mz = mzr, pch = 16, #' param = PeakDensityParam(minFraction = 1)) -plotChromPeakDensity <- function(object, mz, rt, param = PeakDensityParam(), +plotChromPeakDensity <- function(object, mz, rt, param, simulate = TRUE, col = "#00000080", xlab = "retention time", - ylab = "sample", xlim = range(rt), ...) { + ylab = "sample", xlim = range(rt), + main = NULL, ...) { if (missing(object)) stop("Required parameter 'object' is missing") if (!is(object, "XCMSnExp")) @@ -1089,6 +1108,22 @@ plotChromPeakDensity <- function(object, mz, rt, param = PeakDensityParam(), mz <- c(-Inf, Inf) if (missing(rt)) rt <- range(rtime(object)) + if (!simulate & !hasFeatures(object)) { + warning("Falling back to 'simulate = TRUE' because no correspondence ", + "results are present") + simulate <- TRUE + } + if (missing(param)) { + ## Use the internal parameter - if available. + if (hasFeatures(object)) { + ph <- processHistory(object, type = .PROCSTEP.PEAK.GROUPING) + param <- processParam(ph[[length(ph)]]) + } else { + param = PeakDensityParam() + } + } + if (!is(param, "PeakDensityParam")) + stop("'param' has to be a 'PeakDensityParam'") mz <- range(mz) rt <- range(rt) ## Get all the data we require. @@ -1099,8 +1134,13 @@ plotChromPeakDensity <- function(object, mz, rt, param = PeakDensityParam(), if (nrow(pks)) { ## Extract parameters from the param object bw = bw(param) + ## Ensure the density is calculated exactly as it would in the real + ## case. + full_rt_range <- range(chromPeaks(object)[, "rt"]) + dens_from <- full_rt_range[1] - 3 * bw + dens_to <- full_rt_range[2] + 3 * bw ## That's Jan Stanstrup's fix (issue #161). - densN <- max(512, 2^(ceiling(log2(diff(rt) / (bw / 2))))) + densN <- max(512, 2 * 2^(ceiling(log2(diff(full_rt_range) / (bw / 2))))) sample_groups <- sampleGroups(param) if (length(sample_groups) == 0) sample_groups <- rep(1, nsamples) @@ -1109,38 +1149,66 @@ plotChromPeakDensity <- function(object, mz, rt, param = PeakDensityParam(), "class has to have the same length than there are samples ", "in 'object'") sample_groups_table <- table(sample_groups) - dens_from <- rt[1] - 3 * bw - dens_to <- rt[2] + 3 * bw dens <- density(pks[, "rt"], bw = bw, from = dens_from, to = dens_to, n = densN) yl <- c(0, max(dens$y)) - ypos <- seq(from = 0, to = yl[2], length.out = nsamples) + ypos <- seq(from = yl[1], to = yl[2], length.out = nsamples) + if (is.null(main)) + main <- paste0(format(mz, digits = 7), collapse = " - ") ## Plot the peaks as points. - plot(x = pks[, "rt"], y = ypos[pks[, "sample"]], xlim = xlim, - col = col[pks[, "sample"]], xlab = xlab, yaxt = "n", ylab = ylab, - main = paste0(format(mz, digits = 7), collapse = " - "), ylim = yl, - ...) + ## Fix assignment of point types for each sample. + dots <- list(...) + if (any(names(dots) == "bg")) { + bg <- dots$bg + if (length(bg) != nsamples) + bg <- rep_len(bg[1], nsamples) + dots$bg <- bg[pks[, "sample"]] + } + if (any(names(dots) == "pch")) { + pch <- dots$pch + if (length(pch) != nsamples) + pch <- rep_len(pch[1], nsamples) + dots$pch <- pch[pks[, "sample"]] + } + do.call("plot", args = c(list(x = pks[, "rt"], + y = ypos[pks[, "sample"]], xlim = xlim, + col = col[pks[, "sample"]], xlab = xlab, + yaxt = "n", ylab = ylab, + main = main, ylim = yl), dots)) + ## plot(x = pks[, "rt"], y = ypos[pks[, "sample"]], xlim = xlim, + ## col = col[pks[, "sample"]], xlab = xlab, yaxt = "n", ylab = ylab, + ## main = main, ylim = yl, + ## ...) axis(side = 2, at = ypos, labels = 1:nsamples) points(x = dens$x, y = dens$y, type = "l") ## Estimate what would be combined to a feature ## Code is taken from do_groupChromPeaks_density dens_max <- max(dens$y) dens_y <- dens$y - snum <- 0 - while(dens_y[max_y <- which.max(dens_y)] > dens_max / 20 && - snum < maxFeatures(param)) { - feat_range <- xcms:::descendMin(dens_y, max_y) - dens_y[feat_range[1]:feat_range[2]] <- 0 - feat_idx <- which(pks[, "rt"] >= dens$x[feat_range[1]] & - pks[, "rt"] <= dens$x[feat_range[2]]) - tt <- table(sample_groups[pks[feat_idx, "sample"]]) - if (!any(tt / sample_groups_table[names(tt)] >= - minFraction(param) & tt >= minSamples(param))) - next - rect(xleft = min(pks[feat_idx, "rt"]), ybottom = 0, - xright = max(pks[feat_idx, "rt"]), ytop = yl[2], - border = "#00000040", col = "#00000020") - } + if (simulate) { + snum <- 0 + while(dens_y[max_y <- which.max(dens_y)] > dens_max / 20 && + snum < maxFeatures(param)) { + feat_range <- descendMin(dens_y, max_y) + dens_y[feat_range[1]:feat_range[2]] <- 0 + feat_idx <- which(pks[, "rt"] >= dens$x[feat_range[1]] & + pks[, "rt"] <= dens$x[feat_range[2]]) + tt <- table(sample_groups[pks[feat_idx, "sample"]]) + if (!any(tt / sample_groups_table[names(tt)] >= + minFraction(param) & tt >= minSamples(param))) + next + rect(xleft = min(pks[feat_idx, "rt"]), ybottom = yl[1], + xright = max(pks[feat_idx, "rt"]), ytop = yl[2], + border = "#00000040", col = "#00000020") + } + } else { + ## Plot all features in the region. + fts <- featureDefinitions(object, mz = mz, rt = rt) + rect(xleft = fts$rtmin, xright = fts$rtmax, + ybottom = rep(yl[1], nrow(fts)), ytop = rep(yl[2], nrow(fts)), + border = "#00000040", col = "#00000020") + abline(v = fts$rtmed, col = "#00000040", lty = 2) + } } else { plot(3, 3, pch = NA, xlim = rt, xlab = xlab, main = paste0(format(mz, digits = 7), collapse = " - ")) @@ -1413,6 +1481,18 @@ plotChromPeakImage <- function(x, binSize = 30, xlim = NULL, log = FALSE, } } +#' @rdname calibrate-calibrant-mass +#' +#' @description The `isCalibrated` function returns `TRUE` if chromatographic +#' peaks of the [XCMSnExp] object `x` were calibrated and `FALSE` otherwise. +#' +#' @md +isCalibrated <- function(object) { + if (length(processHistory(object, type = .PROCSTEP.CALIBRATION))) + TRUE + else + FALSE +} ## Find mz ranges with multiple peaks per sample. ## Use the density distribution for that? with a bandwidth = 0.001, check diff --git a/R/functions-normalization.R b/R/functions-normalization.R index e32a04b1b..d4acab811 100644 --- a/R/functions-normalization.R +++ b/R/functions-normalization.R @@ -192,9 +192,9 @@ adjustDriftWithModel <- function(y, data = NULL, model = y ~ injection_idx, data_fit <- data_fit[fitOnSubset, , drop = FALSE] ## First fitting the model. message("Fitting the model to the features ... ", appendLF = FALSE) - lms <- xcms:::fitModel(formula = model, data = data_fit, - y = y[, fitOnSubset, drop = FALSE], - minVals = minVals, method = method) + lms <- fitModel(formula = model, data = data_fit, + y = y[, fitOnSubset, drop = FALSE], + minVals = minVals, method = method) message("OK") message("Applying models to adjust values ... ", appendLF = FALSE) if (is.null(rownames(y))) diff --git a/R/functions-xcmsSet.R b/R/functions-xcmsSet.R index aba8d0b8b..19e3e47e3 100644 --- a/R/functions-xcmsSet.R +++ b/R/functions-xcmsSet.R @@ -365,6 +365,25 @@ split.xcmsSet <- function(x, f, drop = TRUE, ...) { ############################################################ ## phenoDataFromPaths ## derive experimental design from set of file paths +#' @title Derive experimental design from file paths +#' +#' @description The `phenoDataFromPaths` function builds a `data.frame` +#' representing the experimental design from the folder structure in which +#' the files of the experiment are located. +#' +#' @note This function is used by the *old* `xcmsSet` function to guess +#' the experimental design (i.e. group assignment of the files) from the +#' folders in which the files of the experiment can be found. +#' +#' @param paths `character` representing the file names (including the full +#' path) of the experiment's files. +#' +#' @md +#' +#' @examples +#' ## List the files available in the faahKO package +#' base_dir <- system.file("cdf", package = "faahKO") +#' cdf_files <- list.files(base_dir, recursive = TRUE, full.names = TRUE) phenoDataFromPaths <- function(paths) { ## create factors from filesystem hierarchy sclass <- gsub("^\\.$", "sample", dirname(paths)) diff --git a/R/matchpeaks.R b/R/matchpeaks.R index 41dbcc712..a2eb1b2d4 100644 --- a/R/matchpeaks.R +++ b/R/matchpeaks.R @@ -1,25 +1,72 @@ -matchpeaks <- function(peaklist, masslist, mzabs=0.0001, mzppm=5, neighbours=3, intensity="into"){ +#' @title Identify peaks close to calibrants' mz values +#' +#' @description Define which peaks are closest to the provided calibrants' mz +#' values and return their index, mz value and difference between their and +#' the calibrant's mz value. +#' +#' @note The mz values of the calibrants are first sorted. *Cave* the function +#' requires the `peaklist` to be sorted by mz value (issue #200). +#' +#' @param peaklist `matrix` with the identified peaks of a single sample. +#' Required columns are: `"mz"` and `intensity` +#' +#' @param masslist `numeric` with the masses of the calibrants. +#' +#' @param mzabs `numeric(1)` defining the absolute mz deviation. +#' +#' @param mzppm `numeric(1)` defining the mz deviation in ppm. +#' +#' @param neighbours `numeric(1)` with the number of neighbors. +#' +#' @param intensity `character(1)` specifying the column in `peaklist` that +#' contains the peaks' intensity values. +#' +#' @return A `matrix` with 3 columns: +#' + `"pos"`: the position of the peak closest to the corresponding +#' calibrant's mz value. +#' + `"mass"`: the m/z of the peak. +#' + `"dif"`: the difference between the calibrant's and peak's mz. +#' The number of rows corresponds to the number of calibrants, unless no +#' peak was found to be close enough to a calibrant. In the latter case +#' the calibrant is removed from the result table. +#' +#' @md +#' +#' @noRd +matchpeaks <- function(peaklist, masslist, mzabs = 0.0001, mzppm = 5, + neighbours = 3, intensity = "into") { + ## Dangerous, we're nowhere checking for masslist being correct. masslist <- masslist[order(masslist)] - dif <- matrix(nrow=length(masslist), ncol=neighbours, data=rep(1000,length(masslist)*neighbours)) - pos <- matrix(nrow=length(masslist), ncol=neighbours, data=rep(0,length(masslist)*neighbours)) - mzu <- peaklist[,"mz"] - itu <- peaklist[,intensity] - mdif <- NA - mpos <- NA + dif <- matrix(nrow = length(masslist), ncol = neighbours, + data = rep(1000, length(masslist) * neighbours)) + pos <- matrix(nrow = length(masslist), ncol = neighbours, + data = rep(0, length(masslist) * neighbours)) + mzu <- peaklist[, "mz"] + itu <- peaklist[, intensity] + mdif <- NA ## smallest difference between a peak and a calibrant. + mpos <- NA ## position of the entry in the peak list. cdifs <- NA cposs <- NA - lastval<-1 + lastval <- 1 + ## Loop over masslist. for (b in 1:length(masslist)){ - ## finding for each entry in masslist the ##neighbours closest masses in the spectra + ## finding for each entry in masslist the ##neighbours closest masses + ## in the peaklist a <- lastval - sf <- FALSE ## is set to true if the neighbour-box is filled the first time for a entry in masslist + sf <- FALSE ## is set to true if the neighbour-box is filled the first + ## time for a entry in masslist while (a < length(mzu)){ - mxd <- dif[b,max(which(abs(dif[b,])==max(abs(dif[b,]))))] ## the currently biggest distance - mxp <- max(which(abs(dif[b,])==max(abs(dif[b,])))) ## the position of the distance - if (abs(mzu[a]-masslist[b]) <= abs(mxd)){ - dif[b,mxp] <- (mzu[a]-masslist[b]) - pos[b,mxp] <- a - lastval <- min(pos[b,]) + ## loop over peaks + ## the currently biggest distance: + mxd <- dif[b, max(which(abs(dif[b, ]) == max(abs(dif[b,]))))] + ## the position of the distance + mxp <- max(which(abs(dif[b, ]) == max(abs(dif[b,])))) + ## If the difference between the peak and the current mass is + ## smaller than the largest difference in dif + if (abs(mzu[a] - masslist[b]) <= abs(mxd)) { + dif[b, mxp] <- (mzu[a] - masslist[b]) + pos[b, mxp] <- a + lastval <- min(pos[b, ]) sf <- TRUE }else{ ## no more masses are smaller, switching to end of mzu if (sf) a <- length(mzu) @@ -27,40 +74,196 @@ matchpeaks <- function(peaklist, masslist, mzabs=0.0001, mzppm=5, neighbours=3, a <- a + 1 } ## cat(min(abs(dif[b,])),"\n") - ## checking the treshold and finding the candidate with the biggest intensity - smalldiffs <- which(abs(dif[b,]) <= (mzabs+(mzu[pos[b,]]/1000000*mzppm))) - cdifs <- dif[b,smalldiffs] - cposs <- pos[b,smalldiffs] - if (length(cdifs)>0){ + ## checking the treshold and finding the candidate with the biggest + ## intensity + smalldiffs <- which(abs(dif[b, ]) <= + (mzabs + (mzu[pos[b, ]] / 1000000 * mzppm))) + cdifs <- dif[b, smalldiffs] + cposs <- pos[b, smalldiffs] + if (length(cdifs) > 0){ mcdi <- smalldiffs[which(itu[cposs] == max(itu[cposs]))] - mdif[b] <- dif[b,mcdi] - mpos[b] <- pos[b,mcdi] + mdif[b] <- dif[b, mcdi] ## mz - mz_masslist + mpos[b] <- pos[b, mcdi] ## The index of the mz in the peaklist. }else{ - mdif[b]<-NA - mpos[b]<-NA + ## No peak close. + mdif[b] <- NA + mpos[b] <- NA } } mdiffs <- mdif[!is.na(mdif)] mposs <- mpos[!is.na(mdif)] - retdata <- matrix(ncol=3, nrow=length(mdiffs), data=c(mposs,mzu[mposs],mdiffs)) - colnames(retdata) = c("pos","mass","dif") + retdata <- matrix(ncol = 3, nrow = length(mdiffs), + data = c(mposs, mzu[mposs], mdiffs)) + colnames(retdata) = c("pos", "mass", "dif") retdata } -estimate <- function(dtable,method="linear"){ - mdiffs <- dtable[,"dif"] - mposs <- dtable[,"mass"] +#' @title Identify peaks close to calibrants' mz values +#' +#' @description Define which peaks are closest to the provided calibrants' mz +#' values and return their index, mz value and difference between their and +#' the calibrant's mz value. +#' +#' @details This function is faster than the original `matchpeaks` function and +#' in addition does by default return the results for all calibrants' mz +#' values, i.e. `nrow` of the result `matrix` is always equal to +#' `length(masslist)`. To mimic the old function use `na.rm = TRUE`. +#' +#' @note The mz values of the calibrants are first sorted. +#' +#' @param peaklist `matrix` with the identified peaks of a single sample. +#' Required columns are: `"mz"` and `intensity` +#' +#' @param masslist `numeric` with the masses of the calibrants. +#' +#' @param mzabs `numeric(1)` defining the absolute mz deviation. +#' +#' @param mzppm `numeric(1)` defining the mz deviation in ppm. +#' +#' @param neighbours `numeric(1)` with the number of neighbors. +#' +#' @param intensity `character(1)` specifying the column in `peaklist` that +#' contains the peaks' intensity values. +#' +#' @param na.rm `logical(1)` whether results for calibrants for which no peak +#' close enough were found should be excluded from the result `matrix`. +#' @return A `matrix` with 3 columns: +#' + `"pos"`: the position of the peak closest to the corresponding +#' calibrant's mz value. +#' + `"mass"`: the m/z of the peak. +#' + `"dif"`: the difference between the calibrant's and peak's mz. +#' The number of rows matches by default (i.e. with `na.rm = FALSE`) the +#' `length(masslist)`. The `matrix` contains `NA` values for calibrants for +#' which no peak close to the calibrant's mz was found. +#' +#' @md +#' +#' @author Johannes Rainer +#' +#' @noRd +.matchpeaks2 <- function(peaklist, masslist, mzabs = 0.0001, mzppm = 5, + neighbours = 3, intensity = "into", na.rm = FALSE) { + masslist <- sort(masslist) + res_pos <- res_dif <- rep(NA_real_, length(masslist)) + peak_mz <- peaklist[, "mz"] + peak_int <- peaklist[, "into"] + mzppm <- mzppm / 1e6 + peak_mz_maxdiff <- mzabs + (peak_mz * mzppm) + for (i in seq_along(masslist)) { + mzdiffs <- peak_mz - masslist[i] + idx_mzdiffs <- which(abs(mzdiffs) <= peak_mz_maxdiff) + if (length(idx_mzdiffs)) { + the_idx <- idx_mzdiffs + if (length(idx_mzdiffs) > 1) { + ## The neighbours closest. + idx_mzdiffs <- idx_mzdiffs[order(abs(mzdiffs)[idx_mzdiffs])] + idx_mzdiffs <- idx_mzdiffs[1:min(length(idx_mzdiffs), + neighbours)] + ## Select the one with the largest intensity. + the_idx <- idx_mzdiffs[order(peak_int[idx_mzdiffs], + decreasing = TRUE)][1] + } + res_pos[i] <- the_idx + res_dif[i] <- mzdiffs[the_idx] + } + } + not_na <- !is.na(res_pos) + if (!any(not_na)) + stop("Not a single peak close to any of the specified calibrants") + if (na.rm) { + res <- cbind(pos = res_pos[not_na], + mass = peak_mz[res_pos[not_na]], + dif = res_dif[not_na]) + } else { + res <- cbind(pos = res_pos, + mass = peak_mz[res_pos], + dif = res_dif) + } + res +} - if (method!="shift"){ +#' @description Estimates the correction *model*. +#' +#' @param dtable `matrix` as returned by `matchpeaks` or `.matchpeaks2`. +#' +#' @param method `character(1)` defining the method that will be used to +#' perform the calibration. +#' +#' @return A `numeric(2)` with slope and intercept of the linear model. If +#' `method = "shift"` the slope will be `0`. Note that the first element +#' is the slope and the second the intercept! +#' +#' @md +#' +#' @noRd +estimate <- function(dtable, method = c("linear")) { + mdiffs <- dtable[, "dif"] + mposs <- dtable[, "mass"] + + if (method != "shift") { ## complete regression regg <- lm(mdiffs ~ mposs) a <- regg$coefficients[2] b <- regg$coefficients[1] - }else{ + } else { ## only global shift a <- 0 b <- mean(mdiffs) } - if (method != "shift") return (c(a,b)) - else return(c(0,b)) + if (method != "shift") + c(a, b) + else + c(0, b) +} + +#' @description Simple function to calibrate mz values using the provided +#' parameters. +#' +#' @note `mz` values have to be increasingly sorted. At some point we might +#' want to have a more generic implementation that supports, e.g. also +#' loess fits. +#' +#' @param mz `numeric` with the mz values to be calibrated. +#' +#' @param method `character` defining the adjustment method. Can be either +#' `"linear"`, `"shift"` or `"edgeshift"`. +#' +#' @param minMz `numeric(1)` with the minimal mz value. Required for +#' `method = "edgeshift"`. +#' +#' @param maxMz `numeric(1)` with the maximal mz value. Required for +#' `method = "edgeshift"`. +#' +#' @param slope `numeric(1)` with the slope. +#' +#' @param intercept `numeric(1)` with the intercept. +#' +#' @md +#' +#' @author Johannes Rainer +#' +#' @noRd +.calibrate_mz <- function(mz, method = "linear", minMz, maxMz, slope = 0, + intercept) { + if (is.unsorted(mz)) + stop("'mz' values have to be sorted") + if (missing(intercept)) + stop("'intercept' is a required parameter") + method <- match.arg(method, c("shift", "linear", "edgeshift")) + if (method == "edgeshift") { + if (missing(minMz) | missing(maxMz)) + stop("Parameters 'minMz' and 'maxMz' are required for method ", + "'edgeshift'") + shift_lower <- mz < minMz + shift_upper <- mz > maxMz + mz_ <- mz + if (any(shift_lower)) + mz_[shift_lower] <- mz[(max(which(shift_lower)) + 1)] + if (any(shift_upper)) + mz_[shift_upper] <- mz[(min(which(shift_upper)) - 1)] + mz <- mz - (slope * mz_ + intercept) + } else { + mz <- mz - (slope * mz + intercept) + } + mz } diff --git a/R/methods-MsFeatureData.R b/R/methods-MsFeatureData.R index 254eccc2f..2d56dc8e0 100644 --- a/R/methods-MsFeatureData.R +++ b/R/methods-MsFeatureData.R @@ -26,38 +26,35 @@ setMethod("show", "MsFeatureData", function(object) { ## adjustedRtime: getter and setter for the adjustedRtime list. -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("hasAdjustedRtime", "MsFeatureData", function(object) { - return(!is.null(object$adjustedRtime)) - ## return(any(ls(object) == "adjustedRtime")) + !is.null(object$adjustedRtime) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("hasFeatures", "MsFeatureData", function(object) { - return(!is.null(object$featureDefinitions)) - ## return(any(ls(object) == "featureDefinitions")) + !is.null(object$featureDefinitions) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("hasChromPeaks", "MsFeatureData", function(object) { - return(!is.null(object$chromPeaks)) - ## return(any(ls(object) == "chromPeaks")) + !is.null(object$chromPeaks) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("adjustedRtime", "MsFeatureData", function(object) { if (hasAdjustedRtime(object)) return(object$adjustedRtime) warning("No adjusted retention times available.") return(NULL) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setReplaceMethod("adjustedRtime", "MsFeatureData", function(object, value) { object$adjustedRtime <- value if (validObject(object)) return(object) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("dropAdjustedRtime", "MsFeatureData", function(object) { if (hasAdjustedRtime(object)) { rm(list = "adjustedRtime", envir = object) @@ -65,40 +62,40 @@ setMethod("dropAdjustedRtime", "MsFeatureData", function(object) { return(object) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("featureDefinitions", "MsFeatureData", function(object) { if (hasFeatures(object)) return(object$featureDefinitions) warning("No aligned feature information available.") return(NULL) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setReplaceMethod("featureDefinitions", "MsFeatureData", function(object, value) { object$featureDefinitions <- value if (validObject(object)) return(object) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("dropFeatureDefinitions", "MsFeatureData", function(object) { if (hasFeatures(object)) rm(list = "featureDefinitions", envir = object) return(object) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("chromPeaks", "MsFeatureData", function(object) { if (hasChromPeaks(object)) return(object$chromPeaks) warning("No chromatographic peaks available.") return(NULL) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setReplaceMethod("chromPeaks", "MsFeatureData", function(object, value) { object$chromPeaks <- value if (validObject(object)) return(object) }) -##' @rdname XCMSnExp-class +#' @rdname XCMSnExp-class setMethod("dropChromPeaks", "MsFeatureData", function(object) { if (hasChromPeaks(object)) rm(list = "chromPeaks", envir = object) diff --git a/R/methods-XCMSnExp.R b/R/methods-XCMSnExp.R index d07e051d0..ae9a3545d 100644 --- a/R/methods-XCMSnExp.R +++ b/R/methods-XCMSnExp.R @@ -158,6 +158,8 @@ setReplaceMethod("adjustedRtime", "XCMSnExp", function(object, value) { #' #' @description \code{featureDefinitions}, \code{featureDefinitions<-}: extract #' or set the correspondence results, i.e. the mz-rt features (peak groups). +#' Similar to the \code{chromPeaks} it is possible to extract features for +#' specified m/z and/or rt ranges (see \code{chromPeaks} for more details). #' #' @return For \code{featureDefinitions}: a \code{DataFrame} with peak grouping #' information, each row corresponding to one mz-rt feature (grouped peaks @@ -171,8 +173,35 @@ setReplaceMethod("adjustedRtime", "XCMSnExp", function(object, value) { #' returns \code{NULL} if no feature definitions are present. #' #' @rdname XCMSnExp-class -setMethod("featureDefinitions", "XCMSnExp", function(object) { - return(featureDefinitions(object@msFeatureData)) +setMethod("featureDefinitions", "XCMSnExp", function(object, mz = numeric(), + rt = numeric(), ppm = 0, + type = "any") { + feat_def <- featureDefinitions(object@msFeatureData) + type <- match.arg(type, c("any", "within")) + ## Select features within rt range. + if (length(rt)) { + rt <- range(rt) + if (type == "within") + keep <- which(feat_def$rtmin >= rt[1] & feat_def$rtmax <= rt[2]) + else + keep <- which(feat_def$rtmax >= rt[1] & feat_def$rtmin <= rt[2]) + feat_def <- feat_def[keep, , drop = FALSE] + } + ## Select peaks within mz range, considering also ppm + if (length(mz) && nrow(feat_def)) { + mz <- range(mz) + ## Increase mz by ppm. + if (is.finite(mz[1])) + mz[1] <- mz[1] - mz[1] * ppm / 1e6 + if (is.finite(mz[2])) + mz[2] <- mz[2] + mz[2] * ppm / 1e6 + if (type == "within") + keep <- which(feat_def$mzmin >= mz[1] & feat_def$mzmax <= mz[2]) + else + keep <- which(feat_def$mzmax >= mz[1] & feat_def$mzmin <= mz[2]) + feat_def <- feat_def[keep, , drop = FALSE] + } + feat_def }) #' @aliases featureDefinitions<- #' @@ -535,10 +564,7 @@ setMethod("dropChromPeaks", "XCMSnExp", function(object) { object <- dropProcessHistories(object, type = .PROCSTEP.RTIME.CORRECTION) object <- dropProcessHistories(object, type = .PROCSTEP.PEAK.GROUPING) object <- dropProcessHistories(object, type = .PROCSTEP.PEAK.FILLING) - ## idx_fd <- which(unlist(lapply(processHistory(object), processType)) == - ## .PROCSTEP.PEAK.DETECTION) - ## if (length(idx_fd) > 0) - ## object@.processHistory <- object@.processHistory[-idx_fd] + object <- dropProcessHistories(object, type = .PROCSTEP.CALIBRATION) newFd <- new("MsFeatureData") newFd@.xData <- .copy_env(object@msFeatureData) newFd <- dropChromPeaks(newFd) @@ -2410,3 +2436,88 @@ setMethod("extractMsData", "XCMSnExp", object <- as(object, "OnDiskMSnExp") extractMsData(object, rt = rt, mz = mz) }) + + +#' @rdname calibrate-calibrant-mass +#' +#' @param object An [XCMSnExp] object. +#' +#' @param param The `CalibrantMassParam` object with the calibration settings. +#' +#' @return The `calibrate` method returns an [XCMSnExp] object with the +#' chromatographic peaks being calibrated. Note that **only** the detected +#' peaks are calibrated, but not the individual mz values in each spectrum. +#' +#' @md +setMethod("calibrate", "XCMSnExp", function(object, param) { + if (missing(param)) { + stop("Argument 'param' is missing") + } else { + if (!is(param, "CalibrantMassParam")) + stop("The calibrate method for 'XCMSnExp' objects requires a ", + "'CalibrantMassParam' object to be passed with argument ", + "'param'") + } + if (isCalibrated(object)) + stop("'object' is already calibrated! Recurrent calibrations are not ", + "supported") + mzs <- .mz(param) + n_samps <- length(fileNames(object)) + if (length(mzs) == 1) + mzs <- replicate(mzs[[1]], n = n_samps, simplify = FALSE) + if (length(mzs) != n_samps) + stop("Number of calibrant mz vectors differs from the number of samples") + startDate <- date() + ## Dropping grouping results. + object <- dropFeatureDefinitions(object) + pks <- chromPeaks(object) + adj_models <- vector("list", length = n_samps) + method <- .method(param) + for (i in 1:n_samps) { + ## This could also be done with indices... + pk_mz <- pks[pks[, "sample"] == i, c("mz", "into")] + order_mz <- order(pk_mz[, 1]) + close_pks <- .matchpeaks2(pk_mz[order_mz, ], mzs[[i]], + mzabs = .mzabs(param), mzppm = .mzppm(param), + neighbours = .neighbors(param)) + if (nrow(close_pks) == 0) { + warning("Sample ", i, ": can not calibrate as no peaks are close to", + "provided mz values") + next + } else { + if (nrow(close_pks) == 1 & method != "shift") { + warning("Sample ", i, ": only a single peak found, falling ", + "back to method = 'shift'") + method <- "shift" + } + } + + prms <- estimate(close_pks, method) + adj_models[[i]] <- prms + a <- prms[1] # slope + b <- prms[2] # intercept + mz_ <- pk_mz[order_mz, 1] + mz_min <- mz_[min(close_pks[, "pos"])] + mz_max <- mz_[max(close_pks[, "pos"])] + pk_mz[order_mz, "mz"] <- .calibrate_mz(mz_, method = method, + minMz = mz_min, maxMz = mz_max, + slope = a, intercept = b) + pks[pks[, "sample"] == i, "mz"] <- pk_mz[, "mz"] + } + + ## Set the new peak definitions. Careful to not drop additional stuff here. + newFd <- new("MsFeatureData") + newFd@.xData <- .copy_env(object@msFeatureData) + chromPeaks(newFd) <- pks + lockEnvironment(newFd, bindings = TRUE) + object@msFeatureData <- newFd + ## Add param to processHistory + xph <- XProcessHistory(param = param, date. = startDate, + type. = .PROCSTEP.CALIBRATION, + fileIndex = 1:n_samps) + object <- addProcessHistory(object, xph) + if (validObject(object)) + object +}) + + diff --git a/R/methods-xcmsSet.R b/R/methods-xcmsSet.R index 284e29b6f..a7b52ad16 100644 --- a/R/methods-xcmsSet.R +++ b/R/methods-xcmsSet.R @@ -241,41 +241,47 @@ setReplaceMethod("profinfo", "xcmsSet", function(object, value) { ############################################################ ## calibrate -setMethod("calibrate", "xcmsSet", function(object,calibrants,method="linear", - mzabs=0.0001, mzppm=5, - neighbours=3, plotres=FALSE) { +setMethod("calibrate", "xcmsSet", function(object, calibrants, + method = "linear", + mzabs = 0.0001, mzppm = 5, + neighbours = 3, plotres = FALSE) { nsamp = length(unique(object@peaks[,"sample"])) - if (!sum(method == c("shift","linear","edgeshift"))) - stop("unknown calibration method!") + match.arg(method, c("shift", "linear", "edgeshift")) if (is.list(calibrants)) if (length(calibrants) != nsamp) stop("Error: Number of masslists differs with number of samples") - for (s in 1:nsamp){ - peaklist = object@peaks[which(object@peaks[,"sample"]==s),] + ## Loop over samples, estimate calibration and apply it. + for (s in 1:nsamp) { + peaklist = object@peaks[which(object@peaks[,"sample"] == s), ] if (is.list(calibrants)) { masslist <- calibrants[s] - }else{ + } else { masslist <- calibrants } + ## Check that peaks are ordered by mz value (issue #200) + if (is.unsorted(peaklist[, "mz"])) + warning("Peaks in sample ", s, " are not sorted by mz value") - masses <- matchpeaks(peaklist,masslist,mzabs,mzppm,neighbours) - if (length(masses)==0){ + masses <- matchpeaks(peaklist, masslist, mzabs, mzppm, neighbours) + if (length(masses) == 0) { warning("No masses close enough!\n") next } - if (nrow(masses)==1 & method!="shift") { - cat("Warning: only one peak found, fallback to shift.") - method="shift" + if (nrow(masses) == 1 & method != "shift") { + warning("Sample ", s, ": only one peak found, falling back to ", + "method = 'shift'") + method = "shift" } - params <- estimate (masses, method) - mzu <- peaklist[,"mz"] - mposs <- masses[,"pos"] - mdiffs <- masses[,"dif"] + ## Estimate the adjustment. + params <- estimate(masses, method) + mzu <- peaklist[, "mz"] + mposs <- masses[, "pos"] + mdiffs <- masses[, "dif"] a <- params[1] b <- params[2] @@ -284,23 +290,32 @@ setMethod("calibrate", "xcmsSet", function(object,calibrants,method="linear", if (method != "edgeshift"){ mzu <- mzu - (a * mzu + b) } else { - mzu[c(1:(min(mposs)-1))] <- mzu[c(1:(min(mposs)-1))] - (a * mzu[min(mposs)] + b) + ## Different adjustment for peaks below the smallest and peaks above + ## the largest mz + mzu[c(1:(min(mposs) - 1))] <- mzu[c(1:(min(mposs) - 1))] - + (a * mzu[min(mposs)] + b) mzu[c((min(mposs)):(max(mposs)))] <- - mzu[c((min(mposs)):(max(mposs)))] - (a * mzu[c((min(mposs)):(max(mposs)))] + b) + mzu[c((min(mposs)):(max(mposs)))] - + (a * mzu[c((min(mposs)):(max(mposs)))] + b) mzu[c((max(mposs)+1):length(mzu))] <- mzu[c((max(mposs)+1):length(mzu))] - (a * mzu[max(mposs)] + b) } peaklist[,"mz"] <- mzu - object@peaks[which(object@peaks[,"sample"]==s),] <- peaklist + object@peaks[which(object@peaks[,"sample"] == s), ] <- peaklist } if (plotres) { plot(mzu[mposs],mdiffs, xlim=c(min(mzu),max(mzu))) - if (method!="edgeshift") {abline(b,a)}else{ - lines(c(min(mzu),mzu[min(mposs)]),c(a * mzu[min(mposs)] + b,a * mzu[min(mposs)] + b)) - lines(c(mzu[min(mposs)],mzu[max(mposs)]),c(a * mzu[min(mposs)] + b,a * mzu[max(mposs)] + b)) - lines(c(mzu[max(mposs)],max(mzu)),c(a * mzu[max(mposs)] + b,a * mzu[max(mposs)] + b)) + if (method!="edgeshift") { + abline(b,a) + } else { + lines(c(min(mzu), mzu[min(mposs)]), + c(a * mzu[min(mposs)] + b,a * mzu[min(mposs)] + b)) + lines(c(mzu[min(mposs)], mzu[max(mposs)]), + c(a * mzu[min(mposs)] + b,a * mzu[max(mposs)] + b)) + lines(c(mzu[max(mposs)], max(mzu)), + c(a * mzu[max(mposs)] + b,a * mzu[max(mposs)] + b)) } } @@ -836,42 +851,72 @@ setMethod("retcor.obiwarp", "xcmsSet", function(object, plottype = c("none", "de idx <- which(seq(1,N) != center) obj1 <- xcmsRaw(object@filepaths[center], profmethod="bin", profstep=0) - ## added t automatically find the correct scan range from the xcmsSet object - if(length(obj1@scantime) != length(object@rt$raw[[center]])){ + ## Scanrange checking: fix for issue #194 + ## Check if we've got the scanrange slot. + scanrange <- NULL + if (.hasSlot(object, "scanrange")) { + if (length(object@scanrange) == 2) + scanrange <- object@scanrange + } else { + if(length(obj1@scantime) != length(object@rt$raw[[center]])){ ## This is in case the xcmsSet was read using a scanrange, i.e. if - ## the data was read in with defining a scan range, then we would have a - ## mismatch here. This code essentially ensures that the retention time - ## of the raw object would match the retention time present in the xcmsSet. - ## This was before the days in which @scanrange was added as a slot to - ## xcmsSet. - ##figure out the scan time range - scantime.start <-object@rt$raw[[center]][1] - scantime.end <-object@rt$raw[[center]][length(object@rt$raw[[center]])] - - scanrange.start <-which.min(abs(obj1@scantime - scantime.start)) - scanrange.end <-which.min(abs(obj1@scantime - scantime.end)) - scanrange<-c(scanrange.start, scanrange.end) - obj1 <- xcmsRaw(object@filepaths[center], profmethod="bin", profstep=0, scanrange=scanrange) - } else{ - scanrange<-NULL - } - + ## the data was read in with defining a scan range, then we would + ## have a mismatch here. This code essentially ensures that the + ## retention time of the raw object would match the retention time + ## present in the xcmsSet. + ## This was before the days in which @scanrange was added as a slot + ## to xcmsSet. + ##figure out the scan time range + scantime.start <- object@rt$raw[[center]][1] + scantime.end <- object@rt$raw[[center]][length(object@rt$raw[[center]])] + scanrange.start <- which.min(abs(obj1@scantime - scantime.start)) + scanrange.end <- which.min(abs(obj1@scantime - scantime.end)) + scanrange <- c(scanrange.start, scanrange.end) + } + } + ## Subset the object if scanrange not NULL + if (!is.null(scanrange)) + obj1 <- obj1[scanrange[1]:scanrange[2]] + + ## ## added t automatically find the correct scan range from the xcmsSet object + ## if(length(obj1@scantime) != length(object@rt$raw[[center]])){ + ## ## This is in case the xcmsSet was read using a scanrange, i.e. if + ## ## the data was read in with defining a scan range, then we would have a + ## ## mismatch here. This code essentially ensures that the retention time + ## ## of the raw object would match the retention time present in the xcmsSet. + ## ## This was before the days in which @scanrange was added as a slot to + ## ## xcmsSet. + ## ##figure out the scan time range + ## scantime.start <-object@rt$raw[[center]][1] + ## scantime.end <-object@rt$raw[[center]][length(object@rt$raw[[center]])] + + ## scanrange.start <-which.min(abs(obj1@scantime - scantime.start)) + ## scanrange.end <-which.min(abs(obj1@scantime - scantime.end)) + ## scanrange<-c(scanrange.start, scanrange.end) + ## obj1 <- xcmsRaw(object@filepaths[center], profmethod="bin", + ## profstep=0, scanrange=scanrange) + ## } else{ + ## scanrange<-NULL + ## } + for (si in 1:length(idx)) { s <- idx[si] cat(samples[s], " ") - + ## ## Might be better to just get the profile matrix from the center object ## outside of the for loop and then modifying a internal variable within ## the loop - avoids creation of two profile matrices in each iteration. - profStepPad(obj1) <- profStep ## (re-)generate profile matrix, since it might have been modified during previous iteration - if(is.null(scanrange)){ - obj2 <- xcmsRaw(object@filepaths[s], profmethod="bin", profstep=0) - } else{ - obj2 <- xcmsRaw(object@filepaths[s], profmethod="bin", profstep=0, scanrange=scanrange) - } + profStepPad(obj1) <- profStep ## (re-)generate profile matrix, since it + ## might have been modified during previous iteration + if(is.null(scanrange)){ + obj2 <- xcmsRaw(object@filepaths[s], profmethod="bin", profstep=0) + } else{ + obj2 <- xcmsRaw(object@filepaths[s], profmethod="bin", profstep=0, + scanrange = scanrange) + } profStepPad(obj2) <- profStep ## generate profile matrix - + mzmin <- min(obj1@mzrange[1], obj2@mzrange[1]) mzmax <- max(obj1@mzrange[2], obj2@mzrange[2]) @@ -931,23 +976,38 @@ setMethod("retcor.obiwarp", "xcmsSet", function(object, plottype = c("none", "de ## Now ensure that the nrow of the profile matrix matches. ## Add empty rows at the beginning if(mzmin < obj1@mzrange[1]) { - seqlen <- length(seq(mzmin, obj1@mzrange[1], profStep))-1 - x <- matrix(0, seqlen,dim(obj1@env$profile)[2]) + ## The profile matrices should not get larger than mz! + max_missing_rows <- mzval - nrow(obj1@env$profile) + low_mz <- seq(mzmin, obj1@mzrange[1], profStep) + ## keep all mz bins that are smaller than mzrange, but ensure that + ## we're not adding more rows than needed. + seqlen <- min(sum(low_mz < obj1@mzrange[1]), max_missing_rows) + ## seqlen <- length(seq(mzmin, obj1@mzrange[1], profStep)) - 1 + x <- matrix(0, seqlen, dim(obj1@env$profile)[2]) obj1@env$profile <- rbind(x, obj1@env$profile) } ## Add emtpy rows at the end. if(mzmax > obj1@mzrange[2]){ - seqlen <- length(seq(obj1@mzrange[2], mzmax, profStep))-1 + max_missing_rows <- mzval - nrow(obj1@env$profile) + high_mz <- seq(obj1@mzrange[2], mzmax, profStep) + seqlen <- min(sum(high_mz > obj1@mzrange[2]), max_missing_rows) + ## seqlen <- length(seq(obj1@mzrange[2], mzmax, profStep)) - 1 x <- matrix(0, seqlen, dim(obj1@env$profile)[2]) obj1@env$profile <- rbind(obj1@env$profile, x) } if(mzmin < obj2@mzrange[1]){ - seqlen <- length(seq(mzmin, obj2@mzrange[1], profStep))-1 + max_missing_rows <- mzval - nrow(obj2@env$profile) + low_mz <- seq(mzmin, obj2@mzrange[1], profStep) + seqlen <- min(sum(low_mz < obj2@mzrange[1]), max_missing_rows) + ## seqlen <- length(seq(mzmin, obj2@mzrange[1], profStep))-1 x <- matrix(0, seqlen, dim(obj2@env$profile)[2]) obj2@env$profile <- rbind(x, obj2@env$profile) } if(mzmax > obj2@mzrange[2]){ - seqlen <- length(seq(obj2@mzrange[2], mzmax, profStep))-1 + max_missing_rows <- mzval - nrow(obj2@env$profile) + high_mz <- seq(obj2@mzrange[2], mzmax, profStep) + seqlen <- min(sum(high_mz > obj2@mzrange[2]), max_missing_rows) + ## seqlen <- length(seq(obj2@mzrange[2], mzmax, profStep)) - 1 x <- matrix(0, seqlen, dim(obj2@env$profile)[2]) obj2@env$profile <- rbind(obj2@env$profile, x) } @@ -956,7 +1016,10 @@ setMethod("retcor.obiwarp", "xcmsSet", function(object, plottype = c("none", "de intensity1 <- obj1@env$profile intensity2 <- obj2@env$profile - if ((mzval * valscantime1 != length(intensity1)) || (mzval * valscantime2 != length(intensity2))) + ## Final check to ensure that our expansion of profile matrix rows was + ## correct. + if ((mzval * valscantime1 != length(intensity1)) || + (mzval * valscantime2 != length(intensity2))) stop("Dimensions of profile matrices do not match !\n") ## Would it be possible to supply non-binned data too??? diff --git a/inst/NEWS b/inst/NEWS index fcedb8c84..cebabc787 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,29 @@ +CHANGES IN VERSION 2.99.6 +------------------------- + +NEW FEATURES: +- calibrate,XCMSnExp method that allows to calibrate chromatographic peaks. + +USER VISIBLE CHANGES: +- Export phenoDataFromPaths function (issue $195). +- Add arguments mz and rt to featureDefinitions method allowing to extract + features within the specified ranges. +- Increase n for the density function call in group density-based correspondence + by 2. +- Replace xcmsDirect.Rnw with rmarkdown-based vignette using the new user + interface. + + +BUG FIXES: +- issue #196: removed the unnecessary requirement for same-dimension profile + matrices in adjustRtime,XCMSnExp,ObiwarpParam. +- issue #194: fixes in retcor.obiwarp: 1) subset raw data if scanrange != NULL. + 2) if the mz range of the two files to be aligned differ, expand them + correctly. Depending on the profStep and the mz values/ranges the matrices + were not expanded correctly. +- Potential problems in the plotChromPeakDensity function. + + CHANGES IN VERSION 2.99.5 ------------------------- diff --git a/inst/unitTests/runit.Param-classes.R b/inst/unitTests/runit.Param-classes.R index 6264bd505..7811e25fc 100644 --- a/inst/unitTests/runit.Param-classes.R +++ b/inst/unitTests/runit.Param-classes.R @@ -802,8 +802,6 @@ test_GenericParam <- function() { } test_FillChromPeaksParam <- function() { - library(xcms) - library(RUnit) ## Check getter/setter methods: p <- new("FillChromPeaksParam", expandMz = 0.8) checkEquals(expandMz(p), 0.8) @@ -832,3 +830,26 @@ test_FillChromPeaksParam <- function() { checkException(ppm(p) <- c(2, 2)) checkException(ppm(p) <- -2) } + + +test_CalibrantMassParam <- function() { + + p <- new("CalibrantMassParam") + checkTrue(validObject(p)) + p@method <- "other" + checkException(validObject(p)) + mzs <- rnorm(200, mean = 500) + p <- new("CalibrantMassParam") + p@mz <- list(mzs) + checkException(validObject(p)) + + ## Constructor. + p <- xcms:::CalibrantMassParam(mz = mzs, mzabs = 3, mzppm = 9, + neighbors = 4, method = "shift") + checkEquals(xcms:::.mz(p)[[1]], sort(mzs)) + checkEquals(xcms:::.mzabs(p), 3) + checkEquals(xcms:::.mzppm(p), 9) + checkEquals(xcms:::.neighbors(p), 4L) + checkEquals(xcms:::.method(p), "shift") + +} diff --git a/inst/unitTests/runit.XCMSnExp.R b/inst/unitTests/runit.XCMSnExp.R index 062ad07e0..5c7519f3d 100644 --- a/inst/unitTests/runit.XCMSnExp.R +++ b/inst/unitTests/runit.XCMSnExp.R @@ -193,6 +193,33 @@ test_XCMSnExp_class_accessors <- function() { checkTrue(hasChromPeaks(xod)) checkTrue(hasFeatures(xod)) checkEquals(featureDefinitions(xod), fd) + ## featureDefinitions with mz and/or rt range: + feat_def <- featureDefinitions(xod_xg) + ## Within + mzr <- c(300, 330) + keep_mz <- feat_def$mzmin > mzr[1] & feat_def$mzmax < mzr[2] + checkEquals(featureDefinitions(xod_xg, mz = mzr, type = "within"), + feat_def[keep_mz, ]) + rtr <- c(3000, 3800) + keep_rt <- feat_def$rtmin > rtr[1] & feat_def$rtmax < rtr[2] + checkEquals(featureDefinitions(xod_xg, rt = rtr, type = "within"), + feat_def[keep_rt, ]) + checkEquals(featureDefinitions(xod_xg, rt = rtr, mz = mzr, type = "within"), + feat_def[keep_rt & keep_mz, ]) + ## Any + mzr <- range(featureDefinitions(xod_xg)[2, "mzmed"]) + keep_mz <- feat_def$mzmax >= mzr[1] & feat_def$mzmin <= mzr[2] + checkEquals(featureDefinitions(xod_xg, mz = mzr, type = "any"), + feat_def[keep_mz, , drop = FALSE]) + rtr <- range(3420.006) + keep_rt <- feat_def$rtmax >= rtr[1] & feat_def$rtmin <= rtr[2] + checkTrue(nrow(featureDefinitions(xod_xg, rt = rtr, type = "within")) != + nrow(featureDefinitions(xod_xg, rt = rtr, type = "any"))) + checkEquals(featureDefinitions(xod_xg, rt = rtr, type = "any"), + feat_def[keep_rt, , drop = FALSE]) + checkEquals(featureDefinitions(xod_xg, rt = rtr, mz = mzr, type = "any"), + feat_def[keep_rt & keep_mz, , drop = FALSE]) + ## adjustedRtime checkTrue(!hasAdjustedRtime(xod)) xod2 <- xod diff --git a/inst/unitTests/runit.calibrate.R b/inst/unitTests/runit.calibrate.R new file mode 100644 index 000000000..f9bf019e7 --- /dev/null +++ b/inst/unitTests/runit.calibrate.R @@ -0,0 +1,67 @@ +test_calibrate_XCMSnExp <- function() { + do_plot <- FALSE + + tmp <- filterFile(faahko_xod, file = 1) + + ## Check shift calibration. + mzs <- chromPeaks(tmp)[c(3, 6, 7, 13, 17, 32, 45)] + mzs_shift <- mzs + 0.0001 + prm <- CalibrantMassParam(mz = mzs_shift, method = "shift") + res <- calibrate(tmp, prm) + checkTrue(isCalibrated(res)) + checkEquals(chromPeaks(tmp)[, -1], chromPeaks(res)[, -1]) + checkEquals(chromPeaks(tmp)[, 1] + 0.0001, chromPeaks(res)[, 1]) + diffs <- chromPeaks(res)[, "mz"] - chromPeaks(tmp)[, "mz"] + X <- chromPeaks(res)[, "mz"] + if (do_plot) + plot(X, diffs) + + ## Check linear. + mzs_lin <- mzs + 0.00005 + mzs * 0.000002 + max_dif <- max(mzs_lin - mzs) + prm <- CalibrantMassParam(mz = mzs_lin, method = "linear", mzabs = max_dif) + res <- calibrate(tmp, prm) + checkTrue(isCalibrated(res)) + diffs <- chromPeaks(res)[, "mz"] - chromPeaks(tmp)[, "mz"] + X <- chromPeaks(res)[, "mz"] + if (do_plot) + plot(X, diffs) + res_lm <- lm(diffs ~ X) + checkEquals(unname(coefficients(res_lm)[1]), 0.00005, tolerance = 1e-5) + checkEquals(unname(coefficients(res_lm)[2]), 0.000002, tolerance = 1e-5) + + ## edgeshift + prm <- CalibrantMassParam(mz = mzs_lin, method = "edgeshift", + mzabs = max_dif) + res <- calibrate(tmp, prm) + checkTrue(isCalibrated(res)) + diffs <- chromPeaks(res)[, "mz"] - chromPeaks(tmp)[, "mz"] + X <- chromPeaks(res)[, "mz"] + if (do_plot) + plot(X, diffs) + mz_sorted <- chromPeaks(tmp)[, "mz"] + ## Diff has to be constant before and after the linear range. + lower_idx <- which(chromPeaks(tmp)[, "mz"] < min(mzs)) + checkTrue(all(diffs[lower_idx] == diffs[lower_idx][1])) + upper_idx <- which(chromPeaks(tmp)[, "mz"] > max(mzs)) + checkTrue(all(diffs[upper_idx] == diffs[upper_idx][1])) + lin_idx <- 1:length(diffs) + lin_idx <- lin_idx[!(lin_idx %in% lower_idx)] + lin_idx <- lin_idx[!(lin_idx %in% upper_idx)] + lin_mod <- lm(diffs[lin_idx] ~ X[lin_idx]) + checkEquals(unname(coefficients(lin_mod)[1]), 0.00005, tolerance = 1e-5) + checkEquals(unname(coefficients(lin_mod)[2]), 0.000002, tolerance = 1e-5) + + ## Test with a single mass, fall back to shift. + prm <- CalibrantMassParam(mz = mzs_lin[1], method = "edgeshift", + mzabs = max_dif) + res <- calibrate(tmp, prm) + diffs <- chromPeaks(res)[, "mz"] - chromPeaks(tmp)[, "mz"] + min_diff <- min(abs(chromPeaks(tmp)[, "mz"] - mzs_lin[1])) + checkEquals(diffs, rep(min_diff, length(diffs))) + + ## Check errors. + checkException(calibrate(tmp, 4)) + checkException(calibrate(tmp, CalibrantMassParam(mz = list(mzs, mzs)))) +} + diff --git a/inst/unitTests/runit.matchpeaks.R b/inst/unitTests/runit.matchpeaks.R new file mode 100644 index 000000000..09f6e325f --- /dev/null +++ b/inst/unitTests/runit.matchpeaks.R @@ -0,0 +1,92 @@ +## Check functions in matchpeaks.R +dontrun_matchpeaks <- function() { + library(xcms) + library(RUnit) + library(msdata) + mzdatapath <- system.file("fticr", package = "msdata") + mzdatafiles <- list.files(mzdatapath, recursive = TRUE, full.names = TRUE) + + xs4 <- xcmsSet( + method = "MSW", + files = mzdatafiles[1], + scales = c(1,4, 9), + nearbyPeak = T, + verbose.columns = FALSE, + winSize.noise = 500, + SNR.method = "data.mean", + snthr = 10) + + ## Define the calibrants. + masslist <- xs4@peaks[c(1, 4, 7), "mz"] + xs4@peaks[,"mz"] <- xs4@peaks[,"mz"] + + 0.00001*runif(1,0,0.4)*xs4@peaks[,"mz"] + 0.0001 + + xs4c <- calibrate(xs4, + calibrants=masslist, + method="edgeshift", + mzabs=0.0001, + mzppm=5, + neighbours=3, + plotres=FALSE + ) + ## Do the steps separately + pkl <- peaks(xs4) + mpks <- xcms:::matchpeaks(pkl, masslist) + estm <- xcms:::estimate(mpks) +} + +test_matchpeaks <- function() { + library(xcms) + faahko_file <- system.file('cdf/KO/ko15.CDF', package = "faahKO") + + faahko_xs <- xcmsSet(faahko_file, profparam = list(step = 0), + method = "centWave", noise = 10000, snthresh = 40) + pks <- peaks(faahko_xs) + + calibs <- pks[c(3, 5, 7, 13, 17, 29), "mz"] + + res <- xcms:::matchpeaks(pks, calibs) + res_2 <- xcms:::matchpeaks(pks[order(pks[, "mz"]), ], calibs) +} + +dontrun_implementation_matchpeaks <- function() { + ## Check that xcms:::matchpeaks and xcms:::.matchpeaks2 return the same + ## results. + set.seed(123) + pks <- cbind(mz = sort(abs(rnorm(300, mean = 200, sd = 3))), + into = abs(rnorm(300, mean = 2000, sd = 700))) + masses_idx <- sort(sample(1:nrow(pks), size = 50)) + masses <- pks[masses_idx, "mz"] + res_old <- xcms:::matchpeaks(pks, masses) + res_new <- xcms:::.matchpeaks2(pks, masses) + checkEquals(res_old, res_new) + + res_old <- xcms:::matchpeaks(pks, masses, mzabs = 0, mzppm = 0) + checkEquals(res_old[, "pos"], masses_idx) + res_new <- xcms:::.matchpeaks2(pks, masses, mzabs = 0, mzppm = 0) + checkEquals(res_old, res_new) + + res_old <- xcms:::matchpeaks(pks, masses, mzppm = 0, mzabs = 0.01) + res_new <- xcms:::.matchpeaks2(pks, masses, mzppm = 0, mzabs = 0.01) + checkEquals(res_old, res_new) + + + ## Real data... peaks have to be mz sorted! Report that as issue? + pks <- chromPeaks(faahko_xod) + pks_2 <- pks[pks[, "sample"] == 2, ] + masses_idx <- c(4, 13, 32, 33, 37, 41, 45, 53, 58, 67, 74, 88, 90) + + masses <- pks_2[masses_idx, "mz"] + masses_order <- order(masses) + res_old <- xcms:::matchpeaks(pks_2, masses) + res_new <- xcms:::.matchpeaks2(pks_2, masses) + + ## Why the heck does matchpeaks not work??? + + masses <- masses + 3 * masses / 1e6 + + library(microbenchmark) + microbenchmark(xcms:::matchpeaks(pks, masses), + xcms:::.matchpeaks2(pks, masses)) + ## 10x faster. +} diff --git a/inst/unitTests/runit.xcmsSet.R b/inst/unitTests/runit.xcmsSet.R index 37e2fc1c1..7ef0af193 100644 --- a/inst/unitTests/runit.xcmsSet.R +++ b/inst/unitTests/runit.xcmsSet.R @@ -39,3 +39,11 @@ test.xcmsSetParallel <- function() { checkTrue (nrow((peaks(xset1)@.Data)) == nrow((peaks(xset2)@.Data))) } + +test.phenoDataFromPaths <- function() { + base_dir <- system.file("cdf", package = "faahKO") + cdf_files <- list.files(base_dir, recursive = TRUE, full.names = TRUE) + pd <- phenoDataFromPaths(cdf_files) + checkTrue(colnames(pd) == "class") + checkEquals(levels(pd$class), c("KO", "WT")) +} diff --git a/man/XCMSnExp-class.Rd b/man/XCMSnExp-class.Rd index 6e92b408b..a31d0c54e 100644 --- a/man/XCMSnExp-class.Rd +++ b/man/XCMSnExp-class.Rd @@ -104,7 +104,8 @@ processHistoryTypes() \S4method{adjustedRtime}{XCMSnExp}(object) <- value -\S4method{featureDefinitions}{XCMSnExp}(object) +\S4method{featureDefinitions}{XCMSnExp}(object, mz = numeric(), + rt = numeric(), ppm = 0, type = "any") \S4method{featureDefinitions}{XCMSnExp}(object) <- value @@ -194,12 +195,12 @@ specifying the index of the files/samples for which the \item{bySample}{logical(1) specifying whether results should be grouped by sample.} -\item{rt}{optional \code{numeric(2)} defining the retention time range for -which chromatographic peaks should be returned.} - \item{mz}{optional \code{numeric(2)} defining the mz range for which chromatographic peaks should be returned.} +\item{rt}{optional \code{numeric(2)} defining the retention time range for +which chromatographic peaks should be returned.} + \item{ppm}{optional \code{numeric(1)} specifying the ppm by which the \code{mz} range should be extended. For a value of \code{ppm = 10}, all peaks within \code{mz[1] - ppm / 1e6} and \code{mz[2] + ppm / 1e6} are @@ -369,6 +370,8 @@ process histories. These can be passed with argument \code{type} to the \code{featureDefinitions}, \code{featureDefinitions<-}: extract or set the correspondence results, i.e. the mz-rt features (peak groups). + Similar to the \code{chromPeaks} it is possible to extract features for + specified m/z and/or rt ranges (see \code{chromPeaks} for more details). \code{chromPeaks}, \code{chromPeaks<-}: extract or set the matrix containing the information on identified chromatographic diff --git a/man/calibrate-calibrant-mass.Rd b/man/calibrate-calibrant-mass.Rd new file mode 100644 index 000000000..f4bbea29e --- /dev/null +++ b/man/calibrate-calibrant-mass.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataClasses.R, R/functions-Params.R, +% R/functions-XCMSnExp.R, R/methods-XCMSnExp.R +\docType{class} +\name{CalibrantMassParam-class} +\alias{CalibrantMassParam-class} +\alias{mz,CalibrantMassParam} +\alias{CalibrantMassParam} +\alias{isCalibrated} +\alias{calibrate,XCMSnExp-method} +\title{Calibrant mass based calibration of chromatgraphic peaks} +\usage{ +CalibrantMassParam(mz = list(), mzabs = 1e-04, mzppm = 5, neighbors = 3, + method = "linear") + +isCalibrated(object) + +\S4method{calibrate}{XCMSnExp}(object, param) +} +\arguments{ +\item{mz}{a \code{numeric} or \code{list} of \code{numeric} vectors with reference mz +values. If a \code{numeric} vector is provided, this is used for each sample +in the \code{XCMSnExp} object. If a \code{list} is provided, it's length has to be +equal to the number of samples in the experiment.} + +\item{mzabs}{\code{numeric(1)} the absolute error/deviation for matching peaks to +calibrants (in Da).} + +\item{mzppm}{\code{numeric(1)} the relative error for matching peaks to calibrants +in ppm (parts per million).} + +\item{neighbors}{\code{integer(1)} with the maximal number of peaks within the +permitted distance to the calibrants that are considered. Among these the +mz value of the peak with the largest intensity is used in the +calibration function estimation.} + +\item{method}{\code{character(1)} defining the method that should be used to +estimate the calibration function. Can be \code{"shift"}, \code{"linear"} (default) +or \code{"edgeshift"}.} + +\item{object}{An \link{XCMSnExp} object.} + +\item{param}{The \code{CalibrantMassParam} object with the calibration settings.} +} +\value{ +For \code{CalibrantMassParam}: a \code{CalibrantMassParam} instance. +For \code{calibrate}: an \link{XCMSnExp} object with chromatographic peaks being +calibrated. \strong{Be aware} that the actual raw mz values are not (yet) +calibrated, but \strong{only} the identified chromatographic peaks. + +The \code{CalibrantMassParam} function returns an instance of +the \code{CalibrantMassParam} class with all settings and properties set. + +The \code{calibrate} method returns an \link{XCMSnExp} object with the +chromatographic peaks being calibrated. Note that \strong{only} the detected +peaks are calibrated, but not the individual mz values in each spectrum. +} +\description{ +Calibrate peaks using mz values of known masses/calibrants. +mz values of identified peaks are adjusted based on peaks that are close +to the provided mz values. See details below for more information. + +The \code{isCalibrated} function returns \code{TRUE} if chromatographic +peaks of the \link{XCMSnExp} object \code{x} were calibrated and \code{FALSE} otherwise. +} +\details{ +The method does first identify peaks that are close to the provided +mz values and, given that there difference to the calibrants is smaller +than the user provided cut off (based on arguments \code{mzabs} and \code{mzppm}), +their mz values are replaced with the provided mz values. The mz values +of all other peaks are either globally shifted (for \code{method = "shift"} +or estimated by a linear model through all calibrants. +Peaks are considered close to a calibrant mz if the difference between +the calibrant and its mz is \code{<= mzabs + mz * mzppm /1e6}. + +\strong{Adjustment methods}: adjustment function/factor is estimated using +the difference between calibrant and peak mz values only for peaks +that are close enough to the calibrants. The availabel methods are: +\itemize{ +\item \code{shift}: shifts the m/z of each peak by a global factor which +corresponds to the average difference between peak mz and calibrant mz. +\item \code{linear}: fits a linear model throught the differences between +calibrant and peak mz values and adjusts the mz values of all peaks +using this. +\item \code{edgeshift}: performs same adjustment as \code{linear} for peaks that are +within the mz range of the calibrants and shift outside of it. +} +} +\note{ +\code{CalibrantMassParam} classes don't have exported getter or setter +methods. +} +\author{ +Joachim Bargsten, Johannes Rainer +} diff --git a/man/phenoDataFromPaths.Rd b/man/phenoDataFromPaths.Rd new file mode 100644 index 000000000..edb550196 --- /dev/null +++ b/man/phenoDataFromPaths.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions-xcmsSet.R +\name{phenoDataFromPaths} +\alias{phenoDataFromPaths} +\title{Derive experimental design from file paths} +\usage{ +phenoDataFromPaths(paths) +} +\arguments{ +\item{paths}{\code{character} representing the file names (including the full +path) of the experiment's files.} +} +\description{ +The \code{phenoDataFromPaths} function builds a \code{data.frame} +representing the experimental design from the folder structure in which +the files of the experiment are located. +} +\note{ +This function is used by the \emph{old} \code{xcmsSet} function to guess +the experimental design (i.e. group assignment of the files) from the +folders in which the files of the experiment can be found. +} +\examples{ +## List the files available in the faahKO package +base_dir <- system.file("cdf", package = "faahKO") +cdf_files <- list.files(base_dir, recursive = TRUE, full.names = TRUE) +} diff --git a/man/plotChromPeakDensity.Rd b/man/plotChromPeakDensity.Rd index 4909403c2..1bedc0548 100644 --- a/man/plotChromPeakDensity.Rd +++ b/man/plotChromPeakDensity.Rd @@ -4,12 +4,12 @@ \alias{plotChromPeakDensity} \title{Plot chromatographic peak density along the retention time axis} \usage{ -plotChromPeakDensity(object, mz, rt, param = PeakDensityParam(), +plotChromPeakDensity(object, mz, rt, param, simulate = TRUE, col = "#00000080", xlab = "retention time", ylab = "sample", - xlim = range(rt), ...) + xlim = range(rt), main = NULL, ...) } \arguments{ -\item{object}{A \code{\link{XCMSnExp}} object with identified +\item{object}{A \link{XCMSnExp} object with identified chromatographic peaks.} \item{mz}{\code{numeric(2)} defining an mz range for which the peak density @@ -19,8 +19,16 @@ should be plotted.} peak density should be plotted. Defaults to the absolute retention time range of \code{object}.} -\item{param}{\code{\link{PeakDensityParam}} from which parameters for the -\emph{peak density} correspondence algorithm can be extracted.} +\item{param}{\link{PeakDensityParam} from which parameters for the +\emph{peak density} correspondence algorithm can be extracted. If not provided +and if \code{object} contains feature definitions with the correspondence/ +peak grouping being performed by the \emph{peak density} method, the +corresponding parameter class stored in \code{object} is used.} + +\item{simulate}{\code{logical(1)} defining whether correspondence should be +simulated within the specified m/z / rt region or (with +\code{simulate = FALSE}) whether the results from an already performed +correspondence should be shown.} \item{col}{Color to be used for the individual samples. Length has to be 1 or equal to the number of samples in \code{object}.} @@ -32,32 +40,40 @@ or equal to the number of samples in \code{object}.} \item{xlim}{\code{numeric(2)} representing the limits for the x-axis. Defaults to the range of the \code{rt} parameter.} -\item{...}{Additional parameters to be passed to the \code{plot} function.} +\item{main}{\code{character(1)} defining the title of the plot. By default +(for \code{main = NULL}) the mz-range is used.} + +\item{...}{Additional parameters to be passed to the \code{plot} function. Data +point specific parameters such as \code{bg} or \code{pch} have to be of length 1 +or equal to the number of samples.} } \value{ The function is called for its side effect, i.e. to create a plot. } \description{ Plot the density of chromatographic peaks along the retention - time axis and indicate which peaks would be grouped into the same feature - based using the \emph{peak density} correspondence method. Settings for - the \emph{peak density} method can be passed with an - \code{\link{PeakDensityParam}} object to parameter \code{param}. +time axis and indicate which peaks would be (or were) grouped into the +same feature based using the \emph{peak density} correspondence method. +Settings for the \emph{peak density} method can be passed with an +\link{PeakDensityParam} object to parameter \code{param}. If the \code{object} contains +correspondence results and the correspondence was performed with the +\emph{peak groups} method, the results from that correspondence can be +visualized setting \code{simulate = FALSE}. } \details{ The \code{plotChromPeakDensity} function allows to evaluate - different settings for the \emph{peak density} on an mz slice of - interest (e.g. containing chromatographic peaks corresponding to a known - metabolite). - The plot shows the individual peaks that were detected within the - specified \code{mz} slice at their retention time (x-axis) and sample in - which they were detected (y-axis). The density function is plotted as a - black line. Parameters for the \code{density} function are taken from the - \code{param} object. Grey rectangles indicate which chromatographic peaks - would be grouped into a feature by the \emph{peak density} correspondence - method. Parameters for the algorithm are also taken from \code{param}. - See \code{\link{groupChromPeaks-density}} for more information about the - algorithm and its supported settings. +different settings for the \emph{peak density} on an mz slice of +interest (e.g. containing chromatographic peaks corresponding to a known +metabolite). +The plot shows the individual peaks that were detected within the +specified \code{mz} slice at their retention time (x-axis) and sample in +which they were detected (y-axis). The density function is plotted as a +black line. Parameters for the \code{density} function are taken from the +\code{param} object. Grey rectangles indicate which chromatographic peaks +would be grouped into a feature by the \code{peak density} correspondence +method. Parameters for the algorithm are also taken from \code{param}. +See \code{\link[=groupChromPeaks-density]{groupChromPeaks-density()}} for more information about the +algorithm and its supported settings. } \examples{ @@ -94,8 +110,8 @@ plotChromPeakDensity(res, mz = mzr, pch = 16, param = PeakDensityParam(minFraction = 1)) } \seealso{ -\code{\link{groupChromPeaks-density}} for details on the - \emph{peak density} correspondence method and supported settings. +\code{\link[=groupChromPeaks-density]{groupChromPeaks-density()}} for details on the +\emph{peak density} correspondence method and supported settings. } \author{ Johannes Rainer diff --git a/vignettes/new_functionality.org b/vignettes/new_functionality.org index 2898429ea..c5326db28 100644 --- a/vignettes/new_functionality.org +++ b/vignettes/new_functionality.org @@ -1675,4 +1675,36 @@ For each sample: - call the =matchpeaks= function on the peaks matrix and the calibrants (which is supposed to be a numeric vector of mz values. +Global concept: calibration is done on the peaks. Questions: ++ Is there a global calibration value for a file we could store into the + =XCMSnExp= object? If yes we could even apply the calibration to the individual + mz values of a file. Actually, yes, the calibration results could be stored on + a per-file basis in the =XCMSnExp=. Problem is we can not apply one global + calibration to all files. So adding this to the processing queue seems to be a + no-go. + ++ We can add a function to the =processingQueue= that applies different + adjustments depending on the =fileIdx=. Be aware! All subsetting/filtering + approaches do have to update the file index in the =processingQueue=. + +*Idea*: don't need the result class below - should be enough to add the +calibration function (inclusive parameters) to the =processingQueue= of the =MSnExp= object! + +*NOTE*: to enable calibration of =mz= values of a =Spectrum=: ++ Implement a =CalibrationResult= object with slots: + - method + - minMz + - maxMz + - fileIdx + - slope + - intercept ++ Enable adding a =list= of these objects into =MsFeatureData=. ++ Add methods to drop/delete such objects from =MsFeatureData=. ++ =dropChromPeaks= should also drop the =list=. ++ Add function to subset the =list= in the =MsFeatureData=. ++ On subsetting: do also subset the =list=. ++ Implement a =dropCalibration= method that does restore the original mz values. + + + * References diff --git a/vignettes/xcms-direct-injection.Rmd b/vignettes/xcms-direct-injection.Rmd new file mode 100644 index 000000000..575aeb1f8 --- /dev/null +++ b/vignettes/xcms-direct-injection.Rmd @@ -0,0 +1,242 @@ +--- +title: "Grouping FTICR-MS data with xcms" +author: +- name: Joachim Bargsten +- name: Johannes Rainer +package: xcms +output: + BiocStyle::html_document2: + toc_float: true +vignette: > + %\VignetteIndexEntry{Grouping FTICR-MS data with xcms} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteKeywords{Mass Spectrometry, MS, Metabolomics, Bioinformatics} + %\VignetteEncoding{UTF-8} + %\VignetteDepends{xcms,msdata,MassSpecWavelet,BiocStyle} +--- + +# Introduction + +```{r echo = FALSE, results = "hide", message = FALSE} +library(BiocStyle) +``` + +This document describes how to use `r Biocpkg("xcms")` for the analysis of +direct injection mass spec data, including peak detection, calibration and +correspondence (grouping of peaks across samples). + +# Peak detection + +Prior to any other analysis step, peaks have to be identified in the mass spec +data. In contrast to the typical metabolomics workflow, in which peaks are +identified in the chromatographic (time) dimension, in direct injection mass +spec data sets peaks are identified in the m/z dimension. `r Biocpkg("xcms")` +uses functionality from the `MassSpecWavelet` package to identify such peaks. + +Below we load the required packages and set the parallel processing up +(depending on the operating system). + +```{r load-libs, message = FALSE, results = "hide"} +library(xcms) +library(MassSpecWavelet) + +if (.Platform$OS.type == "unix") { + prm <- MulticoreParam(2) +} else { + prm <- SnowParam(2) +} +register(bpstart(prm)) + + +``` + +In this documentation we use an example data set from the `r Biocpkg("msdata")` +package. Assuming that `r Biocpkg("msdata")` is installed, we locate the path of +the package and load the data set. We create also a `data.frame` describing the +experimental setup based on the file names. + +```{r load-data, message = FALSE, results = "hide"} +mzdata_path <- system.file("fticr", package = "msdata") +mzdata_files <- list.files(mzdata_path, recursive = TRUE, full.names = TRUE) + +## Create a data.frame assigning samples to sample groups, i.e. ham4 and ham5. +grp <- rep("ham4", length(mzdata_files)) +grp[grep(basename(mzdata_files), pattern = "^HAM005")] <- "ham5" +pd <- data.frame(filename = basename(mzdata_files), sample_group = grp) + +## Load the data. +ham_raw <- readMSData2(files = mzdata_files, + pdata = new("NAnnotatedDataFrame", pd)) +``` + +The data files are from *direct injection* mass spectrometry experiments, +i.e. we have only a single spectrum available for each sample and no retention +times. + +```{r} +## Only a single spectrum with an *artificial* retention time is available +## for each sample +rtime(ham_raw) +``` + +Peaks are identified within each spectrum using the *mass spec wavelet* method. + +```{r msw} +## Define the parameters for the peak detection +msw <- MSWParam(scales = c(1, 4, 9), nearbyPeak = TRUE, winSize.noise = 500, + SNR.method = "data.mean", snthresh = 10) + +ham_prep <- findChromPeaks(ham_raw, param = msw) + +head(chromPeaks(ham_prep)) +``` + +# Calibration + +The `calibrate` method can be used to correct the m/z values of identified +peaks. The currently implemented method requires identified peaks and a list of +m/z values for known calibrants. The identified peaks m/z values are then +adjusted based on the differences between the calibrants' m/z values and the m/z +values of the closest peaks (within a user defined permitted maximal +distance). Note that this method does presently only calibrate identified peaks, +but not the original m/z values in the spectra. + +Below we demonstrate the `calibrate` method on one of the data files with +artificially defined calibration m/z values. We first subset the data set to the +first data file, extract the m/z values of 3 peaks and modify the values +slightly. + +```{r message = FALSE} +## Subset to the first file. +first_file <- filterFile(ham_prep, file = 1) + +## Extract 3 m/z values +calib_mz <- chromPeaks(first_file)[c(1, 4, 7), "mz"] +calib_mz <- calib_mz + 0.00001 * runif(1, 0, 0.4) * calib_mz + 0.0001 + +``` + +Next we calibrate the data set using the previously defined *artificial* +calibrants. We are using the `"edgeshift"` method for calibration that adjusts +all peaks within the range of the m/z values of the calibrants using a linear +interpolation and shifts all chromatographic peaks outside of that range by a +constant factor (the difference between the lowest respectively largest +calibrant m/z with the closest peak's m/z). Note that in a *real* use case, the +m/z values would obviously represent known m/z of calibrants and would not be +defined on the actual data. + +```{r message = FALSE} +## Set-up the parameter class for the calibration +prm <- CalibrantMassParam(mz = calib_mz, method = "edgeshift", + mzabs = 0.0001, mzppm = 5) +first_file_calibrated <- calibrate(first_file, param = prm) + +``` + +To evaluate the calibration we plot below the difference between the adjusted +and raw m/z values (y-axis) against the raw m/z values. + +```{r calibration-result, fig = TRUE, fig.align = "center"} +diffs <- chromPeaks(first_file_calibrated)[, "mz"] - + chromPeaks(first_file)[, "mz"] + +plot(x = chromPeaks(first_file)[, "mz"], xlab = expression(m/z[raw]), + y = diffs, ylab = expression(m/z[calibrated] - m/z[raw])) + +``` + + +# Correspondence + +Correspondence aims to group peaks across samples to define the *features* (ions +with the same m/z values). Peaks from single spectrum, direct injection MS +experiments can be grouped with the *MZclust* method. Below we perform the +correspondence analysis with the `groupChromPeaks` method using default +settings. + +```{r correspondence, message = FALSE, results = "hide"} +## Using default settings but define sample group assignment +mzc_prm <- MzClustParam(sampleGroups = ham_prep$sample_group) +ham_prep <- groupChromPeaks(ham_prep, param = mzc_prm) + +``` + +Getting an overview of the performed processings: + +```{r} +ham_prep +``` + +The peak group information, i.e. the *feature* definitions can be accessed with +the `featureDefinitions` method. + +```{r} +featureDefinitions(ham_prep) +``` + +Plotting the raw data for direct injection samples involves a little more +processing than for LC/GC-MS data in which we can simply use the `chromatogram` +method to extract the data. Below we extract the m/z-intensity pairs for the +peaks associated with the first feature. We thus first identify the peaks for +that feature and define their m/z values range. Using this range we can +subsequently use the `filterMz` function to sub-set the full data set to the +signal associated with the feature's peaks. On that object we can then call the +`mz` and `intensity` functions to extract the data. + +```{r feature-FT01, fig = TRUE, fig.width = 6, fig.height = 4, fig.align = "center"} +## Get the peaks belonging to the first feature +pks <- chromPeaks(ham_prep)[featureDefinitions(ham_prep)$peakidx[[1]], ] + +## Define the m/z range +mzr <- c(min(pks[, "mzmin"]) - 0.001, max(pks[, "mzmax"]) + 0.001) + +## Subset the object to the m/z range +ham_prep_sub <- filterMz(ham_prep, mz = mzr) + +## Extract the mz and intensity values +mzs <- mz(ham_prep_sub, bySample = TRUE) +ints <- intensity(ham_prep_sub, bySample = TRUE) + +## Plot the data +plot(3, 3, pch = NA, xlim = range(mzs), ylim = range(ints), main = "FT01", + xlab = "m/z", ylab = "intensity") +## Define colors +cols <- rep("#ff000080", length(mzs)) +cols[ham_prep_sub$sample_group == "ham5"] <- "#0000ff80" +tmp <- mapply(mzs, ints, cols, FUN = function(x, y, col) { + points(x, y, col = col, type = "l") +}) + +``` + + +To access the actual intensity values of each feature in each sample the +`featureValue` method can be used. The setting `value = "into"` tells the +function to return the integrated signal for each peak (one representative peak) +per sample. + +```{r} +feat_vals <- featureValues(ham_prep, value = "into") +head(feat_vals) + +``` + +`NA` is reported for features in samples for which no peak was identified at the +feature's m/z value. In some instances there might still be a signal at the +feature's position in the raw data files, but the peak detection failed to +identify a peak. For these cases signal can be recovered using the +`fillChromPeaks` method that integrates all raw signal at the feature's +location. If there is no signal at that location an `NA` is reported. + +```{r fillpeaks, message = FALSE} +ham_prep <- fillChromPeaks(ham_prep, param = FillChromPeaksParam()) + +head(featureValues(ham_prep, value = "into")) +``` + +# Further analysis + +Further analysis, i.e. detection of features/metabolites with significantly +different abundances, or PCA analyses can be performed on the feature matrix +using functionality from other R packages, such as `r Biocpkg("limma")`. + diff --git a/vignettes/xcmsDirect.Rnw b/vignettes/xcmsDirect.Rnw deleted file mode 100644 index c8779eecd..000000000 --- a/vignettes/xcmsDirect.Rnw +++ /dev/null @@ -1,187 +0,0 @@ -% -% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is -% likely to be overwritten. -% -%\VignetteIndexEntry{Grouping FTICR-MS data with xcms} -%\VignetteKeywords{preprocess, analysis} -%\VignettePackage{xcms} -\documentclass[12pt]{article} - -\usepackage{hyperref} - -\newcommand{\Robject}[1]{{\texttt{#1}}} -\newcommand{\Rfunction}[1]{{\texttt{#1}}} -\newcommand{\Rpackage}[1]{{\textit{#1}}} -\newcommand{\Rclass}[1]{{\textit{#1}}} -\newcommand{\Rmethod}[1]{{\textit{#1}}} -\newcommand{\Rfunarg}[1]{{\textit{#1}}} - -\textwidth=6.2in -\textheight=8.5in -%\parskip=.3cm -\oddsidemargin=.1in -\evensidemargin=.1in -\headheight=-.3in - -\begin{document} -\title{Grouping FTICR-MS data with xcms} -\author{J. Bargsten} -\maketitle - -\section*{Introduction} - -This document describes how to use \Rpackage{xcms} for aligning multiple MS -spectra against each other. - -\section{Prerequisites} -Lots of Preprocessing has to be done before the data is ready for aligning. -First of all \Rpackage{xcms} and \Rpackage{MassSpecWavelet} -are needed for further processing. - -<>= -library(xcms) -library(MassSpecWavelet) - -if (.Platform$OS.type == "unix") { - prm <- MulticoreParam(2) -} else { - prm <- SnowParam(2) -} -register(bpstart(prm)) - -@ - -This documentation uses raw mzdata files from \Rpackage{msdata} as example data -set. Assuming that \Rpackage{msdata} is installed, we locate the path of the -package and extract the datafiles. - -<>= -library(msdata) -mzdatapath <- system.file("fticr", package = "msdata") -mzdatafiles <- list.files(mzdatapath, recursive = TRUE, full.names = TRUE) -cat("Starting xcmsDirect.Rnw") -@ - -The \Rmethod{xcmsSet}-Constructor parses the given files and applies -peakpicking using the MassSpecWavelet algorithm, leading to a \Robject{xcmsSet} -object with 2 sampleclasses, ham4 and ham5, and 5 samples, respectively. - -<>= -data.mean <- "data.mean" -xs <- xcmsSet( - method="MSW", - files=mzdatafiles, - scales=c(1,4,9), - nearbyPeak=T, - verbose.columns = FALSE, - winSize.noise=500, - SNR.method="data.mean", - snthr=10 -) -@ -\section{Calibration} -\Rmethod{calibrate} can be used to correct the m/z values in a \Robject{xcmsSet}. It needs a xcmsSet and a list of m/z value which should be found in the object. To show this on a example a sample of ham4 is created and discalibrated a bit after getting some m/z: - -<>= - -xs4 <- xcmsSet( - method = "MSW", - files = mzdatafiles[1], - scales = c(1,4, 9), - nearbyPeak = T, - verbose.columns = FALSE, - winSize.noise = 500, - SNR.method = "data.mean", - snthr = 10) - -masslist <- xs4@peaks[c(1,4,7),"mz"] -xs4@peaks[,"mz"] <- xs4@peaks[,"mz"] + 0.00001*runif(1,0,0.4)*xs4@peaks[,"mz"] + 0.0001 -@ - -The \Robject{xcmsSet} now can be calibrated again with the m/z from the masslist. The plot shows the reference masses with the distances to the found ones and the regression-line. - -<>= -xs4c <- calibrate(xs4, - calibrants=masslist, - method="edgeshift", - mzabs=0.0001, - mzppm=5, - neighbours=3, - plotres=TRUE - ) -@ - - - -The method "shift" adds a value to each m/z, "linear" does a regression and edgeshift does a regression but uses a shift before the smallest and after the biggest m/z from the calibrants. -\\ -These steps are necessary to create a usable input for \Rmethod{mzClust}. -However, if you have already stored the data in a \Robject{xcmsSet}, you can -skip the steps above. - -\section{Aligning} -Now we can align \Robject{xs} with \Rmethod{mzClust}. The result is a clone of -\Robject{xs} enhanced by the result of \Rmethod{mzClust}. For a description of -the arguments \Rmethod{mzClust} takes, see helppage of the function. - -<>= -xsg <- group(xs, method="mzClust") -xsg -@ - -\Rmethod{mzClust} stores the grouping information like the standard -\Rmethod{group} method of \Rpackage{xcms} suited for retrieval via -\Rmethod{groups} and \Rmethod{groupidx}. An example is shown below. - -<>= -groups(xsg)[1:10,] -peaks(xsg)[groupidx(xsg)[[1]]] -@ - - - -\section{Postprocessing} -In most cases not all samples are in one group. This can be the origin of -serious problems in code, which is based on e.g. -\Rmethod{groupval}. \Rmethod{groupval} sets missing peaks to NA. The solution -is \Rmethod{fillPeaks}. It changes all NA values to random noise based on the raw -data file. -<>= -groupval(xsg)[1,] -xsgf <- fillPeaks(xsg, method="MSW") -groupval(xsgf, "medret", "into")[1:10,] -@ - -The results are suited for instance for heatmaps, etc. - - -\section{Analyzing and Visualizing Results} - -A report showing the most statistically significant differences in -analyte intensities can be generated with the \Rmethod{diffreport} -method. It will automatically sho wthe superimposed peaks in the -spectra for a given number of them, in this case 10. Several of those -chromatograms are shown in Figure~\ref{eic}. - -\begin{figure} -\begin{center} -\begin{tabular}{cc} -\includegraphics[width=0.49\textwidth]{example_spec/001}& -\includegraphics[width=0.49\textwidth]{example_spec/002}\\ -\includegraphics[width=0.49\textwidth]{example_spec/003}& -\includegraphics[width=0.49\textwidth]{example_spec/004}\\ -\end{tabular} -\end{center} -\caption{\label{eic} -Auto-generated extracted spectra for the top three -differentially regulated ions. Darkened lines indicate where the -peaks were integrated for quantitation.} -\end{figure} - -<>= -reporttab <- diffreport(xsgf, "ham4", "ham5", "example", eicmax=4, - h=480, w=640) -reporttab[1:4,] -@ - -\end{document} diff --git a/vignettes/xcmsInstall.Rnw b/vignettes/xcmsInstall.Rnw deleted file mode 100644 index 262408622..000000000 --- a/vignettes/xcmsInstall.Rnw +++ /dev/null @@ -1,203 +0,0 @@ -% -% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is -% likely to be overwritten. -% -%\VignetteIndexEntry{Installation Instructions for xcms} -%\VignetteDepends{} -%\VignetteKeywords{install} -%\VignettePackage{xcms} -\documentclass[12pt]{article} - -\usepackage{hyperref} - -\newcommand{\Robject}[1]{{\texttt{#1}}} -\newcommand{\Rfunction}[1]{{\texttt{#1}}} -\newcommand{\Rpackage}[1]{{\textit{#1}}} -\newcommand{\Rclass}[1]{{\textit{#1}}} -\newcommand{\Rmethod}[1]{{\textit{#1}}} -\newcommand{\Rfunarg}[1]{{\textit{#1}}} - -\textwidth=6.2in -\textheight=8.5in -%\parskip=.3cm -\oddsidemargin=.1in -\evensidemargin=.1in -\headheight=-.3in - -\begin{document} -\title{Installation Instructions for xcms} -\author{Colin A. Smith} -\maketitle - -\section*{Introduction} - -This document describes how to install \Rpackage{xcms} and, if -necessary, also obtain and install \texttt{R}. The \Rpackage{xcms} -package includes \texttt{C} code which needs to be compiled, and -also uses the NetCDF library for reading AIA format NetCDF mass -spectral data files. Pre-compiled binaries which include the NetCDF -library are available for Windows and Mac OS X. Users of Linux and -other platforms must install the NetCDF library themselves. - -The \Rpackage{xcms} package and a demonstration data package, -\Rpackage{faahKO}, are currently available from several sources. -The first is the web site of the Bioconductor open source software -project. (\url{http://www.bioconductor.org/}) The second is the web -site of the METLIN Metabolite Database. -(\url{http://metlin.scripps.edu/download/}) Both sites contain -source and binary distributions, although due to differences in -distribution, one may host a slightly more up-to-date version than -the other. - -\texttt{R} is available for download through the Comprehensive R -Archive Network (CRAN). Visitors are encouraged to use one of the -many local mirrors of the CRAN site for efficient downloading. -(\url{http://cran.r-project.org/mirrors.html}) While the vignettes -included with \Rpackage{xcms} give many examples of \texttt{R} use -and syntax, new users are strongly encouraged to skim some of the -introductory material in the Manuals section of the \texttt{R} web -site. (\url{http://www.r-project.org/}) ``An Introduction to R'' -is an especially useful starting point. - -\section{Windows Installation} - -To make installation as straightforward as possible for all users, -\Rpackage{xcms} includes a binary version of the NetCDF library in -the \texttt{inst/netcdfdll} subdirectory. The version currently -included is 3.6.1-beta1, as available from the Unidata web -site\footnote{\url{http://www.unidata.ucar.edu/packages/netcdf/}}. - -\begin{enumerate} - -\item Download and install the current version of \texttt{R}. -Detailed instructions for doing so are available on the CRAN site. - -\item Once you have \texttt{R} installed, launch it and select -Biodconductor repository using the \texttt{Packages > Select -repositories...} menu item. - -\item Use the package installer to automatically download and install -\Rpackage{multtest} with the \texttt{Packages > Install package(s)...} -menu item. - -\item Download the \Rpackage{xcms} and \Rpackage{faahKO} Windows -binaries from one of the sources listed in the introduction. - -\item Install \Rpackage{xcms} and \Rpackage{faahKO} using the -\texttt{Packages > Install package(s) from local zip files...} menu -item. - -\end{enumerate} - -\section{Mac OS X Installation} - -The Mac OS X binaries of \Rpackage{xcms} may be installed without -the Developer Tools and without a separate installation of the -NetCDF library. Its code is pre-compiled and statically linked -against the NetCDF library version 3.6.0-p1. If you wish to compile -it yourself, you may obtain the library from software distribution -projects such as Fink\footnote{\url{http://fink.sourceforge.net/}} -or DarwinPorts\footnote{\url{http://netcdf.darwinports.com/}}. -Alternatively, you may compile it yourself using the instructions -provided below. - -\begin{enumerate} - -\item Several of the functions in \Rpackage{xcms} which generate -PNG images require an X11 display device. If it is not already -installed, you will need to install -X11\footnote{\url{http://www.apple.com/downloads/macosx/apple/x11formacosx.html}}. - -\item Download and install the current version of \texttt{R}. -Detailed instructions for doing so are available on the CRAN site. - -\item Download the \Rpackage{xcms} and \Rpackage{faahKO} Mac OS X -binaries from one of the sources listed in the introduction. - -\item Launch \texttt{R} and open the R Package Installer using the -\texttt{Packages \& Data > Package Installer} menu item. Select the -``At User Level'' radio button. - -\item Select ``BioConductor (binaries)'' from the pop up menu and -use the package installer to automatically download and install -\Rpackage{Biobase} and \Rpackage{multtest}. - -If Bioconductor is preparing for its next release and binaries -aren't yet available for your version of \texttt{R}, you will need -to install the Apple Developer -Tools\footnote{\url{http://connect.apple.com/}} to allow installation -of source Bioconductor packages. Use the ``BioConductor (sources)'' -option instead. - -\item Select ``Local Source Package'' from the popup menu and install -the \Rpackage{xcms} and \Rpackage{faahKO} packages you previously -downloaded. - -\end{enumerate} - -\section{Obtaining the NetCDF Library} - -If you are using Linux or another UNIX-like operating system, you -will have to obtain the NetCDF library before installing and using -\Rpackage{xcms}. Many Linux distributions include that package so -in most cases, locating and installing the version that comes with -your distribution will be the best option. However, you may also -install the NetCDF package using the following instructions: - -\begin{enumerate} - -\item Download the gzipped tar file of the NetCDF source -(\texttt{netcdf.tar.gz}) from the Unidata web-site\footnotemark[1]. -Then extract the archive and change to the source directory. - -\begin{verbatim} -tar -xzf netcdf.tar.gz -cd netcdf-*/src -\end{verbatim} - -\item Configure, compile, and install the library. By default it -is installed in the directory you extracted in the previous step. -If you wish to install it in another location, use the \texttt{---prefix} -argument with the configuration script. - -\begin{verbatim} -./configure --prefix=/usr/local -make -make install -\end{verbatim} - -\end{enumerate} - -It is important to note that, when compiled for the x86\_64 -architecture, the NetCDF library must be compiled with the -\texttt{-fPIC} flag to produce position-independent code. Certain -Linux distributions, such as SuSE Professional 9.2, do not yet -enable that option when packaging the NetCDF library. In that case -you must compile and install the library yourself using the -instructions above. To enable that flag, set the \texttt{CFLAGS} -environment variable prior to compilation. - -\begin{verbatim} -export CFLAGS=-fPIC (for sh, bash, etc.) -setenv CFLAGS -fPIC (for csh, tcsh, etc.) -\end{verbatim} - -\section{Obtaining the \Rpackage{rgl} Package} - -Though not listed in any of the dependencies of \Rpackage{xcms}, -the \Rpackage{rgl} package does get limited use. It provides an -interface for creating interactive, 3D graphics using OpenGL. The -\Rpackage{rgl} package is currently under development and does not -yet provide sufficient functionality or stability to warrant its -listing as an official dependency. It is used by a single -method, \Rmethod{plotSurf}, for plotting out a 3D surface representation -of part of an LC/MS or GC/MS experiment. Due to limitations in -\Rpackage{rgl}, axis labels are currently not implemented. However, -it can be instructive for visualizing a small number of peaks in -three dimensions. - -The best place to obtain source and Windows binary packages of -\Rpackage{rgl} is directly from CRAN. Mac OS X binaries are available -in the same place you download the \texttt{R} installer. - -\end{document}