Performance improvement for chromPeakData,XcmsExperimentHdf5

sneumann · Nov 12, 2024 · 4ef9c2b · 4ef9c2b
1 parent 9255c63
commit 4ef9c2b
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 35 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -66,7 +66,8 @@ Imports:
     Spectra (>= 1.15.7),
     progress,
     RColorBrewer,
-    MetaboCoreUtils (>= 1.11.2)
+    MetaboCoreUtils (>= 1.11.2),
+    data.table
 Suggests:
     BiocStyle,
     caTools,

diff --git a/NAMESPACE b/NAMESPACE
@@ -57,6 +57,8 @@ importFrom("utils", "flush.console", "head", "object.size",
 
 importFrom("MassSpecWavelet", "peakDetectionCWT", "tuneInPeakInfo")
 
+importFrom("data.table", "rbindlist")
+
 ## MSnbase:
 importClassesFrom("MSnbase", "MSnExp", "pSet", "OnDiskMSnExp", "Chromatogram",
                   "MChromatograms", "MSpectra")

diff --git a/R/XcmsExperimentHdf5-functions.R b/R/XcmsExperimentHdf5-functions.R
@@ -302,22 +302,31 @@ NULL
     } else idx_columns <- NULL
     ids <- rep(x@sample_id, length(msLevel))
     msl <- rep(msLevel, each = length(x@sample_id))
+    if (by_sample) {
+        sample_idx <- integer()
+    } else {
+        ## pass the sample index to the import function to add the "sample" col
+        sample_idx <- match(ids, x@sample_id)
+        names(sample_idx) <- paste0("/", ids, "/ms_", msl, "/chrom_peaks")
+    }
     res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peaks",
                          ms_level = msl, read_colnames = TRUE,
                          read_rownames = TRUE, j = idx_columns,
-                         rt = rt, mz = mz, ppm = ppm, type = type)
+                         rt = rt, mz = mz, ppm = ppm, type = type,
+                         sample_index = sample_idx)
     if (by_sample) {
         names(res) <- ids
         res
     } else {
-        l <- vapply(res, nrow, 1L)
-        cbind(do.call(rbind, res), sample = rep(match(ids, x@sample_id), l))
+        do.call(base::rbind, res)
     }
 }
 
 #' Extract the `chromPeakData` data.frame. Using `peaks` allows to reduce memory
 #' demand because only data from the specified chrom peaks is returned. This
 #' assumes that `chromPeaks()` was called before to get the IDs of the peaks.
+#' We're using the data.table::rbindlist to combine the data.frames because its
+#' much faster - but unfortunately drops also the rownames.
 #'
 #' @param x `XcmsExperimentHdf5`
 #'
@@ -330,24 +339,25 @@ NULL
 #' @param by_sample `logical(1)` whether results should be `rbind` or returned
 #'     as a `list` of `data.frame`.
 #'
+#' @importFrom data.table rbindlist
+#'
 #' @noRd
 .h5_chrom_peak_data <- function(x, msLevel = integer(), columns = character(),
                                 peaks = character(), by_sample = TRUE) {
     ids <- rep(x@sample_id, length(msLevel))
     msl <- rep(msLevel, each = length(x@sample_id))
-    ## Eventually pass chrom peak ids along to read only specicic data...
+    names(msl) <- paste0("/", ids, "/ms_", msl, "/chrom_peak_data")
     res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peak_data",
-                         ms_level = msl, read_rownames = TRUE, peaks = peaks)
+                         ms_level = msl, read_rownames = TRUE, peaks = peaks,
+                         ms_levels = msl)
     if (by_sample) {
         names(res) <- ids
-        res <- mapply(FUN = function(a, b) {
-            a$ms_level <- b
-            a
-        }, res, msl, SIMPLIFY = FALSE)
         res
     } else {
-        l <- vapply(res, nrow, 1L)
-        cbind(do.call(rbind, res), ms_level = rep(msl, l))
+        rn <- unlist(lapply(res, rownames), use.names= FALSE, recursive = FALSE)
+        res <- base::as.data.frame(data.table::rbindlist(res))
+        attr(res, "row.names") <- rn
+        res
     }
 }
 
@@ -358,6 +368,15 @@ NULL
                   drop = TRUE)
 }
 
+.h5_chrom_peaks_rownames <- function(x, msLevel = x@chrom_peaks_ms_level) {
+    ids <- rep(x@sample_id, length(msLevel))
+    msl <- rep(msLevel, each = length(x@sample_id))
+    h5 <- rhdf5::H5Fopen(x@hdf5_file)
+    on.exit(invisible(rhdf5::H5Fclose(h5)))
+    d <- paste0("/", ids, "/ms_", msl, "/chrom_peaks_rownames")
+    lapply(d, FUN = rhdf5::h5read, file = h5, drop = TRUE)
+}
+
 .h5_chrom_peak_data_colnames <- function(x, msLevel = 1L) {
     h5 <- rhdf5::H5Fopen(x@hdf5_file)
     on.exit(rhdf5::H5Fclose(h5))
@@ -594,7 +613,7 @@ NULL
                 ms_level = ms_level, read_colnames = TRUE, i = idx,
                 read_rownames = FALSE)[[1L]]
             b$ms_level <- rep(ms_level, length(idx))
-            rownames(b) <- rownames(a)
+            attr(b, "row.names") <- rownames(a)
             tmp <- chr@.Data[j, i][[1L]]
             slot(tmp, "chromPeaks", check = FALSE) <- a
             slot(tmp, "chromPeakData", check = FALSE) <- as(b, "DataFrame")
@@ -697,14 +716,19 @@ NULL
                                         read_rownames = FALSE,
                                         rownames = paste0(name, "_rownames"),
                                         rt = numeric(), mz = numeric(),
-                                        ppm = 0, type = "any") {
+                                        ppm = 0, type = "any",
+                                        sample_index = integer()) {
     read_colnames <- read_colnames || length(rt) > 0 || length(mz) > 0
-    d <- .h5_read_matrix2(name, h5, index, read_colnames, read_rownames,
-                          rownames)
+    d <- .h5_read_matrix2(
+        name, h5, index, read_colnames, read_rownames, rownames)
     if (length(rt) | length(mz))
-        d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz,
-                                       ppm = ppm, type = type), , drop = FALSE]
-    else d
+        d <- d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz,
+                                            ppm = ppm, type = type), ,
+               drop = FALSE]
+    ## If sample_index is provided add a column "sample" with the index.
+    if (length(sample_index))
+        d <- cbind(d, sample = sample_index[name])
+    d
 }
 
 #' Read a single `data.frame` from the HDF5 file. With
@@ -737,15 +761,25 @@ NULL
         d <- lapply(d, as.vector)
     d <- as.data.frame(d)
     if (read_rownames)
-        rownames(d) <- rhdf5::h5read(h5, rownames, drop = TRUE)
+        attr(d, "row.names") <- rhdf5::h5read(h5, rownames, drop = TRUE)
     if (is.null(index[[1L]]))
         d
     else d[index[[1L]], , drop = FALSE]
 }
 
+#' Function to read the chromPeakData `data.frame` for one sample.
+#'
+#' @param peaks optional `character` with the chrom peak IDs for which the
+#'     data should be extracted.
+#'
+#' @param ms_levels optional **named** `integer` with the MS levels of all
+#'     imported data sets. At least one of the `names(ms_levels)` should match
+#'     param `name`. If provided, a column `$ms_level` is added to the result.
+#'
+#' @noRd
 .h5_read_chrom_peak_data <- function(name, h5, index = list(NULL, NULL),
                                      read_rownames = FALSE, peaks = character(),
-                                     ...) {
+                                     ms_levels = integer(), ...) {
     cd <- .h5_read_data_frame(
         name, h5, read_rownames = read_rownames || length(peaks) > 0,
         index = index, rownames = sub("_data", "s_rownames", name))
@@ -755,6 +789,8 @@ NULL
         if (!read_rownames)
             rownames(cd) <- NULL
     }
+    if (length(ms_levels))
+        cd$ms_level <- rep(unname(ms_levels[name]), nrow(cd))
     cd
 }
 

diff --git a/R/XcmsExperimentHdf5.R b/R/XcmsExperimentHdf5.R
@@ -42,14 +42,28 @@
 #' chromatographic peak in the chrom peak matrix **of that sample** and
 #' MS level.
 #'
+#' HDF5 does not support parallel processing, thus preprocessing results need
+#' to be loaded sequentially.
+#'
+#' All functionality for `XcmsExperimentHdf5` objects is optimized to reduce
+#' memory demand at the cost of eventually lower performance.
+#'
 #' @section Functionality related to chromatographic peaks:
 #'
-#' - `chromPeakData()` gains a new parameter `peaks` which allows to specify
-#'   from which chromatographic peaks data should be returned. For these
-#'   chromatographic peaks the ID (row name in `chromPeaks()`) should be
-#'   provided with the `peaks` parameter. This can reduce the memory
+#' - `chromPeaks()` gains parameter `bySample = FALSE` that, if set to `TRUE`
+#'   returns a `list` of `chromPeaks` matrices, one for each sample. Due to
+#'   the way data is organized in `XcmsExperimentHdf5` objects this is more
+#'   efficient than `bySample = FALSE`. Thus, in cases where chrom peak data
+#'   is subsequently evaluated or processed by sample, it is suggested to
+#'   use `bySample = TRUE`.
+#'
+#' - `chromPeakData()` gains a new parameter `peaks = character()` which allows
+#'   to specify from which chromatographic peaks data should be returned.
+#'   For these chromatographic peaks the ID (row name in `chromPeaks()`)
+#'   should be provided with the `peaks` parameter. This can reduce the memory
 #'   requirement for cases in which only data of some selected chromatographic
-#'   peaks needs to be extracted.
+#'   peaks needs to be extracted. Also, `chromPeakData()` supports the
+#'   `bySample` parameter described for `chromPeaks()` above.
 #'
 #' @section Retention time alignment:
 #'
@@ -133,7 +147,7 @@ setMethod("show", "XcmsExperimentHdf5", function(object) {
 ##
 ################################################################################
 
-#' @rdname XcmsExperiment
+#' @rdname hidden_aliases
 setMethod("[", "XcmsExperimentHdf5", function(x, i, j, ...) {
     if (!missing(j))
         stop("subsetting by j not supported")
@@ -241,7 +255,7 @@ setMethod(
     "chromPeaks", "XcmsExperimentHdf5",
     function(object, rt = numeric(), mz = numeric(), ppm = 0,
              msLevel = integer(),  type = c("any", "within", "apex_within"),
-             isFilledColumn = FALSE, columns = character()) {
+             isFilledColumn = FALSE, columns = character(), bySample = FALSE) {
         if (isFilledColumn)
             stop("Parameter 'isFilledColumn = TRUE' is not supported")
         type <- match.arg(type)
@@ -255,7 +269,7 @@ setMethod(
         if (length(msLevel))
             msl <- msl[msl %in% msLevel]
         .h5_chrom_peaks(object, msLevel = msl, columns = columns,
-                        by_sample = FALSE, mz = mz, rt = rt, ppm = ppm,
+                        by_sample = bySample, mz = mz, rt = rt, ppm = ppm,
                         type = type)
     })
 
@@ -270,8 +284,9 @@ setReplaceMethod(
 setMethod(
     "chromPeakData", "XcmsExperimentHdf5",
     function(object, msLevel = integer(), peaks = character(),
-             return.type = c("DataFrame", "data.frame")) {
+             return.type = c("DataFrame", "data.frame"), bySample = FALSE) {
         return.type <- match.arg(return.type)
+        .h5_require_rhdf5()
         if (!length(object))
             return(as(object@chromPeakData, return.type))
         .h5_check_mod_count(object@hdf5_file, object@hdf5_mod_count)
@@ -283,7 +298,7 @@ setMethod(
             as(.h5_chrom_peak_data(object, msLevel, peaks = peaks,
                                    by_sample = FALSE), "DataFrame")
         else .h5_chrom_peak_data(object, msLevel, peaks = peaks,
-                                 by_sample = FALSE)
+                                 by_sample = bySample)
     })
 
 ## #' @rdname refineChromPeaks

diff --git a/tests/testthat/test_XcmsExperimentHdf5-functions.R b/tests/testthat/test_XcmsExperimentHdf5-functions.R
@@ -361,6 +361,19 @@ test_that(".h5_read_chrom_peaks_matrix works", {
         read_colnames = TRUE, read_rownames = FALSE,
         mz = c(300, 350), type = "within")
     expect_true(all(res[, "mz"] > 300 & res[, "mz"] < 350))
+
+    ## with sample_index
+    sidx <- c(4L, 5L, 6L)
+    names(sidx) <- c("/S1/ms_1/chrom_peaks", "/S2/ms_1/chrom_peaks",
+                     "/S3/ms_1/chrom_peaks")
+    res <- .h5_read_chrom_peaks_matrix(
+        "/S2/ms_1/chrom_peaks", xmse_h5@hdf5_file, read_colnames = TRUE,
+        read_rownames = FALSE, sample_index = sidx)
+    expect_true(is.matrix(res))
+    expect_true(is.numeric(res))
+    expect_true(nrow(res) > 0)
+    expect_true(any(colnames(res) %in% "sample"))
+    expect_true(all(res[, "sample"] == 5))
 })
 
 test_that(".h5_read_data_frame works", {
@@ -664,6 +677,13 @@ test_that(".h5_chrom_peaks_colnames works", {
     expect_true(all(c("mz", "mzmin", "mzmax", "rt") %in% res))
 })
 
+test_that(".h5_chrom_peaks_rownames works", {
+    res <- .h5_chrom_peaks_rownames(xmse_h5)
+    expect_true(is.list(res))
+    expect_equal(length(res), 3L)
+    expect_true(is.character(res[[1L]]))
+})
+
 test_that(".h5_filter works", {
     expect_equal(.h5_filter(), "NONE")
 })

diff --git a/tests/testthat/test_XcmsExperimentHdf5.R b/tests/testthat/test_XcmsExperimentHdf5.R
@@ -236,19 +236,19 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", {
     nf <- nrow(b)
     rtmed <- b$rtmed
     ## .h5_feature_values_sample
-    a <- .h5_feature_values_sample(
+    a <- xcms:::.h5_feature_values_sample(
         xmseg_full_h5@hdf5_file, sample_id = "S1", ms_level = 1L,
         n_features = nf, method = "sum", filled = FALSE, col_idx = 9L)
     b <- unname(featureValues(ref, method = "sum", value = "maxo",
                               filled = FALSE)[, 1L])
     expect_equal(a, b)
-    a <- .h5_feature_values_sample(
+    a <- xcms:::.h5_feature_values_sample(
         xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L,
         n_features = nf, filled = FALSE, method = "maxint", col_idx = c(7L, 9L))
     b <- unname(featureValues(ref, method = "maxint", value = "into",
                               filled = FALSE, intensity = "maxo")[, 4L])
     expect_equal(a, b)
-    a <- .h5_feature_values_sample(
+    a <- xcms:::.h5_feature_values_sample(
         xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L,
         n_features = nf, filled = FALSE, method = "medret", col_idx = c(8L, 4L),
         rtmed = rtmed)
@@ -257,7 +257,7 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", {
     expect_equal(a, b)
 
     ## .h5_feature_values_ms_level
-    a <- .h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret",
+    a <- xcms:::.h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret",
                                      value = "into", filled = FALSE)
     b <- featureValues(ref, method = "medret", value = "into", filled = FALSE)
     expect_equal(unname(a), unname(b))