Skip to content

Commit

Permalink
Performance improvement for chromPeakData,XcmsExperimentHdf5
Browse files Browse the repository at this point in the history
  • Loading branch information
jorainer committed Nov 12, 2024
1 parent 9255c63 commit 4ef9c2b
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 35 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ Imports:
Spectra (>= 1.15.7),
progress,
RColorBrewer,
MetaboCoreUtils (>= 1.11.2)
MetaboCoreUtils (>= 1.11.2),
data.table
Suggests:
BiocStyle,
caTools,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ importFrom("utils", "flush.console", "head", "object.size",

importFrom("MassSpecWavelet", "peakDetectionCWT", "tuneInPeakInfo")

importFrom("data.table", "rbindlist")

## MSnbase:
importClassesFrom("MSnbase", "MSnExp", "pSet", "OnDiskMSnExp", "Chromatogram",
"MChromatograms", "MSpectra")
Expand Down
76 changes: 56 additions & 20 deletions R/XcmsExperimentHdf5-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -302,22 +302,31 @@ NULL
} else idx_columns <- NULL
ids <- rep(x@sample_id, length(msLevel))
msl <- rep(msLevel, each = length(x@sample_id))
if (by_sample) {
sample_idx <- integer()
} else {
## pass the sample index to the import function to add the "sample" col
sample_idx <- match(ids, x@sample_id)
names(sample_idx) <- paste0("/", ids, "/ms_", msl, "/chrom_peaks")
}
res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peaks",
ms_level = msl, read_colnames = TRUE,
read_rownames = TRUE, j = idx_columns,
rt = rt, mz = mz, ppm = ppm, type = type)
rt = rt, mz = mz, ppm = ppm, type = type,
sample_index = sample_idx)
if (by_sample) {
names(res) <- ids
res
} else {
l <- vapply(res, nrow, 1L)
cbind(do.call(rbind, res), sample = rep(match(ids, x@sample_id), l))
do.call(base::rbind, res)
}
}

#' Extract the `chromPeakData` data.frame. Using `peaks` allows to reduce memory
#' demand because only data from the specified chrom peaks is returned. This
#' assumes that `chromPeaks()` was called before to get the IDs of the peaks.
#' We're using the data.table::rbindlist to combine the data.frames because its
#' much faster - but unfortunately drops also the rownames.
#'
#' @param x `XcmsExperimentHdf5`
#'
Expand All @@ -330,24 +339,25 @@ NULL
#' @param by_sample `logical(1)` whether results should be `rbind` or returned
#' as a `list` of `data.frame`.
#'
#' @importFrom data.table rbindlist
#'
#' @noRd
.h5_chrom_peak_data <- function(x, msLevel = integer(), columns = character(),
peaks = character(), by_sample = TRUE) {
ids <- rep(x@sample_id, length(msLevel))
msl <- rep(msLevel, each = length(x@sample_id))
## Eventually pass chrom peak ids along to read only specicic data...
names(msl) <- paste0("/", ids, "/ms_", msl, "/chrom_peak_data")
res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peak_data",
ms_level = msl, read_rownames = TRUE, peaks = peaks)
ms_level = msl, read_rownames = TRUE, peaks = peaks,
ms_levels = msl)
if (by_sample) {
names(res) <- ids
res <- mapply(FUN = function(a, b) {
a$ms_level <- b
a
}, res, msl, SIMPLIFY = FALSE)
res
} else {
l <- vapply(res, nrow, 1L)
cbind(do.call(rbind, res), ms_level = rep(msl, l))
rn <- unlist(lapply(res, rownames), use.names= FALSE, recursive = FALSE)
res <- base::as.data.frame(data.table::rbindlist(res))
attr(res, "row.names") <- rn
res
}
}

Expand All @@ -358,6 +368,15 @@ NULL
drop = TRUE)
}

.h5_chrom_peaks_rownames <- function(x, msLevel = x@chrom_peaks_ms_level) {
ids <- rep(x@sample_id, length(msLevel))
msl <- rep(msLevel, each = length(x@sample_id))
h5 <- rhdf5::H5Fopen(x@hdf5_file)
on.exit(invisible(rhdf5::H5Fclose(h5)))
d <- paste0("/", ids, "/ms_", msl, "/chrom_peaks_rownames")
lapply(d, FUN = rhdf5::h5read, file = h5, drop = TRUE)
}

.h5_chrom_peak_data_colnames <- function(x, msLevel = 1L) {
h5 <- rhdf5::H5Fopen(x@hdf5_file)
on.exit(rhdf5::H5Fclose(h5))
Expand Down Expand Up @@ -594,7 +613,7 @@ NULL
ms_level = ms_level, read_colnames = TRUE, i = idx,
read_rownames = FALSE)[[1L]]
b$ms_level <- rep(ms_level, length(idx))
rownames(b) <- rownames(a)
attr(b, "row.names") <- rownames(a)
tmp <- chr@.Data[j, i][[1L]]
slot(tmp, "chromPeaks", check = FALSE) <- a
slot(tmp, "chromPeakData", check = FALSE) <- as(b, "DataFrame")
Expand Down Expand Up @@ -697,14 +716,19 @@ NULL
read_rownames = FALSE,
rownames = paste0(name, "_rownames"),
rt = numeric(), mz = numeric(),
ppm = 0, type = "any") {
ppm = 0, type = "any",
sample_index = integer()) {
read_colnames <- read_colnames || length(rt) > 0 || length(mz) > 0
d <- .h5_read_matrix2(name, h5, index, read_colnames, read_rownames,
rownames)
d <- .h5_read_matrix2(
name, h5, index, read_colnames, read_rownames, rownames)
if (length(rt) | length(mz))
d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz,
ppm = ppm, type = type), , drop = FALSE]
else d
d <- d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz,
ppm = ppm, type = type), ,
drop = FALSE]
## If sample_index is provided add a column "sample" with the index.
if (length(sample_index))
d <- cbind(d, sample = sample_index[name])
d
}

#' Read a single `data.frame` from the HDF5 file. With
Expand Down Expand Up @@ -737,15 +761,25 @@ NULL
d <- lapply(d, as.vector)
d <- as.data.frame(d)
if (read_rownames)
rownames(d) <- rhdf5::h5read(h5, rownames, drop = TRUE)
attr(d, "row.names") <- rhdf5::h5read(h5, rownames, drop = TRUE)
if (is.null(index[[1L]]))
d
else d[index[[1L]], , drop = FALSE]
}

#' Function to read the chromPeakData `data.frame` for one sample.
#'
#' @param peaks optional `character` with the chrom peak IDs for which the
#' data should be extracted.
#'
#' @param ms_levels optional **named** `integer` with the MS levels of all
#' imported data sets. At least one of the `names(ms_levels)` should match
#' param `name`. If provided, a column `$ms_level` is added to the result.
#'
#' @noRd
.h5_read_chrom_peak_data <- function(name, h5, index = list(NULL, NULL),
read_rownames = FALSE, peaks = character(),
...) {
ms_levels = integer(), ...) {
cd <- .h5_read_data_frame(
name, h5, read_rownames = read_rownames || length(peaks) > 0,
index = index, rownames = sub("_data", "s_rownames", name))
Expand All @@ -755,6 +789,8 @@ NULL
if (!read_rownames)
rownames(cd) <- NULL
}
if (length(ms_levels))
cd$ms_level <- rep(unname(ms_levels[name]), nrow(cd))
cd
}

Expand Down
35 changes: 25 additions & 10 deletions R/XcmsExperimentHdf5.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,28 @@
#' chromatographic peak in the chrom peak matrix **of that sample** and
#' MS level.
#'
#' HDF5 does not support parallel processing, thus preprocessing results need
#' to be loaded sequentially.
#'
#' All functionality for `XcmsExperimentHdf5` objects is optimized to reduce
#' memory demand at the cost of eventually lower performance.
#'
#' @section Functionality related to chromatographic peaks:
#'
#' - `chromPeakData()` gains a new parameter `peaks` which allows to specify
#' from which chromatographic peaks data should be returned. For these
#' chromatographic peaks the ID (row name in `chromPeaks()`) should be
#' provided with the `peaks` parameter. This can reduce the memory
#' - `chromPeaks()` gains parameter `bySample = FALSE` that, if set to `TRUE`
#' returns a `list` of `chromPeaks` matrices, one for each sample. Due to
#' the way data is organized in `XcmsExperimentHdf5` objects this is more
#' efficient than `bySample = FALSE`. Thus, in cases where chrom peak data
#' is subsequently evaluated or processed by sample, it is suggested to
#' use `bySample = TRUE`.
#'
#' - `chromPeakData()` gains a new parameter `peaks = character()` which allows
#' to specify from which chromatographic peaks data should be returned.
#' For these chromatographic peaks the ID (row name in `chromPeaks()`)
#' should be provided with the `peaks` parameter. This can reduce the memory
#' requirement for cases in which only data of some selected chromatographic
#' peaks needs to be extracted.
#' peaks needs to be extracted. Also, `chromPeakData()` supports the
#' `bySample` parameter described for `chromPeaks()` above.
#'
#' @section Retention time alignment:
#'
Expand Down Expand Up @@ -133,7 +147,7 @@ setMethod("show", "XcmsExperimentHdf5", function(object) {
##
################################################################################

#' @rdname XcmsExperiment
#' @rdname hidden_aliases
setMethod("[", "XcmsExperimentHdf5", function(x, i, j, ...) {
if (!missing(j))
stop("subsetting by j not supported")
Expand Down Expand Up @@ -241,7 +255,7 @@ setMethod(
"chromPeaks", "XcmsExperimentHdf5",
function(object, rt = numeric(), mz = numeric(), ppm = 0,
msLevel = integer(), type = c("any", "within", "apex_within"),
isFilledColumn = FALSE, columns = character()) {
isFilledColumn = FALSE, columns = character(), bySample = FALSE) {
if (isFilledColumn)
stop("Parameter 'isFilledColumn = TRUE' is not supported")
type <- match.arg(type)
Expand All @@ -255,7 +269,7 @@ setMethod(
if (length(msLevel))
msl <- msl[msl %in% msLevel]
.h5_chrom_peaks(object, msLevel = msl, columns = columns,
by_sample = FALSE, mz = mz, rt = rt, ppm = ppm,
by_sample = bySample, mz = mz, rt = rt, ppm = ppm,
type = type)
})

Expand All @@ -270,8 +284,9 @@ setReplaceMethod(
setMethod(
"chromPeakData", "XcmsExperimentHdf5",
function(object, msLevel = integer(), peaks = character(),
return.type = c("DataFrame", "data.frame")) {
return.type = c("DataFrame", "data.frame"), bySample = FALSE) {
return.type <- match.arg(return.type)
.h5_require_rhdf5()
if (!length(object))
return(as(object@chromPeakData, return.type))
.h5_check_mod_count(object@hdf5_file, object@hdf5_mod_count)
Expand All @@ -283,7 +298,7 @@ setMethod(
as(.h5_chrom_peak_data(object, msLevel, peaks = peaks,
by_sample = FALSE), "DataFrame")
else .h5_chrom_peak_data(object, msLevel, peaks = peaks,
by_sample = FALSE)
by_sample = bySample)
})

## #' @rdname refineChromPeaks
Expand Down
20 changes: 20 additions & 0 deletions tests/testthat/test_XcmsExperimentHdf5-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,19 @@ test_that(".h5_read_chrom_peaks_matrix works", {
read_colnames = TRUE, read_rownames = FALSE,
mz = c(300, 350), type = "within")
expect_true(all(res[, "mz"] > 300 & res[, "mz"] < 350))

## with sample_index
sidx <- c(4L, 5L, 6L)
names(sidx) <- c("/S1/ms_1/chrom_peaks", "/S2/ms_1/chrom_peaks",
"/S3/ms_1/chrom_peaks")
res <- .h5_read_chrom_peaks_matrix(
"/S2/ms_1/chrom_peaks", xmse_h5@hdf5_file, read_colnames = TRUE,
read_rownames = FALSE, sample_index = sidx)
expect_true(is.matrix(res))
expect_true(is.numeric(res))
expect_true(nrow(res) > 0)
expect_true(any(colnames(res) %in% "sample"))
expect_true(all(res[, "sample"] == 5))
})

test_that(".h5_read_data_frame works", {
Expand Down Expand Up @@ -664,6 +677,13 @@ test_that(".h5_chrom_peaks_colnames works", {
expect_true(all(c("mz", "mzmin", "mzmax", "rt") %in% res))
})

test_that(".h5_chrom_peaks_rownames works", {
res <- .h5_chrom_peaks_rownames(xmse_h5)
expect_true(is.list(res))
expect_equal(length(res), 3L)
expect_true(is.character(res[[1L]]))
})

test_that(".h5_filter works", {
expect_equal(.h5_filter(), "NONE")
})
Expand Down
8 changes: 4 additions & 4 deletions tests/testthat/test_XcmsExperimentHdf5.R
Original file line number Diff line number Diff line change
Expand Up @@ -236,19 +236,19 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", {
nf <- nrow(b)
rtmed <- b$rtmed
## .h5_feature_values_sample
a <- .h5_feature_values_sample(
a <- xcms:::.h5_feature_values_sample(
xmseg_full_h5@hdf5_file, sample_id = "S1", ms_level = 1L,
n_features = nf, method = "sum", filled = FALSE, col_idx = 9L)
b <- unname(featureValues(ref, method = "sum", value = "maxo",
filled = FALSE)[, 1L])
expect_equal(a, b)
a <- .h5_feature_values_sample(
a <- xcms:::.h5_feature_values_sample(
xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L,
n_features = nf, filled = FALSE, method = "maxint", col_idx = c(7L, 9L))
b <- unname(featureValues(ref, method = "maxint", value = "into",
filled = FALSE, intensity = "maxo")[, 4L])
expect_equal(a, b)
a <- .h5_feature_values_sample(
a <- xcms:::.h5_feature_values_sample(
xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L,
n_features = nf, filled = FALSE, method = "medret", col_idx = c(8L, 4L),
rtmed = rtmed)
Expand All @@ -257,7 +257,7 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", {
expect_equal(a, b)

## .h5_feature_values_ms_level
a <- .h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret",
a <- xcms:::.h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret",
value = "into", filled = FALSE)
b <- featureValues(ref, method = "medret", value = "into", filled = FALSE)
expect_equal(unname(a), unname(b))
Expand Down

0 comments on commit 4ef9c2b

Please sign in to comment.