From e72e1c281243fb6680a627773cd92f4baa012c24 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 30 Oct 2024 13:44:52 +0100 Subject: [PATCH] feat: add functionality to get chrom peaks for features --- NAMESPACE | 4 +- R/XcmsExperimentHdf5-functions.R | 127 ++++++++++++++---- R/XcmsExperimentHdf5.R | 51 ++++++- .../test_XcmsExperimentHdf5-functions.R | 88 +++++++++--- tests/testthat/test_XcmsExperimentHdf5.R | 26 ++++ 5 files changed, 251 insertions(+), 45 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6bb79d50..0b7986e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,7 +28,7 @@ importClassesFrom("S4Vectors", "Rle", "DataFrame", "Hits") importFrom("S4Vectors", "split", "Rle", "DataFrame", "SimpleList", "List", "as.matrix") importMethodsFrom("S4Vectors", "as.matrix", "mcols", "mcols<-", - "extractROWS", "findMatches") + "extractROWS", "findMatches", "to") importFrom("SummarizedExperiment", "SummarizedExperiment") importFrom("SummarizedExperiment", "rowData") importFrom("SummarizedExperiment", "rowData<-") @@ -605,4 +605,4 @@ export("PercentMissingFilter") export("BlankFlag") ## HDF5 storage mode -exportClasses("XcmsExperimentHdf5") +exportClasses("XcmsExperimentHdf5") \ No newline at end of file diff --git a/R/XcmsExperimentHdf5-functions.R b/R/XcmsExperimentHdf5-functions.R index 5447e902..0fda51b4 100644 --- a/R/XcmsExperimentHdf5-functions.R +++ b/R/XcmsExperimentHdf5-functions.R @@ -293,9 +293,7 @@ NULL type = "any", by_sample = TRUE) { if (length(columns)) { ## Get column names, convert column names to indices. - cn <- rhdf5::h5read(x@hdf5_file, - name = paste0("/", x@sample_id[1L], "/ms_", - msLevel[1L], "/chrom_peaks_colnames")) + cn <- .h5_chrom_peaks_colnames(x, msLevel = msLevel) idx_columns <- match(columns, cn) if (anyNA(idx_columns)) stop("Column(s) ", paste0("\"", columns[is.na(idx_columns)], "\"", @@ -306,7 +304,7 @@ NULL msl <- rep(msLevel, each = length(x@sample_id)) res <- .h5_read_data(x@hdf5_file, index = ids, name = "chrom_peaks", ms_level = msl, read_colnames = TRUE, - read_rownames = TRUE, column = idx_columns) + read_rownames = TRUE, j = idx_columns) if (length(mz) | length(rt)) res <- lapply(res, function(z, rt, mz, ppm, type) { z[.is_chrom_peaks_within_mz_rt( @@ -326,6 +324,12 @@ NULL ## LLLL implement; following .h5_chrom_peaks } +.h5_chrom_peaks_colnames <- function(x, msLevel = 1L) { + rhdf5::h5read(x@hdf5_file, + name = paste0("/", x@sample_id[1L], "/ms_", + msLevel[1L], "/chrom_peaks_colnames"), + drop = TRUE) +} ################################################################################ ## @@ -333,6 +337,40 @@ NULL ## ################################################################################ +#' Get chrom peaks for features from one sample. Allows to define/subset by +#' features (`i`) and select column(s) from the chrom peaks matrix to return. +#' +#' @param sample_id `character(1)` with the ID of the sample from which to +#' return the data. +#' +#' @param hdf5_file `character(1)` with the HDF5 file name +#' +#' @param ms_level `integer(1)` with the MS level of the features/chrom peaks +#' +#' @param i optional `integer` to select the features for which to return the +#' data. +#' +#' @param j optional `integer` defining the index of the column(s) to return. +#' +#' @return `matrix` with the chrom peak data, first column being the feature +#' index. +#' +#' @importFrom S4Vectors findMatches to +#' +#' @noRd +.h5_feature_chrom_peaks_sample <- function(sample_id, hdf5_file, ms_level, + i = integer(), j = NULL) { + fidx <- xcms:::.h5_read_data(hdf5_file, sample_id, name = "feature_to_chrom_peaks", + ms_level = ms_level)[[1L]] + if (length(i)) { + hits <- findMatches(i, fidx[, 1L]) + fidx <- fidx[to(hits), , drop = FALSE] + } + vals <- xcms:::.h5_read_data(hdf5_file, sample_id, name = "chrom_peaks", + ms_level = ms_level, i = fidx[, 2L], j = j)[[1L]] + cbind(fidx[, 1L], vals) +} + #' Extracts feature values for one sample summing intensities for features #' with multiple peaks assigned. #' @@ -370,7 +408,7 @@ NULL res <- rep(NA_real_, n_features) sid <- paste0("/", sample_id, "/ms_", ms_level) vals <- .h5_read_data(hdf5_file, sample_id, name = "chrom_peaks", - ms_level = ms_level, column = col_idx)[[1L]] + ms_level = ms_level, j = col_idx)[[1L]] fidx <- .h5_read_data(hdf5_file, sample_id, name = "feature_to_chrom_peaks", ms_level = ms_level)[[1L]] ## remove gap-filled values @@ -404,8 +442,7 @@ NULL #' @noRd .h5_feature_values_ms_level <- function(ms_level, x, method, value, intensity, filled = TRUE) { - cn <- h5read(x@hdf5_file, paste0("/", x@sample_id[1L], "/ms_", ms_level, - "/chrom_peaks_colnames"), drop = TRUE) + cn <- .h5_chrom_peaks_colnames(x, ms_level) col <- switch(method, sum = value, medret = c(value, "rt"), @@ -430,6 +467,30 @@ NULL res } +################################################################################ +## +## ALIGNMENT RELATED FUNCTIONALITY +## +################################################################################ + +#' .getPeakGroupsRtMatrix in do_adjustRtime-functions.R +#' Need a function that gets a matrix with columns being samples and rows the +#' retention time of a (defined) peak group. Peak groups get defined based +#' on the number of available peaks per feature. +#' +#' define the features based on the features_to_chrom_peaks data set. +#' for those we need to get then their retention times. +#' +#' @noRd +.get_peak_groups_rt_matrix <- function(x, ms_level = 1L) { + ## LLLLLL continue. + i <- 1L + idx <- rhdf5::h5read(x@hdf5_file, paste0("/S", i, "/ms_", ms_level, + "/feature_to_chrom_peaks")) + rts <- xcms:::.h5_read_data(, index = LLLL) +} + + ################################################################################ ## ## HDF5 FUNCTIONALITY @@ -456,8 +517,8 @@ NULL #' #' @param h5 HDF5 file handle #' -#' @param index `integer` with the index (or indices) of the columns to read. -#' By default, with `index = NULL` all columns are read. +#' @param index `list` with `integer` indices of the rows and columns to read +#' only a subset of the data. Is passed directly to `rhdf5::h5read()`. #' #' @param read_colnames `logical(1)` whether column names should also be read #' and set. @@ -467,19 +528,17 @@ NULL #' @return numeric `matrix` #' #' @noRd -.h5_read_matrix <- function(name, h5, index = NULL, +.h5_read_matrix <- function(name, h5, index = list(NULL, NULL), read_colnames = FALSE, read_rownames = FALSE, rownames = paste0(name, "_rownames")) { - d <- rhdf5::h5read(h5, name = name, index = list(NULL, index)) + d <- rhdf5::h5read(h5, name = name, index = index) if (read_rownames) - rownames(d) <- rhdf5::h5read(h5, name = rownames, drop = TRUE) - if (read_colnames) { - cn <- rhdf5::h5read(h5, name = paste0(name, "_colnames"), drop = TRUE) - if (length(index)) - colnames(d) <- cn[index] - else colnames(d) <- cn - } + rownames(d) <- rhdf5::h5read(h5, name = rownames, drop = TRUE, + index = index[1L]) + if (read_colnames) + colnames(d) <- rhdf5::h5read(h5, name = paste0(name, "_colnames"), + drop = TRUE, index = index[2L]) d } @@ -494,13 +553,19 @@ NULL #' #' @param h5 HDF5 file handle #' +#' @param index `list` with integer indices passed to rhdf5::h5read to read +#' only a subset of the data. Since `rhdf5::h5read()` seems to not support +#' parameter `index` for data sets the subsetting is done in R - but only +#' for `index[[1L]]`, i.e. rows. `index[[2L]]` is currently IGNORED. +#' #' @param read_rownames `logical(1)` whether rownames should be read and set. #' #' @param rownames `character(1)` defining the name of the HDF5 array #' containing the rownames. #' #' @noRd -.h5_read_data_frame <- function(name, h5, read_rownames = FALSE, +.h5_read_data_frame <- function(name, h5, index = list(NULL, NULL), + read_rownames = FALSE, rownames = paste0(name, "_rownames"), ...) { d <- rhdf5::h5read(h5, name = name) if (is.list(d)) @@ -508,11 +573,14 @@ NULL d <- as.data.frame(d) if (read_rownames) rownames(d) <- rhdf5::h5read(h5, rownames, drop = TRUE) - d + if (length(index[[1L]])) + d[index[[1L]], , drop = FALSE] + else d } -.h5_read_chrom_peak_data <- function(name, h5, read_rownames = FALSE, ...) { - .h5_read_data_frame(name, h5, read_rownames = read_rownames, +.h5_read_chrom_peak_data <- function(name, h5, index = list(NULL, NULL), + read_rownames = FALSE, ...) { + .h5_read_data_frame(name, h5, read_rownames = read_rownames, index = index, rownames = sub("_data", "s_rownames", name)) } @@ -571,7 +639,9 @@ NULL #' @param rownames `logical(1)` defining the name of the HDF5 array containing #' the row names. #' -#' @param column For `name = "chrom_peak_data"`: `character(1)` allowing to +#' @param i optional index with the rows to read. +#' +#' @param j For `name = "chrom_peak_data"`: `character(1)` allowing to #' select a **single** column to read. For `name = "chrom_peaks"`: `integer` #' with the indices of the column(s) that should be imported. #' @@ -588,7 +658,7 @@ NULL ms_level = integer(), read_colnames = FALSE, read_rownames = FALSE, - column = NULL) { + i = NULL, j = NULL) { if (!length(index)) return(list()) stopifnot(length(ms_level) == length(index)) name <- match.arg(name) @@ -602,10 +672,13 @@ NULL h5 <- rhdf5::H5Fopen(h5_file) on.exit(invisible(rhdf5::H5Fclose(h5))) d <- paste0("/", index, "/ms_", ms_level, "/", name) - if (is.character(column) && length(column) == 1L) - d <- paste0(d, "/", column) + index <- list(i, j) + if (is.character(j) && length(j) == 1L) { + d <- paste0(d, "/", j) + index <- list(i, NULL) + } lapply(d, FUN = FUN, read_colnames = read_colnames, - read_rownames = read_rownames, index = column, h5 = h5) + read_rownames = read_rownames, index = index, h5 = h5) } diff --git a/R/XcmsExperimentHdf5.R b/R/XcmsExperimentHdf5.R index 60ff5a1d..51c1d773 100644 --- a/R/XcmsExperimentHdf5.R +++ b/R/XcmsExperimentHdf5.R @@ -404,6 +404,8 @@ setMethod( "'groupChromPeaks'") peakGroupsMatrix(param) <- adjustRtimePeakGroups( object, param = param) + ## Need to implement an `adjustRtimePeakGroups,XcmsExperimentHdf5`. + } fidx <- as.factor(fromFile(object)) rt_raw <- split(rtime(object), fidx) @@ -604,4 +606,51 @@ setMethod( } } vals - }) \ No newline at end of file + }) + +################################################################################ +## +## OTHER FUNCTIONALIY +## +################################################################################ + +#' While previously we were first extracting the chromatograms and then adding +#' the chrom peaks later for `XcmsExperimentHdf5` it might be more efficient to +#' also extract the chrom peaks in the loop/chunk processing. So, essentially: +#' - process `object` chunk-wise +#' - for each chunk: +#' - extract chromatograms (in parallel?) +#' - get chrom peaks for each sample/chrom peak. +#' +#' @noRd +## #' @rdname hidden_aliases +## setMethod( +## "chromatogram", "XcmsExperimentHdf5", +## function(object, rt = matrix(nrow = 0, ncol = 2), +## mz = matrix(nrow = 0, ncol = 2), aggregationFun = "sum", +## msLevel = 1L, chunkSize = 2L, isolationWindowTargetMz = NULL, +## return.type = c("XChromatograms", "MChromatograms"), +## include = character(), +## chromPeaks = c("apex_within", "any", "none"), +## BPPARAM = bpparam()) { +## if (!is.matrix(rt)) rt <- matrix(rt, ncol = 2L) +## if (!is.matrix(mz)) mz <- matrix(mz, ncol = 2L) +## if (length(include)) { +## warning("Parameter 'include' is deprecated, please use ", +## "'chromPeaks' instead") +## chromPeaks <- include +## } +## if (nrow(mz) && !nrow(rt)) +## rt <- cbind(rep(-Inf, nrow(mz)), rep(Inf, nrow(mz))) +## if (nrow(rt) && !nrow(mz)) +## mz <- cbind(rep(-Inf, nrow(rt)), rep(Inf, nrow(rt))) +## return.type <- match.arg(return.type) +## chromPeaks <- match.arg(chromPeaks) +## if (hasAdjustedRtime(object)) +## object <- applyAdjustedRtime(object) +## .xmse_extract_chromatograms_old( +## object, rt = rt, mz = mz, aggregationFun = aggregationFun, +## msLevel = msLevel, isolationWindow = isolationWindowTargetMz, +## chunkSize = chunkSize, chromPeaks = chromPeaks, +## return.type = return.type, BPPARAM = BPPARAM) +## }) \ No newline at end of file diff --git a/tests/testthat/test_XcmsExperimentHdf5-functions.R b/tests/testthat/test_XcmsExperimentHdf5-functions.R index b7c44dcd..54ab32d2 100644 --- a/tests/testthat/test_XcmsExperimentHdf5-functions.R +++ b/tests/testthat/test_XcmsExperimentHdf5-functions.R @@ -263,12 +263,25 @@ test_that(".h5_read_matrix works", { res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, read_colnames = TRUE, read_rownames = TRUE) expect_equal(res, a) - res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, index = 3, + res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, index = list(NULL, 3), read_colnames = TRUE, read_rownames = TRUE) expect_equal(res, a[, 3, drop = FALSE]) - res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, index = c(1, 3), + res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, + index = list(NULL, c(1, 3)), read_colnames = FALSE, read_rownames = FALSE) expect_equal(res, unname(a[, c(1, 3)])) + res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, index = list(2, c(1, 3)), + read_colnames = TRUE, read_rownames = TRUE) + expect_equal(res, a[2, c(1, 3), drop = FALSE]) + res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, + index = list(c(2, 1, 2), c(1, 3)), + read_colnames = TRUE, read_rownames = TRUE) + expect_equal(res, a[c(2, 1, 2), c(1, 3), drop = FALSE]) + res <- .h5_read_matrix("/1/ms_1/chrom_peaks", h5, + index = list(c(2, 1, 2), c(1, 3)), + read_colnames = FALSE, read_rownames = FALSE) + expect_equal(res, unname(a[c(2, 1, 2), c(1, 3), drop = FALSE])) + H5Fclose(h5) file.remove(h5f) }) @@ -277,11 +290,6 @@ test_that(".h5_read_data_frame works", { h5f <- tempfile() .h5_initialize_file(h5f) - a <- data.frame(is_filled = c(TRUE, FALSE), other_col = "c") - b <- data.frame(is_filled = c(FALSE, FALSE, TRUE), other_col = "d") - l <- list(a, b) - names(l) <- 1:2 - .h5_write_data(h5f, l, name = "chrom_peak_data", ms_level = c(2L, 2L)) a <- cbind(a = c(1.2, 1.4), b = c(3.5, 3.6), c = c(5.3, 5.1)) rownames(a) <- c("CP1", "CP2") b <- cbind(a = c(12.4, 13.1, 3.2), b = c(1.3, 1.3, 1.4), c(4.2, 5.1, 4.6)) @@ -289,16 +297,38 @@ test_that(".h5_read_data_frame works", { l <- list(a, b) names(l) <- c(1, 2) .h5_write_data(h5f, l, name = "chrom_peaks", ms_level = c(2L, 2L)) + a <- data.frame(is_filled = c(TRUE, FALSE), other_col = "c") + b <- data.frame(is_filled = c(FALSE, FALSE, TRUE), other_col = "d") + l <- list(a, b) + names(l) <- 1:2 + .h5_write_data(h5f, l, name = "chrom_peak_data", ms_level = c(2L, 2L)) h5 <- H5Fopen(h5f) res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data", h5, - read_rownames = TRUE) - expect_equal(rownames(res), rownames(a)) + read_rownames = TRUE) + expect_equal(rownames(res), c("CP1", "CP2")) expect_equal(colnames(res), c("is_filled", "other_col")) + rownames(res) <- NULL + expect_equal(res, a) res <- .h5_read_data_frame("/1/ms_2/chrom_peak_data", h5, read_rownames = FALSE) expect_equal(colnames(res), c("is_filled", "other_col")) expect_equal(rownames(res), c("1", "2")) + expect_equal(res, a) + + ## Read selected rows + res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data", h5, + read_rownames = FALSE, + index = list(2, NULL)) + expect_equal(res, a[2, ]) + res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data", h5, + read_rownames = FALSE, + index = list(c(2, 1, 2), NULL)) + expect_equal(res, a[c(2, 1, 2), ]) + res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data", h5, + read_rownames = TRUE, + index = list(c(2, 1, 2), NULL)) + expect_equal(rownames(res), c("CP2", "CP1", "CP2.1")) ## Read single column res <- .h5_dataset_names("/1/ms_2/chrom_peak_data", h5) @@ -306,6 +336,13 @@ test_that(".h5_read_data_frame works", { res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data/other_col", h5) expect_true(is.data.frame(res)) expect_equal(res[, 1L], c("c", "c")) + + ## Read selected rows + res <- .h5_read_chrom_peak_data("/1/ms_2/chrom_peak_data/is_filled", h5, + index = list(2, NULL)) + expect_true(nrow(res) == 1L) + expect_equal(res[, 1L], FALSE) + H5Fclose(h5) file.remove(h5f) }) @@ -346,24 +383,34 @@ test_that(".h5_read_data works", { expect_true(is.null(colnames(res[[1L]]))) ## single column res <- .h5_read_data(h5f, index = c(2, 1), name = "chrom_peaks", - ms_level = c(2L, 2L), column = 2) + ms_level = c(2L, 2L), j = 2) expect_equal(length(res), 2L) expect_true(ncol(res[[1L]]) == 1L) expect_equal(res[[1L]][, 1], unname(b2[, 2])) res <- .h5_read_data(h5f, index = c(2, 1), name = "chrom_peaks", - ms_level = c(2L, 2L), column = 2, read_colnames = TRUE, + ms_level = c(2L, 2L), j = 2, read_colnames = TRUE, read_rownames = TRUE) expect_equal(length(res), 2L) expect_true(ncol(res[[1L]]) == 1L) expect_equal(res[[1L]][, 1, drop = FALSE], b2[, 2, drop = FALSE]) res <- .h5_read_data(h5f, index = c(1, 2, 1), name = "chrom_peaks", - ms_level = c(2, 2, 2), column = 1L, + ms_level = c(2, 2, 2), j = 1L, read_colnames = TRUE, read_rownames = TRUE) expect_equal(length(res), 3) expect_equal(res[[1]], res[[3]]) expect_equal(res[[2]], b2[, 1, drop = FALSE]) + ## selected rows. + res <- .h5_read_data(h5f, index = c(1, 2, 1), name = "chrom_peaks", + ms_level = c(2, 2, 2), j = 1L, i = c(2, 1, 2), + read_colnames = TRUE, + read_rownames = TRUE) + expect_equal(length(res), 3) + expect_equal(res[[1]], res[[3]]) + expect_equal(res[[2]], b2[c(2, 1, 2), 1, drop = FALSE]) + expect_equal(res[[1]], a2[c(2, 1, 2), 1, drop = FALSE]) + ## chrom peak data res <- .h5_read_data(h5f, index = c(2, 1), name = "chrom_peak_data", ms_level = c(2L, 2L), read_colnames = TRUE, @@ -372,7 +419,7 @@ test_that(".h5_read_data works", { rownames(b) <- c("CP3", "CP4", "CP5") expect_equal(unname(res[[1L]]), unname(b)) res <- .h5_read_data(h5f, index = 1, name = "chrom_peak_data", - ms_level = 2L, column = "is_filled") + ms_level = 2L, j = "is_filled") expect_equal(length(res), 1L) expect_equal(res[[1L]][, 1], a$is_filled) @@ -510,7 +557,18 @@ test_that(".h5_feature_values_sample", { expect_true(TRUE) }) -test_that(".h5_feature_Values_ms_level", { - ## unit test in text_XcmsExperimentHdf5.R +test_that(".h5_feature_values_ms_level", { + ## unit test in test_XcmsExperimentHdf5.R expect_true(TRUE) +}) + +test_that(".h5_feature_values_ms_level", { + ## unit test in test_XcmsExperimentHdf5.R + expect_true(TRUE) +}) + +test_that(".h5_chrom_peaks_colnames works", { + res <- .h5_chrom_peaks_colnames(xmse_h5, 1L) + expect_true(is.character(res)) + expect_true(all(c("mz", "mzmin", "mzmax", "rt") %in% res)) }) \ No newline at end of file diff --git a/tests/testthat/test_XcmsExperimentHdf5.R b/tests/testthat/test_XcmsExperimentHdf5.R index 86172c2e..b1bf60e0 100644 --- a/tests/testthat/test_XcmsExperimentHdf5.R +++ b/tests/testthat/test_XcmsExperimentHdf5.R @@ -269,6 +269,32 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", { expect_equal(res, fv_ref) }) +test_that("adjustRtime,XcmsExperimentHdf5,PeakGroupsParam works", { + object <- xmseg_full_h5 + msLevel <- 1L + param <- PeakGroupsParam(span = 0.4) +}) + +test_that(".h5_feature_chrom_peaks_sample works", { + cn <- .h5_chrom_peaks_colnames(xmseg_full_h5, 1L) + res <- .h5_feature_chrom_peaks_sample("S3", xmseg_full_h5@hdf5_file, + 1L, j = match("into", cn)) + ref <- featureValues(xmseg_full_h5, method = "sum", value = "into") + vals <- split(res[, 2L], factor(res[, 1L], levels = seq_len(nrow(ref)))) + vals <- vapply(vals, function(z) { + if (length(z)) + sum(z) + else NA_real_ + }, 2.2) + expect_equal(unname(vals), unname(ref[, 3L])) + ## With index. + i <- c(1, 4, 2, 3, 2) + res <- .h5_feature_chrom_peaks_sample("S3", xmseg_full_h5@hdf5_file, + 1L, j = match("into", cn), i = i) + expect_equal(res[, 1L], c(4, 2, 2)) + expect_equal(res[, 2L], unname(ref[c(4, 2, 2), 3L])) +}) + unlink(h5f) unlink(h5f_full) unlink(h5f_full_g) \ No newline at end of file