Skip to content

Commit

Permalink
feat: add functionality to get chrom peaks for features
Browse files Browse the repository at this point in the history
  • Loading branch information
jorainer committed Oct 30, 2024
1 parent 71129d6 commit e72e1c2
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 45 deletions.
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ importClassesFrom("S4Vectors", "Rle", "DataFrame", "Hits")
importFrom("S4Vectors", "split", "Rle", "DataFrame", "SimpleList", "List",
"as.matrix")
importMethodsFrom("S4Vectors", "as.matrix", "mcols", "mcols<-",
"extractROWS", "findMatches")
"extractROWS", "findMatches", "to")
importFrom("SummarizedExperiment", "SummarizedExperiment")
importFrom("SummarizedExperiment", "rowData")
importFrom("SummarizedExperiment", "rowData<-")
Expand Down Expand Up @@ -605,4 +605,4 @@ export("PercentMissingFilter")
export("BlankFlag")

## HDF5 storage mode
exportClasses("XcmsExperimentHdf5")
exportClasses("XcmsExperimentHdf5")
127 changes: 100 additions & 27 deletions R/XcmsExperimentHdf5-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,7 @@ NULL
type = "any", by_sample = TRUE) {
if (length(columns)) {
## Get column names, convert column names to indices.
cn <- rhdf5::h5read(x@hdf5_file,
name = paste0("/", x@sample_id[1L], "/ms_",
msLevel[1L], "/chrom_peaks_colnames"))
cn <- .h5_chrom_peaks_colnames(x, msLevel = msLevel)
idx_columns <- match(columns, cn)
if (anyNA(idx_columns))
stop("Column(s) ", paste0("\"", columns[is.na(idx_columns)], "\"",
Expand All @@ -306,7 +304,7 @@ NULL
msl <- rep(msLevel, each = length(x@sample_id))
res <- .h5_read_data(x@hdf5_file, index = ids, name = "chrom_peaks",
ms_level = msl, read_colnames = TRUE,
read_rownames = TRUE, column = idx_columns)
read_rownames = TRUE, j = idx_columns)
if (length(mz) | length(rt))
res <- lapply(res, function(z, rt, mz, ppm, type) {
z[.is_chrom_peaks_within_mz_rt(
Expand All @@ -326,13 +324,53 @@ NULL
## LLLL implement; following .h5_chrom_peaks
}

.h5_chrom_peaks_colnames <- function(x, msLevel = 1L) {
rhdf5::h5read(x@hdf5_file,
name = paste0("/", x@sample_id[1L], "/ms_",
msLevel[1L], "/chrom_peaks_colnames"),
drop = TRUE)
}

################################################################################
##
## FEATURES THINGS
##
################################################################################

#' Get chrom peaks for features from one sample. Allows to define/subset by
#' features (`i`) and select column(s) from the chrom peaks matrix to return.
#'
#' @param sample_id `character(1)` with the ID of the sample from which to
#' return the data.
#'
#' @param hdf5_file `character(1)` with the HDF5 file name
#'
#' @param ms_level `integer(1)` with the MS level of the features/chrom peaks
#'
#' @param i optional `integer` to select the features for which to return the
#' data.
#'
#' @param j optional `integer` defining the index of the column(s) to return.
#'
#' @return `matrix` with the chrom peak data, first column being the feature
#' index.
#'
#' @importFrom S4Vectors findMatches to
#'
#' @noRd
.h5_feature_chrom_peaks_sample <- function(sample_id, hdf5_file, ms_level,
i = integer(), j = NULL) {
fidx <- xcms:::.h5_read_data(hdf5_file, sample_id, name = "feature_to_chrom_peaks",
ms_level = ms_level)[[1L]]
if (length(i)) {
hits <- findMatches(i, fidx[, 1L])
fidx <- fidx[to(hits), , drop = FALSE]
}
vals <- xcms:::.h5_read_data(hdf5_file, sample_id, name = "chrom_peaks",
ms_level = ms_level, i = fidx[, 2L], j = j)[[1L]]
cbind(fidx[, 1L], vals)
}

#' Extracts feature values for one sample summing intensities for features
#' with multiple peaks assigned.
#'
Expand Down Expand Up @@ -370,7 +408,7 @@ NULL
res <- rep(NA_real_, n_features)
sid <- paste0("/", sample_id, "/ms_", ms_level)
vals <- .h5_read_data(hdf5_file, sample_id, name = "chrom_peaks",
ms_level = ms_level, column = col_idx)[[1L]]
ms_level = ms_level, j = col_idx)[[1L]]
fidx <- .h5_read_data(hdf5_file, sample_id, name = "feature_to_chrom_peaks",
ms_level = ms_level)[[1L]]
## remove gap-filled values
Expand Down Expand Up @@ -404,8 +442,7 @@ NULL
#' @noRd
.h5_feature_values_ms_level <- function(ms_level, x, method, value, intensity,
filled = TRUE) {
cn <- h5read(x@hdf5_file, paste0("/", x@sample_id[1L], "/ms_", ms_level,
"/chrom_peaks_colnames"), drop = TRUE)
cn <- .h5_chrom_peaks_colnames(x, ms_level)
col <- switch(method,
sum = value,
medret = c(value, "rt"),
Expand All @@ -430,6 +467,30 @@ NULL
res
}

################################################################################
##
## ALIGNMENT RELATED FUNCTIONALITY
##
################################################################################

#' .getPeakGroupsRtMatrix in do_adjustRtime-functions.R
#' Need a function that gets a matrix with columns being samples and rows the
#' retention time of a (defined) peak group. Peak groups get defined based
#' on the number of available peaks per feature.
#'
#' define the features based on the features_to_chrom_peaks data set.
#' for those we need to get then their retention times.
#'
#' @noRd
.get_peak_groups_rt_matrix <- function(x, ms_level = 1L) {
## LLLLLL continue.
i <- 1L
idx <- rhdf5::h5read(x@hdf5_file, paste0("/S", i, "/ms_", ms_level,
"/feature_to_chrom_peaks"))
rts <- xcms:::.h5_read_data(, index = LLLL)
}


################################################################################
##
## HDF5 FUNCTIONALITY
Expand All @@ -456,8 +517,8 @@ NULL
#'
#' @param h5 HDF5 file handle
#'
#' @param index `integer` with the index (or indices) of the columns to read.
#' By default, with `index = NULL` all columns are read.
#' @param index `list` with `integer` indices of the rows and columns to read
#' only a subset of the data. Is passed directly to `rhdf5::h5read()`.
#'
#' @param read_colnames `logical(1)` whether column names should also be read
#' and set.
Expand All @@ -467,19 +528,17 @@ NULL
#' @return numeric `matrix`
#'
#' @noRd
.h5_read_matrix <- function(name, h5, index = NULL,
.h5_read_matrix <- function(name, h5, index = list(NULL, NULL),
read_colnames = FALSE,
read_rownames = FALSE,
rownames = paste0(name, "_rownames")) {
d <- rhdf5::h5read(h5, name = name, index = list(NULL, index))
d <- rhdf5::h5read(h5, name = name, index = index)
if (read_rownames)
rownames(d) <- rhdf5::h5read(h5, name = rownames, drop = TRUE)
if (read_colnames) {
cn <- rhdf5::h5read(h5, name = paste0(name, "_colnames"), drop = TRUE)
if (length(index))
colnames(d) <- cn[index]
else colnames(d) <- cn
}
rownames(d) <- rhdf5::h5read(h5, name = rownames, drop = TRUE,
index = index[1L])
if (read_colnames)
colnames(d) <- rhdf5::h5read(h5, name = paste0(name, "_colnames"),
drop = TRUE, index = index[2L])
d
}

Expand All @@ -494,25 +553,34 @@ NULL
#'
#' @param h5 HDF5 file handle
#'
#' @param index `list` with integer indices passed to rhdf5::h5read to read
#' only a subset of the data. Since `rhdf5::h5read()` seems to not support
#' parameter `index` for data sets the subsetting is done in R - but only
#' for `index[[1L]]`, i.e. rows. `index[[2L]]` is currently IGNORED.
#'
#' @param read_rownames `logical(1)` whether rownames should be read and set.
#'
#' @param rownames `character(1)` defining the name of the HDF5 array
#' containing the rownames.
#'
#' @noRd
.h5_read_data_frame <- function(name, h5, read_rownames = FALSE,
.h5_read_data_frame <- function(name, h5, index = list(NULL, NULL),
read_rownames = FALSE,
rownames = paste0(name, "_rownames"), ...) {
d <- rhdf5::h5read(h5, name = name)
if (is.list(d))
d <- lapply(d, as.vector)
d <- as.data.frame(d)
if (read_rownames)
rownames(d) <- rhdf5::h5read(h5, rownames, drop = TRUE)
d
if (length(index[[1L]]))
d[index[[1L]], , drop = FALSE]
else d
}

.h5_read_chrom_peak_data <- function(name, h5, read_rownames = FALSE, ...) {
.h5_read_data_frame(name, h5, read_rownames = read_rownames,
.h5_read_chrom_peak_data <- function(name, h5, index = list(NULL, NULL),
read_rownames = FALSE, ...) {
.h5_read_data_frame(name, h5, read_rownames = read_rownames, index = index,
rownames = sub("_data", "s_rownames", name))
}

Expand Down Expand Up @@ -571,7 +639,9 @@ NULL
#' @param rownames `logical(1)` defining the name of the HDF5 array containing
#' the row names.
#'
#' @param column For `name = "chrom_peak_data"`: `character(1)` allowing to
#' @param i optional index with the rows to read.
#'
#' @param j For `name = "chrom_peak_data"`: `character(1)` allowing to
#' select a **single** column to read. For `name = "chrom_peaks"`: `integer`
#' with the indices of the column(s) that should be imported.
#'
Expand All @@ -588,7 +658,7 @@ NULL
ms_level = integer(),
read_colnames = FALSE,
read_rownames = FALSE,
column = NULL) {
i = NULL, j = NULL) {
if (!length(index)) return(list())
stopifnot(length(ms_level) == length(index))
name <- match.arg(name)
Expand All @@ -602,10 +672,13 @@ NULL
h5 <- rhdf5::H5Fopen(h5_file)
on.exit(invisible(rhdf5::H5Fclose(h5)))
d <- paste0("/", index, "/ms_", ms_level, "/", name)
if (is.character(column) && length(column) == 1L)
d <- paste0(d, "/", column)
index <- list(i, j)
if (is.character(j) && length(j) == 1L) {
d <- paste0(d, "/", j)
index <- list(i, NULL)
}
lapply(d, FUN = FUN, read_colnames = read_colnames,
read_rownames = read_rownames, index = column, h5 = h5)
read_rownames = read_rownames, index = index, h5 = h5)
}


Expand Down
51 changes: 50 additions & 1 deletion R/XcmsExperimentHdf5.R
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,8 @@ setMethod(
"'groupChromPeaks'")
peakGroupsMatrix(param) <- adjustRtimePeakGroups(
object, param = param)
## Need to implement an `adjustRtimePeakGroups,XcmsExperimentHdf5`.

}
fidx <- as.factor(fromFile(object))
rt_raw <- split(rtime(object), fidx)
Expand Down Expand Up @@ -604,4 +606,51 @@ setMethod(
}
}
vals
})
})

################################################################################
##
## OTHER FUNCTIONALIY
##
################################################################################

#' While previously we were first extracting the chromatograms and then adding
#' the chrom peaks later for `XcmsExperimentHdf5` it might be more efficient to
#' also extract the chrom peaks in the loop/chunk processing. So, essentially:
#' - process `object` chunk-wise
#' - for each chunk:
#' - extract chromatograms (in parallel?)
#' - get chrom peaks for each sample/chrom peak.
#'
#' @noRd
## #' @rdname hidden_aliases
## setMethod(
## "chromatogram", "XcmsExperimentHdf5",
## function(object, rt = matrix(nrow = 0, ncol = 2),
## mz = matrix(nrow = 0, ncol = 2), aggregationFun = "sum",
## msLevel = 1L, chunkSize = 2L, isolationWindowTargetMz = NULL,
## return.type = c("XChromatograms", "MChromatograms"),
## include = character(),
## chromPeaks = c("apex_within", "any", "none"),
## BPPARAM = bpparam()) {
## if (!is.matrix(rt)) rt <- matrix(rt, ncol = 2L)
## if (!is.matrix(mz)) mz <- matrix(mz, ncol = 2L)
## if (length(include)) {
## warning("Parameter 'include' is deprecated, please use ",
## "'chromPeaks' instead")
## chromPeaks <- include
## }
## if (nrow(mz) && !nrow(rt))
## rt <- cbind(rep(-Inf, nrow(mz)), rep(Inf, nrow(mz)))
## if (nrow(rt) && !nrow(mz))
## mz <- cbind(rep(-Inf, nrow(rt)), rep(Inf, nrow(rt)))
## return.type <- match.arg(return.type)
## chromPeaks <- match.arg(chromPeaks)
## if (hasAdjustedRtime(object))
## object <- applyAdjustedRtime(object)
## .xmse_extract_chromatograms_old(
## object, rt = rt, mz = mz, aggregationFun = aggregationFun,
## msLevel = msLevel, isolationWindow = isolationWindowTargetMz,
## chunkSize = chunkSize, chromPeaks = chromPeaks,
## return.type = return.type, BPPARAM = BPPARAM)
## })
Loading

0 comments on commit e72e1c2

Please sign in to comment.