Skip to content

Commit

Permalink
Revert "fix: Handle zero-sum samples in generate_taxa_test_single"
Browse files Browse the repository at this point in the history
This reverts commit f0dae8d.
  • Loading branch information
cafferychen777 committed Nov 8, 2024
1 parent 3e03292 commit 98f7c0d
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 374 deletions.
7 changes: 0 additions & 7 deletions R/generate_taxa_test_single.R
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,6 @@ generate_taxa_test_single <- function(data.obj,
mStat_filter(prev.filter = prev.filter,
abund.filter = abund.filter)

# Add this check before linda analysis
if (any(colSums(otu_tax_agg_filter) == 0)) {
keep_samples <- colSums(otu_tax_agg_filter) > 0
otu_tax_agg_filter <- otu_tax_agg_filter[, keep_samples]
meta_tab <- meta_tab[keep_samples, ]
}

# Perform LinDA (Linear models for Differential Abundance) analysis
linda.obj <- linda(
feature.dat = otu_tax_agg_filter,
Expand Down
313 changes: 48 additions & 265 deletions R/mStat_import_qiime2_as_data_obj.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,31 +49,14 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
message("Loading feature table...")
feature.tab <- read_qza(otu_qza)

if (is.null(feature.tab) || nrow(feature.tab) == 0 || ncol(feature.tab) == 0) {
stop("Failed to read feature table or table is empty")
}

if (!is.null(taxa_qza)) {
message("Loading feature annotations...")
taxa_data <- read_qza_taxonomy(taxa_qza)
if (!"Feature.ID" %in% colnames(taxa_data)) {
stop("Taxonomy file must contain a 'Feature.ID' column. Available columns: ",
paste(colnames(taxa_data), collapse = ", "))
}
if (!"Taxon" %in% colnames(taxa_data)) {
stop("Taxonomy file must contain a 'Taxon' column")
}

common_features <- intersect(rownames(feature.tab), taxa_data$Feature.ID)
if (length(common_features) == 0) {
stop("No matching features between feature table and taxonomy data")
}

feature.tab <- feature.tab[common_features, , drop = FALSE]
taxa_data <- taxa_data[match(common_features, taxa_data$Feature.ID), , drop = FALSE]
feature.ann <- read_qza(file = taxa_qza)
feature.ann <- feature.ann %>% as.data.frame() %>% filter(Feature.ID %in% rownames(feature.tab))
feature.ann <- parse_q2taxonomy(feature.ann)
} else {
message("No feature annotations provided. Skipping...")
taxa_data <- NULL
taxa_tab <- NULL
}

if (!is.null(sam_tab)) {
Expand All @@ -96,7 +79,7 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
message("Creating MicrobiomeStat data object...")
data.obj <- list(feature.tab = feature.tab,
meta.dat = sam_tab,
feature.ann = taxa_data,
feature.ann = feature.ann,
tree = tree)


Expand Down Expand Up @@ -126,90 +109,46 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
#' # library(yaml)
#' # library(Biostrings)
#' # read_qza('path_to_your_file.qza')
read_qza <- function(file) {
temp <- tempdir()
read_qza <- function(file, temp = tempdir()) {
# Unzip the file
message("Unzipping the .qza file...")
utils::unzip(file, exdir = temp)

# 列出所有文件
all_files <- list.files(temp, recursive = TRUE, full.names = TRUE)
message("Found files: ", paste(all_files, collapse = "\n"))

# 查找 metadata.yaml
metadata_file <- list.files(temp, pattern = "metadata.yaml$",
full.names = TRUE, recursive = TRUE)[1]
if (is.na(metadata_file)) {
stop("Could not find metadata.yaml in qza archive")
}

metadata <- yaml::read_yaml(metadata_file)
message("Format from metadata: ", metadata$format)

# 处理 BIOMV210 格式
if (grepl("BIOMV210", metadata$format)) {
# 查找任意目录下的 biom 文件
biom_file <- list.files(temp, pattern = "\\.biom$",
full.names = TRUE, recursive = TRUE)[1]

if (!is.na(biom_file)) {
message("Found BIOM file: ", biom_file)
message("Reading BIOM file...")
return(read_q2biom(biom_file))
} else {
stop("Could not find BIOM file in qza archive")
}
}

# 处理分类数据
if (grepl("TSVTaxonomyDirectoryFormat", metadata$format)) {
# 查找任意目录下的 taxonomy.tsv 文件
tsv_file <- list.files(temp, pattern = "taxonomy\\.tsv$",
full.names = TRUE, recursive = TRUE)[1]

if (!is.na(tsv_file)) {
message("Found taxonomy file: ", tsv_file)
message("Reading taxonomy TSV file...")
data <- utils::read.table(tsv_file, header = TRUE, sep = "\t",
quote = "", comment.char = "")

# 重命名列
col_mapping <- c(
"#OTU ID" = "Feature.ID",
"OTU ID" = "Feature.ID",
"ID" = "Feature.ID",
"Taxonomy" = "Taxon"
)

for (old_name in names(col_mapping)) {
if (old_name %in% colnames(data)) {
data[[col_mapping[old_name]]] <- data[[old_name]]
data[[old_name]] <- NULL
}
}

if (!"Feature.ID" %in% colnames(data)) {
stop("Could not find Feature.ID column in taxonomy file")
}

return(data)
} else {
stop("Could not find taxonomy.tsv file in qza archive")
}
}

# 处理 Newick 树
if (grepl("NewickDirectoryFormat", metadata$format)) {
tree_file <- list.files(temp, pattern = "tree\\.nwk$",
full.names = TRUE, recursive = TRUE)[1]
if (!is.na(tree_file)) {
message("Found tree file: ", tree_file)
return(ape::read.tree(tree_file))
} else {
stop("Could not find tree.nwk file in qza archive")
}
unzipped_file_paths <- utils::unzip(file, exdir = temp)

# Find metadata.yaml file in the unzipped files
message("Locating the metadata file...")
metadata_file_path <- grep("metadata.yaml", unzipped_file_paths, value = TRUE)

# Read the metadata
message("Reading the metadata...")
metadata <- yaml::read_yaml(metadata_file_path[1])
file_format <- metadata$format
file_uuid <- metadata$uuid

# Check the format of the file and process accordingly
message("Processing the file based on its format...")
if (grepl("BIOMV", file_format)) {
biom_file_path <- file.path(temp, file_uuid, "data/feature-table.biom")
processed_file <- read_q2biom(biom_file_path)
} else if (file_format == "TSVTaxonomyDirectoryFormat") {
taxonomy_file_path <- file.path(temp, file_uuid, "data/taxonomy.tsv")
processed_file <- read_q2taxa(taxonomy_file_path)
} else if (file_format == "NewickDirectoryFormat") {
tree_file_path <- file.path(temp, file_uuid, "data/tree.nwk")
processed_file <- read_tree(tree_file_path)
} else if (file_format == "DNASequencesDirectoryFormat") {
dna_sequences_file_path <- file.path(temp, file_uuid, "data/dna-sequences.fasta")
processed_file <- readDNAStringSet(dna_sequences_file_path)
} else {
stop(
"Only files in format of 'BIOMV210DirFmt' ",
"'TSVTaxonomyDirectoryFormat', ",
"'NewickDirectoryFormat' and 'DNASequencesDirectoryFormat'",
" are supported."
)
}

stop("Unsupported format or file not found: ", metadata$format)

message("File processed successfully!")
return(processed_file)
}


Expand All @@ -222,116 +161,11 @@ read_qza <- function(file) {
#' @examples
#' # Please replace 'path_to_your_file.biom' with your actual file path
#' # feature_table <- read_q2biom('path_to_your_file.biom')
read_q2biom <- function(biom_file) {
message("\n=== BIOM File Reading Debug Info ===")
message("BIOM file path: ", biom_file)
message("File exists: ", file.exists(biom_file))
message("File size: ", file.size(biom_file), " bytes")

# Check file content type
file_header <- readBin(biom_file, "raw", n = 10)
message("File header (hex): ", paste(as.hexmode(file_header), collapse = " "))

# Try biomformat first
biom_data <- tryCatch({
message("\nAttempting to read with biomformat...")
biom <- biomformat::read_biom(biom_file)
message("BIOM object class: ", class(biom))
message("BIOM object structure:")
str(biom)

# Convert to matrix and then data frame
mat <- as.matrix(biom)
message("\nMatrix dimensions: ", nrow(mat), " x ", ncol(mat))
message("Sample of matrix data:")
print(mat[1:min(5, nrow(mat)), 1:min(5, ncol(mat))])

df <- as.data.frame(mat)
message("Successfully converted to data frame")
return(df)
}, error = function(e) {
message("biomformat reading failed with error: ", e$message)
message("Error class: ", class(e))
message("Full error:")
print(e)
return(NULL)
})

if (!is.null(biom_data)) {
message("Successfully read BIOM file with biomformat")
return(biom_data)
}

# Try HDF5
if (requireNamespace("rhdf5", quietly = TRUE)) {
message("\nAttempting to read as HDF5...")
rhdf5::h5closeAll()

# Check if file is HDF5
is_hdf5 <- tryCatch({
rhdf5::H5Fis_hdf5(biom_file)
}, error = function(e) {
message("HDF5 format check failed: ", e$message)
FALSE
})

message("Is HDF5 format: ", is_hdf5)

if (is_hdf5) {
# List HDF5 structure
message("HDF5 file structure:")
print(rhdf5::h5ls(biom_file))

hdf5_data <- tryCatch({
mat <- rhdf5::h5read(biom_file, "observation/matrix")
message("Matrix dimensions: ", nrow(mat), " x ", ncol(mat))

obs_ids <- rhdf5::h5read(biom_file, "observation/ids")
message("Number of observation IDs: ", length(obs_ids))
message("Sample observation IDs: ", paste(head(obs_ids), collapse = ", "))

sam_ids <- rhdf5::h5read(biom_file, "sample/ids")
message("Number of sample IDs: ", length(sam_ids))
message("Sample sample IDs: ", paste(head(sam_ids), collapse = ", "))

df <- as.data.frame(mat)
colnames(df) <- sam_ids
rownames(df) <- obs_ids

message("Successfully created data frame from HDF5")
return(df)
}, error = function(e) {
message("HDF5 reading failed with error: ", e$message)
message("Error class: ", class(e))
message("Full error:")
print(e)
return(NULL)
}, finally = {
rhdf5::h5closeAll()
})

if (!is.null(hdf5_data)) {
return(hdf5_data)
}
}
}

# Try reading as JSON text
message("\nAttempting to read as JSON text...")
json_data <- tryCatch({
json_text <- readLines(biom_file, warn = FALSE)
message("First few lines of file:")
print(head(json_text))

# Add JSON parsing here if needed

}, error = function(e) {
message("JSON reading failed: ", e$message)
return(NULL)
})

message("\n=== End of BIOM File Reading Debug Info ===\n")
stop("Failed to read BIOM file in any format")
read_q2biom <- function(file) {
biomobj <- read_biom(file)
feature_tab <- as(biom_data(biomobj), "matrix")

return(feature_tab)
}

#' Read qiime2 taxa file
Expand Down Expand Up @@ -405,55 +239,4 @@ parse_q2taxonomy <- function(taxa, sep = "; |;", trim_rank_prefix = TRUE) {
return(as.matrix(taxa))
}

read_qza_taxonomy <- function(taxa_qza) {
# Extract taxonomy file
unzip_dir <- tempfile()
dir.create(unzip_dir)
unzip(taxa_qza, exdir = unzip_dir)

# Find taxonomy file
taxa_file <- list.files(unzip_dir,
pattern = "taxonomy.tsv$",
recursive = TRUE,
full.names = TRUE)[1]

if (is.na(taxa_file)) {
stop("Cannot find taxonomy.tsv file in the qza archive")
}

message("Found taxonomy file: ", taxa_file)

# Read taxonomy data
taxa_data <- read.table(taxa_file,
sep = "\t",
header = TRUE,
quote = "",
comment.char = "",
stringsAsFactors = FALSE)

message("Found columns: ", paste(colnames(taxa_data), collapse = ", "))

# Ensure Feature.ID column exists
if (!"Feature.ID" %in% colnames(taxa_data)) {
# Try common alternative names
alt_names <- c("Feature ID", "#OTU ID", "ID", "OTU", "ASV")
for (name in alt_names) {
if (name %in% colnames(taxa_data)) {
taxa_data$Feature.ID <- taxa_data[[name]]
break
}
}

# If still no Feature.ID, use first column
if (!"Feature.ID" %in% colnames(taxa_data)) {
taxa_data$Feature.ID <- taxa_data[[1]]
}
}

# Clean up temporary directory
unlink(unzip_dir, recursive = TRUE)

return(taxa_data)
}


Loading

0 comments on commit 98f7c0d

Please sign in to comment.