diff --git a/R/generate_taxa_test_single.R b/R/generate_taxa_test_single.R index d9bed11..5e6886c 100644 --- a/R/generate_taxa_test_single.R +++ b/R/generate_taxa_test_single.R @@ -150,13 +150,6 @@ generate_taxa_test_single <- function(data.obj, mStat_filter(prev.filter = prev.filter, abund.filter = abund.filter) - # Add this check before linda analysis - if (any(colSums(otu_tax_agg_filter) == 0)) { - keep_samples <- colSums(otu_tax_agg_filter) > 0 - otu_tax_agg_filter <- otu_tax_agg_filter[, keep_samples] - meta_tab <- meta_tab[keep_samples, ] - } - # Perform LinDA (Linear models for Differential Abundance) analysis linda.obj <- linda( feature.dat = otu_tax_agg_filter, diff --git a/R/mStat_import_qiime2_as_data_obj.R b/R/mStat_import_qiime2_as_data_obj.R index e87aae2..75026fe 100644 --- a/R/mStat_import_qiime2_as_data_obj.R +++ b/R/mStat_import_qiime2_as_data_obj.R @@ -49,31 +49,14 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza, message("Loading feature table...") feature.tab <- read_qza(otu_qza) - if (is.null(feature.tab) || nrow(feature.tab) == 0 || ncol(feature.tab) == 0) { - stop("Failed to read feature table or table is empty") - } - if (!is.null(taxa_qza)) { message("Loading feature annotations...") - taxa_data <- read_qza_taxonomy(taxa_qza) - if (!"Feature.ID" %in% colnames(taxa_data)) { - stop("Taxonomy file must contain a 'Feature.ID' column. Available columns: ", - paste(colnames(taxa_data), collapse = ", ")) - } - if (!"Taxon" %in% colnames(taxa_data)) { - stop("Taxonomy file must contain a 'Taxon' column") - } - - common_features <- intersect(rownames(feature.tab), taxa_data$Feature.ID) - if (length(common_features) == 0) { - stop("No matching features between feature table and taxonomy data") - } - - feature.tab <- feature.tab[common_features, , drop = FALSE] - taxa_data <- taxa_data[match(common_features, taxa_data$Feature.ID), , drop = FALSE] + feature.ann <- read_qza(file = taxa_qza) + feature.ann <- feature.ann %>% as.data.frame() %>% filter(Feature.ID %in% rownames(feature.tab)) + feature.ann <- parse_q2taxonomy(feature.ann) } else { message("No feature annotations provided. Skipping...") - taxa_data <- NULL + taxa_tab <- NULL } if (!is.null(sam_tab)) { @@ -96,7 +79,7 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza, message("Creating MicrobiomeStat data object...") data.obj <- list(feature.tab = feature.tab, meta.dat = sam_tab, - feature.ann = taxa_data, + feature.ann = feature.ann, tree = tree) @@ -126,90 +109,46 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza, #' # library(yaml) #' # library(Biostrings) #' # read_qza('path_to_your_file.qza') -read_qza <- function(file) { - temp <- tempdir() +read_qza <- function(file, temp = tempdir()) { + # Unzip the file message("Unzipping the .qza file...") - utils::unzip(file, exdir = temp) - - # 列出所有文件 - all_files <- list.files(temp, recursive = TRUE, full.names = TRUE) - message("Found files: ", paste(all_files, collapse = "\n")) - - # 查找 metadata.yaml - metadata_file <- list.files(temp, pattern = "metadata.yaml$", - full.names = TRUE, recursive = TRUE)[1] - if (is.na(metadata_file)) { - stop("Could not find metadata.yaml in qza archive") - } - - metadata <- yaml::read_yaml(metadata_file) - message("Format from metadata: ", metadata$format) - - # 处理 BIOMV210 格式 - if (grepl("BIOMV210", metadata$format)) { - # 查找任意目录下的 biom 文件 - biom_file <- list.files(temp, pattern = "\\.biom$", - full.names = TRUE, recursive = TRUE)[1] - - if (!is.na(biom_file)) { - message("Found BIOM file: ", biom_file) - message("Reading BIOM file...") - return(read_q2biom(biom_file)) - } else { - stop("Could not find BIOM file in qza archive") - } - } - - # 处理分类数据 - if (grepl("TSVTaxonomyDirectoryFormat", metadata$format)) { - # 查找任意目录下的 taxonomy.tsv 文件 - tsv_file <- list.files(temp, pattern = "taxonomy\\.tsv$", - full.names = TRUE, recursive = TRUE)[1] - - if (!is.na(tsv_file)) { - message("Found taxonomy file: ", tsv_file) - message("Reading taxonomy TSV file...") - data <- utils::read.table(tsv_file, header = TRUE, sep = "\t", - quote = "", comment.char = "") - - # 重命名列 - col_mapping <- c( - "#OTU ID" = "Feature.ID", - "OTU ID" = "Feature.ID", - "ID" = "Feature.ID", - "Taxonomy" = "Taxon" - ) - - for (old_name in names(col_mapping)) { - if (old_name %in% colnames(data)) { - data[[col_mapping[old_name]]] <- data[[old_name]] - data[[old_name]] <- NULL - } - } - - if (!"Feature.ID" %in% colnames(data)) { - stop("Could not find Feature.ID column in taxonomy file") - } - - return(data) - } else { - stop("Could not find taxonomy.tsv file in qza archive") - } - } - - # 处理 Newick 树 - if (grepl("NewickDirectoryFormat", metadata$format)) { - tree_file <- list.files(temp, pattern = "tree\\.nwk$", - full.names = TRUE, recursive = TRUE)[1] - if (!is.na(tree_file)) { - message("Found tree file: ", tree_file) - return(ape::read.tree(tree_file)) - } else { - stop("Could not find tree.nwk file in qza archive") - } + unzipped_file_paths <- utils::unzip(file, exdir = temp) + + # Find metadata.yaml file in the unzipped files + message("Locating the metadata file...") + metadata_file_path <- grep("metadata.yaml", unzipped_file_paths, value = TRUE) + + # Read the metadata + message("Reading the metadata...") + metadata <- yaml::read_yaml(metadata_file_path[1]) + file_format <- metadata$format + file_uuid <- metadata$uuid + + # Check the format of the file and process accordingly + message("Processing the file based on its format...") + if (grepl("BIOMV", file_format)) { + biom_file_path <- file.path(temp, file_uuid, "data/feature-table.biom") + processed_file <- read_q2biom(biom_file_path) + } else if (file_format == "TSVTaxonomyDirectoryFormat") { + taxonomy_file_path <- file.path(temp, file_uuid, "data/taxonomy.tsv") + processed_file <- read_q2taxa(taxonomy_file_path) + } else if (file_format == "NewickDirectoryFormat") { + tree_file_path <- file.path(temp, file_uuid, "data/tree.nwk") + processed_file <- read_tree(tree_file_path) + } else if (file_format == "DNASequencesDirectoryFormat") { + dna_sequences_file_path <- file.path(temp, file_uuid, "data/dna-sequences.fasta") + processed_file <- readDNAStringSet(dna_sequences_file_path) + } else { + stop( + "Only files in format of 'BIOMV210DirFmt' ", + "'TSVTaxonomyDirectoryFormat', ", + "'NewickDirectoryFormat' and 'DNASequencesDirectoryFormat'", + " are supported." + ) } - - stop("Unsupported format or file not found: ", metadata$format) + + message("File processed successfully!") + return(processed_file) } @@ -222,116 +161,11 @@ read_qza <- function(file) { #' @examples #' # Please replace 'path_to_your_file.biom' with your actual file path #' # feature_table <- read_q2biom('path_to_your_file.biom') -read_q2biom <- function(biom_file) { - message("\n=== BIOM File Reading Debug Info ===") - message("BIOM file path: ", biom_file) - message("File exists: ", file.exists(biom_file)) - message("File size: ", file.size(biom_file), " bytes") - - # Check file content type - file_header <- readBin(biom_file, "raw", n = 10) - message("File header (hex): ", paste(as.hexmode(file_header), collapse = " ")) - - # Try biomformat first - biom_data <- tryCatch({ - message("\nAttempting to read with biomformat...") - biom <- biomformat::read_biom(biom_file) - message("BIOM object class: ", class(biom)) - message("BIOM object structure:") - str(biom) - - # Convert to matrix and then data frame - mat <- as.matrix(biom) - message("\nMatrix dimensions: ", nrow(mat), " x ", ncol(mat)) - message("Sample of matrix data:") - print(mat[1:min(5, nrow(mat)), 1:min(5, ncol(mat))]) - - df <- as.data.frame(mat) - message("Successfully converted to data frame") - return(df) - }, error = function(e) { - message("biomformat reading failed with error: ", e$message) - message("Error class: ", class(e)) - message("Full error:") - print(e) - return(NULL) - }) - - if (!is.null(biom_data)) { - message("Successfully read BIOM file with biomformat") - return(biom_data) - } - - # Try HDF5 - if (requireNamespace("rhdf5", quietly = TRUE)) { - message("\nAttempting to read as HDF5...") - rhdf5::h5closeAll() - - # Check if file is HDF5 - is_hdf5 <- tryCatch({ - rhdf5::H5Fis_hdf5(biom_file) - }, error = function(e) { - message("HDF5 format check failed: ", e$message) - FALSE - }) - - message("Is HDF5 format: ", is_hdf5) - - if (is_hdf5) { - # List HDF5 structure - message("HDF5 file structure:") - print(rhdf5::h5ls(biom_file)) - - hdf5_data <- tryCatch({ - mat <- rhdf5::h5read(biom_file, "observation/matrix") - message("Matrix dimensions: ", nrow(mat), " x ", ncol(mat)) - - obs_ids <- rhdf5::h5read(biom_file, "observation/ids") - message("Number of observation IDs: ", length(obs_ids)) - message("Sample observation IDs: ", paste(head(obs_ids), collapse = ", ")) - - sam_ids <- rhdf5::h5read(biom_file, "sample/ids") - message("Number of sample IDs: ", length(sam_ids)) - message("Sample sample IDs: ", paste(head(sam_ids), collapse = ", ")) - - df <- as.data.frame(mat) - colnames(df) <- sam_ids - rownames(df) <- obs_ids - - message("Successfully created data frame from HDF5") - return(df) - }, error = function(e) { - message("HDF5 reading failed with error: ", e$message) - message("Error class: ", class(e)) - message("Full error:") - print(e) - return(NULL) - }, finally = { - rhdf5::h5closeAll() - }) - - if (!is.null(hdf5_data)) { - return(hdf5_data) - } - } - } - - # Try reading as JSON text - message("\nAttempting to read as JSON text...") - json_data <- tryCatch({ - json_text <- readLines(biom_file, warn = FALSE) - message("First few lines of file:") - print(head(json_text)) - - # Add JSON parsing here if needed - - }, error = function(e) { - message("JSON reading failed: ", e$message) - return(NULL) - }) - - message("\n=== End of BIOM File Reading Debug Info ===\n") - stop("Failed to read BIOM file in any format") +read_q2biom <- function(file) { + biomobj <- read_biom(file) + feature_tab <- as(biom_data(biomobj), "matrix") + + return(feature_tab) } #' Read qiime2 taxa file @@ -405,55 +239,4 @@ parse_q2taxonomy <- function(taxa, sep = "; |;", trim_rank_prefix = TRUE) { return(as.matrix(taxa)) } -read_qza_taxonomy <- function(taxa_qza) { - # Extract taxonomy file - unzip_dir <- tempfile() - dir.create(unzip_dir) - unzip(taxa_qza, exdir = unzip_dir) - - # Find taxonomy file - taxa_file <- list.files(unzip_dir, - pattern = "taxonomy.tsv$", - recursive = TRUE, - full.names = TRUE)[1] - - if (is.na(taxa_file)) { - stop("Cannot find taxonomy.tsv file in the qza archive") - } - - message("Found taxonomy file: ", taxa_file) - - # Read taxonomy data - taxa_data <- read.table(taxa_file, - sep = "\t", - header = TRUE, - quote = "", - comment.char = "", - stringsAsFactors = FALSE) - - message("Found columns: ", paste(colnames(taxa_data), collapse = ", ")) - - # Ensure Feature.ID column exists - if (!"Feature.ID" %in% colnames(taxa_data)) { - # Try common alternative names - alt_names <- c("Feature ID", "#OTU ID", "ID", "OTU", "ASV") - for (name in alt_names) { - if (name %in% colnames(taxa_data)) { - taxa_data$Feature.ID <- taxa_data[[name]] - break - } - } - - # If still no Feature.ID, use first column - if (!"Feature.ID" %in% colnames(taxa_data)) { - taxa_data$Feature.ID <- taxa_data[[1]] - } - } - - # Clean up temporary directory - unlink(unzip_dir, recursive = TRUE) - - return(taxa_data) -} - diff --git a/tests/testthat/test-mStat_import_qiime2.R b/tests/testthat/test-mStat_import_qiime2.R deleted file mode 100644 index 938bb09..0000000 --- a/tests/testthat/test-mStat_import_qiime2.R +++ /dev/null @@ -1,102 +0,0 @@ -test_that("mStat_import_qiime2_as_data_obj works correctly", { - # Skip if required packages are not available - skip_if_not_installed("biomformat") - skip_if_not_installed("yaml") - skip_if_not_installed("Biostrings") - - # Load required package - require(biomformat) - - # Download test files if they don't exist - test_dir <- file.path(tempdir(), "qiime2_test_data") - dir.create(test_dir, showWarnings = FALSE) - - # Define URLs - urls <- c( - table = "https://docs.qiime2.org/2023.9/data/tutorials/moving-pictures/table.qza", - taxonomy = "https://docs.qiime2.org/2023.9/data/tutorials/moving-pictures/taxonomy.qza", - tree = "https://docs.qiime2.org/2023.9/data/tutorials/moving-pictures/rooted-tree.qza", - metadata = "https://data.qiime2.org/2024.10/tutorials/moving-pictures/sample_metadata.tsv" - ) - - # Download files - files <- list() - for(name in names(urls)) { - files[[name]] <- file.path(test_dir, basename(urls[[name]])) - if(!file.exists(files[[name]])) { - tryCatch({ - download.file(urls[[name]], files[[name]], mode = "wb") - }, error = function(e) { - skip(paste("Could not download test file:", name)) - }) - } - } - - # Test full import - test_that("Full import works with real qza files", { - skip_if_not_installed("rhdf5") - skip_if_not_installed("biomformat") - - message("\n=== Test Environment Info ===") - message("R version: ", R.version.string) - message("Platform: ", Sys.info()["sysname"]) - message("Package versions:") - message("- biomformat: ", packageVersion("biomformat")) - message("- rhdf5: ", packageVersion("rhdf5")) - - message("\nTest files:") - message("OTU: ", files$table) - message("Taxonomy: ", files$taxonomy) - message("Metadata: ", files$metadata) - message("Tree: ", files$tree) - - message("\nFile existence checks:") - message("OTU file exists: ", file.exists(files$table)) - message("Taxonomy file exists: ", file.exists(files$taxonomy)) - message("Metadata file exists: ", file.exists(files$metadata)) - message("Tree file exists: ", file.exists(files$tree)) - - message("\nFile sizes:") - message("OTU file: ", file.size(files$table), " bytes") - message("Taxonomy file: ", file.size(files$taxonomy), " bytes") - message("Metadata file: ", file.size(files$metadata), " bytes") - message("Tree file: ", file.size(files$tree), " bytes") - - # Import data with error handling - data_obj <- tryCatch({ - mStat_import_qiime2_as_data_obj( - otu_qza = files$table, - taxa_qza = files$taxonomy, - sam_tab = files$metadata, - tree_qza = files$tree - ) - }, error = function(e) { - message("\nError during import:") - message("Error message: ", e$message) - message("Error class: ", class(e)) - message("Full error:") - print(e) - stop(e) - }) - - message("\nValidation results:") - message("data_obj is null: ", is.null(data_obj)) - if (!is.null(data_obj)) { - message("feature.tab dimensions: ", - nrow(data_obj$feature.tab), " x ", ncol(data_obj$feature.tab)) - message("meta.dat dimensions: ", - nrow(data_obj$meta.dat), " x ", ncol(data_obj$meta.dat)) - message("feature.ann dimensions: ", - nrow(data_obj$feature.ann), " x ", ncol(data_obj$feature.ann)) - } - - message("=== End of Test Environment Info ===\n") - - expect_true(!is.null(data_obj)) - expect_true(nrow(data_obj$feature.tab) > 0) - expect_true(ncol(data_obj$feature.tab) > 0) - }) - - # Clean up - unlink(test_dir, recursive = TRUE) -})