Revert "fix: Handle zero-sum samples in generate_taxa_test_single"

This reverts commit f0dae8d.
cafferychen777 · Nov 8, 2024 · 98f7c0d · 98f7c0d
1 parent 3e03292
commit 98f7c0d
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 374 deletions.
diff --git a/R/generate_taxa_test_single.R b/R/generate_taxa_test_single.R
@@ -150,13 +150,6 @@ generate_taxa_test_single <- function(data.obj,
       mStat_filter(prev.filter = prev.filter,
                    abund.filter = abund.filter)
 
-    # Add this check before linda analysis
-    if (any(colSums(otu_tax_agg_filter) == 0)) {
-      keep_samples <- colSums(otu_tax_agg_filter) > 0
-      otu_tax_agg_filter <- otu_tax_agg_filter[, keep_samples]
-      meta_tab <- meta_tab[keep_samples, ]
-    }
-
     # Perform LinDA (Linear models for Differential Abundance) analysis
     linda.obj <- linda(
       feature.dat = otu_tax_agg_filter,

diff --git a/R/mStat_import_qiime2_as_data_obj.R b/R/mStat_import_qiime2_as_data_obj.R
@@ -49,31 +49,14 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
   message("Loading feature table...")
   feature.tab <- read_qza(otu_qza)
 
-  if (is.null(feature.tab) || nrow(feature.tab) == 0 || ncol(feature.tab) == 0) {
-    stop("Failed to read feature table or table is empty")
-  }
-
   if (!is.null(taxa_qza)) {
     message("Loading feature annotations...")
-    taxa_data <- read_qza_taxonomy(taxa_qza)
-    if (!"Feature.ID" %in% colnames(taxa_data)) {
-      stop("Taxonomy file must contain a 'Feature.ID' column. Available columns: ",
-           paste(colnames(taxa_data), collapse = ", "))
-    }
-    if (!"Taxon" %in% colnames(taxa_data)) {
-      stop("Taxonomy file must contain a 'Taxon' column")
-    }
-
-    common_features <- intersect(rownames(feature.tab), taxa_data$Feature.ID)
-    if (length(common_features) == 0) {
-      stop("No matching features between feature table and taxonomy data")
-    }
-
-    feature.tab <- feature.tab[common_features, , drop = FALSE]
-    taxa_data <- taxa_data[match(common_features, taxa_data$Feature.ID), , drop = FALSE]
+    feature.ann <- read_qza(file = taxa_qza)
+    feature.ann <- feature.ann %>% as.data.frame() %>% filter(Feature.ID %in% rownames(feature.tab))
+    feature.ann <- parse_q2taxonomy(feature.ann)
   } else {
     message("No feature annotations provided. Skipping...")
-    taxa_data <- NULL
+    taxa_tab <- NULL
   }
 
   if (!is.null(sam_tab)) {
@@ -96,7 +79,7 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
   message("Creating MicrobiomeStat data object...")
   data.obj <- list(feature.tab = feature.tab,
                    meta.dat = sam_tab,
-                   feature.ann = taxa_data,
+                   feature.ann = feature.ann,
                    tree = tree)
 
 
@@ -126,90 +109,46 @@ mStat_import_qiime2_as_data_obj <- function(otu_qza,
 #' # library(yaml)
 #' # library(Biostrings)
 #' # read_qza('path_to_your_file.qza')
-read_qza <- function(file) {
-  temp <- tempdir()
+read_qza <- function(file, temp = tempdir()) {
+  # Unzip the file
   message("Unzipping the .qza file...")
-  utils::unzip(file, exdir = temp)
-
-  # 列出所有文件
-  all_files <- list.files(temp, recursive = TRUE, full.names = TRUE)
-  message("Found files: ", paste(all_files, collapse = "\n"))
-
-  # 查找 metadata.yaml
-  metadata_file <- list.files(temp, pattern = "metadata.yaml$", 
-                             full.names = TRUE, recursive = TRUE)[1]
-  if (is.na(metadata_file)) {
-    stop("Could not find metadata.yaml in qza archive")
-  }
-
-  metadata <- yaml::read_yaml(metadata_file)
-  message("Format from metadata: ", metadata$format)
-
-  # 处理 BIOMV210 格式
-  if (grepl("BIOMV210", metadata$format)) {
-    # 查找任意目录下的 biom 文件
-    biom_file <- list.files(temp, pattern = "\\.biom$", 
-                           full.names = TRUE, recursive = TRUE)[1]
-
-    if (!is.na(biom_file)) {
-      message("Found BIOM file: ", biom_file)
-      message("Reading BIOM file...")
-      return(read_q2biom(biom_file))
-    } else {
-      stop("Could not find BIOM file in qza archive")
-    }
-  }
-
-  # 处理分类数据
-  if (grepl("TSVTaxonomyDirectoryFormat", metadata$format)) {
-    # 查找任意目录下的 taxonomy.tsv 文件
-    tsv_file <- list.files(temp, pattern = "taxonomy\\.tsv$", 
-                          full.names = TRUE, recursive = TRUE)[1]
-
-    if (!is.na(tsv_file)) {
-      message("Found taxonomy file: ", tsv_file)
-      message("Reading taxonomy TSV file...")
-      data <- utils::read.table(tsv_file, header = TRUE, sep = "\t", 
-                               quote = "", comment.char = "")
-
-      # 重命名列
-      col_mapping <- c(
-        "#OTU ID" = "Feature.ID",
-        "OTU ID" = "Feature.ID",
-        "ID" = "Feature.ID",
-        "Taxonomy" = "Taxon"
-      )
-
-      for (old_name in names(col_mapping)) {
-        if (old_name %in% colnames(data)) {
-          data[[col_mapping[old_name]]] <- data[[old_name]]
-          data[[old_name]] <- NULL
-        }
-      }
-
-      if (!"Feature.ID" %in% colnames(data)) {
-        stop("Could not find Feature.ID column in taxonomy file")
-      }
-
-      return(data)
-    } else {
-      stop("Could not find taxonomy.tsv file in qza archive")
-    }
-  }
-
-  # 处理 Newick 树
-  if (grepl("NewickDirectoryFormat", metadata$format)) {
-    tree_file <- list.files(temp, pattern = "tree\\.nwk$", 
-                           full.names = TRUE, recursive = TRUE)[1]
-    if (!is.na(tree_file)) {
-      message("Found tree file: ", tree_file)
-      return(ape::read.tree(tree_file))
-    } else {
-      stop("Could not find tree.nwk file in qza archive")
-    }
+  unzipped_file_paths <- utils::unzip(file, exdir = temp)
+
+  # Find metadata.yaml file in the unzipped files
+  message("Locating the metadata file...")
+  metadata_file_path <- grep("metadata.yaml", unzipped_file_paths, value = TRUE)
+
+  # Read the metadata
+  message("Reading the metadata...")
+  metadata <- yaml::read_yaml(metadata_file_path[1])
+  file_format <- metadata$format
+  file_uuid <- metadata$uuid
+
+  # Check the format of the file and process accordingly
+  message("Processing the file based on its format...")
+  if (grepl("BIOMV", file_format)) {
+    biom_file_path <- file.path(temp, file_uuid, "data/feature-table.biom")
+    processed_file <- read_q2biom(biom_file_path)
+  } else if (file_format == "TSVTaxonomyDirectoryFormat") {
+    taxonomy_file_path <- file.path(temp, file_uuid, "data/taxonomy.tsv")
+    processed_file <- read_q2taxa(taxonomy_file_path)
+  } else if (file_format == "NewickDirectoryFormat") {
+    tree_file_path <- file.path(temp, file_uuid, "data/tree.nwk")
+    processed_file <- read_tree(tree_file_path)
+  } else if (file_format == "DNASequencesDirectoryFormat") {
+    dna_sequences_file_path <- file.path(temp, file_uuid, "data/dna-sequences.fasta")
+    processed_file <- readDNAStringSet(dna_sequences_file_path)
+  } else {
+    stop(
+      "Only files in format of 'BIOMV210DirFmt' ",
+      "'TSVTaxonomyDirectoryFormat', ",
+      "'NewickDirectoryFormat' and 'DNASequencesDirectoryFormat'",
+      " are supported."
+    )
   }
-
-  stop("Unsupported format or file not found: ", metadata$format)
+
+  message("File processed successfully!")
+  return(processed_file)
 }
 
 
@@ -222,116 +161,11 @@ read_qza <- function(file) {
 #' @examples
 #' # Please replace 'path_to_your_file.biom' with your actual file path
 #' # feature_table <- read_q2biom('path_to_your_file.biom')
-read_q2biom <- function(biom_file) {
-  message("\n=== BIOM File Reading Debug Info ===")
-  message("BIOM file path: ", biom_file)
-  message("File exists: ", file.exists(biom_file))
-  message("File size: ", file.size(biom_file), " bytes")
-
-  # Check file content type
-  file_header <- readBin(biom_file, "raw", n = 10)
-  message("File header (hex): ", paste(as.hexmode(file_header), collapse = " "))
-
-  # Try biomformat first
-  biom_data <- tryCatch({
-    message("\nAttempting to read with biomformat...")
-    biom <- biomformat::read_biom(biom_file)
-    message("BIOM object class: ", class(biom))
-    message("BIOM object structure:")
-    str(biom)
-
-    # Convert to matrix and then data frame
-    mat <- as.matrix(biom)
-    message("\nMatrix dimensions: ", nrow(mat), " x ", ncol(mat))
-    message("Sample of matrix data:")
-    print(mat[1:min(5, nrow(mat)), 1:min(5, ncol(mat))])
-
-    df <- as.data.frame(mat)
-    message("Successfully converted to data frame")
-    return(df)
-  }, error = function(e) {
-    message("biomformat reading failed with error: ", e$message)
-    message("Error class: ", class(e))
-    message("Full error:")
-    print(e)
-    return(NULL)
-  })
-
-  if (!is.null(biom_data)) {
-    message("Successfully read BIOM file with biomformat")
-    return(biom_data)
-  }
-
-  # Try HDF5
-  if (requireNamespace("rhdf5", quietly = TRUE)) {
-    message("\nAttempting to read as HDF5...")
-    rhdf5::h5closeAll()
-
-    # Check if file is HDF5
-    is_hdf5 <- tryCatch({
-      rhdf5::H5Fis_hdf5(biom_file)
-    }, error = function(e) {
-      message("HDF5 format check failed: ", e$message)
-      FALSE
-    })
-
-    message("Is HDF5 format: ", is_hdf5)
-
-    if (is_hdf5) {
-      # List HDF5 structure
-      message("HDF5 file structure:")
-      print(rhdf5::h5ls(biom_file))
-
-      hdf5_data <- tryCatch({
-        mat <- rhdf5::h5read(biom_file, "observation/matrix")
-        message("Matrix dimensions: ", nrow(mat), " x ", ncol(mat))
-
-        obs_ids <- rhdf5::h5read(biom_file, "observation/ids")
-        message("Number of observation IDs: ", length(obs_ids))
-        message("Sample observation IDs: ", paste(head(obs_ids), collapse = ", "))
-
-        sam_ids <- rhdf5::h5read(biom_file, "sample/ids")
-        message("Number of sample IDs: ", length(sam_ids))
-        message("Sample sample IDs: ", paste(head(sam_ids), collapse = ", "))
-
-        df <- as.data.frame(mat)
-        colnames(df) <- sam_ids
-        rownames(df) <- obs_ids
-
-        message("Successfully created data frame from HDF5")
-        return(df)
-      }, error = function(e) {
-        message("HDF5 reading failed with error: ", e$message)
-        message("Error class: ", class(e))
-        message("Full error:")
-        print(e)
-        return(NULL)
-      }, finally = {
-        rhdf5::h5closeAll()
-      })
-
-      if (!is.null(hdf5_data)) {
-        return(hdf5_data)
-      }
-    }
-  }
-
-  # Try reading as JSON text
-  message("\nAttempting to read as JSON text...")
-  json_data <- tryCatch({
-    json_text <- readLines(biom_file, warn = FALSE)
-    message("First few lines of file:")
-    print(head(json_text))
-
-    # Add JSON parsing here if needed
-
-  }, error = function(e) {
-    message("JSON reading failed: ", e$message)
-    return(NULL)
-  })
-
-  message("\n=== End of BIOM File Reading Debug Info ===\n")
-  stop("Failed to read BIOM file in any format")
+read_q2biom <- function(file) {
+  biomobj <- read_biom(file)
+  feature_tab <- as(biom_data(biomobj), "matrix")
+
+  return(feature_tab)
 }
 
 #' Read qiime2 taxa file
@@ -405,55 +239,4 @@ parse_q2taxonomy <- function(taxa, sep = "; |;", trim_rank_prefix = TRUE) {
   return(as.matrix(taxa))
 }
 
-read_qza_taxonomy <- function(taxa_qza) {
-  # Extract taxonomy file
-  unzip_dir <- tempfile()
-  dir.create(unzip_dir)
-  unzip(taxa_qza, exdir = unzip_dir)
-
-  # Find taxonomy file
-  taxa_file <- list.files(unzip_dir, 
-                         pattern = "taxonomy.tsv$", 
-                         recursive = TRUE, 
-                         full.names = TRUE)[1]
-
-  if (is.na(taxa_file)) {
-    stop("Cannot find taxonomy.tsv file in the qza archive")
-  }
-
-  message("Found taxonomy file: ", taxa_file)
-
-  # Read taxonomy data
-  taxa_data <- read.table(taxa_file, 
-                         sep = "\t", 
-                         header = TRUE, 
-                         quote = "", 
-                         comment.char = "",
-                         stringsAsFactors = FALSE)
-
-  message("Found columns: ", paste(colnames(taxa_data), collapse = ", "))
-
-  # Ensure Feature.ID column exists
-  if (!"Feature.ID" %in% colnames(taxa_data)) {
-    # Try common alternative names
-    alt_names <- c("Feature ID", "#OTU ID", "ID", "OTU", "ASV")
-    for (name in alt_names) {
-      if (name %in% colnames(taxa_data)) {
-        taxa_data$Feature.ID <- taxa_data[[name]]
-        break
-      }
-    }
-
-    # If still no Feature.ID, use first column
-    if (!"Feature.ID" %in% colnames(taxa_data)) {
-      taxa_data$Feature.ID <- taxa_data[[1]]
-    }
-  }
-
-  # Clean up temporary directory
-  unlink(unzip_dir, recursive = TRUE)
-
-  return(taxa_data)
-}
-