Add feature selection to barplot and areaplot functions with post-sel…

…ection renormalization This update enhances the barplot and areaplot functions by introducing a 'features.plot' parameter, allowing users to visualize specific features of interest. Key changes: - Added 'features.plot' parameter to both barplot and areaplot functions - Implemented feature selection logic based on 'features.plot' input - Integrated renormalization step after feature selection Important note: When specific features are selected using 'features.plot', the resulting subset of data undergoes renormalization. This ensures that the visualizations accurately represent the proportions within the selected feature set.
cafferychen777 · Aug 30, 2024 · 45c573f · 45c573f
1 parent 3d041d1
commit 45c573f
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 22 deletions.
diff --git a/R/generate_taxa_areaplot_long.R b/R/generate_taxa_areaplot_long.R
@@ -18,6 +18,8 @@
 #'                     which will use the original taxon identifiers. Multiple levels can be specified
 #'                     and data will be plotted separately for each. **Cannot be NULL, as NULL value
 #'                     will lead to errors.** Default is "original".
+#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
+#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
 #' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
 #' Should be one of:
 #' - "count": Raw count data, will be normalized by the function.
@@ -111,6 +113,24 @@
 #'   pdf = TRUE,
 #'   file.ann = NULL
 #' )
+#' generate_taxa_areaplot_long(
+#'   data.obj = ecam.obj,
+#'   subject.var = "studyid",
+#'   time.var = "month_num",
+#'   group.var = "delivery",
+#'   strata.var = "diet",
+#'   feature.level = c("Genus"),
+#'   feature.dat.type = "proportion",
+#'   feature.number = 20,
+#'   features.plot = unique(ecam.obj$feature.ann[,"Genus"])[1:15],
+#'   t0.level = NULL,
+#'   ts.levels = NULL,
+#'   base.size = 10,
+#'   theme.choice = "bw",
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL
+#' )
 #' data(subset_T2D.obj)
 #' generate_taxa_areaplot_long(
 #'   data.obj = subset_T2D.obj,
@@ -162,6 +182,7 @@ generate_taxa_areaplot_long <-
            feature.level = "original",
            feature.dat.type = c("count", "proportion", "other"),
            feature.number = 20,
+           features.plot = NULL,
            t0.level = NULL,
            ts.levels = NULL,
            base.size = 10,
@@ -249,6 +270,10 @@ generate_taxa_areaplot_long <-
         otu_tax_agg <- data.obj$feature.tab
       }
 
+      if (!is.null(features.plot)){
+        otu_tax_agg <- otu_tax_agg[features.plot,]
+      }
+
       otu_tax_agg <-  otu_tax_agg %>%
         as.data.frame() %>%
         rownames_to_column(feature.level)

diff --git a/R/generate_taxa_barplot_long.R b/R/generate_taxa_barplot_long.R
@@ -16,6 +16,8 @@
 #' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
 #' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
 #' is also NULL.
+#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
+#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
 #' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
 #' Should be one of:
 #' - "count": Raw count data, will be normalized by the function.
@@ -97,6 +99,41 @@
 #'   file.ann = NULL
 #' )
 #'
+#' generate_taxa_barplot_long(
+#'   data.obj = ecam.obj,
+#'   subject.var = "studyid",
+#'   time.var = "month_num",
+#'   group.var = "delivery",
+#'   strata.var = "diet",
+#'   feature.level = "Genus",
+#'   feature.dat.type = "proportion",
+#'   feature.number = 30,
+#'   t0.level = NULL,
+#'   ts.levels = NULL,
+#'   theme.choice = "bw",
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL
+#' )
+#'
+#' generate_taxa_barplot_long(
+#'   data.obj = ecam.obj,
+#'   subject.var = "studyid",
+#'   time.var = "month_num",
+#'   group.var = "delivery",
+#'   strata.var = "diet",
+#'   feature.level = "Genus",
+#'   feature.dat.type = "proportion",
+#'   features.plot = unique(ecam.obj$feature.ann[,"Genus"])[1:10],
+#'   feature.number = 30,
+#'   t0.level = NULL,
+#'   ts.levels = NULL,
+#'   theme.choice = "bw",
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL
+#' )
+#'
 #' data(subset_T2D.obj)
 #' generate_taxa_barplot_long(
 #'   data.obj = subset_T2D.obj,
@@ -116,6 +153,26 @@
 #'   pdf.wid = 49,
 #'   file.ann = NULL
 #' )
+#'
+#' generate_taxa_barplot_long(
+#'   data.obj = subset_T2D.obj,
+#'   subject.var = "subject_id",
+#'   time.var = "visit_number_num",
+#'   group.var = "subject_race",
+#'   strata.var = "subject_gender",
+#'   feature.level = c("Genus"),
+#'   feature.dat.type = "count",
+#'   features.plot = unique(subset_T2D.obj$feature.ann[,"Genus"][1:6]),
+#'   feature.number = 40,
+#'   t0.level = NULL,
+#'   ts.levels = NULL,
+#'   base.size = 10,
+#'   theme.choice = "bw",
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   pdf.wid = 49,
+#'   file.ann = NULL
+#' )
 #' }
 #' @export
 generate_taxa_barplot_long <-
@@ -127,6 +184,7 @@ generate_taxa_barplot_long <-
            feature.level = "original",
            feature.dat.type = c("count", "proportion", "other"),
            feature.number = 20,
+           features.plot = NULL,
            t0.level = NULL,
            ts.levels = NULL,
            base.size = 10,
@@ -216,6 +274,11 @@ generate_taxa_barplot_long <-
         otu_tax_agg <- data.obj$feature.tab
       }
 
+      if (!is.null(features.plot)){
+        otu_tax_agg <- otu_tax_agg[na.omit(features.plot),]
+        otu_tax_agg <- apply(otu_tax_agg, 2, function(x) x / sum(x))
+      }
+
       otu_tax_agg <-  otu_tax_agg %>%
         as.data.frame() %>%
         rownames_to_column(feature.level)
@@ -261,26 +324,26 @@ generate_taxa_barplot_long <-
 
       sorted_merged_long_df <- sorted_merged_long_df %>% dplyr::mutate(!!sym(feature.level) := as.factor(!!sym(feature.level)))
 
-      # 计算每个特征的平均值并排序
+      # Calculate the average value of each feature and sort them.
       df_sorted <- sorted_merged_long_df %>%
         dplyr::group_by(!!sym(feature.level)) %>%
         dplyr::summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
         dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", TRUE, FALSE)) %>%
         dplyr::arrange(is_other, overall_mean) %>%
         dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
 
-      # 更新 new_levels
+      # Update new_levels
       if (!is.na(other.abund.cutoff)) {
         new_levels <- c("Other", setdiff(levels(df_sorted[[feature.level]]), "Other"))
       } else {
         new_levels <- levels(df_sorted[[feature.level]])
       }
 
-      # 应用新的排序
+      # Apply new sorting
       sorted_merged_long_df <- sorted_merged_long_df %>%
         dplyr::mutate(!!sym(feature.level) := factor(!!sym(feature.level), levels = new_levels))
 
-      # 修改 df 的创建
+      # Modify the creation of df
       df <- sorted_merged_long_df %>%
         dplyr::group_by(sample) %>%
         dplyr::mutate(!!sym(feature.level) := factor(!!sym(feature.level), levels = new_levels)) %>%
@@ -291,7 +354,7 @@ generate_taxa_barplot_long <-
         dplyr::mutate(next_cumulative_value = dplyr::if_else(sample %in% last_sample_ids$last_sample_id, NA_real_, dplyr::lead(cumulative_value))) %>%
         dplyr::ungroup()
 
-      # 更新颜色调色板
+      # Update color palette
       color_pal <- setNames(pal[1:length(new_levels)], new_levels)
 
       bar_width <- 0.6

diff --git a/R/generate_taxa_barplot_pair.R b/R/generate_taxa_barplot_pair.R
@@ -14,6 +14,8 @@
 #' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
 #' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
 #' is also NULL.
+#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
+#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
 #' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
 #' Should be one of:
 #' - "count": Raw count data, will be normalized by the function.
@@ -121,6 +123,7 @@ generate_taxa_barplot_pair <-
            feature.level = "original",
            feature.dat.type = c("count", "proportion", "other"),
            feature.number = 20,
+           features.plot = NULL,
            base.size = 10,
            theme.choice = "bw",
            custom.theme = NULL,

diff --git a/R/generate_taxa_barplot_single.R b/R/generate_taxa_barplot_single.R
@@ -16,6 +16,8 @@
 #' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
 #' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
 #' is also NULL.
+#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
+#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
 #' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
 #' Should be one of:
 #' - "count": Raw count data, will be normalized by the function.
@@ -111,6 +113,7 @@
 #'   pdf.wid = 11,
 #'   pdf.hei = 8.5
 #' )
+#'
 #' data("subset_T2D.obj")
 #' generate_taxa_barplot_single(
 #'   data.obj = subset_T2D.obj,
@@ -150,6 +153,60 @@
 #'   pdf.wid = 11,
 #'   pdf.hei = 8.5
 #' )
+#' data(ecam.obj)
+#' generate_taxa_barplot_single(
+#'   data.obj = ecam.obj,
+#'   time.var = "month",
+#'   group.var = "antiexposedall",
+#'   strata.var = NULL,
+#'   feature.level = c("Phylum", "Family", "Genus"),
+#'   feature.dat.type = "proportion",
+#'   feature.number = 10,
+#'   base.size = 10,
+#'   theme.choice = "bw",
+#'   custom.theme = NULL,
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL,
+#'   pdf.wid = 11,
+#'   pdf.hei = 8.5
+#' )
+#' generate_taxa_barplot_single(
+#'   data.obj = ecam.obj,
+#'   time.var = "month",
+#'   t.level = "0",
+#'   group.var = "antiexposedall",
+#'   strata.var = NULL,
+#'   feature.level = c("Phylum", "Family", "Genus"),
+#'   feature.dat.type = "proportion",
+#'   feature.number = 10,
+#'   base.size = 10,
+#'   theme.choice = "bw",
+#'   custom.theme = NULL,
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL,
+#'   pdf.wid = 11,
+#'   pdf.hei = 8.5
+#' )
+#' generate_taxa_barplot_single(
+#'   data.obj = ecam.obj,
+#'   time.var = "month",
+#'   t.level = "0",
+#'   group.var = "antiexposedall",
+#'   strata.var = NULL,
+#'   feature.level = c("Family"),
+#'   feature.dat.type = "proportion",
+#'   features.plot = unique(ecam.obj$feature.ann[,"Family"])[1:12],
+#'   base.size = 10,
+#'   theme.choice = "bw",
+#'   custom.theme = NULL,
+#'   palette = NULL,
+#'   pdf = TRUE,
+#'   file.ann = NULL,
+#'   pdf.wid = 11,
+#'   pdf.hei = 8.5
+#' )
 #' }
 #' @export
 generate_taxa_barplot_single <-
@@ -161,6 +218,7 @@ generate_taxa_barplot_single <-
            strata.var = NULL,
            feature.level = "original",
            feature.dat.type = c("count", "proportion", "other"),
+           features.plot = NULL,
            feature.number = 20,
            base.size = 10,
            theme.choice = "bw",
@@ -242,13 +300,14 @@ generate_taxa_barplot_single <-
       )
       data.obj <- mStat_normalize_data(data.obj, method = "TSS")$data.obj.norm
     } else if (feature.dat.type == "proportion"){
-
+      data.obj <- mStat_normalize_data(data.obj, method = "TSS")$data.obj.norm
     } else if (feature.dat.type == "other"){
       stop("The 'other' type is suitable for situations where the user has analyzed the data using a method not provided in 'mStat_normalize_data' method, and the 'areaplot' is only applicable to raw data that has not undergone any processing or proportion data that adds up to 1. If you believe your data falls into these two categories, please modify 'feature.dat.type'.")
     }
 
     plot_list_all <- lapply(feature.level,function(feature.level){
 
+
       if (is.null(data.obj$feature.agg.list[[feature.level]]) & feature.level != "original"){
         data.obj <- mStat_aggregate_by_taxonomy(data.obj = data.obj, feature.level = feature.level)
       }
@@ -259,6 +318,10 @@ generate_taxa_barplot_single <-
         otu_tax_agg <- data.obj$feature.tab
       }
 
+      if (!is.null(features.plot)){
+        otu_tax_agg <- otu_tax_agg[features.plot,]
+      }
+
       otu_tax_agg <-  otu_tax_agg %>%
         as.data.frame() %>%
         rownames_to_column(feature.level)
@@ -351,15 +414,15 @@ generate_taxa_barplot_single <-
       result <- midpoints
 
       df_sorted <- df %>%
-        group_by(!!sym(feature.level)) %>%
-        summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
-        mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
-        arrange(is_other, overall_mean) %>%
-        mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
+        dplyr::group_by(!!sym(feature.level)) %>%
+        dplyr::summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
+        dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
+        dplyr::arrange(is_other, overall_mean) %>%
+        dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
 
       # Apply sorted factor levels to the original data frame
       df <- df %>%
-        mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_sorted[[feature.level]])))
+        dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_sorted[[feature.level]])))
 
       stack_barplot_indiv  <- # Main plot code
         df %>%
@@ -441,15 +504,15 @@ generate_taxa_barplot_single <-
       }
 
       df_average_sorted <- df_average %>%
-        group_by(!!sym(feature.level)) %>%
-        summarise(overall_mean = mean(mean_value, na.rm = TRUE)) %>%
-        mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
-        arrange(is_other, overall_mean) %>%
-        mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
+        dplyr::group_by(!!sym(feature.level)) %>%
+        dplyr::summarise(overall_mean = mean(mean_value, na.rm = TRUE)) %>%
+        dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
+        dplyr::arrange(is_other, overall_mean) %>%
+        dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
 
       # Apply the sorted factor levels to the original data frame.
       df_average <- df_average %>%
-        mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_average_sorted[[feature.level]])))
+        dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_average_sorted[[feature.level]])))
 
       stack_barplot_average  <- # Main plot code
         df_average %>%
@@ -492,8 +555,6 @@ generate_taxa_barplot_single <-
 
       if (pdf) {
         pdf_name <- paste0("taxa_barplot_single",
-                           "_",
-                           "subject_", subject.var,
                            "_",
                            "time_", time.var,
                            "_",
@@ -516,8 +577,6 @@ generate_taxa_barplot_single <-
       # Save the stacked barplots as a PDF file
       if (pdf) {
         pdf_name <- paste0("taxa_barplot_single",
-                           "_",
-                           "subject_", subject.var,
                            "_",
                            "time_", time.var,
                            "_",