Skip to content

Commit

Permalink
Add feature selection to barplot and areaplot functions with post-sel…
Browse files Browse the repository at this point in the history
…ection renormalization

This update enhances the barplot and areaplot functions by introducing a 'features.plot' parameter, allowing users to visualize specific features of interest.

Key changes:

- Added 'features.plot' parameter to both barplot and areaplot functions

- Implemented feature selection logic based on 'features.plot' input

- Integrated renormalization step after feature selection

Important note: When specific features are selected using 'features.plot', the resulting subset of data undergoes renormalization. This ensures that the visualizations accurately represent the proportions within the selected feature set.
  • Loading branch information
cafferychen777 committed Aug 30, 2024
1 parent 3d041d1 commit 45c573f
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 22 deletions.
25 changes: 25 additions & 0 deletions R/generate_taxa_areaplot_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#' which will use the original taxon identifiers. Multiple levels can be specified
#' and data will be plotted separately for each. **Cannot be NULL, as NULL value
#' will lead to errors.** Default is "original".
#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
#' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
#' Should be one of:
#' - "count": Raw count data, will be normalized by the function.
Expand Down Expand Up @@ -111,6 +113,24 @@
#' pdf = TRUE,
#' file.ann = NULL
#' )
#' generate_taxa_areaplot_long(
#' data.obj = ecam.obj,
#' subject.var = "studyid",
#' time.var = "month_num",
#' group.var = "delivery",
#' strata.var = "diet",
#' feature.level = c("Genus"),
#' feature.dat.type = "proportion",
#' feature.number = 20,
#' features.plot = unique(ecam.obj$feature.ann[,"Genus"])[1:15],
#' t0.level = NULL,
#' ts.levels = NULL,
#' base.size = 10,
#' theme.choice = "bw",
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL
#' )
#' data(subset_T2D.obj)
#' generate_taxa_areaplot_long(
#' data.obj = subset_T2D.obj,
Expand Down Expand Up @@ -162,6 +182,7 @@ generate_taxa_areaplot_long <-
feature.level = "original",
feature.dat.type = c("count", "proportion", "other"),
feature.number = 20,
features.plot = NULL,
t0.level = NULL,
ts.levels = NULL,
base.size = 10,
Expand Down Expand Up @@ -249,6 +270,10 @@ generate_taxa_areaplot_long <-
otu_tax_agg <- data.obj$feature.tab
}

if (!is.null(features.plot)){
otu_tax_agg <- otu_tax_agg[features.plot,]
}

otu_tax_agg <- otu_tax_agg %>%
as.data.frame() %>%
rownames_to_column(feature.level)
Expand Down
73 changes: 68 additions & 5 deletions R/generate_taxa_barplot_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
#' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
#' is also NULL.
#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
#' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
#' Should be one of:
#' - "count": Raw count data, will be normalized by the function.
Expand Down Expand Up @@ -97,6 +99,41 @@
#' file.ann = NULL
#' )
#'
#' generate_taxa_barplot_long(
#' data.obj = ecam.obj,
#' subject.var = "studyid",
#' time.var = "month_num",
#' group.var = "delivery",
#' strata.var = "diet",
#' feature.level = "Genus",
#' feature.dat.type = "proportion",
#' feature.number = 30,
#' t0.level = NULL,
#' ts.levels = NULL,
#' theme.choice = "bw",
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL
#' )
#'
#' generate_taxa_barplot_long(
#' data.obj = ecam.obj,
#' subject.var = "studyid",
#' time.var = "month_num",
#' group.var = "delivery",
#' strata.var = "diet",
#' feature.level = "Genus",
#' feature.dat.type = "proportion",
#' features.plot = unique(ecam.obj$feature.ann[,"Genus"])[1:10],
#' feature.number = 30,
#' t0.level = NULL,
#' ts.levels = NULL,
#' theme.choice = "bw",
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL
#' )
#'
#' data(subset_T2D.obj)
#' generate_taxa_barplot_long(
#' data.obj = subset_T2D.obj,
Expand All @@ -116,6 +153,26 @@
#' pdf.wid = 49,
#' file.ann = NULL
#' )
#'
#' generate_taxa_barplot_long(
#' data.obj = subset_T2D.obj,
#' subject.var = "subject_id",
#' time.var = "visit_number_num",
#' group.var = "subject_race",
#' strata.var = "subject_gender",
#' feature.level = c("Genus"),
#' feature.dat.type = "count",
#' features.plot = unique(subset_T2D.obj$feature.ann[,"Genus"][1:6]),
#' feature.number = 40,
#' t0.level = NULL,
#' ts.levels = NULL,
#' base.size = 10,
#' theme.choice = "bw",
#' palette = NULL,
#' pdf = TRUE,
#' pdf.wid = 49,
#' file.ann = NULL
#' )
#' }
#' @export
generate_taxa_barplot_long <-
Expand All @@ -127,6 +184,7 @@ generate_taxa_barplot_long <-
feature.level = "original",
feature.dat.type = c("count", "proportion", "other"),
feature.number = 20,
features.plot = NULL,
t0.level = NULL,
ts.levels = NULL,
base.size = 10,
Expand Down Expand Up @@ -216,6 +274,11 @@ generate_taxa_barplot_long <-
otu_tax_agg <- data.obj$feature.tab
}

if (!is.null(features.plot)){
otu_tax_agg <- otu_tax_agg[na.omit(features.plot),]
otu_tax_agg <- apply(otu_tax_agg, 2, function(x) x / sum(x))
}

otu_tax_agg <- otu_tax_agg %>%
as.data.frame() %>%
rownames_to_column(feature.level)
Expand Down Expand Up @@ -261,26 +324,26 @@ generate_taxa_barplot_long <-

sorted_merged_long_df <- sorted_merged_long_df %>% dplyr::mutate(!!sym(feature.level) := as.factor(!!sym(feature.level)))

# 计算每个特征的平均值并排序
# Calculate the average value of each feature and sort them.
df_sorted <- sorted_merged_long_df %>%
dplyr::group_by(!!sym(feature.level)) %>%
dplyr::summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", TRUE, FALSE)) %>%
dplyr::arrange(is_other, overall_mean) %>%
dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))

# 更新 new_levels
# Update new_levels
if (!is.na(other.abund.cutoff)) {
new_levels <- c("Other", setdiff(levels(df_sorted[[feature.level]]), "Other"))
} else {
new_levels <- levels(df_sorted[[feature.level]])
}

# 应用新的排序
# Apply new sorting
sorted_merged_long_df <- sorted_merged_long_df %>%
dplyr::mutate(!!sym(feature.level) := factor(!!sym(feature.level), levels = new_levels))

# 修改 df 的创建
# Modify the creation of df
df <- sorted_merged_long_df %>%
dplyr::group_by(sample) %>%
dplyr::mutate(!!sym(feature.level) := factor(!!sym(feature.level), levels = new_levels)) %>%
Expand All @@ -291,7 +354,7 @@ generate_taxa_barplot_long <-
dplyr::mutate(next_cumulative_value = dplyr::if_else(sample %in% last_sample_ids$last_sample_id, NA_real_, dplyr::lead(cumulative_value))) %>%
dplyr::ungroup()

# 更新颜色调色板
# Update color palette
color_pal <- setNames(pal[1:length(new_levels)], new_levels)

bar_width <- 0.6
Expand Down
3 changes: 3 additions & 0 deletions R/generate_taxa_barplot_pair.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
#' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
#' is also NULL.
#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
#' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
#' Should be one of:
#' - "count": Raw count data, will be normalized by the function.
Expand Down Expand Up @@ -121,6 +123,7 @@ generate_taxa_barplot_pair <-
feature.level = "original",
feature.dat.type = c("count", "proportion", "other"),
feature.number = 20,
features.plot = NULL,
base.size = 10,
theme.choice = "bw",
custom.theme = NULL,
Expand Down
93 changes: 76 additions & 17 deletions R/generate_taxa_barplot_single.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#' column names in feature.ann. Multiple columns can be provided, and data will be plotted separately
#' for each column. Default is NULL, which defaults to all columns in feature.ann if `features.plot`
#' is also NULL.
#' @param features.plot A character vector specifying which feature IDs (e.g. OTU IDs) to plot.
#' Default is NULL, in which case features will be selected based on `top.k.plot` and `top.k.func`.
#' @param feature.dat.type The type of the feature data, which determines how the data is handled in downstream analyses.
#' Should be one of:
#' - "count": Raw count data, will be normalized by the function.
Expand Down Expand Up @@ -111,6 +113,7 @@
#' pdf.wid = 11,
#' pdf.hei = 8.5
#' )
#'
#' data("subset_T2D.obj")
#' generate_taxa_barplot_single(
#' data.obj = subset_T2D.obj,
Expand Down Expand Up @@ -150,6 +153,60 @@
#' pdf.wid = 11,
#' pdf.hei = 8.5
#' )
#' data(ecam.obj)
#' generate_taxa_barplot_single(
#' data.obj = ecam.obj,
#' time.var = "month",
#' group.var = "antiexposedall",
#' strata.var = NULL,
#' feature.level = c("Phylum", "Family", "Genus"),
#' feature.dat.type = "proportion",
#' feature.number = 10,
#' base.size = 10,
#' theme.choice = "bw",
#' custom.theme = NULL,
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL,
#' pdf.wid = 11,
#' pdf.hei = 8.5
#' )
#' generate_taxa_barplot_single(
#' data.obj = ecam.obj,
#' time.var = "month",
#' t.level = "0",
#' group.var = "antiexposedall",
#' strata.var = NULL,
#' feature.level = c("Phylum", "Family", "Genus"),
#' feature.dat.type = "proportion",
#' feature.number = 10,
#' base.size = 10,
#' theme.choice = "bw",
#' custom.theme = NULL,
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL,
#' pdf.wid = 11,
#' pdf.hei = 8.5
#' )
#' generate_taxa_barplot_single(
#' data.obj = ecam.obj,
#' time.var = "month",
#' t.level = "0",
#' group.var = "antiexposedall",
#' strata.var = NULL,
#' feature.level = c("Family"),
#' feature.dat.type = "proportion",
#' features.plot = unique(ecam.obj$feature.ann[,"Family"])[1:12],
#' base.size = 10,
#' theme.choice = "bw",
#' custom.theme = NULL,
#' palette = NULL,
#' pdf = TRUE,
#' file.ann = NULL,
#' pdf.wid = 11,
#' pdf.hei = 8.5
#' )
#' }
#' @export
generate_taxa_barplot_single <-
Expand All @@ -161,6 +218,7 @@ generate_taxa_barplot_single <-
strata.var = NULL,
feature.level = "original",
feature.dat.type = c("count", "proportion", "other"),
features.plot = NULL,
feature.number = 20,
base.size = 10,
theme.choice = "bw",
Expand Down Expand Up @@ -242,13 +300,14 @@ generate_taxa_barplot_single <-
)
data.obj <- mStat_normalize_data(data.obj, method = "TSS")$data.obj.norm
} else if (feature.dat.type == "proportion"){

data.obj <- mStat_normalize_data(data.obj, method = "TSS")$data.obj.norm
} else if (feature.dat.type == "other"){
stop("The 'other' type is suitable for situations where the user has analyzed the data using a method not provided in 'mStat_normalize_data' method, and the 'areaplot' is only applicable to raw data that has not undergone any processing or proportion data that adds up to 1. If you believe your data falls into these two categories, please modify 'feature.dat.type'.")
}

plot_list_all <- lapply(feature.level,function(feature.level){


if (is.null(data.obj$feature.agg.list[[feature.level]]) & feature.level != "original"){
data.obj <- mStat_aggregate_by_taxonomy(data.obj = data.obj, feature.level = feature.level)
}
Expand All @@ -259,6 +318,10 @@ generate_taxa_barplot_single <-
otu_tax_agg <- data.obj$feature.tab
}

if (!is.null(features.plot)){
otu_tax_agg <- otu_tax_agg[features.plot,]
}

otu_tax_agg <- otu_tax_agg %>%
as.data.frame() %>%
rownames_to_column(feature.level)
Expand Down Expand Up @@ -351,15 +414,15 @@ generate_taxa_barplot_single <-
result <- midpoints

df_sorted <- df %>%
group_by(!!sym(feature.level)) %>%
summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
arrange(is_other, overall_mean) %>%
mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
dplyr::group_by(!!sym(feature.level)) %>%
dplyr::summarise(overall_mean = mean(value, na.rm = TRUE)) %>%
dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
dplyr::arrange(is_other, overall_mean) %>%
dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))

# Apply sorted factor levels to the original data frame
df <- df %>%
mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_sorted[[feature.level]])))
dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_sorted[[feature.level]])))

stack_barplot_indiv <- # Main plot code
df %>%
Expand Down Expand Up @@ -441,15 +504,15 @@ generate_taxa_barplot_single <-
}

df_average_sorted <- df_average %>%
group_by(!!sym(feature.level)) %>%
summarise(overall_mean = mean(mean_value, na.rm = TRUE)) %>%
mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
arrange(is_other, overall_mean) %>%
mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))
dplyr::group_by(!!sym(feature.level)) %>%
dplyr::summarise(overall_mean = mean(mean_value, na.rm = TRUE)) %>%
dplyr::mutate(is_other = ifelse(!!sym(feature.level) == "Other", FALSE, TRUE)) %>%
dplyr::arrange(is_other, overall_mean) %>%
dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = !!sym(feature.level)))

# Apply the sorted factor levels to the original data frame.
df_average <- df_average %>%
mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_average_sorted[[feature.level]])))
dplyr::mutate(!!feature.level := factor(!!sym(feature.level), levels = levels(df_average_sorted[[feature.level]])))

stack_barplot_average <- # Main plot code
df_average %>%
Expand Down Expand Up @@ -492,8 +555,6 @@ generate_taxa_barplot_single <-

if (pdf) {
pdf_name <- paste0("taxa_barplot_single",
"_",
"subject_", subject.var,
"_",
"time_", time.var,
"_",
Expand All @@ -516,8 +577,6 @@ generate_taxa_barplot_single <-
# Save the stacked barplots as a PDF file
if (pdf) {
pdf_name <- paste0("taxa_barplot_single",
"_",
"subject_", subject.var,
"_",
"time_", time.var,
"_",
Expand Down

0 comments on commit 45c573f

Please sign in to comment.