From 522461603bd2536bea8e4cfbcb139a6cfa47e8f4 Mon Sep 17 00:00:00 2001 From: Lisa Malins <49386619+lisakmalins@users.noreply.github.com> Date: Tue, 16 Jan 2024 11:00:47 -0800 Subject: [PATCH 1/3] fix: Handle filenames more robustly in run_DESeq_stats.R Fix for issue #81. In a nutshell, sample names are now extracted more precisely from filenames so that file naming can be more flexible. --- R_scripts/run_DESeq_stats.R | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/R_scripts/run_DESeq_stats.R b/R_scripts/run_DESeq_stats.R index 93637f7..4ef610f 100755 --- a/R_scripts/run_DESeq_stats.R +++ b/R_scripts/run_DESeq_stats.R @@ -128,21 +128,18 @@ complete_table <- complete_table[,-1] # OPTIONAL: importing the raw counts if (is.null(opt$raw_counts) == FALSE) { - raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "") - raw_counts_table <- data.frame(raw_counts_table, - do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_'))) - raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2)) - raw_counts_table <- t(raw_counts_table[,c("X2", "V2")]) - row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL") - colnames(raw_counts_table) <- raw_counts_table[1,] - raw_counts_table <- as.data.frame(raw_counts_table) - raw_counts_table <- raw_counts_table[-1,] + raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "") - # Need to subtract off the total number of annotations - raw_counts_table["ANNOTATION COUNT",] <- colSums(complete_table) - raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,] + # Extract sample names from filestems + raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM) - complete_table <- rbind(complete_table, raw_counts_table["OTHER",]) + # Subtract the annotation count from total to obtain unannotated count + raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(complete_table) + raw_counts_table[, "OTHER"] <- raw_counts_table["RAW_TOTAL"] - raw_counts_table["ANNOTATION_COUNT"] + + # Append unannotated counts to complete table + other_counts_table <- t(data.frame("OTHER"=raw_counts_table[, "OTHER"], row.names=raw_counts_table[, "SAMPLE"])) + complete_table <- rbind(complete_table, other_counts_table) } # DESeq statistical calculations From 04de463ab7e231280bd1980e211b53b6c60dde59 Mon Sep 17 00:00:00 2001 From: Lisa Malins <49386619+lisakmalins@users.noreply.github.com> Date: Tue, 16 Jan 2024 12:00:31 -0800 Subject: [PATCH 2/3] fix: Handle filenames more robustly in Subsystems_DESeq_stats.R --- R_scripts/Subsystems_DESeq_stats.R | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/R_scripts/Subsystems_DESeq_stats.R b/R_scripts/Subsystems_DESeq_stats.R index e78f1c3..8f98297 100644 --- a/R_scripts/Subsystems_DESeq_stats.R +++ b/R_scripts/Subsystems_DESeq_stats.R @@ -185,23 +185,18 @@ l1_table[is.na(l1_table)] <- 0 # OPTIONAL: importing the raw counts cat ("Now importing raw counts, if provided.\n") if (is.null(opt$raw_counts) == FALSE) { - raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "") - raw_counts_table <- data.frame(raw_counts_table, - do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_'))) - raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2)) - raw_counts_table <- t(raw_counts_table[,c("X2", "V2")]) - row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL") - colnames(raw_counts_table) <- raw_counts_table[1,] - raw_counts_table <- as.data.frame(raw_counts_table) - raw_counts_table <- raw_counts_table[-1,] + raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "") - # Need to subtract off the total number of annotations - raw_counts_table["ANNOTATION COUNT",] <- colSums(l1_table) - raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,] - - l1_table <- rbind(l1_table, raw_counts_table["OTHER",]) - l1_names <- c(l1_names, "OTHER") - rownames(l1_table) <- l1_names + # Extract sample names from filestems + raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM) + + # Subtract the annotation count from total to obtain unannotated count + raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(l1_table) + raw_counts_table[, "OTHER"] <- raw_counts_table["RAW_TOTAL"] - raw_counts_table["ANNOTATION_COUNT"] + + # Append unannotated counts to complete table + other_counts_table <- t(data.frame("OTHER"=raw_counts_table[, "OTHER"], row.names=raw_counts_table[, "SAMPLE"])) + l1_table <- rbind(l1_table, other_counts_table) } # now the DESeq stuff From ffb10a48142d255c7fa5606fbefbd266cf26785e Mon Sep 17 00:00:00 2001 From: transcript Date: Sat, 27 Jan 2024 10:38:52 -0600 Subject: [PATCH 3/3] modified: R_scripts/Subsystems_DESeq_stats.R modified: R_scripts/run_DESeq_stats.R modified: sample_files_paired-end/.DS_Store --- R_scripts/Subsystems_DESeq_stats.R | 14 ++++++++------ R_scripts/run_DESeq_stats.R | 12 +++++++----- sample_files_paired-end/.DS_Store | Bin 10244 -> 10244 bytes 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/R_scripts/Subsystems_DESeq_stats.R b/R_scripts/Subsystems_DESeq_stats.R index 8f98297..91e729a 100644 --- a/R_scripts/Subsystems_DESeq_stats.R +++ b/R_scripts/Subsystems_DESeq_stats.R @@ -55,22 +55,24 @@ control_files <- list.files( pattern = "control_*", full.names = T, recursive = FALSE) control_names = "" for (name in control_files) { - control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} + control_names <- c(control_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])} control_names <- control_names[-1] control_names_trimmed = "" for (name in control_names) { - control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} + spl <- regexpr("_",name) + control_names_trimmed <- c(control_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])} control_names_trimmed <- control_names_trimmed[-1] exp_files <- list.files( - pattern = "experiment_*", full.names = T, recursive = FALSE) + pattern = "experimental_*", full.names = T, recursive = FALSE) exp_names = "" for (name in exp_files) { - exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} + exp_names <- c(exp_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])} exp_names <- exp_names[-1] exp_names_trimmed = "" for (name in exp_names) { - exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} + spl <- regexpr("_",name) + exp_names_trimmed <- c(exp_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])} exp_names_trimmed <- exp_names_trimmed[-1] # sanity check @@ -188,7 +190,7 @@ if (is.null(opt$raw_counts) == FALSE) { raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "") # Extract sample names from filestems - raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM) + raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned.fastq", "", raw_counts_table$FILESTEM) # Subtract the annotation count from total to obtain unannotated count raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(l1_table) diff --git a/R_scripts/run_DESeq_stats.R b/R_scripts/run_DESeq_stats.R index 4ef610f..a7463b7 100755 --- a/R_scripts/run_DESeq_stats.R +++ b/R_scripts/run_DESeq_stats.R @@ -48,22 +48,24 @@ control_files <- list.files( pattern = "control_*", full.names = T, recursive = FALSE) control_names = "" for (name in control_files) { - control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} + control_names <- c(control_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])} control_names <- control_names[-1] control_names_trimmed = "" for (name in control_names) { - control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} + spl <- regexpr("_",name) + control_names_trimmed <- c(control_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])} control_names_trimmed <- control_names_trimmed[-1] exp_files <- list.files( pattern = "experimental_*", full.names = T, recursive = FALSE) exp_names = "" for (name in exp_files) { - exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} + exp_names <- c(exp_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])} exp_names <- exp_names[-1] exp_names_trimmed = "" for (name in exp_names) { - exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} + spl <- regexpr("_",name) + exp_names_trimmed <- c(exp_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])} exp_names_trimmed <- exp_names_trimmed[-1] # sanity check @@ -131,7 +133,7 @@ if (is.null(opt$raw_counts) == FALSE) { raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "") # Extract sample names from filestems - raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM) + raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned.fastq", "", raw_counts_table$FILESTEM) # Subtract the annotation count from total to obtain unannotated count raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(complete_table) diff --git a/sample_files_paired-end/.DS_Store b/sample_files_paired-end/.DS_Store index f65874317ca1c14c495aec9a4d556cbcdf73d131..a2b9985a00f54da6846bbe4819ee55f6176ebbe9 100644 GIT binary patch delta 135 zcmZn(XbIS$BFc2sZt@2a_07JbQLG%t(mZAuxgT}}il|NgFU^A@u$flobB-_NafMN0iIjPMP MYIH