From 522461603bd2536bea8e4cfbcb139a6cfa47e8f4 Mon Sep 17 00:00:00 2001
From: Lisa Malins <49386619+lisakmalins@users.noreply.github.com>
Date: Tue, 16 Jan 2024 11:00:47 -0800
Subject: [PATCH 1/3] fix: Handle filenames more robustly in run_DESeq_stats.R

Fix for issue #81. In a nutshell, sample names are now extracted more precisely from filenames so that file naming can be more flexible.
---
 R_scripts/run_DESeq_stats.R | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/R_scripts/run_DESeq_stats.R b/R_scripts/run_DESeq_stats.R
index 93637f7..4ef610f 100755
--- a/R_scripts/run_DESeq_stats.R
+++ b/R_scripts/run_DESeq_stats.R
@@ -128,21 +128,18 @@ complete_table <- complete_table[,-1]
 
 # OPTIONAL: importing the raw counts
 if (is.null(opt$raw_counts) == FALSE) {
-  raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "")
-  raw_counts_table <- data.frame(raw_counts_table, 
-        do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_')))
-  raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2))
-  raw_counts_table <- t(raw_counts_table[,c("X2", "V2")])
-  row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL")
-  colnames(raw_counts_table) <- raw_counts_table[1,]
-  raw_counts_table <- as.data.frame(raw_counts_table)
-  raw_counts_table <- raw_counts_table[-1,]
+  raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "")
   
-  # Need to subtract off the total number of annotations
-  raw_counts_table["ANNOTATION COUNT",] <- colSums(complete_table)
-  raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,]
+  # Extract sample names from filestems
+  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM)
 
-  complete_table <- rbind(complete_table, raw_counts_table["OTHER",])
+  # Subtract the annotation count from total to obtain unannotated count
+  raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(complete_table)
+  raw_counts_table[, "OTHER"] <- raw_counts_table["RAW_TOTAL"] - raw_counts_table["ANNOTATION_COUNT"]
+
+  # Append unannotated counts to complete table
+  other_counts_table <- t(data.frame("OTHER"=raw_counts_table[, "OTHER"], row.names=raw_counts_table[, "SAMPLE"]))
+  complete_table <- rbind(complete_table, other_counts_table)
 }
 
 # DESeq statistical calculations

From 04de463ab7e231280bd1980e211b53b6c60dde59 Mon Sep 17 00:00:00 2001
From: Lisa Malins <49386619+lisakmalins@users.noreply.github.com>
Date: Tue, 16 Jan 2024 12:00:31 -0800
Subject: [PATCH 2/3] fix: Handle filenames more robustly in
 Subsystems_DESeq_stats.R

---
 R_scripts/Subsystems_DESeq_stats.R | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/R_scripts/Subsystems_DESeq_stats.R b/R_scripts/Subsystems_DESeq_stats.R
index e78f1c3..8f98297 100644
--- a/R_scripts/Subsystems_DESeq_stats.R
+++ b/R_scripts/Subsystems_DESeq_stats.R
@@ -185,23 +185,18 @@ l1_table[is.na(l1_table)] <- 0
 # OPTIONAL: importing the raw counts
 cat ("Now importing raw counts, if provided.\n")
 if (is.null(opt$raw_counts) == FALSE) {
-  raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "")
-  raw_counts_table <- data.frame(raw_counts_table, 
-        do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_')))
-  raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2))
-  raw_counts_table <- t(raw_counts_table[,c("X2", "V2")])
-  row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL")
-  colnames(raw_counts_table) <- raw_counts_table[1,]
-  raw_counts_table <- as.data.frame(raw_counts_table)
-  raw_counts_table <- raw_counts_table[-1,]
+  raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "")
   
-  # Need to subtract off the total number of annotations
-  raw_counts_table["ANNOTATION COUNT",] <- colSums(l1_table)
-  raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,]
-  
-  l1_table <- rbind(l1_table, raw_counts_table["OTHER",])
-  l1_names <- c(l1_names, "OTHER")
-  rownames(l1_table) <- l1_names
+  # Extract sample names from filestems
+  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM)
+
+  # Subtract the annotation count from total to obtain unannotated count
+  raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(l1_table)
+  raw_counts_table[, "OTHER"] <- raw_counts_table["RAW_TOTAL"] - raw_counts_table["ANNOTATION_COUNT"]
+
+  # Append unannotated counts to complete table
+  other_counts_table <- t(data.frame("OTHER"=raw_counts_table[, "OTHER"], row.names=raw_counts_table[, "SAMPLE"]))
+  l1_table <- rbind(l1_table, other_counts_table)
 }
 
 # now the DESeq stuff

From ffb10a48142d255c7fa5606fbefbd266cf26785e Mon Sep 17 00:00:00 2001
From: transcript <swestreich@gmail.com>
Date: Sat, 27 Jan 2024 10:38:52 -0600
Subject: [PATCH 3/3] 	modified:   R_scripts/Subsystems_DESeq_stats.R 
 modified:   R_scripts/run_DESeq_stats.R 	modified:  
 sample_files_paired-end/.DS_Store

---
 R_scripts/Subsystems_DESeq_stats.R |  14 ++++++++------
 R_scripts/run_DESeq_stats.R        |  12 +++++++-----
 sample_files_paired-end/.DS_Store  | Bin 10244 -> 10244 bytes
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/R_scripts/Subsystems_DESeq_stats.R b/R_scripts/Subsystems_DESeq_stats.R
index 8f98297..91e729a 100644
--- a/R_scripts/Subsystems_DESeq_stats.R
+++ b/R_scripts/Subsystems_DESeq_stats.R
@@ -55,22 +55,24 @@ control_files <- list.files(
   pattern = "control_*", full.names = T, recursive = FALSE)
 control_names = ""
 for (name in control_files) {
-  control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} 
+  control_names <- c(control_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])}
 control_names <- control_names[-1]
 control_names_trimmed = ""
 for (name in control_names) {
-  control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
+  spl <- regexpr("_",name)
+  control_names_trimmed <- c(control_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])}
 control_names_trimmed <- control_names_trimmed[-1]
 
 exp_files <- list.files(
-  pattern = "experiment_*", full.names = T, recursive = FALSE)
+  pattern = "experimental_*", full.names = T, recursive = FALSE)
 exp_names = ""
 for (name in exp_files) {
-  exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])}
+  exp_names <- c(exp_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])}
 exp_names <- exp_names[-1]
 exp_names_trimmed = ""
 for (name in exp_names) {
-  exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
+  spl <- regexpr("_",name)
+  exp_names_trimmed <- c(exp_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])}
 exp_names_trimmed <- exp_names_trimmed[-1]
 
 # sanity check
@@ -188,7 +190,7 @@ if (is.null(opt$raw_counts) == FALSE) {
   raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "")
   
   # Extract sample names from filestems
-  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM)
+  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned.fastq", "", raw_counts_table$FILESTEM)
 
   # Subtract the annotation count from total to obtain unannotated count
   raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(l1_table)
diff --git a/R_scripts/run_DESeq_stats.R b/R_scripts/run_DESeq_stats.R
index 4ef610f..a7463b7 100755
--- a/R_scripts/run_DESeq_stats.R
+++ b/R_scripts/run_DESeq_stats.R
@@ -48,22 +48,24 @@ control_files <- list.files(
   pattern = "control_*", full.names = T, recursive = FALSE)
 control_names = ""
 for (name in control_files) {
-  control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])}
+  control_names <- c(control_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])}
 control_names <- control_names[-1]
 control_names_trimmed = ""
 for (name in control_names) {
-  control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
+  spl <- regexpr("_",name)
+  control_names_trimmed <- c(control_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])}
 control_names_trimmed <- control_names_trimmed[-1]
 
 exp_files <- list.files(
   pattern = "experimental_*", full.names = T, recursive = FALSE)
 exp_names = ""
 for (name in exp_files) {
-  exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])}
+  exp_names <- c(exp_names, unlist(strsplit(name, split='.', fixed=TRUE))[2])}
 exp_names <- exp_names[-1]
 exp_names_trimmed = ""
 for (name in exp_names) {
-  exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
+  spl <- regexpr("_",name)
+  exp_names_trimmed <- c(exp_names_trimmed, substring(name,c(1,spl+1),c(spl-1,nchar(name)))[2])}
 exp_names_trimmed <- exp_names_trimmed[-1]
 
 # sanity check
@@ -131,7 +133,7 @@ if (is.null(opt$raw_counts) == FALSE) {
   raw_counts_table <- read.delim(counts_file, header=FALSE, col.names=c("FILESTEM", "RAW_TOTAL"), quote = "")
   
   # Extract sample names from filestems
-  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned", "", raw_counts_table$FILESTEM)
+  raw_counts_table[, "SAMPLE"] <- gsub("experimental_|control_|.cleaned.fastq", "", raw_counts_table$FILESTEM)
 
   # Subtract the annotation count from total to obtain unannotated count
   raw_counts_table[, "ANNOTATION_COUNT"] <- colSums(complete_table)
diff --git a/sample_files_paired-end/.DS_Store b/sample_files_paired-end/.DS_Store
index f65874317ca1c14c495aec9a4d556cbcdf73d131..a2b9985a00f54da6846bbe4819ee55f6176ebbe9 100644
GIT binary patch
delta 135
zcmZn(XbIS$BFc2sZt@2a_07JbQLG%t(mZAuxgT}}il|NgFU^A@u$f<aBO}uzfysd~
zI+NeY@UZ_{bMM=)iAN?g%38wNj<Qxvj0ux#Wv!WfnI_lCf>lobB-_NafMN0iIjPMP
M<fgD8?A*=-0B}Gy2LJ#7

delta 135
zcmZn(XbIS$BFc1R&*Tpx>YIH<qgXk(1u_E?_8xEqil|NgFU^A@u$f<aBO_Ck(d0lG
zoyqTHc-V`6h%Zv+KRB6D))LNkl(k}F*f6<P)|$zna&nz4SmoqTvQ11443ihgNo}4W
LH-!yh=XNFl<h?ME