stjude · robinpaul85 · Dec 17, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/client/plots/DEanalysis.js b/client/plots/DEanalysis.js
@@ -131,32 +131,18 @@ class DEanalysis {
 					{ label: 'wilcoxon', value: 'wilcoxon' }
 				]
 			})
+			if (this.settings.method == 'edgeR') {
+				inputs.push({
+					type: 'term',
+					configKey: 'term',
+					chartType: 'DEanalysis',
+					usecase: { target: 'DEanalysis', detail: 'term' },
+					label: 'Confounding factors',
+					vocabApi: this.app.vocabApi
+				})
+			}
 		}
 
-		//if (
-		//	(output.mid_sample_size_cutoff >= output.sample_size1 &&
-		//		output.mid_sample_size_cutoff < output.sample_size2 &&
-		//		output.sample_size2 < output.high_sample_size_cutoff) ||
-		//	(output.mid_sample_size_cutoff >= output.sample_size2 &&
-		//		output.mid_sample_size_cutoff < output.sample_size1 &&
-		//		output.sample_size1 < output.high_sample_size_cutoff)
-		//) {
-		//	// Invoked only when one sample size is low than the mid_sample_size_cutoff and the other one is higher but the higher sample size is lower than the high cutoff so that the DE computation does not take a lot of time on the server
-		//	inputs.push({
-		//		label: 'Method',
-		//		type: 'radio',
-		//		chartType: 'DEanalysis',
-		//		settingsKey: 'method',
-		//		title: 'Toggle between edgeR and wilcoxon test',
-		//		options: [
-		//			{ label: 'edgeR', value: 'edgeR' },
-		//			{ label: 'wilcoxon', value: 'wilcoxon' }
-		//		]
-		//	})
-		//	this.settings.method = output.method
-		//	this.state.config = output.method
-		//}
-
 		if (this.app.opts.genome.termdbs) {
 			// Check if genome build contains termdbs, only then enable gene ora
 			inputs.push({
@@ -748,15 +734,21 @@ export async function openHiercluster(term, samplelstTW, app, id, newId) {
 }
 
 async function runDEanalysis(self) {
+	const input = {
+		genome: self.app.vocabApi.vocab.genome,
+		dslabel: self.app.vocabApi.vocab.dslabel,
+		samplelst: self.config.samplelst,
+		min_count: self.settings.min_count,
+		min_total_count: self.settings.min_total_count,
+		method: self.settings.method
+	}
+	if (self.config.term) {
+		input.tw = self.config.term
+		self.settings.method = 'edgeR' // When adjustment of confounding variables is selected, the method should always be a parmetric method such as edgeR
+		input.method = 'edgeR'
+	}
 	const output = await dofetch3('DEanalysis', {
-		body: {
-			genome: self.app.vocabApi.vocab.genome,
-			dslabel: self.app.vocabApi.vocab.dslabel,
-			samplelst: self.config.samplelst,
-			min_count: self.settings.min_count,
-			min_total_count: self.settings.min_total_count,
-			method: self.settings.method
-		}
+		body: input
 	})
 	if (output.error) console.log('server side error:', output.error)
 	return output

diff --git a/server/routes/termdb.DE.ts b/server/routes/termdb.DE.ts
@@ -1,8 +1,9 @@
-//import fs from 'fs'
+import fs from 'fs'
 import path from 'path'
 import type { DERequest, DEResponse, ExpressionInput, RouteApi } from '#types'
 import { diffExpPayload } from '#types/checkers'
 import { run_rust } from '@sjcrh/proteinpaint-rust'
+import { getData } from '../src/termdb.matrix.js'
 import { get_ds_tdb } from '../src/termdb.js'
 import run_R from '../src/run_R.js'
 import serverconfig from '../src/serverconfig.js'
@@ -28,7 +29,21 @@ function init({ genomes }) {
 			const genome = genomes[q.genome]
 			if (!genome) throw 'invalid genome'
 			const [ds] = get_ds_tdb(genome, q)
-			const results = await run_DE(req.query as DERequest, ds)
+			let term_results: any = []
+			if (q.tw) {
+				const terms = [q.tw]
+				term_results = await getData(
+					{
+						filter: q.filter,
+						filter0: q.filter0,
+						terms
+					},
+					ds,
+					genome
+				)
+				if (term_results.error) throw term_results.error
+			}
+			const results = await run_DE(req.query as DERequest, ds, term_results)
 			res.send(results)
 		} catch (e: any) {
 			res.send({ status: 'error', error: e.message || e })
@@ -37,7 +52,7 @@ function init({ genomes }) {
 	}
 }
 
-async function run_DE(param: DERequest, ds: any) {
+async function run_DE(param: DERequest, ds: any, term_results: any) {
 	/*
 param{}
     samplelst{}
@@ -57,12 +72,16 @@ param{}
 	const group1names = [] as string[]
 	//let group1names_not_found = 0
 	//const group1names_not_found_list = []
+	const conf1_group1: (string | number)[] = []
 	for (const s of param.samplelst.groups[0].values) {
 		if (!Number.isInteger(s.sampleId)) continue
 		const n = ds.cohort.termdb.q.id2sampleName(s.sampleId)
 		if (!n) continue
 		if (q.allSampleSet.has(n)) {
 			group1names.push(n)
+			if (param.tw) {
+				conf1_group1.push(term_results.samples[s.sampleId][param.tw.$id]['value'])
+			}
 		} else {
 			//group1names_not_found += 1
 			//group1names_not_found_list.push(n)
@@ -71,12 +90,16 @@ param{}
 	const group2names = [] as string[]
 	//let group2names_not_found = 0
 	//const group2names_not_found_list = []
+	const conf1_group2: (string | number)[] = []
 	for (const s of param.samplelst.groups[1].values) {
 		if (!Number.isInteger(s.sampleId)) continue
 		const n = ds.cohort.termdb.q.id2sampleName(s.sampleId)
 		if (!n) continue
 		if (q.allSampleSet.has(n)) {
 			group2names.push(n)
+			if (param.tw) {
+				conf1_group2.push(term_results.samples[s.sampleId][param.tw.$id]['value'])
+			}
 		} else {
 			//group2names_not_found += 1
 			//group2names_not_found_list.push(n)
@@ -108,11 +131,17 @@ param{}
 		storage_type: param.storage_type
 	} as ExpressionInput
 
+	if (param.tw) {
+		expression_input.conf1 = [...conf1_group1, ...conf1_group2]
+		expression_input.conf1_type = param.tw.term.type
+	}
+
 	//console.log('expression_input:', expression_input)
-	//fs.writeFile('test.txt', JSON.stringify(expression_input), function(err) {
-	//    // For catching input to rust pipeline, in case of an error
-	//    if (err) return console.log(err)
-	//})
+	//console.log("param.method:",param.method)
+	fs.writeFile('test.txt', JSON.stringify(expression_input), function (err) {
+		// For catching input to rust pipeline, in case of an error
+		if (err) return console.log(err)
+	})
 
 	const sample_size_limit = 8 // Cutoff to determine if parametric estimation using edgeR should be used or non-parametric estimation using wilcoxon test
 	let result
@@ -124,7 +153,6 @@ param{}
 		console.log('Time taken to run edgeR:', time2 - time1, 'ms')
 		for (const line of r_output.split('\n')) {
 			if (line.startsWith('adjusted_p_values:')) {
-				//console.log("line1:", line.replace('adjusted_p_values:', ''))
 				result = JSON.parse(line.replace('adjusted_p_values:', ''))
 			} else {
 				//console.log("line:", line)

diff --git a/server/utils/edge.R b/server/utils/edge.R
@@ -34,7 +34,7 @@ suppressWarnings({
     suppressPackageStartupMessages(library(edgeR))
     suppressPackageStartupMessages(library(dplyr))
 })
-
+read_json_start <- Sys.time()
 con <- file("stdin", "r")
 json <- readLines(con, warn=FALSE)
 close(con)
@@ -47,6 +47,11 @@ controls <- unlist(strsplit(input$control, ","))
 combined <- c("geneID","geneSymbol",cases,controls)
 #data %>% select(all_of(combined))
 #read_file_time_start <- Sys.time()
+read_json_stop <- Sys.time()
+print (paste0("Time to read json:", read_json_stop - read_json_start))
+
+case_sample_list <- c()
+control_sample_list <- c()
 if (exists(input$storage_type)==FALSE) {
     if (input$storage_type == "HDF5") {
         #print(h5ls(input$input_file))
@@ -59,6 +64,7 @@ if (exists(input$storage_type)==FALSE) {
             sample_index <- which(samples == sample)
             if (length(sample_index) == 1) {
                 samples_indicies <- c(samples_indicies,sample_index)
+                case_sample_list <- c(case_sample_list,sample)
             } else {
                 print (paste(sample,"not found"))
                 quit(status = 1)
@@ -69,6 +75,7 @@ if (exists(input$storage_type)==FALSE) {
             sample_index <- which(samples == sample)
             if (length(sample_index) == 1) {
                 samples_indicies <- c(samples_indicies,sample_index)
+                control_sample_list <- c(control_sample_list,sample)
             } else {
                 print (paste(sample,"not found"))
                 quit(status = 1)
@@ -100,7 +107,10 @@ if (exists(input$storage_type)==FALSE) {
     read_counts <- select(read_counts, -geneID)
     read_counts <- select(read_counts, -geneSymbol)
 }
-
+print ("case_sample_list:")
+print (case_sample_list)
+print ("control_sample_list:")
+print (control_sample_list)
 #read_file_time_stop <- Sys.time()
 #print (read_file_time_stop - read_file_time_start)
 
@@ -114,20 +124,46 @@ keep <- filterByExpr(y, min.count = input$min_count, min.total.count = input$min
 y <- y[keep, keep.lib.sizes = FALSE]
 y <- calcNormFactors(y, method = "TMM")
 #print (y)
-calculate_dispersion_time_start <- Sys.time()
-suppressWarnings({
-  suppressMessages({
-      dge <- estimateDisp(y = y)
-  })
-})
-calculate_dispersion_time_stop <- Sys.time()
-print("Dispersion Time")
-print (calculate_dispersion_time_stop - calculate_dispersion_time_start)
-calculate_exact_test_time_start <- Sys.time()
-et <- exactTest(object = dge)
-calculate_exact_test_time_stop <- Sys.time()
-print("Exact Time")
-print(calculate_exact_test_time_stop - calculate_exact_test_time_start)
+
+if (length(input$conf1) == 0) { # No adjustment of confounding factors
+      calculate_dispersion_time_start <- Sys.time()
+      suppressWarnings({
+        suppressMessages({
+            dge <- estimateDisp(y = y)
+        })
+      })
+      calculate_dispersion_time_stop <- Sys.time()
+      print (paste0("Dispersion Time:",calculate_dispersion_time_stop - calculate_dispersion_time_start))
+      calculate_exact_test_time_start <- Sys.time()
+      et <- exactTest(object = dge)
+      calculate_exact_test_time_stop <- Sys.time()
+      print (paste0("Exact Time:",calculate_exact_test_time_stop - calculate_exact_test_time_start))
+} else { # Adjusting for confounding factors. This has been adapted based on the protocol described here: http://larionov.co.uk/deg_ebi_tutorial_2020/edger-analysis-1.html#calculate-degs
+    y$samples$conditions <- conditions
+    y$samples$conf1 <- input$conf1
+    print ("y$samples")
+    print (y$samples)
+    calculate_model_start <- Sys.time()
+    design <- model.matrix(~ conf1 + conditions, data = y$samples)
+    calculate_model_stop <- Sys.time()
+    print (paste0("Time for making design matrix:", calculate_model_stop - calculate_model_start))
+    calculate_dispersion_time_start <- Sys.time()
+    y <- estimateDisp(y, design)
+    calculate_dispersion_time_stop <- Sys.time()
+    print (paste0("Dispersion Time:",calculate_dispersion_time_stop - calculate_dispersion_time_start))
+    # Fit the model
+    calculate_fit_time_start <- Sys.time()
+    #fit <- glmQLFit(y, design)
+    fit <- glmFit(y, design)
+    calculate_fit_time_stop <- Sys.time()
+    print (paste0("Fit time:",calculate_fit_time_stop - calculate_fit_time_start))
+    # Calculate the test statistics
+    calculate_test_statistics_start <- Sys.time()
+    #et <- glmQLFTest(fit, coef = ncol(design))
+    et <- glmLRT(fit, coef = 2)  # coef = 2 corresponds to the 'treatment' vs 'control' comparison
+    calculate_test_statistics_stop <- Sys.time()
+    print (paste0("Test statistics time:",calculate_test_statistics_stop - calculate_test_statistics_start))
+}
 #print ("Time to calculate DE")
 #print (calculate_DE_time_stop - calculate_DE_time_start)
 #print (et)
@@ -147,6 +183,7 @@ names(output)[4] <- "original_p_value"
 names(output)[5] <- "adjusted_p_value"
 #write_csv(output,"DE_output.txt")
 cat(paste0("adjusted_p_values:",toJSON(output)))
+
 #output_json <- toJSON(output)
 #print ("output_json")
 #output_file <- paste0(input$output_path,"/r_output.txt")

diff --git a/shared/types/src/routes/termdb.DE.ts b/shared/types/src/routes/termdb.DE.ts
@@ -15,6 +15,8 @@ export type DERequest = {
 	storage_type: 'text' | 'HDF5'
 	/** Method of DE used wilcoxon/edgeR */
 	method?: string
+	/** Term for confounding variable (if present) */
+	tw?: any
 }
 
 export type ExpressionInput = {
@@ -32,6 +34,10 @@ export type ExpressionInput = {
 	min_total_count: number
 	/** Type of storage file: HDF5 or text. Text will be deprecated in the future */
 	storage_type: 'HDF5' | 'text'
+	/** Confounding variable for DE analysis. Maybe array of string (Gender: Male/female) or number (Age). For now supporting 1 confounding variable. Later will add support for multiple confounding variables */
+	conf1?: any[]
+	/** Type of the confounding variable (categorical/float) */
+	conf1_type?: 'categorical' | 'float'
 }
 
 export type DEResponse = {