Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tentative implementation for adjusting of confounding factors using edgeR #2539

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
Draft
56 changes: 24 additions & 32 deletions client/plots/DEanalysis.js
Original file line number Diff line number Diff line change
Expand Up @@ -131,32 +131,18 @@ class DEanalysis {
{ label: 'wilcoxon', value: 'wilcoxon' }
]
})
if (this.settings.method == 'edgeR') {
inputs.push({
type: 'term',
configKey: 'term',
chartType: 'DEanalysis',
usecase: { target: 'DEanalysis', detail: 'term' },
label: 'Confounding factors',
vocabApi: this.app.vocabApi
})
}
}

//if (
// (output.mid_sample_size_cutoff >= output.sample_size1 &&
// output.mid_sample_size_cutoff < output.sample_size2 &&
// output.sample_size2 < output.high_sample_size_cutoff) ||
// (output.mid_sample_size_cutoff >= output.sample_size2 &&
// output.mid_sample_size_cutoff < output.sample_size1 &&
// output.sample_size1 < output.high_sample_size_cutoff)
//) {
// // Invoked only when one sample size is low than the mid_sample_size_cutoff and the other one is higher but the higher sample size is lower than the high cutoff so that the DE computation does not take a lot of time on the server
// inputs.push({
// label: 'Method',
// type: 'radio',
// chartType: 'DEanalysis',
// settingsKey: 'method',
// title: 'Toggle between edgeR and wilcoxon test',
// options: [
// { label: 'edgeR', value: 'edgeR' },
// { label: 'wilcoxon', value: 'wilcoxon' }
// ]
// })
// this.settings.method = output.method
// this.state.config = output.method
//}

if (this.app.opts.genome.termdbs) {
// Check if genome build contains termdbs, only then enable gene ora
inputs.push({
Expand Down Expand Up @@ -748,15 +734,21 @@ export async function openHiercluster(term, samplelstTW, app, id, newId) {
}

async function runDEanalysis(self) {
const input = {
genome: self.app.vocabApi.vocab.genome,
dslabel: self.app.vocabApi.vocab.dslabel,
samplelst: self.config.samplelst,
min_count: self.settings.min_count,
min_total_count: self.settings.min_total_count,
method: self.settings.method
}
if (self.config.term) {
input.tw = self.config.term
self.settings.method = 'edgeR' // When adjustment of confounding variables is selected, the method should always be a parmetric method such as edgeR
input.method = 'edgeR'
}
const output = await dofetch3('DEanalysis', {
body: {
genome: self.app.vocabApi.vocab.genome,
dslabel: self.app.vocabApi.vocab.dslabel,
samplelst: self.config.samplelst,
min_count: self.settings.min_count,
min_total_count: self.settings.min_total_count,
method: self.settings.method
}
body: input
})
if (output.error) console.log('server side error:', output.error)
return output
Expand Down
44 changes: 36 additions & 8 deletions server/routes/termdb.DE.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
//import fs from 'fs'
import fs from 'fs'
import path from 'path'
import type { DERequest, DEResponse, ExpressionInput, RouteApi } from '#types'
import { diffExpPayload } from '#types/checkers'
import { run_rust } from '@sjcrh/proteinpaint-rust'
import { getData } from '../src/termdb.matrix.js'
import { get_ds_tdb } from '../src/termdb.js'
import run_R from '../src/run_R.js'
import serverconfig from '../src/serverconfig.js'
Expand All @@ -28,7 +29,21 @@ function init({ genomes }) {
const genome = genomes[q.genome]
if (!genome) throw 'invalid genome'
const [ds] = get_ds_tdb(genome, q)
const results = await run_DE(req.query as DERequest, ds)
let term_results: any = []
if (q.tw) {
const terms = [q.tw]
term_results = await getData(
{
filter: q.filter,
filter0: q.filter0,
terms
},
ds,
genome
)
if (term_results.error) throw term_results.error
}
const results = await run_DE(req.query as DERequest, ds, term_results)
res.send(results)
} catch (e: any) {
res.send({ status: 'error', error: e.message || e })
Expand All @@ -37,7 +52,7 @@ function init({ genomes }) {
}
}

async function run_DE(param: DERequest, ds: any) {
async function run_DE(param: DERequest, ds: any, term_results: any) {
/*
param{}
samplelst{}
Expand All @@ -57,12 +72,16 @@ param{}
const group1names = [] as string[]
//let group1names_not_found = 0
//const group1names_not_found_list = []
const conf1_group1: (string | number)[] = []
for (const s of param.samplelst.groups[0].values) {
if (!Number.isInteger(s.sampleId)) continue
const n = ds.cohort.termdb.q.id2sampleName(s.sampleId)
if (!n) continue
if (q.allSampleSet.has(n)) {
group1names.push(n)
if (param.tw) {
conf1_group1.push(term_results.samples[s.sampleId][param.tw.$id]['value'])
}
} else {
//group1names_not_found += 1
//group1names_not_found_list.push(n)
Expand All @@ -71,12 +90,16 @@ param{}
const group2names = [] as string[]
//let group2names_not_found = 0
//const group2names_not_found_list = []
const conf1_group2: (string | number)[] = []
for (const s of param.samplelst.groups[1].values) {
if (!Number.isInteger(s.sampleId)) continue
const n = ds.cohort.termdb.q.id2sampleName(s.sampleId)
if (!n) continue
if (q.allSampleSet.has(n)) {
group2names.push(n)
if (param.tw) {
conf1_group2.push(term_results.samples[s.sampleId][param.tw.$id]['value'])
}
} else {
//group2names_not_found += 1
//group2names_not_found_list.push(n)
Expand Down Expand Up @@ -108,11 +131,17 @@ param{}
storage_type: param.storage_type
} as ExpressionInput

if (param.tw) {
expression_input.conf1 = [...conf1_group1, ...conf1_group2]
expression_input.conf1_type = param.tw.term.type
}

//console.log('expression_input:', expression_input)
//fs.writeFile('test.txt', JSON.stringify(expression_input), function(err) {
// // For catching input to rust pipeline, in case of an error
// if (err) return console.log(err)
//})
//console.log("param.method:",param.method)
fs.writeFile('test.txt', JSON.stringify(expression_input), function (err) {
// For catching input to rust pipeline, in case of an error
if (err) return console.log(err)
})

const sample_size_limit = 8 // Cutoff to determine if parametric estimation using edgeR should be used or non-parametric estimation using wilcoxon test
let result
Expand All @@ -124,7 +153,6 @@ param{}
console.log('Time taken to run edgeR:', time2 - time1, 'ms')
for (const line of r_output.split('\n')) {
if (line.startsWith('adjusted_p_values:')) {
//console.log("line1:", line.replace('adjusted_p_values:', ''))
result = JSON.parse(line.replace('adjusted_p_values:', ''))
} else {
//console.log("line:", line)
Expand Down
69 changes: 53 additions & 16 deletions server/utils/edge.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ suppressWarnings({
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(dplyr))
})

read_json_start <- Sys.time()
con <- file("stdin", "r")
json <- readLines(con, warn=FALSE)
close(con)
Expand All @@ -47,6 +47,11 @@ controls <- unlist(strsplit(input$control, ","))
combined <- c("geneID","geneSymbol",cases,controls)
#data %>% select(all_of(combined))
#read_file_time_start <- Sys.time()
read_json_stop <- Sys.time()
print (paste0("Time to read json:", read_json_stop - read_json_start))

case_sample_list <- c()
control_sample_list <- c()
if (exists(input$storage_type)==FALSE) {
if (input$storage_type == "HDF5") {
#print(h5ls(input$input_file))
Expand All @@ -59,6 +64,7 @@ if (exists(input$storage_type)==FALSE) {
sample_index <- which(samples == sample)
if (length(sample_index) == 1) {
samples_indicies <- c(samples_indicies,sample_index)
case_sample_list <- c(case_sample_list,sample)
} else {
print (paste(sample,"not found"))
quit(status = 1)
Expand All @@ -69,6 +75,7 @@ if (exists(input$storage_type)==FALSE) {
sample_index <- which(samples == sample)
if (length(sample_index) == 1) {
samples_indicies <- c(samples_indicies,sample_index)
control_sample_list <- c(control_sample_list,sample)
} else {
print (paste(sample,"not found"))
quit(status = 1)
Expand Down Expand Up @@ -100,7 +107,10 @@ if (exists(input$storage_type)==FALSE) {
read_counts <- select(read_counts, -geneID)
read_counts <- select(read_counts, -geneSymbol)
}

print ("case_sample_list:")
robinpaul85 marked this conversation as resolved.
Show resolved Hide resolved
print (case_sample_list)
print ("control_sample_list:")
print (control_sample_list)
#read_file_time_stop <- Sys.time()
#print (read_file_time_stop - read_file_time_start)

Expand All @@ -114,20 +124,46 @@ keep <- filterByExpr(y, min.count = input$min_count, min.total.count = input$min
y <- y[keep, keep.lib.sizes = FALSE]
y <- calcNormFactors(y, method = "TMM")
#print (y)
calculate_dispersion_time_start <- Sys.time()
suppressWarnings({
suppressMessages({
dge <- estimateDisp(y = y)
})
})
calculate_dispersion_time_stop <- Sys.time()
print("Dispersion Time")
print (calculate_dispersion_time_stop - calculate_dispersion_time_start)
calculate_exact_test_time_start <- Sys.time()
et <- exactTest(object = dge)
calculate_exact_test_time_stop <- Sys.time()
print("Exact Time")
print(calculate_exact_test_time_stop - calculate_exact_test_time_start)

if (length(input$conf1) == 0) { # No adjustment of confounding factors
calculate_dispersion_time_start <- Sys.time()
suppressWarnings({
suppressMessages({
dge <- estimateDisp(y = y)
})
})
calculate_dispersion_time_stop <- Sys.time()
print (paste0("Dispersion Time:",calculate_dispersion_time_stop - calculate_dispersion_time_start))
calculate_exact_test_time_start <- Sys.time()
et <- exactTest(object = dge)
calculate_exact_test_time_stop <- Sys.time()
print (paste0("Exact Time:",calculate_exact_test_time_stop - calculate_exact_test_time_start))
} else { # Adjusting for confounding factors. This has been adapted based on the protocol described here: http://larionov.co.uk/deg_ebi_tutorial_2020/edger-analysis-1.html#calculate-degs
y$samples$conditions <- conditions
y$samples$conf1 <- input$conf1
print ("y$samples")
print (y$samples)
calculate_model_start <- Sys.time()
design <- model.matrix(~ conf1 + conditions, data = y$samples)
calculate_model_stop <- Sys.time()
print (paste0("Time for making design matrix:", calculate_model_stop - calculate_model_start))
calculate_dispersion_time_start <- Sys.time()
y <- estimateDisp(y, design)
calculate_dispersion_time_stop <- Sys.time()
print (paste0("Dispersion Time:",calculate_dispersion_time_stop - calculate_dispersion_time_start))
# Fit the model
calculate_fit_time_start <- Sys.time()
#fit <- glmQLFit(y, design)
fit <- glmFit(y, design)
calculate_fit_time_stop <- Sys.time()
print (paste0("Fit time:",calculate_fit_time_stop - calculate_fit_time_start))
# Calculate the test statistics
calculate_test_statistics_start <- Sys.time()
#et <- glmQLFTest(fit, coef = ncol(design))
et <- glmLRT(fit, coef = 2) # coef = 2 corresponds to the 'treatment' vs 'control' comparison
calculate_test_statistics_stop <- Sys.time()
print (paste0("Test statistics time:",calculate_test_statistics_stop - calculate_test_statistics_start))
}
#print ("Time to calculate DE")
#print (calculate_DE_time_stop - calculate_DE_time_start)
#print (et)
Expand All @@ -147,6 +183,7 @@ names(output)[4] <- "original_p_value"
names(output)[5] <- "adjusted_p_value"
#write_csv(output,"DE_output.txt")
cat(paste0("adjusted_p_values:",toJSON(output)))

#output_json <- toJSON(output)
#print ("output_json")
#output_file <- paste0(input$output_path,"/r_output.txt")
Expand Down
6 changes: 6 additions & 0 deletions shared/types/src/routes/termdb.DE.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ export type DERequest = {
storage_type: 'text' | 'HDF5'
/** Method of DE used wilcoxon/edgeR */
method?: string
/** Term for confounding variable (if present) */
tw?: any
}

export type ExpressionInput = {
Expand All @@ -32,6 +34,10 @@ export type ExpressionInput = {
min_total_count: number
/** Type of storage file: HDF5 or text. Text will be deprecated in the future */
storage_type: 'HDF5' | 'text'
/** Confounding variable for DE analysis. Maybe array of string (Gender: Male/female) or number (Age). For now supporting 1 confounding variable. Later will add support for multiple confounding variables */
conf1?: any[]
/** Type of the confounding variable (categorical/float) */
conf1_type?: 'categorical' | 'float'
}

export type DEResponse = {
Expand Down
Loading