Skip to content

Commit

Permalink
likModel option and fixes for multisample
Browse files Browse the repository at this point in the history
1. Exposed likModel for snakemake
2. Exposed option to specify the normal initialization value to ignore subclonal analysis in the solution
3. Hyperparameter settings for variance prior model when using likModel=gaussian
4. Do not generate individual solution plots for multi-sample analysis. "all_sols" plot is sufficient
  • Loading branch information
gavinha committed Mar 13, 2020
1 parent 975ac42 commit 54189dd
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 46 deletions.
17 changes: 8 additions & 9 deletions R/EM.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
rho <- matrix(0, KS, N)
lambdas <- array(0, dim = c(K, S, maxiter)) # State Variances
#covars <- rep(list(NULL), maxiter) #State covariance matrices
vars <- array(0, dim = c(K, S, maxiter))
vars <- array(0, dim = c(KS, S, maxiter)) #array(0, dim = c(K, S, maxiter))
phi <- matrix(NA, length(param$phi_0), maxiter) # Tumour ploidy
n <- matrix(NA, S, maxiter) # Normal contamination
sp <- matrix(NA, S, maxiter) # cellular prevalence (tumor does not contain event)
Expand Down Expand Up @@ -64,11 +64,11 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
sp[, i] <- param$sp_0
phi[, i] <- param$phi_0
lambdas[, , i] <- param$lambda #matrix(param$lambda, nrow = K, ncol = S, byrow = TRUE)
lambdasKS <- as.matrix(expand.grid(as.data.frame(lambdas[, , i]))) #as.matrix(expand.grid(rep(list(param$lambda), S)))
lambdasKS <- as.matrix(na.omit(expand.grid(as.data.frame(lambdas[, , i])))) #as.matrix(expand.grid(rep(list(param$lambda), S)))
#covars[[i]] <- param$covar
if (param$likModel == "Gaussian"){
vars[, , i] <- param$var
varsKS <- as.matrix(expand.grid(as.data.frame(vars[, , i])))
vars[, , i] <- as.matrix(na.omit(expand.grid(as.data.frame(param$var))))
varsKS <- vars[, , i]
}
mus[, , i] <- as.matrix(get2and3ComponentMixture(param$jointCNstates, param$jointSCstatus, n[, i], sp[, i], phi[, i]))

Expand Down Expand Up @@ -131,8 +131,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
estimateTransition = estimateTransition,
estimateInitDist = estimateInitDist,
estimateSubclone = estimateSubclone)
#vars[, , i] <- output$var
varsKS <- output$var
vars[, , i] <- output$var
if (verbose == TRUE) {
for (s in 1:S){
message("Sample", s, " n=", signif(output$n[s], digits = 4), ", sp=", signif(output$sp[s], digits = 4),
Expand Down Expand Up @@ -170,7 +169,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
estF <- output$F

# Recalculate the likelihood
varsKS <- as.matrix(expand.grid(as.data.frame(vars[, , i])))
varsKS <- vars[, , i]
mus[, , i] <- as.matrix(get2and3ComponentMixture(param$jointCNstates, param$jointSCstatus, n[, i], sp[, i], phi[, i]))
if (param$likModel == "t"){
for (ks in 1:KS) {
Expand Down Expand Up @@ -373,7 +372,7 @@ getNormLik <- function(copy, mus, varsKS, sw){
S <- param$numberSamples
K <- length(param$ct)
Z <- sum(param$ct.sc) #number of subclonal states (# repeated copy number states)
KS <- K ^ S
KS <- nrow(varsKS)
py <- t(sapply(1:KS, function(ks) {
y <- NULL
for (s in 1:S){
Expand Down Expand Up @@ -817,7 +816,7 @@ priorProbs <- function(n, sp, phi, lambda, var, piG, A, params,
estimateSubclone = TRUE){
S <- params$numberSamples
K <- length(params$ct)
KS <- K ^ S
KS <- nrow(param$jointStates) #K ^ S
## prior terms ##
priorA <- rep(0, KS)
if (estimateTransition){
Expand Down
27 changes: 25 additions & 2 deletions R/segmentation.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ getTransitionMatrix <- function(K, e, strength){
}

getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy_0 = 2,
normalIgnoreSC = 0.9,
e = 0.9999999, e.sameState = 10, strength = 10000000,
includeHOMD = FALSE, likModel = "t"){
if (includeHOMD){
Expand Down Expand Up @@ -197,7 +198,7 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
logR.var <- 1 / ((apply(x, 2, sd, na.rm = TRUE) / sqrt(K)) ^ 2)
if (!is.null(dim(x))){ # multiple samples (columns)
param$lambda <- matrix(logR.var, nrow=K, ncol=S, byrow=T, dimnames=list(c(),colnames(x)))
}else{ # only 1 sample
}else{ # only 1 sample and using student's-t likelihood model
#logR.var <- 1 / ((sd(x, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2)
param$lambda <- matrix(logR.var, length(param$ct))
param$lambda[param$ct %in% c(2)] <- logR.var
Expand All @@ -210,7 +211,7 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
param$jointStates <- expand.grid(rep(list(1:length(param$ct)), S))
param$jointCNstates <- expand.grid(rep(list(param$ct), S))
param$jointSCstatus <- expand.grid(rep(list(param$ct.sc.status), S))
KS <- nrow(param$jointCNstates)
#KS <- nrow(param$jointCNstates)
#param$covar <- replicate(KS, covar$covar)

colnames(param$jointCNstates) <- paste0("Sample.", 1:param$numberSamples)
Expand Down Expand Up @@ -251,6 +252,28 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
param$kappa[cnStateDiff == 0] <- param$kappa[cnStateDiff == 0] + 125
param$kappa[cnStateDiff >=3] <- param$kappa[cnStateDiff >=3] - 50
param$kappa[which(rowSums(param$jointCNstates==2) == S)] <- 800

# ignore subclones for sample with n >= normalIgnoreSC
indKS <- 1:nrow(param$jointSCstatus) # use all joint states
indK <- 1:length(param$ct.sc.status)
if (sum(n_0 >= normalIgnoreSC) > 0){
for (i in which(n_0 >= normalIgnoreSC)){
# keep states excluding subclones for samples with n >= normalIgnoreSC
indKS <- intersect(indKS, which(!param$jointSCstatus[, i]))
indK <- intersect(indK, which(param$ct.sc.status))
param$lambda[indK, i] <- NaN
param$betaLambda[indK, i] <- NaN
param$var[indK, i] <- NaN
}

param$kappa <- param$kappa[indKS]
param$alphaVar <- param$alphaVar[indKS, ]
param$jointStates <- param$jointStates[indKS, ]
param$jointCNstates <- param$jointCNstates[indKS, ]
param$jointSCstatus <- param$jointSCstatus[indKS, ]
param$A <- param$A[indKS, indKS]
param$dirPrior <- param$dirPrior[indKS, indKS]
}

return(param)
}
Expand Down
81 changes: 47 additions & 34 deletions scripts/runIchorCNA.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ option_list <- list(
make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"),
make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"),
make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"),
make_option(c("--normal2IgnoreSC"), type="numeric", default=1.0, help="Ignore subclonal analysis when normal proportion is >= this value. Default: [%default]"),
make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"),
make_option(c("--likModel"), type="character", default="t", help="Likelihood model to use. \"t\" or \"gaussian\". Use \"gaussian\" for faster runtimes. Default: [%default]"),
make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"),
Expand Down Expand Up @@ -102,6 +103,7 @@ normalizeMaleX <- as.logical(opt$normalizeMaleX)
includeHOMD <- as.logical(opt$includeHOMD)
fracReadsInChrYForMale <- opt$fracReadsInChrYForMale
chrXMedianForMale <- -0.1
normalIgnoreSC <- opt$normal2IgnoreSC
outDir <- opt$outDir
libdir <- opt$libdir
plotFileType <- opt$plotFileType
Expand Down Expand Up @@ -195,7 +197,7 @@ for (i in 1:numSamples) {
chrNormalize = chrNormalize, mapScoreThres = minMapScore)
gender <- counts[[id]]$gender

if ((!is.null(sex) || sex != "None" || sex != "NULL") && gender$gender != sex ){ #compare with user-defined sex
if ((!is.null(sex) && sex != "None" && sex != "NULL") && gender$gender != sex ){ #compare with user-defined sex
message("Estimated gender (", gender$gender, ") doesn't match to user-defined gender (", sex, "). Use ", sex, " instead.")
gender$gender <- sex
}
Expand Down Expand Up @@ -261,15 +263,17 @@ save.image(outImage)
## store the results for different normal and ploidy solutions ##
ptmTotalSolutions <- proc.time() # start total timer
results <- list()
loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 8,
dimnames = list(c(), c("n_0", "phi_0", "n_est", "phi_est", "BIC",
numCombinations <- (length(normal) * length(ploidy)) ^ S
loglik <- as.data.frame(matrix(NA, nrow = numCombinations, ncol = 7,
dimnames = list(c(), c("n_0", "phi_0", "n_est", "phi_est",
"Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik"))))

fracGenomeSub <- as.data.frame(matrix(NA, nrow = numCombinations, ncol = 2))
fracAltSub <- as.data.frame(matrix(NA, nrow = numCombinations, ncol = 2))
normal.restarts <- expand.grid(rep(list(normal), S))
#ploidy.restarts <- expand.grid(rep(list(ploidy), S))
counter <- 1
compNames <- rep(NA, nrow(loglik))
mainName <- NULL #rep(NA, nrow(normal.restarts) * nrow(ploidy.restarts))
mainName <- vector('list', S) #rep(NA, nrow(normal.restarts) * nrow(ploidy.restarts))
#### restart for purity and ploidy values ####
for (i in 1:length(ploidy)){
p <- rep(ploidy[i], S)
Expand All @@ -279,29 +283,33 @@ for (i in 1:length(ploidy)){
if (sum(n == 0.95 & p != 2) > 0) {
next
}
scStates.toUse <- scStates
## ignore subclones when normal=0.95
if (n == 0.95){
scStates.toUse <- c()
}
message("Running EM for normal=", n, ", ploidy=", p)

logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X #
param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD,
ct.sc=scStates.toUse, ploidy_0 = floor(p), e=txnE, e.same = 50, strength=txnStrength, likModel = likModel)
param$n_0 <- as.numeric(n)
param$phi_0 <- as.numeric(p)
param <- getDefaultParameters(logR[valid & chrInd, , drop=F], n_0 = n, maxCN = maxCN, includeHOMD = includeHOMD,
ct.sc=scStates, normalIgnoreSC = normalIgnoreSC, ploidy_0 = floor(p),
e=txnE, e.same = 50, strength=txnStrength, likModel = likModel)
param$sw <- rep(1, S)
############################################
######## CUSTOM PARAMETER SETTINGS #########
############################################
# 0.1x cfDNA #
logR.var <- param$var[1, ]
param$var[param$ct >= 4, ] <- logR.var * 5
param$var[param$ct == max(param$ct), ] <- logR.var * 15
param$var[param$ct.sc.status, ] <- logR.var * 10
# param$var <- param$var / 10
# param$var[param$ct >= 4, ] <- param$var[param$ct >= 4, ] * 1
# param$var[param$ct == max(param$ct), ] <- param$var[param$ct == max(param$ct), ] * 1
# param$var[param$ct.sc.status, ] <- param$var[param$ct.sc.status, ] * 1

param$betaVar <- rep(2, length(param$ct))

# param$alphaVar <- param$alphaVar * 10
# param$alphaVar[param$jointCNstates >= 4, ] <- param$alphaVar[param$jointCNstates >= 4, ] / 5
# param$alphaVar[param$jointCNstates == max(param$jointCNstates), ] <- param$alphaVar[param$jointCNstates == max(param$jointCNstates), ] / 15
if (numSamples > 1){ # for multi-sample analysis
for (m in 1:S){
param$alphaVar[param$jointSCstatus[, m], m] <- param$alphaVar[param$jointSCstatus[, m], m] / 10
}
}else{
param$betaLambda[param$ct.sc.status] <- param$betaLambda[param$ct.sc.status] * 10
}
#############################################
################ RUN HMM ####################
#############################################
Expand Down Expand Up @@ -344,22 +352,26 @@ for (i in 1:length(ploidy)){
if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){
hmmResults.cor$results$n[s, iter] <- 1.0
}
## plot solution ##
outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n[s], "-p", p[s])
mainName <- c(mainName, paste0(id, ", n: ", n[s], ", p: ", p[s], ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)))
plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType,
logR.column = "logR", call.column = "event",
plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter])
## plot solution ##
mainName[[s]] <- c(mainName[[s]], paste0("Solution: ", counter, ", Sample: ", id, ", n: ", n[s], ", p: ", p[s],
", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)))
## plot individual samples if only single-sample analysis
if (numSamples == 1){
outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n[s], "-p", p[s])
plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType,
logR.column = "logR", call.column = "event",
plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[[s]][counter])
}
}
iter <- hmmResults.cor$results$iter
results[[counter]] <- hmmResults.cor
loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4)
subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) }))
fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) }))
fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) }))
fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}})
loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",")
loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",")
fracGenomeSub[counter, ] <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) }))
fracAltSub[counter, ] <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) }))
fracAltSubVect <- lapply(fracAltSub[counter, ], function(x){if (is.na(x)){0}else{x}})
loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub[counter, ], digits=2), collapse=",")
loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSubVect), digits=2), collapse=",")
loglik[counter, "n_0"] <- paste0(n, collapse = ",")
loglik[counter, "phi_0"] <- paste0(p, collapse = ",")
loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",")
Expand All @@ -378,10 +390,10 @@ message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digi
save.image(outImage)
#save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData"))

### SELECT SOLUTION WITH LARGEST LIKELIHOOD ###
### SORT SOLUTIONS BY DECREASING LIKELIHOOD ###
if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal
fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone &
loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone)
fracInd <- which(rowSums(fracAltSub <= maxFracCNASubclone) == S &
rowSums(fracGenomeSub <= maxFracGenomeSubclone) == S)
if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal
ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)]
}else{ # otherwise just take largest likelihood
Expand All @@ -394,6 +406,7 @@ if (estimateScPrevalence){ ## sort but excluding solutions with too large % subc
## print combined PDF of all solutions
#new loop by order of solutions (ind)
for (s in 1:numSamples){
id <- names(hmmResults.cor$cna)[s]
outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols")
for (i in 1:length(ind)) {
hmmResults.cor <- results[[ind[i]]]
Expand All @@ -409,7 +422,7 @@ for (s in 1:numSamples){
logR.column = "logR", call.column = "event",
plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence,
seqinfo = seqinfo,
turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]])
turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[[s]][ind[i]])
}
}

Expand Down
2 changes: 2 additions & 0 deletions scripts/snakemake/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ ichorCNA_includeHOMD: FALSE
ichorCNA_maxFracGenomeSubclone: 0.5
# Exclude solutions if total length of subclonal CNAs > this fraction of total CNA length
ichorCNA_maxFracCNASubclone: 0.7
# Ignore subclonal analysis when initial normal setting >= this value
ichorCNA_normal2IgnoreSC: 1.0
# control segmentation - higher (e.g. 0.9999999) leads to higher specificity and fewer segments
# lower (e.g. 0.99) leads to higher sensitivity and more segments
ichorCNA_txnE: 0.9999
Expand Down
3 changes: 2 additions & 1 deletion scripts/snakemake/ichorCNA.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ rule ichorCNA:
minMapScore=config["ichorCNA_minMapScore"],
maxFracGenomeSubclone=config["ichorCNA_maxFracGenomeSubclone"],
maxFracCNASubclone=config["ichorCNA_maxFracCNASubclone"],
normal2IgnoreSC=config["ichorCNA_normal2IgnoreSC"],
exons=config["ichorCNA_exons"],
txnE=config["ichorCNA_txnE"],
txnStrength=config["ichorCNA_txnStrength"],
Expand All @@ -72,5 +73,5 @@ rule ichorCNA:
log:
"logs/ichorCNA/{tumor}.log"
shell:
"Rscript {params.rscript} --id {params.id} --libdir {params.libdir} --WIG {input.tum} --gcWig {params.gcwig} --mapWig {params.mapwig} --repTimeWig {params.repTimeWig} --sex {params.sex} --normalPanel {params.normalpanel} --ploidy \"{params.ploidy}\" --normal \"{params.normal}\" --maxCN {params.maxCN} --includeHOMD {params.includeHOMD} --chrs \"{params.chrs}\" --chrTrain \"{params.chrTrain}\" --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateScPrevalence {params.estimateClonality} --scStates \"{params.scStates}\" --likModel {params.likModel} --centromere {params.centromere} --exons.bed {params.exons} --txnE {params.txnE} --txnStrength {params.txnStrength} --minMapScore {params.minMapScore} --fracReadsInChrYForMale {params.fracReadsChrYMale} --maxFracGenomeSubclone {params.maxFracGenomeSubclone} --maxFracCNASubclone {params.maxFracCNASubclone} --plotFileType {params.plotFileType} --plotYLim \"{params.plotYlim}\" --outDir {params.outDir} > {log} 2> {log}"
"Rscript {params.rscript} --id {params.id} --libdir {params.libdir} --WIG {input.tum} --gcWig {params.gcwig} --mapWig {params.mapwig} --repTimeWig {params.repTimeWig} --sex {params.sex} --normalPanel {params.normalpanel} --ploidy \"{params.ploidy}\" --normal \"{params.normal}\" --maxCN {params.maxCN} --includeHOMD {params.includeHOMD} --chrs \"{params.chrs}\" --chrTrain \"{params.chrTrain}\" --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateScPrevalence {params.estimateClonality} --scStates \"{params.scStates}\" --likModel {params.likModel} --centromere {params.centromere} --exons.bed {params.exons} --txnE {params.txnE} --txnStrength {params.txnStrength} --minMapScore {params.minMapScore} --fracReadsInChrYForMale {params.fracReadsChrYMale} --maxFracGenomeSubclone {params.maxFracGenomeSubclone} --maxFracCNASubclone {params.maxFracCNASubclone} --normal2IgnoreSC {params.normal2IgnoreSC} --plotFileType {params.plotFileType} --plotYLim \"{params.plotYlim}\" --outDir {params.outDir} > {log} 2> {log}"

0 comments on commit 54189dd

Please sign in to comment.