added subclone penalty; tuned Gaussian likelihood

bioinform · May 8, 2020 · 3102297 · 3102297
1 parent 08692df
commit 3102297
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 98 deletions.
diff --git a/R/EM.R b/R/EM.R
@@ -31,7 +31,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
   KS <- nrow(param$jointStates)#K ^ S
   N <- nrow(copy)
   rho <- matrix(0, KS, N)
-  lambdas <- array(0, dim = c(K, S, maxiter))  # State Variances
+  lambdas <- array(0, dim = c(nrow(na.omit(param$lambda)), S, maxiter))  # State Variances
   #covars <- rep(list(NULL), maxiter) #State covariance matrices
   vars <- array(0, dim = c(KS, S, maxiter)) #array(0, dim = c(K, S, maxiter))
   phi <- matrix(NA, length(param$phi_0), maxiter)					 # Tumour ploidy
@@ -63,7 +63,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
   n[, i] <- param$n_0
   sp[, i] <- param$sp_0
   phi[, i] <- param$phi_0
-  lambdas[, , i] <- param$lambda #matrix(param$lambda, nrow = K, ncol = S, byrow = TRUE)
+  lambdas[, , i] <- na.omit(param$lambda) #matrix(param$lambda, nrow = K, ncol = S, byrow = TRUE)
   lambdasKS <- as.matrix(na.omit(expand.grid(as.data.frame(lambdas[, , i])))) #as.matrix(expand.grid(rep(list(param$lambda), S)))
   #covars[[i]] <- param$covar
   if (param$likModel == "Gaussian"){
@@ -73,6 +73,10 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
   mus[, , i] <- as.matrix(get2and3ComponentMixture(param$jointCNstates, param$jointSCstatus, n[, i], sp[, i], phi[, i]))
 
   # Likelihood #
+  # if (!grepl("gauss", param$likModel, ignore.case = TRUE)){
+  #     param$likModel <- "Gaussian"
+  #     message("Switching to Gaussian likelihood model.")
+  # }
   if (param$likModel == "t"){
     for (ks in 1:KS) {
       probs <- tdistPDF(copy, mus[ks, , i], lambdasKS[ks, ], param$nu)
@@ -103,7 +107,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
     for (j in 1:length(chrsI)) {
       I <- intersect(chrsI[[j]], which(chrInd))
       if (length(I) > 0){
-        output <- .Call("forward_backward", piG[, i - 1], param$A, py[, I],PACKAGE = "HMMcopy")
+        output <- .Call("forward_backward", piG[, i - 1], A, py[, I],PACKAGE = "HMMcopy")
         rho[, I] <- output$rho
         loglik[i] <- loglik[i] + output$loglik
         Zcounts <- Zcounts + t(colSums(aperm(output$xi, c(3, 2, 1))))
@@ -143,18 +147,19 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
       ptm.em <- proc.time()
       output <- estimateTParamsMap(copy[chrInd, ], n[, i - 1], sp[, i - 1], phi[, i - 1], 
                                 lambdas[, , i - 1], piG[, i - 1], A, param,
-                                rho[, chrInd], Zcounts, 
+                                rho[, chrInd], Zcounts, loglik[i],
                                 estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                                 estimatePrecision = estimatePrecision, estimateTransition = estimateTransition,
                                 estimateInitDist = estimateInitDist, estimateSubclone = estimateSubclone)
 
       lambdas[, , i] <- output$lambda
+      lambdasKS <- as.matrix(na.omit(expand.grid(as.data.frame(lambdas[, , i]))))
       if (verbose == TRUE) {
 
         for (s in 1:S){
           message("Sample", s, " n=", signif(output$n[s], digits = 4), ", sp=", signif(output$sp[s], digits = 4), 
                   ", phi=", signif(output$phi[s], digits = 4), 
-                  ", lambda=", paste0(signif(output$lambda[, s], digits=5), collapse = ","),
+                  ", lambda=", paste0(signif(output$lambda, digits=5), collapse = ","),
                   ", F=", signif(output$F, digits=4))
         }     
       }
@@ -203,7 +208,7 @@ runEM <- function(copy, chr, chrInd, param, maxiter, verbose = TRUE,
     }
     if ((abs(loglik[i] - loglik[i - 1]) / abs(loglik[i])) < likChangeConvergence){#} && loglik[i] > loglik[i - 1]) {
       message("runEM iter", i-1, " EM Converged")
-      converged = 0
+      converged = 1
     }
     if (loglik[i] < loglik[i - 1]){
       #message("LIKELIHOOD DECREASED!!!")
@@ -381,6 +386,7 @@ getNormLik <- function(copy, mus, varsKS, sw){
     }
     probs <- apply(y, 1, prod)
   }))
+  py[is.na(py)] <- 1
   return(py)
 }
 
@@ -424,7 +430,7 @@ invWishartpdflog <- function(covar, psi, nu){
 }
 
 estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_prev, A_prev, 
-                              params, rho, Zcounts, 
+                              params, rho, Zcounts, loglik,
                               estimateNormal = TRUE, estimatePloidy = TRUE,
                               estimatePrecision = TRUE, estimateInitDist = TRUE, 
                               estimateTransition = TRUE, estimateSubclone = TRUE,
@@ -443,6 +449,7 @@ estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_pre
   n_hat <- n_prev
   sp_hat <- sp_prev
   phi_hat <- phi_prev
+  lambda_prev <- na.omit(lambda_prev)
   lambda_hat <- lambda_prev
   pi_hat <- pi_prev
   A_hat <- A_prev
@@ -464,7 +471,7 @@ estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_pre
     suppressWarnings(
       estNorm <- optim(n_prev, fn = completeLikelihoodFun, pType = rep("n", S), n = n_prev, sp = sp_prev, phi = phi_prev, 
                        lambda = lambda_prev, piG = pi_hat, A = A_hat,
-                       params = params, D = D, rho = rho, Zcounts = Zcounts, 
+                       params = params, D = D, rho = rho, Zcounts = Zcounts, lik = loglik,
                        estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                        estimatePrecision = estimatePrecision, estimateInitDist = estimateInitDist,
                        estimateTransition = estimateTransition, estimateSubclone = estimateSubclone,
@@ -479,7 +486,7 @@ estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_pre
     suppressWarnings(
       estSubclone <- optim(sp_prev, fn = completeLikelihoodFun, pType = rep("sp", S), n = n_hat, sp = sp_prev, 
                            phi = phi_prev, lambda = lambda_prev, piG = pi_hat, A = A_hat,
-                           params = params, D = D, rho = rho, Zcounts = Zcounts, 
+                           params = params, D = D, rho = rho, Zcounts = Zcounts, lik = loglik,
                            estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                            estimatePrecision = estimatePrecision, estimateInitDist = estimateInitDist,
                            estimateTransition = estimateTransition, estimateSubclone = estimateSubclone,
@@ -494,7 +501,7 @@ estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_pre
     suppressWarnings(
       estPhi <- optim(phi_prev, fn = completeLikelihoodFun, pType = rep("phi", length(phi_prev)), n = n_hat, sp = sp_hat, 
                       phi = phi_prev, lambda = lambda_prev, piG = pi_hat, A = A_hat,
-                      params = params, D = D, rho = rho, Zcounts = Zcounts, 
+                      params = params, D = D, rho = rho, Zcounts = Zcounts, lik = loglik,
                       estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                       estimatePrecision = estimatePrecision, estimateInitDist = estimateInitDist,
                       estimateTransition = estimateTransition, estimateSubclone = estimateSubclone,
@@ -509,7 +516,7 @@ estimateTParamsMap <- function(D, n_prev, sp_prev, phi_prev, lambda_prev, pi_pre
     suppressWarnings(
       estLambda <- optim(c(lambda_prev), fn = completeLikelihoodFun, pType = rep("lambda", K*S), n = n_hat, sp = sp_hat,
                          phi = phi_hat, lambda = lambda_prev, piG = pi_hat, A = A_hat,
-                         params = params, D = D, rho = rho, Zcounts = Zcounts, 
+                         params = params, D = D, rho = rho, Zcounts = Zcounts, lik = loglik,
                          estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                          estimatePrecision = estimatePrecision, estimateInitDist = estimateInitDist,
                          estimateTransition = estimateTransition, estimateSubclone = estimateSubclone,
@@ -710,7 +717,9 @@ varUpdateEqn <- function(n, sp, phi, jointStates, jointCNstates, jointSCstatus,
   # }
   # term1 <- term1 - 2 * betaVar
   # term2 <- term2 - 2 * (alphaVar + 1)
-  term1 <- c - 2*b*mus + a*mus^2 + 2*betaVar
+  #term1 <- c - 2*b*mus + a*mus^2 + 2*betaVar
+  d <- b/a
+  term1 <- c - 2*b*d + a*d^2 + 2*betaVar
   term2 <- a + 2*(alphaVar + 1)
   var_hat <- term1 / term2
   return(var_hat)
@@ -765,7 +774,7 @@ stLikelihood <- function(n, sp, phi, lambda, params, D, rho){
 ## length of x will depend on which pararmeters are being estimated
 ## x values should be same as n, phi, lambda, pi, but may not necessarily
 ## include all of those variables
-completeLikelihoodFun <- function(x, pType, n, sp, phi, lambda, piG, A, params, D, rho, Zcounts,
+completeLikelihoodFun <- function(x, pType, n, sp, phi, lambda, piG, A, params, D, rho, Zcounts, lik,
                                   estimateNormal = TRUE, estimatePloidy = TRUE,
                                   estimatePrecision = TRUE, estimateTransition = FALSE,
                                   estimateInitDist = TRUE, estimateSubclone = TRUE){
@@ -774,7 +783,7 @@ completeLikelihoodFun <- function(x, pType, n, sp, phi, lambda, piG, A, params,
   S <- params$numberSamples
   K <- length(params$ct)
 
-  lambda <- matrix(lambda, ncol = S, byrow = FALSE)
+  lambda <- na.omit(matrix(lambda, ncol = S, byrow = FALSE))
 
   if (estimatePrecision && sum(pType == "lambda") > 0){
     lambda <- matrix(x[pType == "lambda"], ncol = S, byrow = FALSE)
@@ -822,6 +831,7 @@ priorProbs <- function(n, sp, phi, lambda, var, piG, A, params,
   if (estimateTransition){
     priorA <- sapply(1:KS, function(ks){
       dirichletpdflog(A[ks, ], params$dirPrior[ks, ])
+      #dirichletpdflog(params$A_prior[ks, ], A[ks, ])
     })
   }
   priorA <- sum(priorA)
@@ -843,14 +853,14 @@ priorProbs <- function(n, sp, phi, lambda, var, piG, A, params,
     if (estimateVar){
       for (s in 1:S){
         priorVar <- priorVar + 
-          sum(invgammapdflog(as.data.frame(var[, s]), params$alphaVar[, s], params$betaVar[s]))
+          sum(invgammapdflog(as.data.frame(var[, s]), params$alphaVar[, s], params$betaVar[s]), na.rm = TRUE)
       }
     }
   }else{ # param$likModel == "t"
     if (estimatePrecision){
       for (s in 1:S){
         priorLambda <- priorLambda + 
-          sum(gammapdflog(as.data.frame(lambda)[, s], params$alphaLambda, params$betaLambda[, s]))
+          sum(gammapdflog(as.data.frame(lambda)[, s], params$alphaLambda[s], params$betaLambda[, s]), na.rm = TRUE)
       }
       priorLambda <- as.numeric(priorLambda)
     }
@@ -879,11 +889,10 @@ estimatePrecisionParamMap <- function(lambda, n, phi, params, D, rho){
 
 estimateMixWeightsParamMap <- function(rho, kappa) {
   K <- nrow(rho)
-  #pi <- (rho[, 1] + kappa - 1) / (sum(rho[, 1]) + sum(kappa) - K)
-  pi <- (rowSums(rho, na.rm = TRUE) + kappa - 1) / 
-    #	(ncol(rho) + sum(kappa) - K)
-    (sum(rowSums(rho, na.rm = TRUE)) + sum(kappa) - K)
-
+  pi <- (rho[, 1] + kappa - 1) #/ (sum(rho[, 1]) + sum(kappa) - K)
+  pi <- pi / sum(pi)
+  #pi <- (rowSums(rho, na.rm = TRUE) + kappa - 1) / 
+  #  (sum(rowSums(rho, na.rm = TRUE)) + sum(kappa) - K)
   return(pi)
 } 
 
diff --git a/R/plotting.R b/R/plotting.R
@@ -109,16 +109,16 @@ plotSolutions <- function(hmmResults.cor, tumour_copy, chrs, outDir, counts,
     try(plotCovarBias(counts[[s]], covar = "gc", before = "reads", 
                     after = "cor.gc", fit = "gc.fit", xlab = "GC Content",
                     pch = 20, cex = 0.5, mfrow = NULL),
-        silent=TRUE)
+    silent = TRUE)
     try(plotCovarBias(counts[[s]], covar = "map", before = "cor.gc", 
                       after = "cor.map", fit = NULL, xlab = "Mappability Score",
                       pch = 20, cex = 0.5, mfrow = NULL),
-        silent=TRUE)
+    silent = TRUE)
+
     try(plotCovarBias(counts[[s]], covar = "repTime", before = "cor.map", 
                       after = "cor.rep", fit = "rep.fit", xlab = "Replication Timing", 
                       pch = 20, cex = 0.5, mfrow = NULL),
-        silent=TRUE)
-
+    silent = TRUE)
     dev.off()
 
     ### PLOT TPDF ##
@@ -555,12 +555,14 @@ plotCovarBias <- function(correctOutput, covar = "gc",
        ylab = "Uncorrected", xlab = xlab, 
        ...)
   # plot curve fit line
-  if (!is.null(fit)){
-    domain <- c(min(counts[[covar]], na.rm = TRUE), max(counts[[covar]], na.rm = TRUE))
-    fit.covar <- correctOutput[[fit]]
-    i <- seq(domain[1], domain[2], by = 0.001)
-    y <- predict(fit.covar, i)
-    lines(i, y, type="l", lwd=1, col="red")
+  if (!is.null(fit)){ # want to look at fit
+    if (!is.null(correctOutput[[fit]])){ # actually have the fit object/model as list element
+      domain <- c(min(counts[[covar]], na.rm = TRUE), max(counts[[covar]], na.rm = TRUE))
+      fit.covar <- correctOutput[[fit]]
+      i <- seq(domain[1], domain[2], by = 0.001)
+      y <- predict(fit.covar, i)
+      lines(i, y, type="l", lwd=1, col="red")
+    }
   }
 
   # select points to show (after)

diff --git a/R/segmentation.R b/R/segmentation.R
@@ -125,8 +125,8 @@ getTransitionMatrix <- function(K, e, strength){
 }
 
 getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy_0 = 2, 
-                                 normalIgnoreSC = 0.9,
-                                 e = 0.9999999, e.sameState = 10, strength = 10000000, 
+                                 normal2IgnoreSC = 0.9, e = 0.9999999, e.subclone = 0.1,
+                                 e.sameState = 50, strength = 10000000, 
                                  includeHOMD = FALSE, likModel = "t"){
   if (includeHOMD){
     ct <- 0:maxCN
@@ -144,7 +144,7 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
 		nu = 2.1,
 		kappa = rep(75, length(ct)), 
 		alphaLambda = 5,
-		betaVar = 1,
+		betaVar = 2,
 		likModel = likModel
 	)
 	K <- length(param$ct)
@@ -183,7 +183,7 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
     #param$var <- t(replicate(K, diag(covar$covar)))
     param$var <- matrix(apply(x, 2, var, na.rm=T), ncol = S, nrow = K, byrow = TRUE)
     param$betaVar <- rep(param$betaVar, S)
-    alphaVar <- param$betaVar / (apply(x, 2, var, na.rm = TRUE) / sqrt(K))
+    alphaVar <- 1 / (apply(x, 2, var, na.rm = TRUE) / sqrt(K) ^ 2)
     param$alphaVar <- matrix(alphaVar, ncol = S, nrow = KS, byrow = TRUE)
   }
 
@@ -220,59 +220,62 @@ getDefaultParameters <- function(x, maxCN = 5, ct.sc = c(1,3), n_0 = 0.5, ploidy
 	# Initialize transition matrix to the prior
 	txn <- getTransitionMatrix(K ^ S, e, strength)
   ## set higher transition probs for same CN states across samples ##
-  # joint states where at least "tol" fraction of samples with the same CN state
+  # joint states where at least "e.sameState" fraction of samples with the same CN state
 	#apply(param$jointCNstates, 1, function(x){ sum(duplicated(as.numeric(x))) > 0 })
   cnStateDiff <- apply(param$jointCNstates, 1, function(x){ (abs(max(x) - min(x)))})
   if (e.sameState > 0 & S > 1){
 		txn$A[, cnStateDiff == 0] <- txn$A[, cnStateDiff == 0] * e.sameState * K 
 		txn$A[, cnStateDiff >= 3] <- txn$A[, cnStateDiff >=3]  / e.sameState / K
 	}
-  for (i in 1:nrow(txn$A)){
-    for (j in 1:ncol(txn$A)){
-      if (i == j){
-        txn$A[i, j] <- e
-      }
-    }
-  }
-  txn$A <- normalize(txn$A)
-	param$A <- txn$A
-	param$dirPrior <- txn$A * strength[1] 
-  param$A[, param$ct.sc.status] <- param$A[, param$ct.sc.status] / 10
-  param$A <- normalize(param$A)
-  param$dirPrior[, param$ct.sc.status] <- param$dirPrior[, param$ct.sc.status] / 10
-
-  if (includeHOMD){
+  param$kappa <- rep(75, K ^ S)
+  param$kappa[cnStateDiff == 0] <- param$kappa[cnStateDiff == 0] + 125
+  param$kappa[cnStateDiff >=3] <- param$kappa[cnStateDiff >=3] - 50
+  param$kappa[which(rowSums(param$jointCNstates==2) == S)] <- 800
+
+  # penalize homozygous deletion state
+ if (includeHOMD){
     K <- length(param$ct)
-    param$A[1, 2:K] <- param$A[1, 2:K] * 1e-5; param$A[2:K, 1] <- param$A[2:K, 1] * 1e-5;
-    param$A[1, 1] <- param$A[1, 1] * 1e-5
-    param$A <- normalize(param$A); param$dirPrior <- param$A * param$strength
+    txn$A[1, 2:K] <- txn$A[1, 2:K] * 1e-5; 
+    txn$A[2:K, 1] <- txn$A[2:K, 1] * 1e-5;
+    txn$A[1, 1] <- txn$A[1, 1] * 1e-5
   }
 
-  param$kappa <- rep(75, K ^ S)
-  param$kappa[cnStateDiff == 0] <- param$kappa[cnStateDiff == 0] + 125
-	param$kappa[cnStateDiff >=3] <- param$kappa[cnStateDiff >=3] - 50
-	param$kappa[which(rowSums(param$jointCNstates==2) == S)] <- 800
+  # for (i in 1:nrow(txn$A)){
+  #     txn$A[i, i] <- e
+  # }
+
+  # penalize transitions into subclonal states
+  ind.bothSC <- rowSums(param$jointSCstatus) == S
+  txn$A[, ind.bothSC] <- txn$A[, ind.bothSC] / (e.subclone * K)
+
+  txn$A <- normalize(txn$A)
+	param$A <- txn$A
+  param$A_prior <- param$A
+  param$dirPrior <- param$A * strength[1] 
 
-  # ignore subclones for sample with n >= normalIgnoreSC
+  # ignore subclones for sample with n >= normal2IgnoreSC
   indKS <- 1:nrow(param$jointSCstatus) # use all joint states
   indK <- 1:length(param$ct.sc.status)
-  if (sum(n_0 >= normalIgnoreSC) > 0){
-    for (i in which(n_0 >= normalIgnoreSC)){
-      # keep states excluding subclones for samples with n >= normalIgnoreSC
+  if (sum(n_0 >= normal2IgnoreSC) > 0){
+    for (i in which(n_0 >= normal2IgnoreSC)){
+      # keep states excluding subclones for samples with n >= normal2IgnoreSC
       indKS <- intersect(indKS, which(!param$jointSCstatus[, i]))
       indK <- intersect(indK, which(param$ct.sc.status))
       param$lambda[indK, i] <- NaN
       param$betaLambda[indK, i] <- NaN
-      param$var[indK, i] <- NaN      
+      if (likModel == "Gaussian"){
+        param$var[indK, i] <- NaN
+      }    
     }
 
     param$kappa <- param$kappa[indKS]
-    param$alphaVar <- param$alphaVar[indKS, ]
-    param$jointStates <- param$jointStates[indKS, ]
-    param$jointCNstates <- param$jointCNstates[indKS, ]
-    param$jointSCstatus <- param$jointSCstatus[indKS, ]
-    param$A <- param$A[indKS, indKS]
-    param$dirPrior <- param$dirPrior[indKS, indKS]
+    param$alphaVar <- param$alphaVar[indKS, , drop = FALSE]
+    param$jointStates <- param$jointStates[indKS, , drop = FALSE]
+    param$jointCNstates <- param$jointCNstates[indKS, , drop = FALSE]
+    param$jointSCstatus <- param$jointSCstatus[indKS, , drop = FALSE]
+    param$A <- normalize(param$A[indKS, indKS])
+    param$A_prior <- param$A
+    param$dirPrior <- param$A * strength[1]
   } 
 
   return(param)