diff --git a/.travis.yml b/.travis.yml index 4c0d348..a7bd536 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ branches: # This is the minimal set of R packages needed to run "R CMD check" on # the package. install: - - R -e 'install.packages(c("assertthat","devtools","etrunct","truncnorm","Rcpp","foreach","doParallel","pscl","SQUAREM","testthat","rmarkdown","knitr","ggplot2"))' + - R -e 'install.packages(c("assertthat","devtools","etrunct","expint","truncnorm","Rcpp","foreach","doParallel","pscl","SQUAREM","testthat","rmarkdown","knitr","ggplot2"))' - R -e 'devtools::install_github("stephenslab/mixsqp")' env: diff --git a/DESCRIPTION b/DESCRIPTION index 43273c1..a9e14a4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,8 @@ Imports: pscl, Rcpp (>= 0.10.5), foreach, - etrunct + etrunct, + expint LinkingTo: Rcpp Suggests: testthat, diff --git a/NAMESPACE b/NAMESPACE index 0bfec3b..1068e13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -137,6 +137,7 @@ import(foreach) import(parallel) import(pscl) import(truncnorm) +importFrom(expint,gammainc) importFrom(graphics,abline) importFrom(graphics,legend) importFrom(graphics,lines) diff --git a/R/ash.R b/R/ash.R index ee3eb96..7e2ed7f 100644 --- a/R/ash.R +++ b/R/ash.R @@ -301,12 +301,16 @@ ash.workhorse <- # set range to search the mode if (lik$name=="pois"){ + lam = lik$data$y / lik$data$scale if (lik$data$link=="identity"){ - args$modemin = min(mode, min(lik$data$y/lik$data$scale),na.rm = TRUE) - args$modemax = max(mode, max(lik$data$y/lik$data$scale),na.rm = TRUE) - }else if (lik$data$link=="log"){ - args$modemin = min(log(lik$data$y/lik$data$scale+0.01)) - args$modemax = max(log(lik$data$y/lik$data$scale+0.01)) + args$modemin = min(mode, min(lam), na.rm = TRUE) + args$modemax = max(mode, max(lam), na.rm = TRUE) + } + else if (lik$data$link=="log"){ + eps = 1 / mean(lik$data$scale) + log_lam = log(lam + eps) + args$modemin = min(log_lam) + args$modemax = max(log_lam) } }else if(lik$name=="binom"){ if (lik$data$link=="identity"){ @@ -753,16 +757,23 @@ qval.from.lfdr = function(lfdr){ return(qvalue) } -# try to select a default range for the sigmaa values +# try to select a default range for the sigma values # that should be used, based on the values of betahat and sebetahat # mode is the location about which inference is going to be centered # mult is the multiplier by which the sds differ across the grid # grange is the user-specified range of mixsd autoselect.mixsd = function(data,mult,mode,grange,mixcompdist){ if (data$lik$name %in% c("pois")){ - data$x = data$lik$data$y/data$lik$data$scale #estimate of lambda - data$s = sqrt(data$x)/data$lik$data$scale #standard error of estimate - # if the link is log we probably want to take the log of this? + lam = data$lik$data$y / data$lik$data$scale + if (data$lik$data$link == "identity"){ + data$x = lam + data$s = sqrt(data$x) / data$lik$data$scale + } + else { + eps = 1 / mean(data$lik$data$scale) + data$x = log(lam + eps) + data$s = sqrt(var(lam) / (lam + eps)^2) + } } if (data$lik$name %in% c("binom")){ data$x = data$lik$data$y diff --git a/R/lik.R b/R/lik.R index 5087e86..d111d2b 100644 --- a/R/lik.R +++ b/R/lik.R @@ -58,7 +58,7 @@ lik_logF = function(df1,df2){ #' @details Suppose we have Poisson observations \code{y} where \eqn{y_i\sim Poisson(c_i\lambda_i)}. #' We either put an unimodal prior g on the (scaled) intensities \eqn{\lambda_i\sim g} #' (by specifying \code{link="identity"}) or on the log intensities -#' \eqn{logit(\lambda_i)\sim g} (by specifying \code{link="log"}). Either way, +#' \eqn{log(\lambda_i)\sim g} (by specifying \code{link="log"}). Either way, #' ASH with this Poisson likelihood function will compute the posterior mean of the #' intensities \eqn{\lambda_i}. #' @param y Poisson observations. @@ -73,6 +73,7 @@ lik_logF = function(df1,df2){ #' #' @importFrom stats pgamma #' @importFrom stats dgamma +#' @importFrom expint gammainc #' #' @export #' @@ -81,19 +82,30 @@ lik_pois = function(y, scale=1, link=c("identity","log")){ if (link=="identity"){ list(name = "pois", const = TRUE, + ## log_comp_dens_conv.unimix calls this like + ## + ## lcdfFUN((data$x - a) / data$s) + ## + ## where data is an ash data object with data$x = 0 and data$s = 1, so + ## we need to take absolute values lcdfFUN = function(x){pgamma(abs(x),shape=y+1,rate=scale,log.p=TRUE)-log(scale)}, lpdfFUN = function(x){dgamma(abs(x),shape=y+1,rate=scale,log=TRUE)-log(scale)}, + ## comp_postmean.unimix calls this like + ## + ## x - s * do.call(lik$etruncFUN, list(alpha, beta)) + ## + ## so we need to return the negative etruncFUN = function(a,b){-my_etruncgamma(-b,-a,y+1,scale)}, e2truncFUN = function(a,b){my_e2truncgamma(-b,-a,y+1,scale)}, data=list(y=y, scale=scale, link=link)) - }else if (link=="log"){ - y1 = y+1e-5 # add pseudocount + } + else if (link=="log"){ list(name = "pois", const = TRUE, - lcdfFUN = function(x){pgamma(exp(-x),shape=y1,rate=scale,log.p=TRUE)-log(y1)}, - lpdfFUN = function(x){dgamma(exp(-x),shape=y1,rate=scale,log=TRUE)-log(y1)}, - etruncFUN = function(a,b){-my_etruncgamma(exp(-b),exp(-a),y1,scale)}, - e2truncFUN = function(a,b){my_e2truncgamma(exp(-b),exp(-a),y1,scale)}, + lcdfFUN = function(x) {log(expint::gammainc(y, scale * exp(-x))) - lgamma(y + 1)}, + lpdfFUN = function(x){dpois(y, exp(-x), log=TRUE)}, + etruncFUN = function(a,b){-my_etruncgamma(exp(-b),exp(-a),y,scale)}, + e2truncFUN = function(a,b){my_e2truncgamma(exp(-b),exp(-a),y,scale)}, data=list(y=y, scale=scale, link=link)) } } diff --git a/R/unimix.R b/R/unimix.R index 0e142e3..e81cf1c 100644 --- a/R/unimix.R +++ b/R/unimix.R @@ -57,6 +57,7 @@ comp_dens_conv.unimix = function(m,data){ } #' log density of convolution of each component of a unif mixture +#' #' @inheritParams comp_dens_conv.unimix #' @return a k by n matrix of densities log_comp_dens_conv.unimix = function(m,data){ @@ -74,8 +75,12 @@ log_comp_dens_conv.unimix = function(m,data){ } lcomp_dens = t(lpa + log(1-exp(lpb-lpa))) - log(b-a) + # Handle the case lpa = -Inf, lpb = -Inf, lpb - lpa = NaN. This can happen + # for Poisson likelihood with log link + lcomp_dens[lpa == -Inf & lpb == -Inf] = -Inf lcomp_dens[a==b,] = t(do.call(lik$lpdfFUN, list(outer(data$x,b,FUN="-")/data$s)) -log(data$s))[a==b,] + return(lcomp_dens) } diff --git a/appveyor.yml b/appveyor.yml index 40c5c62..d12b23f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,7 +22,7 @@ environment: # This is the minimal set of R packages needed to run "R CMD check" on # the package. build_script: - - R -e install.packages(c('assertthat','devtools','etrunct','truncnorm','Rcpp','foreach','doParallel','pscl','SQUAREM','testthat','rmarkdown','knitr','ggplot2'),head(.libPaths(),1),'http://cran.wustl.edu') + - R -e install.packages(c('assertthat','devtools','etrunct','expint','truncnorm','Rcpp','foreach','doParallel','pscl','SQUAREM','testthat','rmarkdown','knitr','ggplot2'),head(.libPaths(),1),'http://cran.wustl.edu') - R -e devtools::install_github('stephenslab/mixsqp') test_script: diff --git a/man/lik_pois.Rd b/man/lik_pois.Rd index b4e3688..6a478e2 100644 --- a/man/lik_pois.Rd +++ b/man/lik_pois.Rd @@ -21,7 +21,7 @@ Creates a likelihood object for ash for use with Poisson error distribution Suppose we have Poisson observations \code{y} where \eqn{y_i\sim Poisson(c_i\lambda_i)}. We either put an unimodal prior g on the (scaled) intensities \eqn{\lambda_i\sim g} (by specifying \code{link="identity"}) or on the log intensities - \eqn{logit(\lambda_i)\sim g} (by specifying \code{link="log"}). Either way, + \eqn{log(\lambda_i)\sim g} (by specifying \code{link="log"}). Either way, ASH with this Poisson likelihood function will compute the posterior mean of the intensities \eqn{\lambda_i}. } diff --git a/tests/testthat/test_lik.R b/tests/testthat/test_lik.R index 0f3a4dd..089deb1 100644 --- a/tests/testthat/test_lik.R +++ b/tests/testthat/test_lik.R @@ -30,6 +30,100 @@ test_that("general likelihood with multiple df works", { data =set_data(betahat,s,lik = lik_t(df),alpha=0) expect_equal(calc_null_loglik(data),sum(dt(betahat/s,df=df,log=TRUE)-log(s))) - +}) + +test_that("Poisson (identity link) marginal PMF is correct", { + y = seq(0, 100) + g = unimix(1, .1, .2) + lik = lik_pois(y=y, link="identity") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function (z) {integrate(function(lam) {dpois(z, lam)}, g$a[1], g$b[1])$value / (g$b[1] - g$a[1])})) + expect_equal(py, true_py) +}) + +test_that("Poisson (identity link) marginal PMF for point mass is correct", { + y = seq(0, 100) + g = unimix(1, .1, .1) + lik = lik_pois(y=y, link="identity") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function(x) {dpois(x, g$a[1])})) + expect_equal(py, true_py) +}) + +test_that("Poisson (identity link) marginal PMF is correct with scale factors", { + y = seq(0, 100) + s = 100 + g = unimix(1, 1e-3, 2e-3) + lik = lik_pois(y=y, scale=s, link="identity") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function (z) {integrate(function(lam) {dpois(z, s * lam)}, g$a[1], g$b[1])$value / (g$b[1] - g$a[1])})) + expect_equal(py, true_py, tolerance=1e-5, scale=1) +}) + +test_that("Poisson (log link) marginal PMF is correct", { + y = seq(0, 100) + g = unimix(1, log(.1), log(.2)) + lik = lik_pois(y=y, link="log") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function (z) {integrate(function(log_lam) {dpois(z, exp(log_lam))}, g$a[1], g$b[1])$value / (g$b[1] - g$a[1])})) + expect_equal(py, true_py, tolerance=1e-5, scale=1) +}) + +test_that("Poisson (log link) marginal PMF for point mass is correct", { + y = seq(0, 100) + g = unimix(1, log(.1), log(.1)) + lik = lik_pois(y=y, link="log") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function(x) {dpois(x, exp(g$a[1]))})) + expect_equal(py, true_py) +}) + +test_that("Poisson (log link) marginal PMF is correct with scale factors", { + y = seq(0, 100) + s = 100 + g = unimix(1, log(1e-3), log(2e-3)) + lik = lik_pois(y=y, scale=s, link="log") + data = set_data(rep(0, length(y)), 1, lik) + py = drop(comp_dens_conv(g, data)) + true_py = unlist(lapply(y, function (z) {integrate(function(log_lam) {dpois(z, s * exp(log_lam))}, g$a[1], g$b[1])$value / (g$b[1] - g$a[1])})) + expect_equal(py, true_py, tolerance=1e-5, scale=1) +}) + +test_that("ln p(x = 0 | s) is correct for Poisson likelihood (log link)", { + s = 10 ^ seq(log10(1e3), log10(1e6), .1) + y = rep(0, length(s)) + g = unimix(1, log(1e-3), log(2e-3)) + lik = lik_pois(y=y, scale=s, link="log") + data = set_data(rep(0, length(y)), 1, lik) + log_py = drop(log(comp_dens_conv(g, data))) + true_log_py = log(unlist(lapply(s, function (z) {integrate(function(log_lam) {dpois(0, z * exp(log_lam))}, g$a[1], g$b[1])$value / (g$b[1] - g$a[1])}))) + ## Work around a recent bug comparing Inf (called from expect_equal) + ## + ## https://github.com/r-lib/testthat/issues/789 + ## https://github.com/r-lib/testthat/commit/992ddd82fd7b6f1fdc5bb66c31db94277f3df126 + expect_true(all((true_log_py == log_py | abs(true_log_py - log_py) < 1e-2))) +}) + +test_that("Poisson (identity link) returns sensible marginal PMF on real data", { + dat = readRDS("test_pois_data.Rds") + fit <- ash_pois(dat$x, dat$scale, link="identity", mixcompdist="halfuniform") + F = comp_dens_conv(fit$fitted_g, fit$data) + expect_true(all(F < 1)) +}) +test_that("Poisson (log link) returns sensible marginal PMF on real data", { + dat = readRDS("test_pois_data.Rds") + lam <- dat$x / dat$scale + eps = 1 / mean(dat$scale) + log_lam <- log(lam + eps) + se_log_lam <- sqrt(var(lam) / (lam + eps)^2) + mixsd <- seq(.1 * min(se_log_lam), max(2 * sqrt(log_lam^2 - se_log_lam^2)), by=.5 * log(2)) + fit <- ash_pois(dat$x, dat$scale, link="log", mixcompdist="halfuniform", mixsd=mixsd) + F = comp_dens_conv(fit$fitted_g, fit$data) + expect_true(all(F < 1)) }) diff --git a/tests/testthat/test_pois.R b/tests/testthat/test_pois.R index e06b572..5bcd493 100644 --- a/tests/testthat/test_pois.R +++ b/tests/testthat/test_pois.R @@ -1,88 +1,78 @@ context("ashr with Poisson likelihoods") test_that("lik_pois (identity link) fitted g is close to true g",{ - # Simulate a Poisson dataset set.seed(1) - trueg = unimix(c(0.5,0.5),c(1,1),c(1,5)) # true prior g: 0.5*U(1,5)+0.5*delta(1) - lambda = c(rep(1,500), runif(500,1,5)) # generate lambda from g - x = rpois(1000,lambda) # Poisson observations - out <- capture.output( - ash.pois.out <- ash(rep(0,length(x)),1,lik=lik_pois(x),g=trueg, - control = list(verbose = TRUE))) - - # Check if the estimated mixture proportion for components delta(0.5) and U(0.1,0.9) - # is close to the true mixture proportion (0.5,0.5) - expect_equal(ash.pois.out$fitted_g$pi, c(0.5,0.5), tolerance = 0.05) + trueg = unimix(c(0.5,0.5),c(1,1),c(1,5)) + lambda = c(rep(1,500), runif(500,1,5)) + x = rpois(1000,lambda) + ash.pois.out = ash_pois(x, link="identity", g=trueg) + expect_equal(ash.pois.out$fitted_g$pi, trueg$pi, tolerance = 0.05) }) test_that("lik_pois (identity link) fitted mode is close to true mode",{ - # Simulate a Poisson dataset set.seed(1) - truemode = 50 # set mode of prior g - lambda = c(rnorm(1000,truemode,5)) # generate lambda from g - x = rpois(1000,lambda) # Poisson observations - ash.pois.out = ash(rep(0,length(x)),1,lik=lik_pois(x),mode="estimate") - - # Check if the estimated mode is close to the true mode 50 + truemode = 50 + lambda = c(rnorm(1000,truemode,5)) + x = rpois(1000,lambda) + ash.pois.out = ash_pois(x) expect_equal(ash.pois.out$fitted_g$a[1], truemode, tolerance = 0.05, scale=truemode) }) test_that("lik_pois (identity link) with high intensities gives similar answers to normal likelihood",{ - # Simulate a Poisson data set with relatively high intensities set.seed(1) - lambda = c(rnorm(1000,200,5)) # simulate intensities around 200 + lambda = c(rnorm(1000,200,5)) x = rpois(1000,lambda) - - # Fit the ash model with two different likelihood densities: (1) the - # normal distribution with (s.e.) to be match the standard deviations of - # Poisson distribution sqrt(lambda), and (2) the Poisson distribution - ash.pois.out = ash(rep(0,length(x)),1,lik=lik_pois(x)) + ash.pois.out = ash_pois(x) + # For large lambda, Poisson(lambda) is approximately N(lambda, lambda) ash.norm.out = ash(x, sqrt(lambda), mode="estimate", prior="uniform") - - # Compare the posterior mean estimates from ash using the two - # different likelihood densities. We expect that the difference - # between the two estimates should always be small (relative error - # at most 5%). expect_equal(ash.norm.out$result$PosteriorMean, ash.pois.out$result$PosteriorMean, tolerance = 0.05) }) test_that("lik_pois (log link) fitted g is close to true g",{ - # Simulate a Poisson dataset set.seed(1) trueg = unimix(c(0.8,0.2),c(0,-3),c(0,3)) loglambda = c(rep(0,800), runif(200,-3,3)) - lambda = exp(loglambda) - x = rpois(1000,lambda) # Poisson observations - out <- capture.output( - ash.pois.out <- ash(rep(0,length(x)),1,lik = lik_pois(x,link="log"), - g = trueg,control = list(verbose = TRUE))) - - # Check if the estimated mixture proportion for components delta(0) - # and U(-3,3) is close to the true mixture proportion (0.8,0.2) - expect_equal(ash.pois.out$fitted_g$pi, c(0.8,0.2), tolerance = 0.05) + x = rpois(1000, exp(loglambda)) + ash.pois.out = ash_pois(x, link="log", g=trueg) + expect_equal(ash.pois.out$fitted_g$pi, trueg$pi, tolerance = 0.05) }) test_that("lik_pois (log link) fitted mode is close to true mode",{ - # Simulate a Poisson dataset set.seed(1) truemode = 4 - loglambda = c(rep(4,500), rnorm(500,4,1)) # simulate log(lambda) from distn w/ mode at 4 - lambda = exp(loglambda) - x = rpois(1000,lambda) # Poisson observations - ash.pois.out = ash(rep(0,length(x)),1,lik=lik_pois(x,link="log"),mode="estimate") - - # Check if the estimated mode is close to the true mode 50 - expect_equal(ash.pois.out$fitted_g$a[1], truemode, tolerance = 0.05, scale=truemode) + N = 500 + # Set the SD in such a way that exp(loglambda) does not become too big. + # Otherwise, expint::gammainc will fail + loglambda = rnorm(N, 4, .1) + x = rpois(N, exp(loglambda)) + ash.pois.out = ash_pois(x, link="log", mode="estimate") + expect_equal(ash.pois.out$fitted_g$a[1], truemode, tolerance = 1e-2, scale=1) }) -test_that("Mode estimation for pois_lik finds an acceptable solution", { - set.seed(1) - # Load example 10X Genomics data - dat = readRDS("test_pois_data.Rds") - m0 = ashr::ash(rep(0, nrow(dat)), 1, lik=ashr::lik_pois(dat$x, scale=dat$scale, link="identity"), mode="estimate") - lam = dat$x / dat$scale - m1 = ashr::ash(rep(0, nrow(dat)), 1, lik=ashr::lik_pois(dat$x, scale=dat$scale, link="identity"), mode=c(min(lam), max(lam))) - expect_equal(m0$loglik, m1$loglik, tolerance=1, scale=1) +test_that("Mode estimation for lik_pois finds an acceptable solution", { + set.seed(1) + ## Load example 10X Genomics data + dat = readRDS("test_pois_data.Rds") + m0 = ash_pois(dat$x, dat$scale, mode="estimate") + lam = dat$x / dat$scale + m1 = ash_pois(dat$x, dat$scale, mode=c(min(lam), max(lam))) + expect_equal(m0$loglik, m1$loglik, tolerance=1, scale=1) +}) + +test_that("Mode estimation for lik_pois gives same answer under identity and log link", { + set.seed(1) + ## Typical values for scRNA-seq data from Sarkar et al. 2019 + s = 1e5 + log_mu = runif(n=1, min=-12, max=-8) + log_phi = runif(n=1, min=-6, max=0) + N = 1000 + lam = rgamma(n=N, shape=exp(-log_phi), scale=exp(log_mu + log_phi)) + x = rpois(n=N, lambda=s * lam) + dat = data.frame(cbind(x, s)) + fit0 = ash_pois(dat$x, dat$s, link="identity", mixcompdist="halfuniform") + fit1 = ash_pois(dat$x, dat$s, link="log", mixcompdist="halfuniform") + expect_equal(fit0$loglik, fit1$loglik, tolerance=1e-2, scale=1) + expect_equal(fit0$fitted_g$a[1], exp(fit1$fitted_g$a[1]), tolerance=1e-6, scale=1) })