From dc895efaec96a2c15f76d518be61b06214583d83 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Tue, 19 Mar 2024 15:28:10 -0700 Subject: [PATCH 01/13] Add helper function to filter small pchisq p-values Add a helper function that calls pchisq and then filters any p-values that are smaller than Machine$double.xmin to the double.xmin value. --- R/utils.R | 7 +++++++ tests/testthat/test_utils.R | 39 ++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/R/utils.R b/R/utils.R index f10baa0..c9c1f92 100644 --- a/R/utils.R +++ b/R/utils.R @@ -352,3 +352,10 @@ setMethod(".annotateAssoc", put.attr.gdsn(geno.node, "sample.order") closefn.gds(gds) } + +.pchisq_filter_extreme <- function(...) { + # Note: stat is the squared test statistic. + pval = pchisq(...) + pval[pval < .Machine$double.xmin] = .Machine$double.xmin + return(pval) +} diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index bc11e77..df977d7 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -128,12 +128,12 @@ test_that("MAC - sex chrs", { sample.id <- seqGetData(gds, "sample.id") df <- data.frame(sample.id, sex="M", stringsAsFactors=FALSE) svd <- SeqVarData(gds, sampleData=AnnotatedDataFrame(df)) - + freq <- alleleFrequency(svd) geno <- refDosage(svd) chk <- .alleleFreq(svd, geno, male.diploid=FALSE) expect_equal(chk$freq, freq) - + mac <- minorAlleleCount(svd) expect_equal(chk$MAC, round(mac)) seqClose(gds) @@ -145,24 +145,24 @@ test_that("meanImpute", { m <- 1000 set.seed(123) geno <- matrix(rbinom(n*m, size = 2, prob = 0.1), nrow = n, ncol = m) - + miss <- sample(n*m, size = 0.1*n*m, replace = FALSE) geno[miss] <- NA - + freq <- 0.5*colMeans(geno, na.rm = TRUE) - + # original function x <- .meanImputeFn(geno, freq) - + # new function with matrix y <- .meanImpute(geno, freq) expect_equal(x, y) - + # new function with Matrix (one block) Geno <- Matrix(geno) y <- .meanImpute(Geno, freq) expect_equivalent(x, as.matrix(y)) - + # new function with Matrix (multiple blocks) #n*m/2^25 # 3 blocks if m=100000 y <- .meanImpute(Geno, freq, maxelem = 4e5) @@ -182,32 +182,32 @@ test_that("prepGenoBlock", { n1 <- colSums(geno == 1, na.rm=TRUE) n2 <- colSums(geno == 2, na.rm=TRUE) mono <- (n0 == n | n1 == n | n2 == n) - + vi <- data.frame(a=1:m) x <- list(var.info=vi, geno=geno, chr=rep("1",m)) - + g <- .prepGenoBlock(x) expect_equal(vi[!mono,,drop=FALSE], g$var.info) expect_equal(colSums(!is.na(geno[,!mono])), g$n.obs) expect_equal(0.5*colMeans(geno[,!mono], na.rm=TRUE), g$freq$freq) expect_equal(geno[,!mono], g$geno) expect_true(is.null(g$weight)) - + g2 <- .prepGenoBlock(x, AF.max = 0.1) inc <- g$freq$freq <= 0.1 expect_equal(g2$freq, g$freq[inc,]) expect_equal(g2$geno, g$geno[,inc]) - + gr <- .prepGenoBlock(x, geno.coding="recessive") rec.mono <- n2 == 0 | n2 == n expect_equal(colSums(gr$geno == 1, na.rm=TRUE), n2[!rec.mono]) expect_equal(names(gr$freq), c("freq", "MAC", "n.hom.eff")) - + gd <- .prepGenoBlock(x, geno.coding="dominant") dom.mono <- n0 == 0 | n0 == n expect_equal(colSums(gd$geno != 0, na.rm=TRUE), (n1+n2)[!dom.mono]) expect_equal(names(gd$freq), c("freq", "MAC", "n.any.eff")) - + x$weight <- c(rep(1,900), rep(0,100)) gw <- .prepGenoBlock(x) expect_equal(gw$weight, x$weight[!mono & as.logical(x$weight)]) @@ -223,13 +223,13 @@ test_that("prepGenoBlock - male haploid", { sex <- sample(c("M", "F"), n, replace=TRUE) vi <- data.frame(a=1:m) x <- list(var.info=vi, geno=geno, chr=rep("X",m)) - + male <- sex == "M" female <- sex == "F" nm1 <- colSums(geno[male,] == 1) nm2 <- colSums(geno[male,] == 2) nf2 <- colSums(geno[female,] == 2) - + gd <- .prepGenoBlock(x, geno.coding="recessive", male.diploid=TRUE, sex=sex) expect_equal(colSums(gd$geno[male,]), nm2) expect_equal(colSums(gd$geno[female,]), nf2) @@ -237,3 +237,10 @@ test_that("prepGenoBlock - male haploid", { expect_equal(colSums(gh$geno[male,]), nm1 + nm2) expect_equal(colSums(gh$geno[female,]), nf2) }) + +test_that(".pchisq_filter_extreme", { + stat <- c(0.1, 0.5, 1, 10) + expect_equal(.pchisq_filter_extreme(stat^2, lower.tail=FALSE, df=1), pchisq(stat^2, lower.tail=FALSE, df=1)) + # Extreme p-value + expect_equal(.pchisq_filter_extreme(100^2, lower.tail=FALSE, df=1), .Machine$double.xmin) +}) From 7d38afe985a6a3396def419587abaa888365d3c9 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Tue, 19 Mar 2024 15:31:23 -0700 Subject: [PATCH 02/13] Add devel branch to CI --- .github/workflows/check-bioc.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index bc9f5e7..5a2fbe2 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -25,10 +25,12 @@ on: branches: - master - main + - devel pull_request: branches: - master - main + - devel name: R-CMD-check-bioc From 503ba358c6178b074631d3119272a741bc7786ad Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Tue, 19 Mar 2024 15:48:11 -0700 Subject: [PATCH 03/13] Try using devel Bioconductor in github CI --- .github/workflows/check-bioc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 5a2fbe2..d05e495 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -60,7 +60,7 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } ## - { os: macOS-latest, r: 'devel', bioc: '3.17'} ## - { os: windows-latest, r: 'devel', bioc: '3.17'} env: From c4ad85982903f8692c7b86b81b2b04f6d347a26e Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 09:59:19 -0700 Subject: [PATCH 04/13] Switch any call to pchisq to .pchisq_filter_extreme --- R/admixMap.R | 4 ++-- R/jointScoreTest.R | 4 ++-- R/prepareOutputForNullModel.R | 4 ++-- R/testGeno.R | 10 +++++----- R/testSingleVarianceComponent.R | 2 +- R/variantSetTests.R | 12 ++++++------ 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R/admixMap.R b/R/admixMap.R index 5e88266..65ccf16 100644 --- a/R/admixMap.R +++ b/R/admixMap.R @@ -131,7 +131,7 @@ admixMap <- function(admixDataList, res[,"Est"] <- beta res[,"SE"] <- sqrt(Vbeta) res[,"Stat"] <- Stat - res[,"pval"] <- pchisq(Stat, df=1, lower.tail=FALSE) + res[,"pval"] <- .pchisq_filter_extreme(Stat, df=1, lower.tail=FALSE) }else{ Joint.Stat <- rep(NA, ncol(local)) @@ -167,7 +167,7 @@ admixMap <- function(admixDataList, res[,paste(names(admixDataList)[i],".SE", sep="")] <- SE[,i] } res[,"Joint.Stat"] <- Joint.Stat - res[,"Joint.pval"] <- pchisq(Joint.Stat, df=v, lower.tail=FALSE) + res[,"Joint.pval"] <- .pchisq_filter_extreme(Joint.Stat, df=v, lower.tail=FALSE) } # else # results data frame diff --git a/R/jointScoreTest.R b/R/jointScoreTest.R index dd91002..7ef984f 100644 --- a/R/jointScoreTest.R +++ b/R/jointScoreTest.R @@ -51,7 +51,7 @@ jointScoreTest <- function(null.model, G) { # # Check rownames/colnames match. # Joint test statistic. Note this is the equivalent of the stat being squared, # by convention. Stat.joint <- as.numeric(crossprod(GY, beta)) - pval.joint <- pchisq(Stat.joint, lower.tail = FALSE, df = ncol(G)) + pval.joint <- .pchisq_filter_extreme(Stat.joint, lower.tail = FALSE, df = ncol(G)) # Percentage of variance explained jointly by these variants. pve.joint <- as.numeric(Stat.joint / null.model$RSS0) @@ -63,7 +63,7 @@ jointScoreTest <- function(null.model, G) { # # Check rownames/colnames match. stringsAsFactors = FALSE ) fixef$Stat <- fixef$Est / fixef$SE - fixef$pval <- pchisq(fixef$Stat^2, lower.tail = FALSE, df = 1) + fixef$pval <- .pchisq_filter_extreme(fixef$Stat^2, lower.tail = FALSE, df = 1) fixef$PVE <- fixef$Stat^2 / null.model$RSS0 rownames(fixef) <- colnames(G) diff --git a/R/prepareOutputForNullModel.R b/R/prepareOutputForNullModel.R index 62837b0..e4caad4 100644 --- a/R/prepareOutputForNullModel.R +++ b/R/prepareOutputForNullModel.R @@ -80,7 +80,7 @@ SE <- sqrt(diag(betaCov)) Stat <- (vc.mod$beta/SE)^2 - pval <- pchisq(Stat, df = 1, lower.tail = FALSE) + pval <- .pchisq_filter_extreme(Stat, df = 1, lower.tail = FALSE) fixef <- data.frame(Est = vc.mod$beta, SE = SE, Stat = Stat, pval = pval) rownames(fixef) <- varNames @@ -181,7 +181,7 @@ SE <- sqrt(diag(betaCov)) Stat <- (vc.mod$beta/SE)^2 - pval <- pchisq(Stat, df = 1, lower.tail = FALSE) + pval <- .pchisq_filter_extreme(Stat, df = 1, lower.tail = FALSE) fixef <- data.frame(Est = vc.mod$beta, SE = SE, Stat = Stat, pval = pval) rownames(fixef) <- varNames diff --git a/R/testGeno.R b/R/testGeno.R index 9570b27..e93a19b 100644 --- a/R/testGeno.R +++ b/R/testGeno.R @@ -117,7 +117,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA Stat <- score/score.SE res <- data.frame(Score = score, Score.SE = score.SE, Score.Stat = Stat, - Score.pval = pchisq(Stat^2, df = 1, lower.tail = FALSE), + Score.pval = .pchisq_filter_extreme(Stat^2, df = 1, lower.tail = FALSE), Est = score/GPG, Est.SE = 1/score.SE, PVE = (Stat^2)/RSS0) # RSS0 = (n-k) when gaussian; not when binary @@ -135,7 +135,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA # Vbeta <- RSS/GPG # Stat <- beta/sqrt(Vbeta) # res <- data.frame(Est = beta, Est.SE = sqrt(Vbeta), Wald.Stat = Stat, -# Wald.pval = pchisq(Stat^2, df = 1, lower.tail = FALSE)) +# Wald.pval = .pchisq_filter_extreme(Stat^2, df = 1, lower.tail = FALSE)) # return(res) # } @@ -193,8 +193,8 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA } res <- as.data.frame(res) - res$GxE.pval <- pchisq((res$GxE.Stat)^2, df = (v - 1), lower.tail = FALSE) - res$Joint.pval <- pchisq((res$Joint.Stat)^2, df = v, lower.tail = FALSE) + res$GxE.pval <- .pchisq_filter_extreme((res$GxE.Stat)^2, df = (v - 1), lower.tail = FALSE) + res$Joint.pval <- .pchisq_filter_extreme((res$Joint.Stat)^2, df = v, lower.tail = FALSE) if (GxE.return.cov.mat) { return(list(res = res, GxEcovMatList = res.Vbetas)) @@ -286,7 +286,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA crossprod(GPG, betas))/RSS, error = function(e) { NA }) - res[,"Joint.pval"] <- pchisq(res[,"Joint.Stat"], df = v, lower.tail = FALSE) + res[,"Joint.pval"] <- .pchisq_filter_extreme(res[,"Joint.Stat"], df = v, lower.tail = FALSE) res <- as.data.frame(res) return(list(res = res, allelesCovMat = Vbetas)) diff --git a/R/testSingleVarianceComponent.R b/R/testSingleVarianceComponent.R index 537d4fd..d0d28cd 100644 --- a/R/testSingleVarianceComponent.R +++ b/R/testSingleVarianceComponent.R @@ -25,7 +25,7 @@ testSingleVarianceComponent <- function(nullmod, varCompName, covMatList, group. test.stat <- -2*(nullmod.noVarComp$logLik - nullmod$logLik) - pval <- 0.5*pchisq(test.stat, df = length(nullmod$varComp), lower.tail = FALSE) + 0.5*pchisq(test.stat, df = length(nullmod$varComp) - 1, lower.tail = F) + pval <- 0.5*.pchisq_filter_extreme(test.stat, df = length(nullmod$varComp), lower.tail = FALSE) + 0.5*.pchisq_filter_extreme(test.stat, df = length(nullmod$varComp) - 1, lower.tail = F) return(pval) } diff --git a/R/variantSetTests.R b/R/variantSetTests.R index 58f51d3..e1a6f6e 100644 --- a/R/variantSetTests.R +++ b/R/variantSetTests.R @@ -137,7 +137,7 @@ testVariantSet <- function( nullmod, G, weights, GG1 <- crossprod(G, G.rowSums) # WGPGW1 # O(mn) V.sum <- sum(GG1) # 1WGPGW1 burden.stat = U.sum / sqrt(V.sum) - burden.pval <- pchisq(burden.stat^2, df=1, lower.tail=FALSE) + burden.pval <- .pchisq_filter_extreme(burden.stat^2, df=1, lower.tail=FALSE) # adjust U and G for burden U <- U - GG1*U.sum/V.sum # WGPY - WGPGW1 * 1WGPY/(1WGPGW1) @@ -152,7 +152,7 @@ testVariantSet <- function( nullmod, G, weights, # # denominator for burden # V.sum <- sum(GG1) # 1WGPGW1 # # burden p-value - # burden.pval <- pchisq(U.sum^2/V.sum, df=1, lower.tail=FALSE) + # burden.pval <- .pchisq_filter_extreme(U.sum^2/V.sum, df=1, lower.tail=FALSE) # # adjust for burden # U <- U - GG1*U.sum/V.sum # V <- V - tcrossprod(GG1)/V.sum # O(m^2) @@ -163,7 +163,7 @@ testVariantSet <- function( nullmod, G, weights, err <- out$err # Fisher's method to combine p-values - smmat.pval <- tryCatch(pchisq(-2*log(burden.pval)-2*log(theta.pval), df=4, lower.tail = FALSE), error = function(e) { NA }) + smmat.pval <- tryCatch(.pchisq_filter_extreme(-2*log(burden.pval)-2*log(theta.pval), df=4, lower.tail = FALSE), error = function(e) { NA }) if(is.na(smmat.pval)) { err <- 1 smmat.pval <- burden.pval @@ -187,7 +187,7 @@ testVariantSet <- function( nullmod, G, weights, .regular <- function(Q, V, ncolG) { if(ncolG == 1){ - pv <- list(pval = pchisq(as.numeric(Q/V), df=1, lower.tail=FALSE), method = "integration") + pv <- list(pval = .pchisq_filter_extreme(as.numeric(Q/V), df=1, lower.tail=FALSE), method = "integration") }else{ lambda <- eigen(V, only.values = TRUE, symmetric=TRUE)$values @@ -381,7 +381,7 @@ testVariantSet <- function( nullmod, G, weights, # p value calculation if(length(scores) == 1){ lambdas[[i]] <- as.numeric(distMat) - pval <- pchisq(as.numeric(Q/distMat), df=1, lower.tail=FALSE) + pval <- .pchisq_filter_extreme(as.numeric(Q/distMat), df=1, lower.tail=FALSE) err <- ifelse(is.na(pval), 1, 0) }else{ lambda <- eigen(distMat, only.values = TRUE, symmetric=TRUE)$values @@ -493,7 +493,7 @@ integrateFxn <- function(x, qmin, otherParams, tau, rho){ temp.q<-(minval - mu)/sqrt(varia)*sqrt(2*degf) + degf - re<-pchisq(temp.q ,df=degf) * dchisq(x,df=1) + re<-.pchisq_filter_extreme(temp.q ,df=degf) * dchisq(x,df=1) return(re) } From 7a0be7a8eae5743c5ac6e50ab910e896adffb45a Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 11:09:33 -0700 Subject: [PATCH 05/13] Update man pages for extreme p-value filtering --- man/admixMap.Rd | 6 ++++-- man/assocTestAggregate.Rd | 5 ++++- man/assocTestSingle.Rd | 14 ++++++++------ man/fitNullModel.Rd | 3 +++ man/jointScoreTest.Rd | 4 ++++ 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/man/admixMap.Rd b/man/admixMap.Rd index a7b70b7..5e35184 100644 --- a/man/admixMap.Rd +++ b/man/admixMap.Rd @@ -7,7 +7,7 @@ Run admixture analyses } \usage{ admixMap(admixDataList, null.model, male.diploid=TRUE, - genome.build=c("hg19", "hg38"), + genome.build=c("hg19", "hg38"), BPPARAM=bpparam(), verbose=TRUE) } @@ -42,6 +42,8 @@ See the example for how one might set up the \code{admixDataList} object. List n \item{Joint.pval}{The Wald p-value for the joint test of all local ancestry terms} } +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + \author{Matthew P. Conomos, Lisa Brown, Stephanie M. Gogarten, Tamar Sofer, Ken Rice, Chaoyu Yu} \seealso{\code{\link{GenotypeIterator}}, \code{\link{fitNullModel}}, \code{\link{assocTestSingle}}} @@ -77,7 +79,7 @@ genoIterators <- lapply(genoDataList[1:2], GenotypeBlockIterator) null.model <- fitNullModel(scanAnnot, outcome="pheno", covars="covar") # run the association test -myassoc <- admixMap(genoIterators, null.model, +myassoc <- admixMap(genoIterators, null.model, BPPARAM=BiocParallel::SerialParam()) head(myassoc) diff --git a/man/assocTestAggregate.Rd b/man/assocTestAggregate.Rd index ceaf503..9975156 100644 --- a/man/assocTestAggregate.Rd +++ b/man/assocTestAggregate.Rd @@ -117,7 +117,7 @@ \item{Score_burden}{The value of the score function for the burden test} \item{Score.SE_burden}{The estimated standard error of the Score for the burden test} \item{Stat_burden}{The score Z test statistic for the burden test} - \item{pval_burden}{The burden test p-value} + \item{pval_burden}{The burden test p-value.} \item{Q_theta}{The test statistic for the adjusted SKAT test (which is asymptotically independent of the burden test)} \item{pval_theta}{The p-value of the adjusted SKAT test (which is asymptotically independent of the burden test)} \item{pval_SMMAT}{The SMMAT p-value after combining pval_burden and pval_theta using Fisher's method.} @@ -147,6 +147,9 @@ \item{freq}{The estimated effect allele frequency} \item{MAC}{The minor allele count. For multiallelic variants, "minor" is determined by comparing the count of the allele specified by \code{allele.index} with the sum of all other alleles.} \item{weight}{The weight assigned to the variant in the analysis.} + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + } \author{Matthew P. Conomos, Stephanie M. Gogarten, Thomas Lumley, Tamar Sofer, Ken Rice, Chaoyu Yu, Han Chen} diff --git a/man/assocTestSingle.Rd b/man/assocTestSingle.Rd index dd76145..1bbe410 100644 --- a/man/assocTestSingle.Rd +++ b/man/assocTestSingle.Rd @@ -13,7 +13,7 @@ \S4method{assocTestSingle}{SeqVarIterator}(gdsobj, null.model, test=c("Score", "Score.SPA", "BinomiRare", "CMP"), recalc.pval.thresh=0.05, fast.score.SE=FALSE, - GxE=NULL, + GxE=NULL, geno.coding=c("additive", "dominant", "recessive"), sparse=TRUE, imputed=FALSE, male.diploid=TRUE, genome.build=c("hg19", "hg38"), @@ -47,14 +47,14 @@ \code{assocTestSingle} uses the \code{\link{BiocParallel}} package to process iterator chunks in parallel. See the \code{\link{BiocParallel}} documentation for more information on the default behaviour of \code{\link{bpparam}} and how to register different parallel backends. If serial execution is desired, set \code{BPPARAM=BiocParallel::SerialParam()}. Note that parallel execution requires more RAM than serial execution. All samples included in \code{null model} will be included in the association test, even if a different set of samples is present in the current filter for \code{gdsobj}. - + The effect size estimate is for each copy of the alternate allele (when \code{gdsobj} is a \code{\link{SeqVarIterator}} object) or the "A" allele (when \code{gdsobj} is a \code{\link{GenotypeIterator}} object). We refer to this as the "effect allele" in the rest of the documentation. For multiallelic variants in \code{\link{SeqVarIterator}} objects, each alternate (or "A") allele is tested separately. %When \code{impute.geno} is TRUE, sporadic missing genotype values are mean imputed using the minor allele frequency (MAF) calculated on all other samples at that SNP. When \code{impute.geno} is FALSE, samples with missing values for all of the SNP genotypes in the current SNP block are removed from the analysis for the block; this may significantly slow down computation time because many pre-computed matrices need to be re-computed each time the sample set changes. Also note: when \code{impute.geno} is FALSE, sporadic missingness for a sample inside of a SNP block will lead to an error. Sporadic missing genotype values are mean imputed using the allele frequency calculated on all other samples at that variant. Monomorphic variants (including variants where every sample is a heterozygote) are omitted from the results. - + The input \code{GxE} can be used to perform GxE tests. Multiple interaction variables may be specified, but all interaction variables specified must have been included as covariates in fitting the null model with \code{fitNullModel}. When performing GxE analyses, \code{assocTestSingle} will report two tests: (1) the joint Wald test of all genotype interaction terms in the model (this is the test for any genotype interaction effect), and (2) the joint Wald test of the genotype term along with all of the genotype interaction terms (this is the test for any genetic effect). Individual genotype interaction terms can be tested by creating test statistics from the reported effect size estimates and their standard errors (Note: when \code{GxE} contains a single continuous or binary covariate, this test is the same as the test for any genotype interaction effect mentioned above). %In order to test more complex hypotheses regarding subsets of multiple genotype interaction terms, \code{ivar.return.betaCov} can be used to retrieve the estimated covariance matrix of the effect size estimates. The saddle point approximation (SPA), run by using \code{test = "Score.SPA"}, implements the method described by Dey et al. (2017), which was extended to mixed models by Zhou et al. (2018) in their SAIGE software. SPA provides better calibration of p-values when fitting mixed models with a binomial family for a sample with an imbalanced case to control ratio. @@ -89,7 +89,7 @@ \item{PVE}{An approximation of the proportion of phenotype variance explained} % If \code{test} is \code{"Wald"} and \code{GxE} is \code{NULL}: % \item{Est}{The effect size estimate for each additional copy of the effect allele} - % \item{Est.SE}{The estimated standard error of the effect size estimate} + % \item{Est.SE}{The estimated standard error of the effect size estimate} % \item{Wald.Stat}{The Wald Z test statistic} % \item{Wald.pval}{The Wald p-value} If \code{test} is \code{"Score.SPA"}: @@ -109,10 +109,12 @@ \item{n.D.carrier}{Number of cases with at least one copy of the effect allele} \item{pval}{p-value} \item{mid.pval}{mid-p-value} - + %When \code{GxE} is not \code{NULL}, if \code{ivar.return.betaCov} is \code{TRUE}, then the output is a list with two elements. The first, "results", is the data.frame described above. The second, "betaCov", is a list with length equal to the number of rows of "results", where each element of the list is the covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms. } +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + \references{ Dey, R., Schmidt, E. M., Abecasis, G. R., & Lee, S. (2017). A fast and accurate algorithm to test for binary phenotypes and its application to PheWAS. The American Journal of Human Genetics, 101(1), 37-49. @@ -157,7 +159,7 @@ nullmod <- fitNullModel(iterator, outcome="outcome", covars="sex") # run the association test assoc <- assocTestSingle(iterator, nullmod, BPPARAM=BiocParallel::SerialParam()) - + # use fast score SE for a null model with a covariance matrix seqResetFilter(seqData) grm <- SNPRelate::snpgdsGRM(seqData, verbose=FALSE) diff --git a/man/fitNullModel.Rd b/man/fitNullModel.Rd index ae81254..6a18cd3 100644 --- a/man/fitNullModel.Rd +++ b/man/fitNullModel.Rd @@ -197,6 +197,9 @@ The \code{score.table} data frame contains the following columns: \item{Score.SE.fast}{The estimated fast standard error of the Score (before scalar correction)} \item{se.ratio}{The ratio of Score.SE to Score.SE.fast; these values are averaged across varaints to estimate \code{se.correction} in \code{nullModelFastScore}.} } + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + } \references{ diff --git a/man/jointScoreTest.Rd b/man/jointScoreTest.Rd index 9f31e7a..3149be1 100644 --- a/man/jointScoreTest.Rd +++ b/man/jointScoreTest.Rd @@ -31,6 +31,10 @@ jointScoreTest(null.model, G) \item{fixef}{A data.frame with joint effect size estimates (Est), standard errors (SE), chi-squared test statistics (Stat), p-values (pval), and estimated proportion of variance explained (PVE) for each of the variants specified in \code{G}.} \item{betaCov}{Estimated covariance matrix for the variants in \code{G}.} } + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + + %\references{ %} From 0ad4333e12a5b1d0b68986cf9e93eeac34bc20c7 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 14:30:59 -0700 Subject: [PATCH 06/13] Move p-value info into details section --- man/admixMap.Rd | 4 ++-- man/assocTestAggregate.Rd | 5 +++-- man/assocTestSingle.Rd | 5 +++-- man/fitNullModel.Rd | 5 ++--- man/jointScoreTest.Rd | 4 ++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/man/admixMap.Rd b/man/admixMap.Rd index 5e35184..aa177f0 100644 --- a/man/admixMap.Rd +++ b/man/admixMap.Rd @@ -28,6 +28,8 @@ This function is used with local ancestry results such as those obtained from RF See the example for how one might set up the \code{admixDataList} object. List names will propagate to the output file. \code{admixMap} uses the \code{\link{BiocParallel}} package to process iterator chunks in parallel. See the \code{\link{BiocParallel}} documentation for more information on the default behaviour of \code{\link{bpparam}} and how to register different parallel backends. If serial execution is desired, set \code{BPPARAM=BiocParallel::SerialParam()}. Note that parallel execution requires more RAM than serial execution. + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. } \value{A data.frame where each row refers to a different variant with the columns: @@ -42,8 +44,6 @@ See the example for how one might set up the \code{admixDataList} object. List n \item{Joint.pval}{The Wald p-value for the joint test of all local ancestry terms} } -p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. - \author{Matthew P. Conomos, Lisa Brown, Stephanie M. Gogarten, Tamar Sofer, Ken Rice, Chaoyu Yu} \seealso{\code{\link{GenotypeIterator}}, \code{\link{fitNullModel}}, \code{\link{assocTestSingle}}} diff --git a/man/assocTestAggregate.Rd b/man/assocTestAggregate.Rd index 9975156..d54fb37 100644 --- a/man/assocTestAggregate.Rd +++ b/man/assocTestAggregate.Rd @@ -73,6 +73,9 @@ The fastSMMAT method uses the same random matrix theory as fastSKAT to speed up the computation of the p-value for the adjusted SKAT component of the test. When \code{test = "fastSMMAT"}, the function uses the same logic as for fastSKAT to determine which p-value calculation approach to use for each aggregation unit. The BinomiRare test, run by using \code{test = "BinomiRare"}, and the CMP test, run by using \code{test = "CMP"} are carrier-only, robust tests. Only variants where the effect allele is minor will be tested. Both tests focuse on carriers of the rare variant allele ("carriers"), and use the estimated probabilities of the binary outcome within the carriers, estimated under the null of not association, and the actual number of observed outcomes, to compute p-values. BinomiRare uses the Poisson-Binomial distribution, and the CMP uses the Conway-Maxwell-Poisson distribution, and is specifically designed for mixed models. (If \code{test = "CMP"} but \code{null.model$family$mixedmodel = FALSE}, the BinomiRare test will be run instead.) These tests provide both a traditional p-value (\code{"pval"}) and a mid-p-value (\code{"midp"}), which is less conservative/more liberal, with the difference being more pronounced for small number of carriers. The BinomiRare test is described in Sofer (2017) and compared to the Score and SPA in various settings in Sofer and Guo (2020). + + p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + } \value{A list with the following items: @@ -148,8 +151,6 @@ \item{MAC}{The minor allele count. For multiallelic variants, "minor" is determined by comparing the count of the allele specified by \code{allele.index} with the sum of all other alleles.} \item{weight}{The weight assigned to the variant in the analysis.} -p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. - } \author{Matthew P. Conomos, Stephanie M. Gogarten, Thomas Lumley, Tamar Sofer, Ken Rice, Chaoyu Yu, Han Chen} diff --git a/man/assocTestSingle.Rd b/man/assocTestSingle.Rd index 1bbe410..223b6d4 100644 --- a/man/assocTestSingle.Rd +++ b/man/assocTestSingle.Rd @@ -65,6 +65,9 @@ %A reference to the CMP test and the BinomiRare for mixed models will be provided once a preprint is available. For the \code{\link{GenotypeIterator}} method, objects created with \code{\link{GdsGenotypeReader}} or \code{\link{MatrixGenotypeReader}} are supported. \code{\link{NcdfGenotypeReader}} objects are not supported. + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. + } \value{A data.frame where each row refers to a different variant with the columns: @@ -113,8 +116,6 @@ %When \code{GxE} is not \code{NULL}, if \code{ivar.return.betaCov} is \code{TRUE}, then the output is a list with two elements. The first, "results", is the data.frame described above. The second, "betaCov", is a list with length equal to the number of rows of "results", where each element of the list is the covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms. } -p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. - \references{ Dey, R., Schmidt, E. M., Abecasis, G. R., & Lee, S. (2017). A fast and accurate algorithm to test for binary phenotypes and its application to PheWAS. The American Journal of Human Genetics, 101(1), 37-49. diff --git a/man/fitNullModel.Rd b/man/fitNullModel.Rd index 6a18cd3..08c5c66 100644 --- a/man/fitNullModel.Rd +++ b/man/fitNullModel.Rd @@ -134,6 +134,8 @@ isNullModelFastScore(null.model) When \code{two.stage = TRUE}, the fully-adjusted two-stage rank normalization procedure from Sofer et. al. (2019) is used. With this procedure: the stage 1 model is fit as usual, with the specified \code{outcome}, \code{covars}, \code{cov.mat}, and \code{group.var}; the marginal residuals from the stage 1 model are rank normalized as specified by \code{norm.option} and then rescaled as specified by \code{rescale}; the stage 2 model is then fit using the rank normalized and rescaled residuals as the new outcome, with the same \code{covars}, \code{cov.mat}, and \code{group.var} as the stage 1 model. The output of this stage 2 model is saved and should be used for association testing. This procedure is only applicable when \code{family = "gaussian"}. The \code{nullModelInvNorm} function takes a previously fit null model as input and does the rank normalization, rescaling, and stage 2 model fitting. The function \code{fitNullModelFastScore} fits a null model that can be used for testing variant association with the fast approximation to the score standard error (SE) implemented by Zhou et al. (2018) in their SAIGE software. This approximation may be much faster than computing the true score SE in large samples, as it replaces the full covariance matrix in the calculation with the product of a diagonal matrix and a scalar correction factor (\code{se.correction} in the null model output); see the option \code{fast.Score.SE} in \code{\link{assocTestSingle}} for further details. A null model previously fit with \code{fitNullModel} can be updated to this format by using \code{calcScore} to compute the necessary statistics on a set of variants, followed by \code{nullModelFastScore} to calculate the \code{se.correction} factor and update the null model format. + +p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. } \value{An object of class '\code{GENESIS.nullModel}' or '\code{GENESIS.nullMixedModel}'. A list including: @@ -197,9 +199,6 @@ The \code{score.table} data frame contains the following columns: \item{Score.SE.fast}{The estimated fast standard error of the Score (before scalar correction)} \item{se.ratio}{The ratio of Score.SE to Score.SE.fast; these values are averaged across varaints to estimate \code{se.correction} in \code{nullModelFastScore}.} } - -p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. - } \references{ diff --git a/man/jointScoreTest.Rd b/man/jointScoreTest.Rd index 3149be1..b391b2c 100644 --- a/man/jointScoreTest.Rd +++ b/man/jointScoreTest.Rd @@ -23,6 +23,8 @@ jointScoreTest(null.model, G) \code{fixef} and \code{betaCov} will be in the same order as the columns in \code{G}. Missing data in \code{G} are not allowed. + + p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. } \value{\code{jointScoreTest} returns a list with the following elements: \item{Joint.Stat}{Squared joint score test statistic for all variants in \code{G}.} @@ -32,8 +34,6 @@ jointScoreTest(null.model, G) \item{betaCov}{Estimated covariance matrix for the variants in \code{G}.} } -p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}. - %\references{ %} From d53872828acb89c0ef8de522fcbb041ee0c4b5b7 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 15:14:37 -0700 Subject: [PATCH 07/13] Handle df=0 case for .pchisq_filter_extreme pchisq behaves a little differently for df=0 than for df>0, and you can legitimately get pval=0 when df=0. We don't want to set those zeros to the double.xmin value. Also update the test to use expect_identical instead of expect_equal, which checks within a tolerance. --- R/utils.R | 6 ++++-- tests/testthat/test_utils.R | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index c9c1f92..c3d73e4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -354,8 +354,10 @@ setMethod(".annotateAssoc", } .pchisq_filter_extreme <- function(...) { - # Note: stat is the squared test statistic. + args <- list(...) pval = pchisq(...) - pval[pval < .Machine$double.xmin] = .Machine$double.xmin + if ("df" %in% names(args) && args$df > 0) { + pval[pval < .Machine$double.xmin] = .Machine$double.xmin + } return(pval) } diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index df977d7..905e865 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -240,7 +240,11 @@ test_that("prepGenoBlock - male haploid", { test_that(".pchisq_filter_extreme", { stat <- c(0.1, 0.5, 1, 10) - expect_equal(.pchisq_filter_extreme(stat^2, lower.tail=FALSE, df=1), pchisq(stat^2, lower.tail=FALSE, df=1)) + expect_identical(.pchisq_filter_extreme(stat^2, lower.tail=FALSE, df=1), pchisq(stat^2, lower.tail=FALSE, df=1)) # Extreme p-value - expect_equal(.pchisq_filter_extreme(100^2, lower.tail=FALSE, df=1), .Machine$double.xmin) + expect_identical(.pchisq_filter_extreme(100^2, lower.tail=FALSE, df=1), .Machine$double.xmin) + # df=0 + expect_identical(.pchisq_filter_extreme(1, lower.tail=FALSE, df=0), 0) + expect_identical(.pchisq_filter_extreme(10, lower.tail=FALSE, df=0), 0) + expect_identical(.pchisq_filter_extreme(100, lower.tail=FALSE, df=0), 0) }) From 7724b0fe4661bb5b2611adf8c550ccccffb3c581 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 16:01:29 -0700 Subject: [PATCH 08/13] Add more df=0 tests Include testing with lower.tail=TRUE --- tests/testthat/test_utils.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index 905e865..2d20f16 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -244,7 +244,12 @@ test_that(".pchisq_filter_extreme", { # Extreme p-value expect_identical(.pchisq_filter_extreme(100^2, lower.tail=FALSE, df=1), .Machine$double.xmin) # df=0 + expect_identical(.pchisq_filter_extreme(0, lower.tail=FALSE, df=0), 1) + expect_identical(.pchisq_filter_extreme(0, lower.tail=TRUE, df=0), 0) expect_identical(.pchisq_filter_extreme(1, lower.tail=FALSE, df=0), 0) + expect_identical(.pchisq_filter_extreme(1, lower.tail=TRUE, df=0), 1) expect_identical(.pchisq_filter_extreme(10, lower.tail=FALSE, df=0), 0) + expect_identical(.pchisq_filter_extreme(10, lower.tail=TRUE, df=0), 1) expect_identical(.pchisq_filter_extreme(100, lower.tail=FALSE, df=0), 0) + expect_identical(.pchisq_filter_extreme(100, lower.tail=TRUE, df=0), 1) }) From b1a0cf3c9f0d31c7f8b513f0c9b1ab87fe750ba1 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Wed, 20 Mar 2024 16:03:15 -0700 Subject: [PATCH 09/13] Update NEWS --- inst/NEWS.Rd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 7b16071..a2b7064 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,6 +2,12 @@ \title{NEWS for GENESIS} +\section{Version 2.33.1}{ + \itemize{ + \item Set extremely small p-values calculated with pchisq to \code{Machine$double.xmin}. + } + +} \section{Version 2.25.5}{ \itemize{ \item Add new test option "fastSMMAT" to assocTestAggregate. From a732627575a9c35dd5cf1ff6a6bc58c841348996 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Thu, 21 Mar 2024 16:15:51 -0700 Subject: [PATCH 10/13] Change version in NEWS --- inst/NEWS.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index a2b7064..e63c864 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,7 +2,7 @@ \title{NEWS for GENESIS} -\section{Version 2.33.1}{ +\section{Version 2.33.2}{ \itemize{ \item Set extremely small p-values calculated with pchisq to \code{Machine$double.xmin}. } From 90038d64d69014dfb4e48626ec122195decc7e41 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Thu, 21 Mar 2024 16:22:07 -0700 Subject: [PATCH 11/13] Remove df=0 check in .pval_filter_extreme Not needed; the previous call to pchisq should handle it if the user doesn't include df as an argument. --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index c3d73e4..10e8e4e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -356,7 +356,7 @@ setMethod(".annotateAssoc", .pchisq_filter_extreme <- function(...) { args <- list(...) pval = pchisq(...) - if ("df" %in% names(args) && args$df > 0) { + if (args$df > 0) { pval[pval < .Machine$double.xmin] = .Machine$double.xmin } return(pval) From df20cb5eac5ebf02426ec8db9251d36739289079 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Fri, 22 Mar 2024 15:23:57 -0700 Subject: [PATCH 12/13] Update NEWS item --- inst/NEWS.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index e63c864..61bc7c3 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -4,7 +4,7 @@ \section{Version 2.33.2}{ \itemize{ - \item Set extremely small p-values calculated with pchisq to \code{Machine$double.xmin}. + \item Set extremely small p-values (\code{< Machine$double.xmin}) calculated with pchisq to \code{Machine$double.xmin}. This change prevents GENESIS from returning p-values equal to 0. } } From fd371277fd61688e4cd049358e287adc27d1fb72 Mon Sep 17 00:00:00 2001 From: Adrienne Stilp Date: Tue, 26 Mar 2024 10:40:14 -0700 Subject: [PATCH 13/13] Bump version number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6921dd3..9b96e25 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness -Version: 2.33.1 +Version: 2.33.2 Date: 2023-11-07 Author: Matthew P. Conomos, Stephanie M. Gogarten, Lisa Brown, Han Chen, Thomas Lumley, Kenneth Rice, Tamar Sofer, Adrienne Stilp, Timothy Thornton, Chaoyu Yu