Skip to content

Commit

Permalink
Merge pull request #112 from UW-GAC/feature/filter-extreme-p-values
Browse files Browse the repository at this point in the history
Filter extremely small p-values by setting them to Machine$double.xmin
  • Loading branch information
amstilp authored Mar 26, 2024
2 parents 25933a8 + fd37127 commit a66c918
Show file tree
Hide file tree
Showing 16 changed files with 93 additions and 45 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ on:
branches:
- master
- main
- devel
pull_request:
branches:
- master
- main
- devel

name: R-CMD-check-bioc

Expand Down Expand Up @@ -58,7 +60,7 @@ jobs:
fail-fast: false
matrix:
config:
- { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
## - { os: macOS-latest, r: 'devel', bioc: '3.17'}
## - { os: windows-latest, r: 'devel', bioc: '3.17'}
env:
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Type: Package
Title: GENetic EStimation and Inference in Structured samples
(GENESIS): Statistical methods for analyzing genetic data from
samples with population structure and/or relatedness
Version: 2.33.1
Version: 2.33.2
Date: 2023-11-07
Author: Matthew P. Conomos, Stephanie M. Gogarten,
Lisa Brown, Han Chen, Thomas Lumley, Kenneth Rice, Tamar Sofer, Adrienne Stilp, Timothy Thornton, Chaoyu Yu
Expand Down
4 changes: 2 additions & 2 deletions R/admixMap.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ admixMap <- function(admixDataList,
res[,"Est"] <- beta
res[,"SE"] <- sqrt(Vbeta)
res[,"Stat"] <- Stat
res[,"pval"] <- pchisq(Stat, df=1, lower.tail=FALSE)
res[,"pval"] <- .pchisq_filter_extreme(Stat, df=1, lower.tail=FALSE)

}else{
Joint.Stat <- rep(NA, ncol(local))
Expand Down Expand Up @@ -167,7 +167,7 @@ admixMap <- function(admixDataList,
res[,paste(names(admixDataList)[i],".SE", sep="")] <- SE[,i]
}
res[,"Joint.Stat"] <- Joint.Stat
res[,"Joint.pval"] <- pchisq(Joint.Stat, df=v, lower.tail=FALSE)
res[,"Joint.pval"] <- .pchisq_filter_extreme(Joint.Stat, df=v, lower.tail=FALSE)
} # else

# results data frame
Expand Down
4 changes: 2 additions & 2 deletions R/jointScoreTest.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jointScoreTest <- function(null.model, G) { # # Check rownames/colnames match.
# Joint test statistic. Note this is the equivalent of the stat being squared,
# by convention.
Stat.joint <- as.numeric(crossprod(GY, beta))
pval.joint <- pchisq(Stat.joint, lower.tail = FALSE, df = ncol(G))
pval.joint <- .pchisq_filter_extreme(Stat.joint, lower.tail = FALSE, df = ncol(G))

# Percentage of variance explained jointly by these variants.
pve.joint <- as.numeric(Stat.joint / null.model$RSS0)
Expand All @@ -63,7 +63,7 @@ jointScoreTest <- function(null.model, G) { # # Check rownames/colnames match.
stringsAsFactors = FALSE
)
fixef$Stat <- fixef$Est / fixef$SE
fixef$pval <- pchisq(fixef$Stat^2, lower.tail = FALSE, df = 1)
fixef$pval <- .pchisq_filter_extreme(fixef$Stat^2, lower.tail = FALSE, df = 1)
fixef$PVE <- fixef$Stat^2 / null.model$RSS0

rownames(fixef) <- colnames(G)
Expand Down
4 changes: 2 additions & 2 deletions R/prepareOutputForNullModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

SE <- sqrt(diag(betaCov))
Stat <- (vc.mod$beta/SE)^2
pval <- pchisq(Stat, df = 1, lower.tail = FALSE)
pval <- .pchisq_filter_extreme(Stat, df = 1, lower.tail = FALSE)

fixef <- data.frame(Est = vc.mod$beta, SE = SE, Stat = Stat, pval = pval)
rownames(fixef) <- varNames
Expand Down Expand Up @@ -181,7 +181,7 @@

SE <- sqrt(diag(betaCov))
Stat <- (vc.mod$beta/SE)^2
pval <- pchisq(Stat, df = 1, lower.tail = FALSE)
pval <- .pchisq_filter_extreme(Stat, df = 1, lower.tail = FALSE)

fixef <- data.frame(Est = vc.mod$beta, SE = SE, Stat = Stat, pval = pval)
rownames(fixef) <- varNames
Expand Down
10 changes: 5 additions & 5 deletions R/testGeno.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA
Stat <- score/score.SE

res <- data.frame(Score = score, Score.SE = score.SE, Score.Stat = Stat,
Score.pval = pchisq(Stat^2, df = 1, lower.tail = FALSE),
Score.pval = .pchisq_filter_extreme(Stat^2, df = 1, lower.tail = FALSE),
Est = score/GPG, Est.SE = 1/score.SE,
PVE = (Stat^2)/RSS0) # RSS0 = (n-k) when gaussian; not when binary

Expand All @@ -135,7 +135,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA
# Vbeta <- RSS/GPG
# Stat <- beta/sqrt(Vbeta)
# res <- data.frame(Est = beta, Est.SE = sqrt(Vbeta), Wald.Stat = Stat,
# Wald.pval = pchisq(Stat^2, df = 1, lower.tail = FALSE))
# Wald.pval = .pchisq_filter_extreme(Stat^2, df = 1, lower.tail = FALSE))
# return(res)
# }

Expand Down Expand Up @@ -193,8 +193,8 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA
}

res <- as.data.frame(res)
res$GxE.pval <- pchisq((res$GxE.Stat)^2, df = (v - 1), lower.tail = FALSE)
res$Joint.pval <- pchisq((res$Joint.Stat)^2, df = v, lower.tail = FALSE)
res$GxE.pval <- .pchisq_filter_extreme((res$GxE.Stat)^2, df = (v - 1), lower.tail = FALSE)
res$Joint.pval <- .pchisq_filter_extreme((res$Joint.Stat)^2, df = v, lower.tail = FALSE)

if (GxE.return.cov.mat) {
return(list(res = res, GxEcovMatList = res.Vbetas))
Expand Down Expand Up @@ -286,7 +286,7 @@ testGenoSingleVar <- function(nullmod, G, E = NULL, test = c("Score", "Score.SPA
crossprod(GPG, betas))/RSS,
error = function(e) { NA })

res[,"Joint.pval"] <- pchisq(res[,"Joint.Stat"], df = v, lower.tail = FALSE)
res[,"Joint.pval"] <- .pchisq_filter_extreme(res[,"Joint.Stat"], df = v, lower.tail = FALSE)

res <- as.data.frame(res)
return(list(res = res, allelesCovMat = Vbetas))
Expand Down
2 changes: 1 addition & 1 deletion R/testSingleVarianceComponent.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ testSingleVarianceComponent <- function(nullmod, varCompName, covMatList, group.

test.stat <- -2*(nullmod.noVarComp$logLik - nullmod$logLik)

pval <- 0.5*pchisq(test.stat, df = length(nullmod$varComp), lower.tail = FALSE) + 0.5*pchisq(test.stat, df = length(nullmod$varComp) - 1, lower.tail = F)
pval <- 0.5*.pchisq_filter_extreme(test.stat, df = length(nullmod$varComp), lower.tail = FALSE) + 0.5*.pchisq_filter_extreme(test.stat, df = length(nullmod$varComp) - 1, lower.tail = F)

return(pval)
}
9 changes: 9 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,12 @@ setMethod(".annotateAssoc",
put.attr.gdsn(geno.node, "sample.order")
closefn.gds(gds)
}

.pchisq_filter_extreme <- function(...) {
args <- list(...)
pval = pchisq(...)
if (args$df > 0) {
pval[pval < .Machine$double.xmin] = .Machine$double.xmin
}
return(pval)
}
12 changes: 6 additions & 6 deletions R/variantSetTests.R
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ testVariantSet <- function( nullmod, G, weights,
GG1 <- crossprod(G, G.rowSums) # WGPGW1 # O(mn)
V.sum <- sum(GG1) # 1WGPGW1
burden.stat = U.sum / sqrt(V.sum)
burden.pval <- pchisq(burden.stat^2, df=1, lower.tail=FALSE)
burden.pval <- .pchisq_filter_extreme(burden.stat^2, df=1, lower.tail=FALSE)

# adjust U and G for burden
U <- U - GG1*U.sum/V.sum # WGPY - WGPGW1 * 1WGPY/(1WGPGW1)
Expand All @@ -152,7 +152,7 @@ testVariantSet <- function( nullmod, G, weights,
# # denominator for burden
# V.sum <- sum(GG1) # 1WGPGW1
# # burden p-value
# burden.pval <- pchisq(U.sum^2/V.sum, df=1, lower.tail=FALSE)
# burden.pval <- .pchisq_filter_extreme(U.sum^2/V.sum, df=1, lower.tail=FALSE)
# # adjust for burden
# U <- U - GG1*U.sum/V.sum
# V <- V - tcrossprod(GG1)/V.sum # O(m^2)
Expand All @@ -163,7 +163,7 @@ testVariantSet <- function( nullmod, G, weights,
err <- out$err

# Fisher's method to combine p-values
smmat.pval <- tryCatch(pchisq(-2*log(burden.pval)-2*log(theta.pval), df=4, lower.tail = FALSE), error = function(e) { NA })
smmat.pval <- tryCatch(.pchisq_filter_extreme(-2*log(burden.pval)-2*log(theta.pval), df=4, lower.tail = FALSE), error = function(e) { NA })
if(is.na(smmat.pval)) {
err <- 1
smmat.pval <- burden.pval
Expand All @@ -187,7 +187,7 @@ testVariantSet <- function( nullmod, G, weights,

.regular <- function(Q, V, ncolG) {
if(ncolG == 1){
pv <- list(pval = pchisq(as.numeric(Q/V), df=1, lower.tail=FALSE), method = "integration")
pv <- list(pval = .pchisq_filter_extreme(as.numeric(Q/V), df=1, lower.tail=FALSE), method = "integration")

}else{
lambda <- eigen(V, only.values = TRUE, symmetric=TRUE)$values
Expand Down Expand Up @@ -381,7 +381,7 @@ testVariantSet <- function( nullmod, G, weights,
# p value calculation
if(length(scores) == 1){
lambdas[[i]] <- as.numeric(distMat)
pval <- pchisq(as.numeric(Q/distMat), df=1, lower.tail=FALSE)
pval <- .pchisq_filter_extreme(as.numeric(Q/distMat), df=1, lower.tail=FALSE)
err <- ifelse(is.na(pval), 1, 0)
}else{
lambda <- eigen(distMat, only.values = TRUE, symmetric=TRUE)$values
Expand Down Expand Up @@ -493,7 +493,7 @@ integrateFxn <- function(x, qmin, otherParams, tau, rho){

temp.q<-(minval - mu)/sqrt(varia)*sqrt(2*degf) + degf

re<-pchisq(temp.q ,df=degf) * dchisq(x,df=1)
re<-.pchisq_filter_extreme(temp.q ,df=degf) * dchisq(x,df=1)

return(re)
}
6 changes: 6 additions & 0 deletions inst/NEWS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

\title{NEWS for GENESIS}

\section{Version 2.33.2}{
\itemize{
\item Set extremely small p-values (\code{< Machine$double.xmin}) calculated with pchisq to \code{Machine$double.xmin}. This change prevents GENESIS from returning p-values equal to 0.
}

}
\section{Version 2.25.5}{
\itemize{
\item Add new test option "fastSMMAT" to assocTestAggregate.
Expand Down
6 changes: 4 additions & 2 deletions man/admixMap.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Run admixture analyses
}
\usage{
admixMap(admixDataList, null.model, male.diploid=TRUE,
genome.build=c("hg19", "hg38"),
genome.build=c("hg19", "hg38"),
BPPARAM=bpparam(), verbose=TRUE)
}

Expand All @@ -28,6 +28,8 @@ This function is used with local ancestry results such as those obtained from RF
See the example for how one might set up the \code{admixDataList} object. List names will propagate to the output file.
\code{admixMap} uses the \code{\link{BiocParallel}} package to process iterator chunks in parallel. See the \code{\link{BiocParallel}} documentation for more information on the default behaviour of \code{\link{bpparam}} and how to register different parallel backends. If serial execution is desired, set \code{BPPARAM=BiocParallel::SerialParam()}. Note that parallel execution requires more RAM than serial execution.
p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}.
}
\value{A data.frame where each row refers to a different variant with the columns:
Expand Down Expand Up @@ -77,7 +79,7 @@ genoIterators <- lapply(genoDataList[1:2], GenotypeBlockIterator)
null.model <- fitNullModel(scanAnnot, outcome="pheno", covars="covar")
# run the association test
myassoc <- admixMap(genoIterators, null.model,
myassoc <- admixMap(genoIterators, null.model,
BPPARAM=BiocParallel::SerialParam())
head(myassoc)
Expand Down
6 changes: 5 additions & 1 deletion man/assocTestAggregate.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@
The fastSMMAT method uses the same random matrix theory as fastSKAT to speed up the computation of the p-value for the adjusted SKAT component of the test. When \code{test = "fastSMMAT"}, the function uses the same logic as for fastSKAT to determine which p-value calculation approach to use for each aggregation unit.

The BinomiRare test, run by using \code{test = "BinomiRare"}, and the CMP test, run by using \code{test = "CMP"} are carrier-only, robust tests. Only variants where the effect allele is minor will be tested. Both tests focuse on carriers of the rare variant allele ("carriers"), and use the estimated probabilities of the binary outcome within the carriers, estimated under the null of not association, and the actual number of observed outcomes, to compute p-values. BinomiRare uses the Poisson-Binomial distribution, and the CMP uses the Conway-Maxwell-Poisson distribution, and is specifically designed for mixed models. (If \code{test = "CMP"} but \code{null.model$family$mixedmodel = FALSE}, the BinomiRare test will be run instead.) These tests provide both a traditional p-value (\code{"pval"}) and a mid-p-value (\code{"midp"}), which is less conservative/more liberal, with the difference being more pronounced for small number of carriers. The BinomiRare test is described in Sofer (2017) and compared to the Score and SPA in various settings in Sofer and Guo (2020).

p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}.

}

\value{A list with the following items:
Expand Down Expand Up @@ -117,7 +120,7 @@
\item{Score_burden}{The value of the score function for the burden test}
\item{Score.SE_burden}{The estimated standard error of the Score for the burden test}
\item{Stat_burden}{The score Z test statistic for the burden test}
\item{pval_burden}{The burden test p-value}
\item{pval_burden}{The burden test p-value.}
\item{Q_theta}{The test statistic for the adjusted SKAT test (which is asymptotically independent of the burden test)}
\item{pval_theta}{The p-value of the adjusted SKAT test (which is asymptotically independent of the burden test)}
\item{pval_SMMAT}{The SMMAT p-value after combining pval_burden and pval_theta using Fisher's method.}
Expand Down Expand Up @@ -147,6 +150,7 @@
\item{freq}{The estimated effect allele frequency}
\item{MAC}{The minor allele count. For multiallelic variants, "minor" is determined by comparing the count of the allele specified by \code{allele.index} with the sum of all other alleles.}
\item{weight}{The weight assigned to the variant in the analysis.}
}
\author{Matthew P. Conomos, Stephanie M. Gogarten, Thomas Lumley, Tamar Sofer, Ken Rice, Chaoyu Yu, Han Chen}
Expand Down
15 changes: 9 additions & 6 deletions man/assocTestSingle.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
\S4method{assocTestSingle}{SeqVarIterator}(gdsobj, null.model,
test=c("Score", "Score.SPA", "BinomiRare", "CMP"),
recalc.pval.thresh=0.05, fast.score.SE=FALSE,
GxE=NULL,
GxE=NULL,
geno.coding=c("additive", "dominant", "recessive"),
sparse=TRUE, imputed=FALSE,
male.diploid=TRUE, genome.build=c("hg19", "hg38"),
Expand Down Expand Up @@ -47,14 +47,14 @@
\code{assocTestSingle} uses the \code{\link{BiocParallel}} package to process iterator chunks in parallel. See the \code{\link{BiocParallel}} documentation for more information on the default behaviour of \code{\link{bpparam}} and how to register different parallel backends. If serial execution is desired, set \code{BPPARAM=BiocParallel::SerialParam()}. Note that parallel execution requires more RAM than serial execution.

All samples included in \code{null model} will be included in the association test, even if a different set of samples is present in the current filter for \code{gdsobj}.

The effect size estimate is for each copy of the alternate allele (when \code{gdsobj} is a \code{\link{SeqVarIterator}} object) or the "A" allele (when \code{gdsobj} is a \code{\link{GenotypeIterator}} object). We refer to this as the "effect allele" in the rest of the documentation. For multiallelic variants in \code{\link{SeqVarIterator}} objects, each alternate (or "A") allele is tested separately.
%When \code{impute.geno} is TRUE, sporadic missing genotype values are mean imputed using the minor allele frequency (MAF) calculated on all other samples at that SNP. When \code{impute.geno} is FALSE, samples with missing values for all of the SNP genotypes in the current SNP block are removed from the analysis for the block; this may significantly slow down computation time because many pre-computed matrices need to be re-computed each time the sample set changes. Also note: when \code{impute.geno} is FALSE, sporadic missingness for a sample inside of a SNP block will lead to an error.

Sporadic missing genotype values are mean imputed using the allele frequency calculated on all other samples at that variant.

Monomorphic variants (including variants where every sample is a heterozygote) are omitted from the results.

The input \code{GxE} can be used to perform GxE tests. Multiple interaction variables may be specified, but all interaction variables specified must have been included as covariates in fitting the null model with \code{fitNullModel}. When performing GxE analyses, \code{assocTestSingle} will report two tests: (1) the joint Wald test of all genotype interaction terms in the model (this is the test for any genotype interaction effect), and (2) the joint Wald test of the genotype term along with all of the genotype interaction terms (this is the test for any genetic effect). Individual genotype interaction terms can be tested by creating test statistics from the reported effect size estimates and their standard errors (Note: when \code{GxE} contains a single continuous or binary covariate, this test is the same as the test for any genotype interaction effect mentioned above). %In order to test more complex hypotheses regarding subsets of multiple genotype interaction terms, \code{ivar.return.betaCov} can be used to retrieve the estimated covariance matrix of the effect size estimates.

The saddle point approximation (SPA), run by using \code{test = "Score.SPA"}, implements the method described by Dey et al. (2017), which was extended to mixed models by Zhou et al. (2018) in their SAIGE software. SPA provides better calibration of p-values when fitting mixed models with a binomial family for a sample with an imbalanced case to control ratio.
Expand All @@ -65,6 +65,9 @@
%A reference to the CMP test and the BinomiRare for mixed models will be provided once a preprint is available.

For the \code{\link{GenotypeIterator}} method, objects created with \code{\link{GdsGenotypeReader}} or \code{\link{MatrixGenotypeReader}} are supported. \code{\link{NcdfGenotypeReader}} objects are not supported.

p-values that are calculated using \code{pchisq} and are smaller than \code{.Machine\$double.xmin} are set to \code{.Machine\$double.xmin}.

}

\value{A data.frame where each row refers to a different variant with the columns:
Expand All @@ -89,7 +92,7 @@
\item{PVE}{An approximation of the proportion of phenotype variance explained}
% If \code{test} is \code{"Wald"} and \code{GxE} is \code{NULL}:
% \item{Est}{The effect size estimate for each additional copy of the effect allele}
% \item{Est.SE}{The estimated standard error of the effect size estimate}
% \item{Est.SE}{The estimated standard error of the effect size estimate}
% \item{Wald.Stat}{The Wald Z test statistic}
% \item{Wald.pval}{The Wald p-value}
If \code{test} is \code{"Score.SPA"}:
Expand All @@ -109,7 +112,7 @@
\item{n.D.carrier}{Number of cases with at least one copy of the effect allele}
\item{pval}{p-value}
\item{mid.pval}{mid-p-value}

%When \code{GxE} is not \code{NULL}, if \code{ivar.return.betaCov} is \code{TRUE}, then the output is a list with two elements. The first, "results", is the data.frame described above. The second, "betaCov", is a list with length equal to the number of rows of "results", where each element of the list is the covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms.
}

Expand Down Expand Up @@ -157,7 +160,7 @@ nullmod <- fitNullModel(iterator, outcome="outcome", covars="sex")
# run the association test
assoc <- assocTestSingle(iterator, nullmod,
BPPARAM=BiocParallel::SerialParam())

# use fast score SE for a null model with a covariance matrix
seqResetFilter(seqData)
grm <- SNPRelate::snpgdsGRM(seqData, verbose=FALSE)
Expand Down
Loading

0 comments on commit a66c918

Please sign in to comment.