forked from AnalysisCommons/genesis_wdl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipelineFunctions.R
108 lines (91 loc) · 3.55 KB
/
pipelineFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
library(plyr)
library(gap)
# This function provides a basic check that the phenotype file is a suitable
# for seqMeta. It checks that the required column names are in the phenotype
# data frame.
#
checkPhenotype <- function(p, outcome, covariates, id.col=NULL, gender.col=NULL) {
#if (!is.null(id.col)) {
# if (anyDuplicated(p[ , id.col])) {
# stop("Duplicated phenotype ids.")
# }
#}
missing.covariates <- !(covariates %in% colnames(p))
if (any(missing.covariates)) {
msg <- paste("Covariates:", covariates[missing.covariates], "not found in phenotype file.\n", sep=" ")
print(colnames(p))
print(covariates %in% colnames(p))
print(covariates[covariates %in% colnames(p)])
stop(msg)
}
return(invisible(NULL))
}
# This function reduces a data set to only the variables used in a model
# removing subjects with missing data. Also, it makes the row names of
# the resulting data fram the subject identifier
#
# JAB addition: subsets to complete cases (i.e. no NAs in outcome or covariates)
#
# p: a data frame containing the variables in the model
#
# formula: a character vector which can be coered to an object of class
# "formula" (with as.formula): a symbolic description of the model to
# be fitted. The details of model specification are given under
# 'Details' in the "lm" help file.
#
# id: (optional) colunm name identifier of the subjects
#
# gender: (optional) colunm name identifier for the gender classification of
# the subjects.
#
# returns: data frame with only the columns specified in the formula and with
# the (optional) row names as the subject identifier.
#
reducePheno <- function(pheno.data,
outcome,
covariates = NULL,
hetvars = NULL,
id=NULL,
gender=NULL) {
checkPhenotype(pheno.data, outcome, covariates, id.col=id, gender.col=gender)
if (!is.null(id)) {
rownames(pheno.data) <- pheno.data[ ,id]
}
all.terms <- unique(c(outcome, covariates, hetvars, gender))
cat('all terms',print(all.terms),'\n')
pheno.data <- as.data.frame(pheno.data)
pheno <- na.omit(pheno.data[, all.terms, drop=F])
return(pheno)
}
# Calculate MAF
#
# dose: matrix with dosages rows individuals, columns are variants
Maf <- function(dose){
aaf <- colMeans(dose,na.rm=T)/2
return(min(1-aaf, aaf))
}
split.by.comma <- function(cur.string){
cur.string <- gsub('"', '', cur.string)
out <- unlist(strsplit(cur.string, ","))
if (length(out) == 0){
out = NULL
}
return(out)
}
filterByMAF <- function(gds, sample.id=NULL, mac.min=NA, maf.min=NA, verbose=TRUE) {
if ((!is.na(mac.min) & mac.min > 1) |
(!is.na(maf.min) & maf.min > 0)) {
if (is.null(sample.id)) sample.id <- seqGetData(gds, "sample.id")
seqSetFilter(gds, sample.id=sample.id, verbose=FALSE)
ref.freq <- seqAlleleFreq(gds)
maf <- pmin(ref.freq, 1-ref.freq)
if (!is.na(mac.min)) {
maf.filt <- 2 * maf * (1-maf) * length(sample.id) >= mac.min
if (verbose) message(paste("Running on", sum(maf.filt), "variants with MAC >=", mac.min))
} else {
maf.filt <- maf >= maf.min
if (verbose) message(paste("Running on", sum(maf.filt), "variants with MAF >=", maf.min))
}
seqSetFilter(gds, variant.sel=maf.filt, action="intersect", verbose=verbose)
}
}