-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscPSM_main.R
175 lines (154 loc) · 7.19 KB
/
scPSM_main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#' @include utils.R
#'
NULL
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Main Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' scPSM integration
#'
#' Correct for batch effects, impute dropouts, and denoise data
#' using the propensity score matching method for single-cell RNA-sequencing data.
#'
#' @param batches One or more log-expression matrices where genes correspond to rows and cells correspond to columns.
#' Each matrix should contain the same number of rows, corresponding to the same genes in the same order.
#' Each matrix represents a batch.
#' @param markers A vector specifying which features used as marker genes to compute propensity scores.
#' @param hvg A vector specifying which features used as HVGs for identifying MNN group.
#' @param k.self An integer scalar specifying the number of nearest neighbors in searching KNNs.
#' @param k.mnn An integer scalar specifying the number of nearest neighbors in matching MNN pairs.
#' @param correct.all A logical scalar specifying whether correction should be applied to all genes, even if only a subset is used for the MNN group identification.
#' @param merge.order An integer vector containing the linear merge order of batches.
#'
#' @importFrom Matrix t
#' @importFrom stats predict glm formula binomial
#' @importFrom BiocNeighbors queryKNN KmknnParam
#' @importFrom BiocParallel SerialParam
#' @importFrom S4Vectors DataFrame
#' @export
#'
#' @examples
#' #Please see the tutorial for the preprocessing and then run the following code
#' #psm.data <- psm_integrate(batches = batches, markers = markers, hvg = hvg, merge.order = 1:4)
psm_integrate <- function(batches, markers, hvg, k.self=10, k.mnn=10,
correct.all=TRUE, merge.order=1:4)
{
prep.out <- .prepare_input_data(batches=batches, cos.norm.in=T, cos.norm.out=T,
subset.row=hvg, correct.all=T)
in.batches <- prep.out$In # matrices of hvgs
out.batches <- prep.out$Out # matrices of all genes
subset.row <- prep.out$Subset # position of hvgs
same.set <- prep.out$Same
in.batches <- lapply(in.batches, t)
if (!same.set) {
out.batches <- lapply(out.batches, t)
}
restrict <- NULL
merge.tree <- .create_tree_predefined(in.batches, restrict, merge.order)
NEXT <- .get_next_merge
UPDATE <- .update_tree
# Putting the out.batches information in the 'extras'.
merge.tree <- .add_out_batches_to_tree(merge.tree, if (same.set) NULL else out.batches)
nbatches <- length(unlist(merge.tree))
nmerges <- nbatches - 1L
mnn.pairings <- left.set <- right.set <- vector("list", nmerges)
for (mdx in seq_len(nmerges)) {
# Traversing the merge tree to find the next two batches to merge.
next.step <- NEXT(merge.tree) # list of nmerges
left <- next.step$left # Large MNN_treenode
right <- next.step$right # Large MNN_treenode
left.data <- .get_node_data(left) # dgCMatrix of cell * hvgs
left.restrict <- .get_node_restrict(left) # NULL
left.index <- .get_node_index(left) # 1L
left.origin <- .get_node_origin(left) # seq of 1
left.extras <- .get_node_extras(left)[[1]] # dgCMatrix of cell * genes
right.data <- .get_node_data(right)
right.restrict <- .get_node_restrict(right)
right.index <- .get_node_index(right)
right.origin <- .get_node_origin(right)
right.extras <- .get_node_extras(right)[[1]]
# Really no point being too cute here with other matrix representations,
# there are coercions for the NN search _and_ for the dense per-gene output.
left.data <- as.matrix(left.data)
right.data <- as.matrix(right.data)
# Find the k-nearest neighbors in data for each point in query
W_right <- queryKNN(right.data, query=right.data, k=k.self,
BNPARAM=KmknnParam(), BPPARAM=SerialParam(), get.distance=FALSE)
# Find mnn pairs using hvgs
mnn.sets <- .restricted_mnn(left.data, left.restrict, right.data, right.restrict,
k=k.mnn, prop.k=NULL, BNPARAM=KmknnParam(), BPPARAM=SerialParam())
s1 <- mnn.sets$first
s2 <- mnn.sets$second
mnn.pairings[[mdx]] <- DataFrame(left=s1, right=s2) # DF of mnn pairs
left.set[[mdx]] <- left.index
right.set[[mdx]] <- right.index
# calculate corrected right. data and extras
# to project right to left, for each cell in right, find mnn pairs in left to cells of knn pairs in right
n_left <- nrow(left.data)
n_right <- nrow(right.data)
parings <- lapply(1:n_right, function(x){
subset(as.data.frame(mnn.pairings[[mdx]]), right %in% W_right[["index"]][x,])})
# calculate propensity scores and weights
# left - comparison, right - treatment, project treat to comparison
batch.id <- rep(c(0,1), c(n_left, n_right))
if (correct.all) {
raw <- rbind(left.extras, right.extras)
} else {
raw <- rbind(left.data, right.data)
}
psm.data <- as.matrix(raw)[, markers]
psm.data <- cbind(as.vector(batch.id), psm.data)
colnames(psm.data)[1] <- "treat"
# for all cells in right batch
psm.datas <- lapply(1:n_right, function(x){
tmp0=as.data.frame(psm.data[c(parings[[x]]$left, parings[[x]]$right+n_left),])
#tmp0$treat=factor(tmp0$treat)
return(tmp0)
})
psm.logit <- lapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
glm(formula(psm.datas[[x]]),
data = psm.datas[[x]],
family = binomial)
} else NULL
})
w.logit <- lapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
#tmp0=psm.datas[[x]]
exp(predict(psm.logit[[x]],subset(psm.datas[[x]],treat==0)))
} else NULL
})
w.logit <- lapply(w.logit, function(x) replace(x, !is.finite(x), 1))
#save(psm.logit,psm.datas,n_right,w.logit,parings,left.extras,left.data,right.data,file="~/Desktop/test_00.RData")
# Applying the correction and expanding the reference batch.
right.extras.cor <- sapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
Matrix::colSums(w.logit[[x]]*left.extras[parings[[x]]$left, ])/sum(w.logit[[x]])
} else {
right.extras[x,]
}
})
right.data.cor <- sapply(1:n_right, function(x){
if (length(parings[[x]]$left) > 1) {
Matrix::colSums(w.logit[[x]]*left.data[parings[[x]]$left, ])/sum(w.logit[[x]])
} else {
right.data[x,]
}
})
right.extras.cor[is.na(right.extras.cor)] <- 0 # matrix of hvgs*cells
right.data.cor[is.na(right.data.cor)] <- 0 # matrix of genes*cells
colnames(right.extras.cor) <- row.names(right.extras)
colnames(right.data.cor) <- row.names(right.data)
merge.tree <- UPDATE(merge.tree, next.step$chosen,
data=rbind(left.data, t(right.data.cor)),
index=c(left.index, right.index),
restrict=.combine_restrict(left.data, left.restrict, t(right.data.cor), right.restrict),
origin=c(left.origin, right.origin),
extras=list(rbind(left.extras, t(right.extras.cor))))
}
if (!correct.all) {
full.data <- .get_node_data(merge.tree)
} else {
full.data <- .get_node_extras(merge.tree)[[1]]
}
full.data
}