-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_protein_match.R
186 lines (139 loc) · 5.57 KB
/
get_protein_match.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
## load blast of base peptides against ensembl+mutationes, and
## find best matching protein for each unique BP
library(segmenTools)
options(stringsAsFactors=FALSE)
## DECODE DATA
proj.path <- file.path(Sys.getenv("DECODE"))
if ( proj.path=="" ) # author's local path
proj.path <- "/home/raim/data/decode"
dat.path <- file.path(proj.path,"originalData")
bp.file <- file.path(proj.path,"processedData","unique_bp_blast.tsv")
out.file <- file.path(proj.path,"processedData","bp_mapped.tsv")
## DATA FROM genomeBrowser, project folder data/mammary,
## run steps in data/mammary/setup.sh to create all data
## required here!
##mam.path <- "/home/raim/data/mammary"
##feature.file <- file.path(mam.path,"features_GRCh38.110.tsv")
##tpmap.file <- file.path(mam.path,"originalData","protein_transcript_map.tsv")
## USING LOCAL COPIES INSTEAD (for publication git)!
feature.file <- file.path(proj.path,"additionalData",
"features_GRCh38.110.tsv.gz")
tpmap.file <- file.path(proj.path,"additionalData",
"protein_transcript_map.tsv.gz")
### LOAD & ANNOTATE BLAST RESULTS
if ( !file.exists(bp.file) )
stop("input file is not available; make sure that either DECODE is set in",
"the calling environment or that the variable proj.path in this",
"script")
bp <- read.delim(bp.file, header=FALSE)
colnames(bp) <- c("BP","protein","identity", "mismatches",
"alen", "qlen", "slen", "sstart", "send", "e", "bitscore")
bp$ensembl <- sub("_.*", "", bp$protein)
## load feature file
features <- read.delim(feature.file)
## use number of GO annotations as a filter criterium: best annotated
gol <- unlist(lapply(strsplit(features$GO, ";"),length))
## load protein-transcript map
tpmap <- read.delim(file=tpmap.file, header=FALSE, row.names=2)
## reverse map transcript-protein
pampt <- matrix(rownames(tpmap), ncol=1)
rownames(pampt) <- tpmap[,1]
## map proteins to feature table!
plst <- strsplit(features$proteins, ";")
gidx <- rep(1:nrow(features), lengths(plst))
names(gidx) <- unlist(plst)
## genes and transcripts for each protein
bp$gene <- features$ID[gidx[bp$ensembl]]
bp$transcript <- tpmap[bp$ensembl,1]
## ADD MANE transcript/protein
## MANE: Matched Annotation from NCBI and EBI
## see ensembl MANE project: consistently annotated
tmp <- features$MANE[gidx[bp$ensembl]]
tmp[tmp==""] <- NA
bp$MANE <- features$MANE[gidx[bp$ensembl]] ##unlist(tmp)
bp$MANE.protein <- pampt[match(bp$MANE,rownames(pampt)),1]
bp$numGO <- gol[gidx[bp$ensembl]]
### SORT & FILTER BLAST HITS
## sort by quality - such that the best option is always
## selected (e.g. protein==MANE) when filtering duplicates
## in this order:
## identity > mismatch>
## MANE present > initial match==MANE >
## shortest> number of annotations
bp <- bp[order(bp$identity,
-bp$mismatches,
!is.na(bp$MANE.protein),
bp$protein==bp$MANE.protein,
bp$slen,
bp$numGO, decreasing=TRUE),]
## FIRST HANDLE PERFECT HITS
all <- bp ## store to analyze missing below
## filter - only those where we found a gene, others
## w/o gene are encoded on scaffolds and not included in the Ensembl fasta
ina <- is.na(bp$gene)
cat(paste("removing", sum(ina), "blast hits without a mapped gene!\n"))
bp <- bp[!ina,] # proteins annotated to a scaffold
mm <- bp$mismatches>0 | bp$identity <100
cat(paste("removing", sum(mm), "hits with mismatches or <100% identity\n"))
bp <- bp[!mm,]
## THE WINNER TAKES IT ALL/RICH GET RICHER APPROACH.
## TODO: check this approach, perhaps there is a better way?
## number of AAS per MANE protein
## this is used to select the "winner" protein match below.
app <- table(bp$MANE.protein)
## split into lists by BP
bpl <- split(bp, paste(bp$BP))
## for each BP select the MANE protein with the
## the most global matches.
winner <- list()
for ( i in seq_along(bpl) ) {
x <- bpl[[i]]
mane <- x$MANE.protein
if ( length(unique(mane))>1 ) {
## only keep the mane protein with most hits globally
cat(paste(i, x$BP[1],
"selecting MANE protein from multiple hits",
nrow(x)))
manemax <- which(mane==mane[which.max(app[mane])])
x <- x[manemax,,drop=FALSE]
cat(paste("->", nrow(x), "\n"))
}
winner[[i]] <- x[1,,drop=FALSE] ## ONLY TAKE FIRST
}
best <- do.call(rbind, winner)
## HANDLE REST
rest <- all[!all$BP%in%c(best$BP),]
## SIMPLY REMOVE DUPLICATE BP and rely on above sorting
dupb <- duplicated(paste0(rest$BP))
cat(paste("removing", sum(dupb), "multiple matches from rest\n"))
rest <- rest[!dupb,]
## combine again and tag quality
all <- rbind(cbind(best, match="good"),
cbind(rest, match="bad"))
## tag really bad hits
all$match[all$mismatch>=3] <- "wrong"
## tag no gene
all$nogene <- FALSE
all$nogene[is.na(all$gene)] <- TRUE
### TAG CATEGORIES
type <- features$type[gidx[all$ensembl]]
all$IG <- FALSE
all$IG[grep("^IG",type)] <- TRUE
desc <- features$description[gidx[all$ensembl]]
all$albumin <- FALSE
all$albumin[grep("albumin",desc)] <- TRUE
all$globin <- FALSE
all$globin[grep("globin",desc)] <- TRUE
## load GO terms
## extracellular region: GO:0005576
## |- extracellular space: GO:0005615
gots <- features$GO[gidx[all$ensembl]]
all$extracellular <- FALSE
all$extracellular[grep("GO:0005576", gots)] <- TRUE
## exclude tag - TODO: any K/R
all$exclude <- all$globin|all$albumin|all$IG
## ADD GENE NAME
all$name <- features$name[gidx[all$ensembl]]
### WRITE OUT BP:protein mapping and annotation
write.table(all, file=out.file, sep="\t",
quote=FALSE, na="", row.names=FALSE)