-
Notifications
You must be signed in to change notification settings - Fork 4
/
Extract_gene_lengths.R
38 lines (26 loc) · 1.19 KB
/
Extract_gene_lengths.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
setwd("./") #~/Desktop/ISB/COVID19/
file.readcounts <- "data/severity_study/GSE147507_RawReadCounts_Human.tsv"
#file.annotations <- "data/severity_study/Biomart.annotations.hg38.txt"
# Import the read count matrix data into R.
counts <- read.csv(file.readcounts, sep="\t",row.names = 1)
# Import feature annotations.
# Assign feature lenght into a numeric vector.
#annotations <- read.table(file.annotations, sep="\t", header=TRUE)
library(DESeq2)
# Convert from gene.symbol to ensembl.gene
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("EnsDb.Hsapiens.v79")
library(EnsDb.Hsapiens.v79)
geneSymbols <- rownames(counts)
geneIDs2 <- ensembldb::select(EnsDb.Hsapiens.v79, keys= geneSymbols, keytype = "SYMBOL", columns = c("SYMBOL","GENEID"))
# Get the length for all genes
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("EDASeq")
library (EDASeq)
length = getGeneLengthAndGCContent(geneIDs2$GENEID, "hsa")
geneIDs2['len'] = length[,1]
geneIDs2<- geneIDs2[complete.cases(geneIDs2),]
geneIDs2 = remove_empty(geneIDs2,"rows")
write.csv(geneIDs2,'data/severity_study/genes_length.csv')