-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathextract_rRNA.py
executable file
·33 lines (27 loc) · 1.05 KB
/
extract_rRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env
"""
Extract rRNA coordinates from GTF
"""
import sys
import GTF
import numpy as np
import pandas as pd
def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)
idx = (gc.feature == "transcript") & gc.transcript_type.str.contains("rRNA")
rRNA = gc.ix[
idx, ["seqname", "start", "end", "transcript_id", "gene_name", "strand"]
]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
rRNA.to_csv("rRNA_transcripts.bed", sep="\t", header=False, index=False)
idx = (gc.feature == "gene") & gc.gene_type.str.contains("rRNA")
rRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
rRNA.to_csv("rRNA_genes.bed", sep="\t", header=False, index=False)
if __name__ == "__main__":
main(sys.argv[1])