-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_chunks.py
executable file
·138 lines (94 loc) · 5.39 KB
/
create_chunks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
import sys, os, re
import pyranges as pr
import pandas as pd
import logging
import argparse
import math
import subprocess
import csv
logging.basicConfig(level=logging.INFO,
format='%(asctime)s : %(levelname)s : %(message)s',
datefmt='%H:%M:%S')
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description="chunk_genome",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--genome_fa", type=str, required=True,
help="genome in fasta format")
parser.add_argument("--out_prefix", "-o", type=str, help="prefix for output files", required=True)
parser.add_argument("--gene_annot_gtf", type=str, help="annotation gtf file", required=True)
parser.add_argument("--chunks", type=str, help="chunks tsv file", required=True)
parser.add_argument("--gtf_only", action='store_true', default=False, help="only write chunked annotation, not the genome")
args = parser.parse_args()
genome_fa = args.genome_fa
out_prefix = args.out_prefix
gene_annot_gtf_filename = args.gene_annot_gtf
chunks_file = args.chunks
write_genome_flag = not args.gtf_only
logger.info("-parsing chromosome lengths")
genome_fai_file = genome_fa + ".fai"
genome_contig_info = pd.read_csv(genome_fai_file, sep="\t", names=["Chromosome", "Length", "Offset", "Linebases", "Linewidth"])
chromosomes = genome_contig_info['Chromosome'].values.tolist()
chr_lengths = dict()
for x in genome_contig_info.itertuples():
chr_lengths[ x.Chromosome] = x.Length
logger.info("-reading in chunks")
chunks = pd.read_csv(chunks_file, sep="\t")
chromosomes_need_chunking = chunks['Chromosome'].unique().tolist()
logger.info("-chromosomes needing chunking: {}".format(chromosomes_need_chunking))
chromosomes_no_chunking_needed = [chrom for chrom in chromosomes if chrom not in chromosomes_need_chunking]
logger.info("-parsing genome annotations in gtf")
annotation_gtf = pd.read_csv(gene_annot_gtf_filename, sep="\t", names=[
"Chromosome", "Source", "ev_type", "Start", "End", "Score", "Strand", "dot", "info"])
# beginning chunking
logger.info("Starting to chunk data.\n")
chunked_genome_filename = out_prefix + ".chunked.genome.fa"
if write_genome_flag:
genome_ofh = open(chunked_genome_filename, "wt")
chunked_annotation_df = None
for chromosome_to_chunk in chromosomes_need_chunking:
logger.info("-processing chromosome: {}".format(chromosome_to_chunk))
chr_gtf = annotation_gtf[ annotation_gtf.Chromosome == chromosome_to_chunk ]
breakpoints = chunks[chunks.Chromosome == chromosome_to_chunk ]['brkpt'].values
breakpoints = sorted(breakpoints)
logger.info(breakpoints)
breakpoints.insert(0, 1)
breakpoints.append(chr_lengths[chromosome_to_chunk]+1)
for i in range(len(breakpoints)-1):
lend_brkpt = breakpoints[i]
rend_brkpt = breakpoints[i+1]
rend_brkpt -= 1
print(lend_brkpt, rend_brkpt)
chrom_partition_name = chromosome_to_chunk + f"^c{i}^o{lend_brkpt}"
chr_gtf_partition = chr_gtf[ (chr_gtf.Start >= lend_brkpt) & (chr_gtf.End <= rend_brkpt) ].copy()
print(chr_gtf_partition.shape)
# make adjustment to the gtf partition
chr_gtf_partition['Chromosome'] = chrom_partition_name
if lend_brkpt != 1:
chr_gtf_partition['Start'] = chr_gtf_partition.Start - lend_brkpt + 1 # lend_brkpt becomes position 1
chr_gtf_partition['End'] = chr_gtf_partition.End - lend_brkpt + 1
chunked_annotation_df = pd.concat([chunked_annotation_df, chr_gtf_partition])
# get genome chunk
if write_genome_flag:
genome_fa_chunk = "\n".join(subprocess.check_output(f"samtools faidx {genome_fa} {chromosome_to_chunk}:{lend_brkpt}-{rend_brkpt}", shell=True).decode().split("\n")[1:])
genome_fa_chunk = f">{chrom_partition_name}\n" + genome_fa_chunk
genome_ofh.write(genome_fa_chunk)
## get rest of them, no chunking required.
logger.info("-Done working on chunks. Now adding in the rest that didn't need chunking....")
if write_genome_flag:
for chrom in chromosomes_no_chunking_needed:
# get chrom seq
logger.info("-adding unchunked {}".format(chrom))
chrom_fa = subprocess.check_output(f"samtools faidx {genome_fa} {chrom}", shell=True).decode()
genome_ofh.write(chrom_fa)
# get gtf annotations
logger.info("-appending the rest of the unchunked annotations.")
rest_gtf = annotation_gtf[ annotation_gtf.Chromosome.isin(chromosomes_no_chunking_needed) ]
chunked_annotation_df = pd.concat([chunked_annotation_df, rest_gtf])
chunked_genome_annotation_filename = out_prefix + ".chunked.gtf"
chunked_annotation_df.to_csv(chunked_genome_annotation_filename, sep="\t", header=False, index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
logger.info("Done. See outputs: {}.chunked.*".format(out_prefix))
sys.exit(0)
if __name__=='__main__':
main()