-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbioinformatic_formats_tool.py
272 lines (238 loc) · 15.1 KB
/
bioinformatic_formats_tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 29 15:12:32 2019
@author: dnx
"""
#Bioinformatic software tool for filtering and manipulating sam/bam alignment, bcf/vcf variant calling and fastq files; version 0.2
#Author: Dani (Vadim) Dubinsky ([email protected])
import argparse
import os
import sys
import numpy as np
import re
try:
import pysam as ps
except:
sys.exit("This program requires Python3 Pysam module, please install it")
#
#--------------------------Main function--------------------------#
#
def main():
args = args_setup()
path_exists (args.input)
path_dir (args.output)
check_ext (args.input)
if args.mapread_stats:
mapping_stats (args.input)
if (args.bam_filter):
bam_filter (args.input, args.ref_match, args.map_qual,
args.read_qual, args.pident, args.read_len)
#Check the case if --bam-filter was not selected but default arguments were changed:
if (args.map_qual or args.read_qual
or args.pident or args.read_len != 0 or args.ref_match != '.*'): #if this statment is True, default arguments were changed, next is to check if --bam-filter was selected:
if (not args.bam_filter): #if this statement is Flase, --bam-filter was selected & run with default arguments
sys.exit("Error: -b/--bam_filter argument is missing") #if True, --bam-filter was not selected but default arguments were changed
if (args.bamtofastq):
bam_to_fastq (args.input)
if (args.mapped_reads):
extract_mapped_reads (args.input)
if (args.fastq_concat):
fastq_concat (args.input)
if (args.fastq_fasta):
fastq_to_fasta (args.input)
if (args.fastq_filter):
fastq_filter (args.input, args.fastq_filter)
if (args.vcf_filter):
vcf_filter (args.input, args.vcf_filter)
#several samtools commands (samtools functionallity with pysam)
file_name = os.path.split(args.input[0])[1] #obtain only the file name withoout the path
if args.sort:
pos_args_len (args.input, 1)
ps.sort("-o", file_name + ".sorted.bam", args.input[0])
print("Sorted bam file <" + file_name + ".sorted.bam" + "> was created")
if args.index:
pos_args_len (args.input, 1)
ps.index(args.input[0])
print("Index file <" + file_name + ".bai" + "> was created")
if args.flagstat:
pos_args_len (args.input, 1)
print("Reads mapping summary for %s" % (file_name))
print(ps.flagstat(args.input[0]))
#
#--------------------------Utility functions--------------------------#
#
def args_setup():
'''Command line arguments parsing function'''
parser = argparse.ArgumentParser(description="Bioinformatic software tool for filtering and manipulating sam/bam alignment, bcf/vcf variant calling and fastq files; version 0.2. Author: Vadim (Dani) Dubinsky ([email protected])")
parser.add_argument("input", metavar = '<input file>', nargs = '+', help = "input file (sam/bam/vcf/bcf/fastq) or files (paired fastq)") #type = check_ext_args,
parser.add_argument("-o", "--output", metavar = '<path-to-output>', required = True, help = "path to output directory")
parser.add_argument("-b", "--bam-filter", action = "store_true", required = False, help = "apply bam filter, select from the next 5 parameters (default: retain all reads)")
parser.add_argument("-r", "--ref-match", type = str, default = '.*', required = False, help = "bam filter, reads mapped to specific reference, wildcards (*) and regexp can be used")
parser.add_argument("-mq", "--map-qual", metavar = "<1-43>", type = int, default = 0, required = False, help = "bam filter, reads above a mapping quality (MAPQ) threshold")
parser.add_argument("-rq", "--read-qual", metavar = "<1-40>", type = int, default = 0, required = False, help = "bam filter, reads above a mean Phred score threshold")
parser.add_argument("-p", "--pident", metavar = "<0-100>", type = float, default = 0, required = False, help = "bam filter, reads above percent identity between the reads and reference")
parser.add_argument("-l","--read-len", type = int, default = 0, required = False, help = "bam filter, reads above a length threshold")
parser.add_argument("-s", "--mapread-stats", action = "store_true", required = False, help = "Calculate reads mapping per each reference contig/gene")
parser.add_argument("-m", "--mapped-reads", action = "store_true", required = False, help = "extract only mapped reads (for sam/bam files)")
parser.add_argument("-so", "--sort", action = "store_true", required = False, help = "samtools sort command (for sam/bam files)")
parser.add_argument("-ix", "--index", action = "store_true", required = False, help = "samtools index command (for sam/bam files)")
parser.add_argument("-fs", "--flagstat", action = "store_true", required = False, help = "samtools flagstat command (for sam/bam files, output reads mapping summary stats)")
parser.add_argument("-bf", "--bamtofastq", action = "store_true", required = False, help = "convert a bam to fastq file (paired reads interleaved in a single file)")
parser.add_argument("--fastq-filter", metavar = "<min-length min-quality>", nargs = 2, type = int, help = "filter a single fastq or two paired fastq file/s based on read length and mean Phred score quality")
parser.add_argument("-fc", "--fastq-concat", action = "store_true", required = False, help = "convert 2 paired-end fastq files to a single interleaved file")
parser.add_argument("-ff", "--fastq-fasta", action = "store_true", required = False, help = "convert fastq file to fasta format")
parser.add_argument("-v", "--vcf-filter", metavar = "<ref-name read-depth map-qual>", nargs = "+", help = "Filter vcf/bcf variant calling file based on reference name, read depth and mapping quality")
return (parser.parse_args())
def pos_args_len(file, n): #n=1 or n=2
if (n==1):
if (len(file) > n):
sys.exit("Input error, only one file is required for this argument")
elif (n==2):
if (len(file) > n): #if 3 or more arguments
sys.exit("Input error, only two files are required for this argument")
elif (len(file) < n): #if 1 argument
sys.exit("Input error, two files are required for this argument")
def path_exists(file):
for f in file: #file is a list now because of nargs parameter
if (not os.path.exists(f)):
#raise argparse.ArgumentTypeError("{0} does not exist".format(file))
#sys.stderr.write("Error!\n")
sys.exit("File path error: " + f + " file does not exist!")
def path_dir(path):
if os.path.isdir(path):
os.chdir(path)
else:
#raise (NotADirectoryError(path))
sys.exit("Path to output directory: " + path + " is not valid!")
def check_ext(file):
'''Check if input file extension is correct, by calling this func in main()'''
file_name = os.path.split(file[0])[1]
exts = ("sam","bam","vcf","bcf","fastq")
if not (file_name.endswith (exts)):
sys.exit("Input error, not supported extension in the following file: " + file_name)
def check_ext_args(file):
'''Check if input file extension is correct, directly via "type" argument in parser.add_argument(...) method'''
file_name = os.path.split(file[0])[1]
exts = ("sam","bam","vcf","bcf","fastq")
if not (file_name.endswith (exts)):
raise argparse.ArgumentTypeError("input error, not supported input file extension")
return file
def mapping_stats(file):
'''Calculate reads mapping for each reference in the sam/bam alignment file'''
pos_args_len (file, 1)
stats = []
with ps.AlignmentFile(file[0], "r") as bamfile:
stats.append(bamfile.get_index_statistics())
stats_arr = np.empty(shape=(0,3)) #define array shape for get_index_statistics() output. #0 - ref contig name, 1 - mapped reads, 2 - singletons mapped.
for rec in stats[0]:
stats_arr = np.append(stats_arr, [[rec[0], rec[1], rec[2]]], axis=0)
#Save stats_arr into file and add headers to columns
file_name = os.path.split(file[0])[1] #obtain only the file name withoout the path
np.savetxt(file_name + "_ref_mapping_stats.txt", stats_arr,
fmt= "%s\t%s\t%s", delimiter='\n',
header="Ref_contig\tMapped_reads\tSingletons_reads", comments='')
print ("Reads mapping file <" + file_name + "_ref_mapping_stats.txt" + "> was created")
def bam_filter(file, ref, mapqual, readqual, pident, readlen):
'''Filter bam file based on a match to specific contigs sets (with Regex), mapping quality (MAPQ), mean read quality (Phred score) and %-identity'''
pos_args_len (file, 1)
print("Selected parameters for bam filter:\nreference: %s, mapping quality: %i, read quality: %i, read length: %i, %%-identity: %.1f"
% (ref, mapqual, readqual, readlen, pident))
file_name = os.path.split(file[0])[1] #obtain only the file name withoout the path
with ps.AlignmentFile(file_name + "_filtered.bam", "wb", template=ps.AlignmentFile(file[0], "r")) as bamfile_out:
for read in ps.AlignmentFile(file[0], "r").fetch():
if (read.has_tag('NM')): #tag 'NM' present only on the mapped reads, will not work with an unmapped bam file, thus must to check this first.
align_len = len(read.query_alignment_sequence)
read_pident = (align_len - read.get_tag("NM")) / align_len * 100
if (re.match(ref, read.reference_name) and
read.mapping_quality >= mapqual and
np.mean(read.query_qualities) >= readqual and
read_pident >= pident and
read.query_length >= readlen):
#print("Reference name: %s, Mapping quality: %i, Mean read quality: %.1f, %%-identity: %i" %
# (read.reference_name, read.mapping_quality, np.mean(read.query_qualities), read_pident))
bamfile_out.write(read)
ps.index(file_name + "_filtered.bam")
print("Finished\nFile <" + file_name + "_filtered.bam" + "> was created")
def bam_to_fastq(file):
'''Convert a bam to fastq file, paired reads are interleaved in a single file with _1/_2 suffix'''
pos_args_len (file, 1)
file_name = os.path.split(file[0])[1] #obtain only the file name withoout the path
with open (file_name + ".fastq", "w") as fastq:
for read in ps.AlignmentFile(file[0], "r").fetch():
if (read.is_read1):
fastq.write(str(read.qname) + "_1" + "\n" + str(read.query_sequence) +
"\n" + "+" + "\n" + str(read.qual) + "\n")
elif (read.is_read2):
fastq.write(str(read.qname) + "_2" + "\n" + str(read.query_sequence) +
"\n" + "+" + "\n" + str(read.qual) + "\n")
print("Finished\nFile <" + file_name + ".fastq" + "> was created")
def extract_mapped_reads(file):
'''Extract only mapped reads (=samtools view -F 4) from a sam/bam file'''
pos_args_len (file, 1)
file_name = os.path.split(file[0])[1] #obtain only the file name withoout the path
with ps.AlignmentFile(file_name + "_mapped.bam", "wb", template=ps.AlignmentFile(file[0], "r")) as bamfile_out:
for read in ps.AlignmentFile(file[0], "r").fetch():
if (not read.is_unmapped):
bamfile_out.write(read)
ps.index(file_name + "_mapped.bam")
print("Finished\nFile <" + file_name + "_mapped.bam" + "> was created")
def fastq_concat(file):
'''Convert 2 paired-end fastq files to a single interleaved fastq file'''
pos_args_len (file, 2)
with ps.FastxFile(file[0]) as r1_fastq_in, ps.FastxFile(file[1]) as r2_fastq_in, open("output_interleaved_R1_R2.fastq", 'w') as fastq_out:
for entry1, entry2 in zip(r1_fastq_in, r2_fastq_in):
fastq_out.write(str(entry1) + "\n" + str(entry2) + "\n")
print ("Finished\nFile <output_interleaved_R1_R2.fastq> was created")
def fastq_to_fasta(file):
'''Convert fastq to fasta format'''
pos_args_len (file, 1)
file_name = os.path.split(file[0])[1]
with ps.FastxFile(file[0]) as fastq_in, open(file_name + ".fasta", "w") as fasta_out:
for entry in fastq_in:
fasta_out.write(">" + entry.name + "\n" + entry.sequence + "\n")
print ("Finished\nFile <" + file_name + ".fasta" + "> was created")
def fastq_filter(file, filter_args):
'''Filter a fastq file based on read length (arg1 - fastq_filter[0]) and mean Phred quality score (arg2 - fastq_filter[1])'''
seq_pas, seq_dis = 0, 0
if (len (file) == 1): #check if only 1 fastq file was input
file_name = os.path.split(file[0])[1]
with ps.FastxFile(file[0]) as fastq_in, open(file_name + "_filtered.fastq", 'w') as fastq_out:
for entry in fastq_in:
if (len(entry.sequence) >= filter_args[0] and
np.mean(entry.get_quality_array()) >= filter_args[1]):
fastq_out.write(str(entry) + "\n")
seq_pas += 1
else:
seq_dis += 1
print("%i reads passed filtration, %i reads were discarded" % (seq_pas, seq_dis))
print("Finished\nFile <" + file_name + "_filtered.fastq" + "> was created")
elif (len (file) == 2): #check if 2 fastq files were input
file_name1 = os.path.split(file[0])[1]
file_name2 = os.path.split(file[1])[1]
with ps.FastxFile(file[0]) as r1_fastq_in, ps.FastxFile(file[1]) as r2_fastq_in, open(file_name1 + "_filtered.fastq", 'w') as r1_fastq_out, open(file_name2 + "_filtered.fastq", 'w') as r2_fastq_out:
for entry1, entry2 in zip (r1_fastq_in, r2_fastq_in):
if (len(entry1.sequence) >= filter_args[0] and len(entry2.sequence) >= filter_args[0]
and np.mean(entry1.get_quality_array()) >= filter_args[1] and np.mean(entry2.get_quality_array()) >= filter_args[1]):
r1_fastq_out.write(str(entry1) + "\n")
r2_fastq_out.write(str(entry2) + "\n")
seq_pas += 2
else:
seq_dis += 2
print("%i reads passed filtration, %i reads were discarded" % (seq_pas, seq_dis))
print("Finished\nFiles <" + file_name1 + "_filtered.fastq" + "," + file_name2 + "_filtered.fastq" + "> were created")
elif (len (file) > 2): #if more then 2 fastq file were input
pos_args_len (file, 2)
def vcf_filter(file, filter_args):
'''Filter vcf/bcf file based on reference name (filter_args[0]), read depth (filter_args[1]) and mapping quality (filter_args[2])'''
pos_args_len (file, 1)
file_name = os.path.split(file[0])[1]
with ps.VariantFile(file_name + "_filtered.vcf", 'w', header = ps.VariantFile(file[0], 'r').header) as vcf_out:
for rec in ps.VariantFile(file[0], 'r').fetch():
if (re.match(filter_args[0], rec.chrom) and
int(rec.info["DP"]) >= int(filter_args[1]) and int(rec.info["MQ"]) >= int(filter_args[2])):
vcf_out.write(rec)
print("Finished\nFile <" + file_name + "_filtered.vcf" + "> was created")
#--------------------------END--------------------------------------#
if __name__ == "__main__":
main()