-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathread_length_identity.py
63 lines (52 loc) · 2.62 KB
/
read_length_identity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
"""
Copyright 2017 Ryan Wick ([email protected])
https://github.com/rrwick/Filtlong
This script was used to get read length/identity stats from a minimap2 PAF alignment file. I used
it to make the scatterplot/histogram figures in the Filtlong README. It uses a particularly strict
definition of read identity: all bases are considered and unaligned bases are assigned an identity
of 0. So if a read had half of its bases align with an identity of 90% and the other half is
unaligned, then the read's final identity would be 45%.
Example commands:
filtlong {OPTIONS} input.fastq.gz | gzip > filtered.fastq.gz
minimap2 -x map10k -t 16 -c reference.fasta filtered.fastq.gz > alignments.paf
read_length_identity.py alignments.paf > read_stats.tsv
This file is part of Filtlong. Filtlong is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version. Filtlong is distributed in
the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details. You should have received a copy of the GNU General Public License along with Filtlong. If
not, see <http://www.gnu.org/licenses/>.
"""
import sys
import collections
import statistics
def main():
read_lengths = {}
read_alignments = collections.defaultdict(list)
paf_filename = sys.argv[1]
with open(paf_filename, 'rt') as paf:
for line in paf:
paf_parts = line.strip().split('\t')
if len(paf_parts) < 11:
continue
read_name = paf_parts[0]
read_length = int(paf_parts[1])
read_lengths[read_name] = read_length
start = int(paf_parts[2])
end = int(paf_parts[3])
identity = 100.0 * int(paf_parts[9]) / int(paf_parts[10])
read_alignments[read_name].append((start, end, identity))
print('\t'.join(['Name', 'Length', 'Identity']))
for read_name, read_length in read_lengths.items():
alignments = read_alignments[read_name]
identity_by_base = [0.0] * read_length
for start, end, identity in alignments:
for i in range(start, end):
if identity > identity_by_base[i]:
identity_by_base[i] = identity
whole_read_identity = statistics.mean(identity_by_base)
print('\t'.join([read_name, str(read_length), str(whole_read_identity)]))
if __name__ == '__main__':
main()