forked from Leslie-C/Deduper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
borchers_deduper.py
151 lines (143 loc) · 6.38 KB
/
borchers_deduper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#Sam file to be deduped MUST be sorted by chromosome
#import packages to be used later
import re
import argparse
import gzip
#Implement argparse so that users can specify the file to be deduped and the list of umis.
def get_args():
parser = argparse.ArgumentParser(description="DemultiplexingProgram")
parser.add_argument("-f", "--file", help="Type -f followed by the path to the SAM file to be deduped.", required=True)
parser.add_argument("-u", "--umi", help="Type -u followed by the path to a file containing the UMIs. Each UMI should be on a new line.", required=False)
parser.add_argument("-p", "--paired", help="paired-end reads are not supported in this program.")
return parser.parse_args()
args = get_args()
sam_file = args.file
umi_file = args.umi
paired_end = args.paired
#Tell user that paired end functionality is not supported
if paired_end:
print("Paired end reads are not supported.")
quit()
#take provided filename and add deduped to it, this will be ouput file title
fname = sam_file
new_filename = fname + "_" + "deduped"
#make an empty dictionary for each umi and a lower digit code to be assigned later
UMI_dic = {}
#This is a function to parse a CIGAR string, provided as a variable called CIGARstring, and return the 5' mapping position for a reverse strand
def get_reverse_pos(CIGARstring):
CIGARlist = re.findall(r'[0-9]+[A-Z]', CIGARstring)
length_spanned = 0
for index in range(len(CIGARlist)):
if "M" in CIGARlist[index]:
variable1 = str(CIGARlist[index])
variable2 = variable1.strip("M")
length_spanned += int(variable2)
elif "N" in CIGARlist[index]:
variable1 = str(CIGARlist[index])
variable2 = variable1.strip("N")
length_spanned += int(variable2)
elif "D" in CIGARlist[index]:
variable1 = str(CIGARlist[index])
variable2 = variable1.strip("D")
length_spanned += int(variable2)
elif "S" in CIGARlist[index]:
if index >> 1:
variable1 = str(CIGARlist[index])
variable2 = variable1.strip("S")
length_spanned += int(variable2)
mapping_pos = length_spanned + int(align_pos)
return(mapping_pos)
#open the provided list of umis and assign each one a numerical identifier, to lighten memory requirements for the program
with open (umi_file, "r") as fh:
count = 0
for line in fh:
count += 1
umi = line.strip()
UMI_dic[umi] = count
#Begin looping through sam file to be deduped
with open (sam_file, "r") as f1:
new_sam = open(new_filename, "a+")
align_dic = {}
chr = 0
#set each regex variable to None so that if any regex matches nothing, the variable still exists and the loop can continue on the next line
for line in f1:
a=None
b=None
c=None
d=None
e=None
info = line
#ignore the header lines
header = re.search(r'^[@][\S]+', info)
if header:
continue
#extract the chromosome
a = re.search(r'^[\S]+\t[\S]+\t([0-9]+)', info)
if a is None:
continue
chrm = a.group(1)
if chrm == chr:
pass
#if the chromosome doesn't match the last seen chromosome, empty the dictionary of unduplicated read info. This improves memory demand.
else:
chr = chrm
align_dic = {}
#extract UMI in sam file
b = re.search(r'^[\S]+[:]([A-Z]+)\t', info)
if b is None:
continue
umi = b.group(1)
#check if observed UMI is in the provided list of umis
if umi in UMI_dic:
#extract alignment position according to SAM file
c = re.search(r'^[\S]+\t[\S]+\t[\S]+\t([0-9]+)\t', info)
if c is None:
continue
align_pos = c.group(1)
#extract flag from SAM file
d = re.search(r'^[\S]+\t([0-9]+)\t', info)
if d is None:
continue
flag = d.group(1)
#check if the flag indicates forward or reverse strand
if int(flag) == 16:
strnd = "rev"
#extract CIGAR string
e = re.search(r'^[\S]+\t[\S]+\t[\S]+\t[\S]+\t[\S]+\t([0-9,A-Z]+)', info)
if e is None:
continue
CIGARstring = e.group(1)
#parse CIGAR string using get_reverse_pos function to determine the adjusted mapping position
get_reverse_pos(CIGARstring)
store_umi = UMI_dic[umi]
#If the umi, adjusted mapping position, and strand value observed in the sam file line are already in the dictionary,the line(read) is a PCR duplicate. If a duplicated, move to the next line and keep looping. If not, store in dictionary
if store_umi in align_dic:
if align_dic[store_umi][0] == mapping_pos:
if align_dic[store_umi][1] == strnd:
continue
else:
align_dic[store_umi] = mapping_pos, strnd
new_sam.write(info)
else:
strnd = "fwd"
#extract CIGAR string
e = re.search(r'^[\S]+\t[\S]+\t[\S]+\t[\S]+\t[\S]+\t([0-9,A-Z]+)', info)
CIGARstring = e.group(1)
CIGARlist = re.findall(r'[0-9]+[A-Z]', CIGARstring)
#Parse CIGAR string for front clipping only, return adusted mapping position
if "S" in CIGARlist[0]:
clipping_string = str(CIGARlist[0])
clipping_int = clipping_string.strip("S")
mapping_pos = int(align_pos) - int(clipping_int)
align_pos = mapping_pos
store_umi = UMI_dic[umi]
#If the umi, adjusted mapping position, and strand value observed in the sam file line are already in the dictionary,the line(read) is a PCR duplicate. If a duplicated, move to the next line and keep looping. If not, store in dictionary
if store_umi in align_dic:
if align_dic[store_umi][0] == align_pos:
if align_dic[store_umi][1] == strnd:
continue
else:
align_dic[store_umi] = align_pos, strnd
new_sam.write(info)
else:
continue