From 26f71dc10d40fc0218730101a5bc0462a6337b99 Mon Sep 17 00:00:00 2001 From: Christian Otto Date: Fri, 5 Jul 2019 13:48:16 +0200 Subject: [PATCH 1/2] Fix for incorrect output of dedup with --paired (#347) --- umi_tools/sam_methods.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/umi_tools/sam_methods.py b/umi_tools/sam_methods.py index 724e1f96..31f2d108 100644 --- a/umi_tools/sam_methods.py +++ b/umi_tools/sam_methods.py @@ -574,6 +574,7 @@ def __init__(self, infile, outfile, tags=False): self.infile = infile self.outfile = outfile self.read1s = set() + self.read1s_done = set() self.chrom = None def write(self, read, unique_id=None, umi=None, unmapped=False): @@ -590,7 +591,8 @@ def write(self, read, unique_id=None, umi=None, unmapped=False): self.chrom = read.reference_name key = read.query_name, read.next_reference_name, read.next_reference_start - self.read1s.add(key) + if key not in self.read1s_done: + self.read1s.add(key) self.outfile.write(read) @@ -609,6 +611,7 @@ def write_mates(self): if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) + self.read1s_done.add(key) U.debug("%i mates remaining" % len(self.read1s)) @@ -623,7 +626,7 @@ def close(self): found = 0 for read in self.infile.fetch(until_eof=True, multiple_iterators=True): - if read.is_unmapped: + if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue key = read.query_name, read.reference_name, read.reference_start From 78f457bbffb3c2068d4fb19d96a34e4d6aca2995 Mon Sep 17 00:00:00 2001 From: Christian Otto Date: Mon, 8 Jul 2019 09:27:35 +0200 Subject: [PATCH 2/2] Improvement of consistency in selection of read2 alignments for output with dedup and --paired as well as avoiding large memory footprint (#347). --- umi_tools/sam_methods.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/umi_tools/sam_methods.py b/umi_tools/sam_methods.py index 31f2d108..e040d00d 100644 --- a/umi_tools/sam_methods.py +++ b/umi_tools/sam_methods.py @@ -574,7 +574,6 @@ def __init__(self, infile, outfile, tags=False): self.infile = infile self.outfile = outfile self.read1s = set() - self.read1s_done = set() self.chrom = None def write(self, read, unique_id=None, umi=None, unmapped=False): @@ -590,9 +589,8 @@ def write(self, read, unique_id=None, umi=None, unmapped=False): self.write_mates() self.chrom = read.reference_name - key = read.query_name, read.next_reference_name, read.next_reference_start - if key not in self.read1s_done: - self.read1s.add(key) + key = read.query_name, read.next_reference_name, read.next_reference_start, read.reference_name, read.reference_start + self.read1s.add(key) self.outfile.write(read) @@ -607,11 +605,10 @@ def write_mates(self): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue - key = read.query_name, read.reference_name, read.reference_start + key = read.query_name, read.reference_name, read.reference_start, read.next_reference_name, read.next_reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key) - self.read1s_done.add(key) U.debug("%i mates remaining" % len(self.read1s)) @@ -629,7 +626,7 @@ def close(self): if any((read.is_unmapped, read.mate_is_unmapped, read.is_read1)): continue - key = read.query_name, read.reference_name, read.reference_start + key = read.query_name, read.reference_name, read.reference_start, read.next_reference_name, read.next_reference_start if key in self.read1s: self.outfile.write(read) self.read1s.remove(key)