Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Oct 19, 2018
2 parents eba89a9 + 92c0b65 commit a9ba622
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 67 deletions.
57 changes: 42 additions & 15 deletions porechop/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def get_barcode_name(self):
Gets the barcode name for the output files. We want a concise name, so it looks at all
options and chooses the shortest.
"""
possible_names = [self.name, self.start_sequence[0]]
possible_names = [self.name]
if self.start_sequence:
possible_names.append(self.start_sequence[0])
if self.end_sequence:
possible_names.append(self.end_sequence[0])
barcode_name = sorted(possible_names, key=lambda x: len(x))[0]
Expand Down Expand Up @@ -76,28 +78,36 @@ def get_barcode_name(self):
start_sequence=('SQK-NSK007_Y_Top', 'AATGTACTTCGTTCAGTTACGTATTGCT'),
end_sequence=('SQK-NSK007_Y_Bottom', 'GCAATACGTAACTGAACGAAGT')),


Adapter('Rapid',
start_sequence=('Rapid_adapter', 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA')),
start_sequence=('Rapid_adapter',
'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA')),

Adapter('RBK004_upstream',
start_sequence=('RBK004_upstream', 'AATGTACTTCGTTCAGTTACGGCTTGGGTGTTTAACC')),


Adapter('SQK-MAP006',
start_sequence=('SQK-MAP006_Y_Top_SK63', 'GGTTGTTTCTGTTGGTGCTGATATTGCT'),
end_sequence= ('SQK-MAP006_Y_Bottom_SK64', 'GCAATATCAGCACCAACAGAAA')),
start_sequence=('SQK-MAP006_Y_Top_SK63', 'GGTTGTTTCTGTTGGTGCTGATATTGCT'),
end_sequence= ('SQK-MAP006_Y_Bottom_SK64', 'GCAATATCAGCACCAACAGAAA')),

Adapter('SQK-MAP006 Short',
Adapter('SQK-MAP006 short',
start_sequence=('SQK-MAP006_Short_Y_Top_LI32', 'CGGCGTCTGCTTGGGTGTTTAACCT'),
end_sequence= ('SQK-MAP006_Short_Y_Bottom_LI33', 'GGTTAAACACCCAAGCAGACGCCG')),


# The PCR adapters are used both in PCR DNA kits and some cDNA kits.
Adapter('PCR adapters 1',
start_sequence=('PCR_1_start', 'ACTTGCCTGTCGCTCTATCTTC'),
end_sequence= ('PCR_1_end', 'GAAGATAGAGCGACAGGCAAGT')),
end_sequence= ('PCR_1_end', 'GAAGATAGAGCGACAGGCAAGT')),

Adapter('PCR tail 1',
start_sequence=('PCR_tail_1_start', 'TTAACCTTTCTGTTGGTGCTGATATTGC'),
end_sequence= ('PCR_tail_1_end', 'GCAATATCAGCACCAACAGAAAGGTTAA')),
Adapter('PCR adapters 2',
start_sequence=('PCR_2_start', 'TTTCTGTTGGTGCTGATATTGC'),
end_sequence= ('PCR_2_end', 'GCAATATCAGCACCAACAGAAA')),

Adapter('PCR tail 2',
start_sequence=('PCR_tail_2_start', 'TTAACCTACTTGCCTGTCGCTCTATCTTC'),
end_sequence= ('PCR_tail_2_end', 'GAAGATAGAGCGACAGGCAAGTAGGTTAA')),
Adapter('PCR adapters 3',
start_sequence=('PCR_3_start', 'TACTTGCCTGTCGCTCTATCTTC'),
end_sequence= ('PCR_3_end', 'GAAGATAGAGCGACAGGCAAGTA')),


# 1D^2 kit adapters are interesting. ONT provided the following sequences on their site:
Expand All @@ -117,6 +127,11 @@ def get_barcode_name(self):
# adapter sequences here.


Adapter('cDNA SSP',
start_sequence=('cDNA_SSP', 'TTTCTGTTGGTGCTGATATTGCTGCCATTACGGCCGGG'),
end_sequence= ('cDNA_SSP_rev', 'CCCGGCCGTAATGGCAGCAATATCAGCACCAACAGAAA')),


# Some barcoding kits (like the native barcodes) use the rev comp barcode at the start
# of the read and the forward barcode at the end of the read.
Adapter('Barcode 1 (reverse)',
Expand Down Expand Up @@ -461,11 +476,23 @@ def make_full_native_barcode_adapter(barcode_num):
end_sequence=('NB' + '%02d' % barcode_num + '_end', end_full_seq))


def make_full_rapid_barcode_adapter(barcode_num):
def make_old_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK001
barcode = [x for x in ADAPTERS if x.name == 'Barcode ' + str(barcode_num) + ' (forward)'][0]
start_barcode_seq = barcode.start_sequence[1]

start_full_seq = 'AATGTACTTCGTTCAGTTACG' + 'TATTGCT' + start_barcode_seq + \
'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA'

return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence, old)',
start_sequence=('RB' + '%02d' % barcode_num + '_full', start_full_seq))


def make_new_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK004
barcode = [x for x in ADAPTERS if x.name == 'Barcode ' + str(barcode_num) + ' (forward)'][0]
start_barcode_seq = barcode.start_sequence[1]

start_full_seq = 'AATGTACTTCGTTCAGTTACGTATTGCT' + start_barcode_seq + 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA'
start_full_seq = 'AATGTACTTCGTTCAGTTACG' + 'GCTTGGGTGTTTAACC' + start_barcode_seq + \
'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA'

return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence)',
return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence, new)',
start_sequence=('RB' + '%02d' % barcode_num + '_full', start_full_seq))
33 changes: 25 additions & 8 deletions porechop/nanopore_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ class NanoporeRead(object):
def __init__(self, name, seq, quals):
self.name = name

self.seq = seq
self.seq = seq.upper()
if self.seq.count('U') > self.seq.count('T'):
self.rna = True
self.seq = self.seq.replace('U', 'T')
else:
self.rna = False

self.quals = quals
if len(quals) < len(seq):
self.quals += '+' * (len(seq) - len(quals))
Expand Down Expand Up @@ -96,6 +102,8 @@ def get_fasta(self, min_split_read_size, discard_middle, untrimmed=False):
seq = self.get_seq_with_start_end_adapters_trimmed()
if not seq: # Don't return empty sequences
return ''
if self.rna:
seq = seq.replace('T', 'U')
return ''.join(['>', self.name, '\n', add_line_breaks_to_sequence(seq, 70)])
elif discard_middle:
return ''
Expand All @@ -106,6 +114,8 @@ def get_fasta(self, min_split_read_size, discard_middle, untrimmed=False):
if not split_read_part[0]: # Don't return empty sequences
return ''
seq = add_line_breaks_to_sequence(split_read_part[0], 70)
if self.rna:
seq = seq.replace('T', 'U')
fasta_str += ''.join(['>', read_name, '\n', seq])
return fasta_str

Expand All @@ -119,17 +129,21 @@ def get_fastq(self, min_split_read_size, discard_middle, untrimmed=False):
quals = self.get_quals_with_start_end_adapters_trimmed()
if not seq: # Don't return empty sequences
return ''
if self.rna:
seq = seq.replace('T', 'U')
return ''.join(['@', self.name, '\n', seq, '\n+\n', quals, '\n'])
elif discard_middle:
return ''
else:
fastq_str = ''
for i, split_read_part in enumerate(self.get_split_read_parts(min_split_read_size)):
read_name = add_number_to_read_name(self.name, i + 1)
if not split_read_part[0]: # Don't return empty sequences
seq, qual = split_read_part[0], split_read_part[1]
if not seq: # Don't return empty sequences
return ''
fastq_str += ''.join(['@', read_name, '\n', split_read_part[0], '\n+\n',
split_read_part[1], '\n'])
if self.rna:
seq = seq.replace('T', 'U')
fastq_str += ''.join(['@', read_name, '\n', seq, '\n+\n', qual, '\n'])
return fastq_str

def align_adapter_set(self, adapter_set, end_size, scoring_scheme_vals):
Expand All @@ -138,10 +152,11 @@ def align_adapter_set(self, adapter_set, end_size, scoring_scheme_vals):
This is not to determine where to trim the reads, but rather to figure out which adapter
sets are present in the data.
"""
read_seq_start = self.seq[:end_size]
score, _, _, _ = align_adapter(read_seq_start, adapter_set.start_sequence[1],
scoring_scheme_vals)
adapter_set.best_start_score = max(adapter_set.best_start_score, score)
if adapter_set.start_sequence:
read_seq_start = self.seq[:end_size]
score, _, _, _ = align_adapter(read_seq_start, adapter_set.start_sequence[1],
scoring_scheme_vals)
adapter_set.best_start_score = max(adapter_set.best_start_score, score)
if adapter_set.end_sequence:
read_seq_end = self.seq[-end_size:]
score, _, _, _ = align_adapter(read_seq_end, adapter_set.end_sequence[1],
Expand All @@ -156,6 +171,8 @@ def find_start_trim(self, adapters, end_size, extra_trim_size, end_threshold,
"""
read_seq_start = self.seq[:end_size]
for adapter in adapters:
if not adapter.start_sequence:
continue
full_score, partial_score, read_start, read_end = \
align_adapter(read_seq_start, adapter.start_sequence[1], scoring_scheme_vals)
if partial_score > end_threshold and read_end != end_size and \
Expand Down
105 changes: 62 additions & 43 deletions porechop/porechop.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from .misc import load_fasta_or_fastq, print_table, red, bold_underline, MyHelpFormatter, int_to_str
from .adapters import ADAPTERS, make_full_native_barcode_adapter, make_full_rapid_barcode_adapter
from .adapters import ADAPTERS, make_full_native_barcode_adapter,\
make_old_full_rapid_barcode_adapter, make_new_full_rapid_barcode_adapter
from .nanopore_read import NanoporeRead
from .version import __version__

Expand All @@ -37,16 +38,17 @@ def main():
matching_sets = find_matching_adapter_sets(check_reads, args.verbosity, args.end_size,
args.scoring_scheme_vals, args.print_dest,
args.adapter_threshold, args.threads)
matching_sets = exclude_end_adapters_for_rapid(matching_sets)
matching_sets = fix_up_1d2_sets(matching_sets)
display_adapter_set_results(matching_sets, args.verbosity, args.print_dest)
matching_sets = add_full_barcode_adapter_sets(matching_sets)

if args.barcode_dir:
forward_or_reverse_barcodes = choose_barcoding_kit(matching_sets, args.verbosity,
args.print_dest)
else:
forward_or_reverse_barcodes = None

display_adapter_set_results(matching_sets, args.verbosity, args.print_dest)
matching_sets = add_full_barcode_adapter_sets(matching_sets)

if args.verbosity > 0:
print('\n', file=args.print_dest)

Expand Down Expand Up @@ -170,7 +172,7 @@ def get_arguments():
'reads with middle adapters are split) (required for '
'reads to be used with Nanopolish, this option is on by '
'default when outputting reads into barcode bins)')
middle_trim_group.add_argument('--middle_threshold', type=float, default=85.0,
middle_trim_group.add_argument('--middle_threshold', type=float, default=90.0,
help='Adapters in the middle of reads must have at least this '
'percent identity to be found (0 to 100)')
middle_trim_group.add_argument('--extra_middle_trim_good_side', type=int, default=10,
Expand Down Expand Up @@ -330,35 +332,43 @@ def choose_barcoding_kit(adapter_sets, verbosity, print_dest):
If the user is sorting reads by barcode bin, choose one barcode configuration (rev comp
barcodes at the start of the read or at the end of the read) and ignore the other.
"""
forward_barcodes = 0
reverse_barcodes = 0
# Tally up scores for forward and reverse barcodes.
forward_start_or_end, reverse_start_or_end = 0, 0
forward_start_and_end, reverse_start_and_end = 0, 0
for adapter_set in adapter_sets:
score = adapter_set.best_start_or_end_score()
if 'Barcode' in adapter_set.name and '(forward)' in adapter_set.name:
forward_barcodes += score
elif 'Barcode' in adapter_set.name and '(reverse)' in adapter_set.name:
reverse_barcodes += score
if forward_barcodes > reverse_barcodes:
if verbosity > 0:
print('\nBarcodes determined to be in forward orientation', file=print_dest)
return 'forward'
elif reverse_barcodes > forward_barcodes:
if verbosity > 0:
print('\nBarcodes determined to be in reverse orientation', file=print_dest)
return 'reverse'
else:
return None

if 'barcode' in adapter_set.name.lower():
if '(forward)' in adapter_set.name.lower():
forward_start_or_end += adapter_set.best_start_or_end_score()
forward_start_and_end += adapter_set.best_start_score
forward_start_and_end += adapter_set.best_end_score
elif '(reverse)' in adapter_set.name.lower():
reverse_start_or_end += adapter_set.best_start_or_end_score()
reverse_start_and_end += adapter_set.best_start_score
reverse_start_and_end += adapter_set.best_end_score

if forward_start_or_end == 0 and reverse_start_or_end == 0:
sys.exit('Error: no barcodes were found, so Porechop cannot perform barcode demultiplexing')

# If possible, make a decision using each barcode's best start OR end score.
orientation = None
if forward_start_or_end > reverse_start_or_end:
orientation = 'forward'
elif reverse_start_or_end > forward_start_or_end:
orientation = 'reverse'

# If that didn't work (i.e. it's a tie between forward and reverse), then choose based on the
# sum of both start AND end scores.
elif forward_start_and_end > reverse_start_and_end:
orientation = 'forward'
elif reverse_start_and_end > forward_start_and_end:
orientation = 'reverse'

if orientation is None:
sys.exit('Error: Porechop could not determine barcode orientation')

def exclude_end_adapters_for_rapid(matching_sets):
"""
Rapid reads shouldn't have end adapters, so we don't want to look for them if this seems to be
a rapid read set.
"""
if 'Rapid adapter' in [x.name for x in matching_sets]:
for s in matching_sets:
s.end_sequence = None
return matching_sets
if verbosity > 0:
print('\nBarcodes determined to be in ' + orientation + ' orientation', file=print_dest)
return orientation


def fix_up_1d2_sets(matching_sets):
Expand Down Expand Up @@ -416,8 +426,11 @@ def add_full_barcode_adapter_sets(matching_sets):

# Rapid barcode full sequences
if all(x in matching_set_names
for x in ['SQK-NSK007', 'Rapid', 'Barcode ' + str(i) + ' (forward)']):
matching_sets.append(make_full_rapid_barcode_adapter(i))
for x in ['Rapid', 'Barcode ' + str(i) + ' (forward)']):
if 'RBK004_upstream' in matching_set_names:
matching_sets.append(make_new_full_rapid_barcode_adapter(i))
elif 'SQK-NSK007' in matching_set_names:
matching_sets.append(make_old_full_rapid_barcode_adapter(i))

return matching_sets

Expand All @@ -429,11 +442,14 @@ def find_adapters_at_read_ends(reads, matching_sets, verbosity, end_size, extra_
if verbosity > 0:
print(bold_underline('Trimming adapters from read ends'),
file=print_dest)
name_len = max(max(len(x.start_sequence[0]) for x in matching_sets),
max(len(x.end_sequence[0]) if x.end_sequence else 0 for x in matching_sets))
name_len = max(max(len(x.start_sequence[0])
if x.start_sequence else 0 for x in matching_sets),
max(len(x.end_sequence[0])
if x.end_sequence else 0 for x in matching_sets))
for matching_set in matching_sets:
print(' ' + matching_set.start_sequence[0].rjust(name_len) + ': ' +
red(matching_set.start_sequence[1]), file=print_dest)
if matching_set.start_sequence:
print(' ' + matching_set.start_sequence[0].rjust(name_len) + ': ' +
red(matching_set.start_sequence[1]), file=print_dest)
if matching_set.end_sequence:
print(' ' + matching_set.end_sequence[0].rjust(name_len) + ': ' +
red(matching_set.end_sequence[1]), file=print_dest)
Expand Down Expand Up @@ -524,15 +540,18 @@ def find_adapters_in_read_middles(reads, matching_sets, verbosity, middle_thresh

adapters = []
for matching_set in matching_sets:
adapters.append(matching_set.start_sequence)
if matching_set.end_sequence and \
matching_set.end_sequence[1] != matching_set.start_sequence[1]:
adapters.append(matching_set.end_sequence)
if matching_set.start_sequence:
adapters.append(matching_set.start_sequence)
if matching_set.end_sequence:
if (not matching_set.start_sequence) or \
matching_set.end_sequence[1] != matching_set.start_sequence[1]:
adapters.append(matching_set.end_sequence)

start_sequence_names = set()
end_sequence_names = set()
for matching_set in matching_sets:
start_sequence_names.add(matching_set.start_sequence[0])
if matching_set.start_sequence:
start_sequence_names.add(matching_set.start_sequence[0])
if matching_set.end_sequence:
end_sequence_names.add(matching_set.end_sequence[0])

Expand Down
2 changes: 1 addition & 1 deletion porechop/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
The version is stored here in a separate file so it can exist in only one place.
http://stackoverflow.com/questions/458550
"""
__version__ = '0.2.3'
__version__ = '0.2.4-beta'

0 comments on commit a9ba622

Please sign in to comment.