diff --git a/porechop/adapters.py b/porechop/adapters.py index 8c5baa1..dce7a4f 100644 --- a/porechop/adapters.py +++ b/porechop/adapters.py @@ -43,7 +43,9 @@ def get_barcode_name(self): Gets the barcode name for the output files. We want a concise name, so it looks at all options and chooses the shortest. """ - possible_names = [self.name, self.start_sequence[0]] + possible_names = [self.name] + if self.start_sequence: + possible_names.append(self.start_sequence[0]) if self.end_sequence: possible_names.append(self.end_sequence[0]) barcode_name = sorted(possible_names, key=lambda x: len(x))[0] @@ -76,28 +78,36 @@ def get_barcode_name(self): start_sequence=('SQK-NSK007_Y_Top', 'AATGTACTTCGTTCAGTTACGTATTGCT'), end_sequence=('SQK-NSK007_Y_Bottom', 'GCAATACGTAACTGAACGAAGT')), + Adapter('Rapid', - start_sequence=('Rapid_adapter', 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA')), + start_sequence=('Rapid_adapter', + 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA')), + + Adapter('RBK004_upstream', + start_sequence=('RBK004_upstream', 'AATGTACTTCGTTCAGTTACGGCTTGGGTGTTTAACC')), + Adapter('SQK-MAP006', - start_sequence=('SQK-MAP006_Y_Top_SK63', 'GGTTGTTTCTGTTGGTGCTGATATTGCT'), - end_sequence= ('SQK-MAP006_Y_Bottom_SK64', 'GCAATATCAGCACCAACAGAAA')), + start_sequence=('SQK-MAP006_Y_Top_SK63', 'GGTTGTTTCTGTTGGTGCTGATATTGCT'), + end_sequence= ('SQK-MAP006_Y_Bottom_SK64', 'GCAATATCAGCACCAACAGAAA')), - Adapter('SQK-MAP006 Short', + Adapter('SQK-MAP006 short', start_sequence=('SQK-MAP006_Short_Y_Top_LI32', 'CGGCGTCTGCTTGGGTGTTTAACCT'), end_sequence= ('SQK-MAP006_Short_Y_Bottom_LI33', 'GGTTAAACACCCAAGCAGACGCCG')), + + # The PCR adapters are used both in PCR DNA kits and some cDNA kits. Adapter('PCR adapters 1', start_sequence=('PCR_1_start', 'ACTTGCCTGTCGCTCTATCTTC'), - end_sequence= ('PCR_1_end', 'GAAGATAGAGCGACAGGCAAGT')), + end_sequence= ('PCR_1_end', 'GAAGATAGAGCGACAGGCAAGT')), - Adapter('PCR tail 1', - start_sequence=('PCR_tail_1_start', 'TTAACCTTTCTGTTGGTGCTGATATTGC'), - end_sequence= ('PCR_tail_1_end', 'GCAATATCAGCACCAACAGAAAGGTTAA')), + Adapter('PCR adapters 2', + start_sequence=('PCR_2_start', 'TTTCTGTTGGTGCTGATATTGC'), + end_sequence= ('PCR_2_end', 'GCAATATCAGCACCAACAGAAA')), - Adapter('PCR tail 2', - start_sequence=('PCR_tail_2_start', 'TTAACCTACTTGCCTGTCGCTCTATCTTC'), - end_sequence= ('PCR_tail_2_end', 'GAAGATAGAGCGACAGGCAAGTAGGTTAA')), + Adapter('PCR adapters 3', + start_sequence=('PCR_3_start', 'TACTTGCCTGTCGCTCTATCTTC'), + end_sequence= ('PCR_3_end', 'GAAGATAGAGCGACAGGCAAGTA')), # 1D^2 kit adapters are interesting. ONT provided the following sequences on their site: @@ -117,6 +127,11 @@ def get_barcode_name(self): # adapter sequences here. + Adapter('cDNA SSP', + start_sequence=('cDNA_SSP', 'TTTCTGTTGGTGCTGATATTGCTGCCATTACGGCCGGG'), + end_sequence= ('cDNA_SSP_rev', 'CCCGGCCGTAATGGCAGCAATATCAGCACCAACAGAAA')), + + # Some barcoding kits (like the native barcodes) use the rev comp barcode at the start # of the read and the forward barcode at the end of the read. Adapter('Barcode 1 (reverse)', @@ -461,11 +476,23 @@ def make_full_native_barcode_adapter(barcode_num): end_sequence=('NB' + '%02d' % barcode_num + '_end', end_full_seq)) -def make_full_rapid_barcode_adapter(barcode_num): +def make_old_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK001 + barcode = [x for x in ADAPTERS if x.name == 'Barcode ' + str(barcode_num) + ' (forward)'][0] + start_barcode_seq = barcode.start_sequence[1] + + start_full_seq = 'AATGTACTTCGTTCAGTTACG' + 'TATTGCT' + start_barcode_seq + \ + 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA' + + return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence, old)', + start_sequence=('RB' + '%02d' % barcode_num + '_full', start_full_seq)) + + +def make_new_full_rapid_barcode_adapter(barcode_num): # applies to SQK-RBK004 barcode = [x for x in ADAPTERS if x.name == 'Barcode ' + str(barcode_num) + ' (forward)'][0] start_barcode_seq = barcode.start_sequence[1] - start_full_seq = 'AATGTACTTCGTTCAGTTACGTATTGCT' + start_barcode_seq + 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA' + start_full_seq = 'AATGTACTTCGTTCAGTTACG' + 'GCTTGGGTGTTTAACC' + start_barcode_seq + \ + 'GTTTTCGCATTTATCGTGAAACGCTTTCGCGTTTTTCGTGCGCCGCTTCA' - return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence)', + return Adapter('Rapid barcoding ' + str(barcode_num) + ' (full sequence, new)', start_sequence=('RB' + '%02d' % barcode_num + '_full', start_full_seq)) diff --git a/porechop/nanopore_read.py b/porechop/nanopore_read.py index 660c644..8a03776 100755 --- a/porechop/nanopore_read.py +++ b/porechop/nanopore_read.py @@ -23,7 +23,13 @@ class NanoporeRead(object): def __init__(self, name, seq, quals): self.name = name - self.seq = seq + self.seq = seq.upper() + if self.seq.count('U') > self.seq.count('T'): + self.rna = True + self.seq = self.seq.replace('U', 'T') + else: + self.rna = False + self.quals = quals if len(quals) < len(seq): self.quals += '+' * (len(seq) - len(quals)) @@ -96,6 +102,8 @@ def get_fasta(self, min_split_read_size, discard_middle, untrimmed=False): seq = self.get_seq_with_start_end_adapters_trimmed() if not seq: # Don't return empty sequences return '' + if self.rna: + seq = seq.replace('T', 'U') return ''.join(['>', self.name, '\n', add_line_breaks_to_sequence(seq, 70)]) elif discard_middle: return '' @@ -106,6 +114,8 @@ def get_fasta(self, min_split_read_size, discard_middle, untrimmed=False): if not split_read_part[0]: # Don't return empty sequences return '' seq = add_line_breaks_to_sequence(split_read_part[0], 70) + if self.rna: + seq = seq.replace('T', 'U') fasta_str += ''.join(['>', read_name, '\n', seq]) return fasta_str @@ -119,6 +129,8 @@ def get_fastq(self, min_split_read_size, discard_middle, untrimmed=False): quals = self.get_quals_with_start_end_adapters_trimmed() if not seq: # Don't return empty sequences return '' + if self.rna: + seq = seq.replace('T', 'U') return ''.join(['@', self.name, '\n', seq, '\n+\n', quals, '\n']) elif discard_middle: return '' @@ -126,10 +138,12 @@ def get_fastq(self, min_split_read_size, discard_middle, untrimmed=False): fastq_str = '' for i, split_read_part in enumerate(self.get_split_read_parts(min_split_read_size)): read_name = add_number_to_read_name(self.name, i + 1) - if not split_read_part[0]: # Don't return empty sequences + seq, qual = split_read_part[0], split_read_part[1] + if not seq: # Don't return empty sequences return '' - fastq_str += ''.join(['@', read_name, '\n', split_read_part[0], '\n+\n', - split_read_part[1], '\n']) + if self.rna: + seq = seq.replace('T', 'U') + fastq_str += ''.join(['@', read_name, '\n', seq, '\n+\n', qual, '\n']) return fastq_str def align_adapter_set(self, adapter_set, end_size, scoring_scheme_vals): @@ -138,10 +152,11 @@ def align_adapter_set(self, adapter_set, end_size, scoring_scheme_vals): This is not to determine where to trim the reads, but rather to figure out which adapter sets are present in the data. """ - read_seq_start = self.seq[:end_size] - score, _, _, _ = align_adapter(read_seq_start, adapter_set.start_sequence[1], - scoring_scheme_vals) - adapter_set.best_start_score = max(adapter_set.best_start_score, score) + if adapter_set.start_sequence: + read_seq_start = self.seq[:end_size] + score, _, _, _ = align_adapter(read_seq_start, adapter_set.start_sequence[1], + scoring_scheme_vals) + adapter_set.best_start_score = max(adapter_set.best_start_score, score) if adapter_set.end_sequence: read_seq_end = self.seq[-end_size:] score, _, _, _ = align_adapter(read_seq_end, adapter_set.end_sequence[1], @@ -156,6 +171,8 @@ def find_start_trim(self, adapters, end_size, extra_trim_size, end_threshold, """ read_seq_start = self.seq[:end_size] for adapter in adapters: + if not adapter.start_sequence: + continue full_score, partial_score, read_start, read_end = \ align_adapter(read_seq_start, adapter.start_sequence[1], scoring_scheme_vals) if partial_score > end_threshold and read_end != end_size and \ diff --git a/porechop/porechop.py b/porechop/porechop.py index 45dea2c..25febe6 100755 --- a/porechop/porechop.py +++ b/porechop/porechop.py @@ -24,7 +24,8 @@ from multiprocessing.dummy import Pool as ThreadPool from collections import defaultdict from .misc import load_fasta_or_fastq, print_table, red, bold_underline, MyHelpFormatter, int_to_str -from .adapters import ADAPTERS, make_full_native_barcode_adapter, make_full_rapid_barcode_adapter +from .adapters import ADAPTERS, make_full_native_barcode_adapter,\ + make_old_full_rapid_barcode_adapter, make_new_full_rapid_barcode_adapter from .nanopore_read import NanoporeRead from .version import __version__ @@ -37,16 +38,17 @@ def main(): matching_sets = find_matching_adapter_sets(check_reads, args.verbosity, args.end_size, args.scoring_scheme_vals, args.print_dest, args.adapter_threshold, args.threads) - matching_sets = exclude_end_adapters_for_rapid(matching_sets) matching_sets = fix_up_1d2_sets(matching_sets) - display_adapter_set_results(matching_sets, args.verbosity, args.print_dest) - matching_sets = add_full_barcode_adapter_sets(matching_sets) if args.barcode_dir: forward_or_reverse_barcodes = choose_barcoding_kit(matching_sets, args.verbosity, args.print_dest) else: forward_or_reverse_barcodes = None + + display_adapter_set_results(matching_sets, args.verbosity, args.print_dest) + matching_sets = add_full_barcode_adapter_sets(matching_sets) + if args.verbosity > 0: print('\n', file=args.print_dest) @@ -170,7 +172,7 @@ def get_arguments(): 'reads with middle adapters are split) (required for ' 'reads to be used with Nanopolish, this option is on by ' 'default when outputting reads into barcode bins)') - middle_trim_group.add_argument('--middle_threshold', type=float, default=85.0, + middle_trim_group.add_argument('--middle_threshold', type=float, default=90.0, help='Adapters in the middle of reads must have at least this ' 'percent identity to be found (0 to 100)') middle_trim_group.add_argument('--extra_middle_trim_good_side', type=int, default=10, @@ -330,35 +332,43 @@ def choose_barcoding_kit(adapter_sets, verbosity, print_dest): If the user is sorting reads by barcode bin, choose one barcode configuration (rev comp barcodes at the start of the read or at the end of the read) and ignore the other. """ - forward_barcodes = 0 - reverse_barcodes = 0 + # Tally up scores for forward and reverse barcodes. + forward_start_or_end, reverse_start_or_end = 0, 0 + forward_start_and_end, reverse_start_and_end = 0, 0 for adapter_set in adapter_sets: - score = adapter_set.best_start_or_end_score() - if 'Barcode' in adapter_set.name and '(forward)' in adapter_set.name: - forward_barcodes += score - elif 'Barcode' in adapter_set.name and '(reverse)' in adapter_set.name: - reverse_barcodes += score - if forward_barcodes > reverse_barcodes: - if verbosity > 0: - print('\nBarcodes determined to be in forward orientation', file=print_dest) - return 'forward' - elif reverse_barcodes > forward_barcodes: - if verbosity > 0: - print('\nBarcodes determined to be in reverse orientation', file=print_dest) - return 'reverse' - else: - return None - + if 'barcode' in adapter_set.name.lower(): + if '(forward)' in adapter_set.name.lower(): + forward_start_or_end += adapter_set.best_start_or_end_score() + forward_start_and_end += adapter_set.best_start_score + forward_start_and_end += adapter_set.best_end_score + elif '(reverse)' in adapter_set.name.lower(): + reverse_start_or_end += adapter_set.best_start_or_end_score() + reverse_start_and_end += adapter_set.best_start_score + reverse_start_and_end += adapter_set.best_end_score + + if forward_start_or_end == 0 and reverse_start_or_end == 0: + sys.exit('Error: no barcodes were found, so Porechop cannot perform barcode demultiplexing') + + # If possible, make a decision using each barcode's best start OR end score. + orientation = None + if forward_start_or_end > reverse_start_or_end: + orientation = 'forward' + elif reverse_start_or_end > forward_start_or_end: + orientation = 'reverse' + + # If that didn't work (i.e. it's a tie between forward and reverse), then choose based on the + # sum of both start AND end scores. + elif forward_start_and_end > reverse_start_and_end: + orientation = 'forward' + elif reverse_start_and_end > forward_start_and_end: + orientation = 'reverse' + + if orientation is None: + sys.exit('Error: Porechop could not determine barcode orientation') -def exclude_end_adapters_for_rapid(matching_sets): - """ - Rapid reads shouldn't have end adapters, so we don't want to look for them if this seems to be - a rapid read set. - """ - if 'Rapid adapter' in [x.name for x in matching_sets]: - for s in matching_sets: - s.end_sequence = None - return matching_sets + if verbosity > 0: + print('\nBarcodes determined to be in ' + orientation + ' orientation', file=print_dest) + return orientation def fix_up_1d2_sets(matching_sets): @@ -416,8 +426,11 @@ def add_full_barcode_adapter_sets(matching_sets): # Rapid barcode full sequences if all(x in matching_set_names - for x in ['SQK-NSK007', 'Rapid', 'Barcode ' + str(i) + ' (forward)']): - matching_sets.append(make_full_rapid_barcode_adapter(i)) + for x in ['Rapid', 'Barcode ' + str(i) + ' (forward)']): + if 'RBK004_upstream' in matching_set_names: + matching_sets.append(make_new_full_rapid_barcode_adapter(i)) + elif 'SQK-NSK007' in matching_set_names: + matching_sets.append(make_old_full_rapid_barcode_adapter(i)) return matching_sets @@ -429,11 +442,14 @@ def find_adapters_at_read_ends(reads, matching_sets, verbosity, end_size, extra_ if verbosity > 0: print(bold_underline('Trimming adapters from read ends'), file=print_dest) - name_len = max(max(len(x.start_sequence[0]) for x in matching_sets), - max(len(x.end_sequence[0]) if x.end_sequence else 0 for x in matching_sets)) + name_len = max(max(len(x.start_sequence[0]) + if x.start_sequence else 0 for x in matching_sets), + max(len(x.end_sequence[0]) + if x.end_sequence else 0 for x in matching_sets)) for matching_set in matching_sets: - print(' ' + matching_set.start_sequence[0].rjust(name_len) + ': ' + - red(matching_set.start_sequence[1]), file=print_dest) + if matching_set.start_sequence: + print(' ' + matching_set.start_sequence[0].rjust(name_len) + ': ' + + red(matching_set.start_sequence[1]), file=print_dest) if matching_set.end_sequence: print(' ' + matching_set.end_sequence[0].rjust(name_len) + ': ' + red(matching_set.end_sequence[1]), file=print_dest) @@ -524,15 +540,18 @@ def find_adapters_in_read_middles(reads, matching_sets, verbosity, middle_thresh adapters = [] for matching_set in matching_sets: - adapters.append(matching_set.start_sequence) - if matching_set.end_sequence and \ - matching_set.end_sequence[1] != matching_set.start_sequence[1]: - adapters.append(matching_set.end_sequence) + if matching_set.start_sequence: + adapters.append(matching_set.start_sequence) + if matching_set.end_sequence: + if (not matching_set.start_sequence) or \ + matching_set.end_sequence[1] != matching_set.start_sequence[1]: + adapters.append(matching_set.end_sequence) start_sequence_names = set() end_sequence_names = set() for matching_set in matching_sets: - start_sequence_names.add(matching_set.start_sequence[0]) + if matching_set.start_sequence: + start_sequence_names.add(matching_set.start_sequence[0]) if matching_set.end_sequence: end_sequence_names.add(matching_set.end_sequence[0]) diff --git a/porechop/version.py b/porechop/version.py index 7ee7940..3be3007 100644 --- a/porechop/version.py +++ b/porechop/version.py @@ -2,4 +2,4 @@ The version is stored here in a separate file so it can exist in only one place. http://stackoverflow.com/questions/458550 """ -__version__ = '0.2.3' +__version__ = '0.2.4-beta'