Skip to content

Commit

Permalink
WIP make porechop mode: now gets same/similar identity score (wrong l…
Browse files Browse the repository at this point in the history
…ength was being retrieved from parasail result) and has a function to preprocess first N reads and identify the barcodes which meet an identity threshold for these
  • Loading branch information
Rachel Colquhoun committed Mar 3, 2020
1 parent 2a99c0e commit 9d1e620
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 42 deletions.
72 changes: 62 additions & 10 deletions readucks/demuxer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,49 @@
gap_open = 10
gap_extend = 1

read_fragment_length = 100

def set_alignment_settings(open, extend, matrix):
global gap_open, gap_extend, nuc_matrix

gap_open = open
gap_extend = extend
nuc_matrix = matrix

def best_read_identity(reads, barcodes, barcode_set):
start_identities = {}
end_identities = {}

for barcode_id in barcodes:
start_identities[barcode_id] = 0
end_identities[barcode_id] = 0

# Look at alignment of just the barcode, not the extended adapter sequence
start_adapter_seq = barcodes[barcode_id]['start']
end_adapter_seq = barcodes[barcode_id]['end']

for read in reads:
query_start = str(read.seq)[:read_fragment_length]
result_start = get_all(barcode_id, query_start, start_adapter_seq, gap_open, gap_extend, nuc_matrix)
if result_start['identity'] > start_identities[barcode_id]:
start_identities[barcode_id] = result_start['identity']

query_end = str(read.seq)[-read_fragment_length:]
result_end = get_all(barcode_id, query_end, end_adapter_seq, gap_open, gap_extend, nuc_matrix)
if result_end['identity'] > end_identities[barcode_id]:
end_identities[barcode_id] = result_end['identity']

return start_identities, end_identities


def demux_read(read, barcodes, barcode_set, single_barcode, threshold, secondary_threshold, score_diff, mode, additional_info, verbosity):
'''
Processes a read to find barcodes and returns the results
:param name: The name of the read
:param read: The sequence
'''
query_start = str(read.seq)[:150]
query_end = str(read.seq)[-150:]
query_start = str(read.seq)[:read_fragment_length]
query_end = str(read.seq)[-read_fragment_length:]

results = []

Expand All @@ -47,9 +74,9 @@ def demux_read(read, barcodes, barcode_set, single_barcode, threshold, secondary
end_adapter_seq = get_end_adapter_seq(barcode_id, barcode_set)

if mode == 'porechop':
result_start = get_stats(barcode_id, query_start, start_adapter_seq, gap_open, gap_extend,
result_start = get_identity(barcode_id, query_start, start_adapter_seq, gap_open, gap_extend,
nuc_matrix)
result_end = get_stats(barcode_id, query_end, end_adapter_seq, gap_open, gap_extend, nuc_matrix)
result_end = get_identity(barcode_id, query_end, end_adapter_seq, gap_open, gap_extend, nuc_matrix)
else:
result_start = get_score(barcode_id, query_start, start_adapter_seq, gap_open, gap_extend, nuc_matrix)
result_end = get_score(barcode_id, query_end, end_adapter_seq, gap_open, gap_extend, nuc_matrix)
Expand All @@ -65,6 +92,7 @@ def demux_read(read, barcodes, barcode_set, single_barcode, threshold, secondary
start_best_end = get_all(results[0]['id'], query_end, get_end_adapter_seq(results[0]['id'], barcode_set), gap_open,
gap_extend, nuc_matrix)
start_best = combine_results(start_best, start_best_end, start_best)
start_second_best = None
if mode == 'porechop' and len(results) > 1:
start_second_best = get_all(results[1]['id'], query_start, get_start_adapter_seq(results[1]['id'], barcode_set), gap_open,
gap_extend, nuc_matrix)
Expand All @@ -79,6 +107,7 @@ def demux_read(read, barcodes, barcode_set, single_barcode, threshold, secondary
end_best_start = get_all(results[0]['id'], query_start, get_start_adapter_seq(results[0]['id'], barcode_set), gap_open,
gap_extend, nuc_matrix)
end_best = combine_results(end_best_start, end_best, end_best)
end_second_best = None
if mode == 'porechop' and len(results) > 1:
end_second_best = get_all(results[1]['id'], query_end, get_end_adapter_seq(results[1]['id'], barcode_set), gap_open,
gap_extend, nuc_matrix)
Expand Down Expand Up @@ -174,9 +203,9 @@ def call_barcode_lenient_mode(primary, secondary, threshold, secondary_threshold
def call_barcode_porechop_mode(primary, secondary, primary_second, secondary_second, single_barcode, threshold, score_diff, verbosity):

primary_over_threshold = (primary['identity'] >= threshold)
primary_good_diff = (primary['identity'] >= primary_second['identity'] + score_diff)
primary_good_diff = (primary_second is None or primary['identity'] >= primary_second['identity'] + score_diff)
secondary_over_threshold = (secondary['identity'] >= threshold)
secondary_good_diff = (secondary['identity'] >= secondary_second['identity'] + score_diff)
secondary_good_diff = (secondary_second is None or secondary['identity'] >= secondary_second['identity'] + score_diff)
ids_match = (primary['id'] == secondary['id'])

if single_barcode:
Expand Down Expand Up @@ -267,6 +296,28 @@ def get_score(id, query, reference, open, extend, matrix):

return result

def get_identity(id, query, reference, open, extend, matrix):
if reference is None:
result = {
'id': id,
'score': 0,
'identity': 0,
}

return result

stats = parasail.sg_qx_stats_striped_sat(query, reference, open, extend, matrix)

result = {
'id': id,
'score': stats.score,
'identity': stats.matches / stats.length
}

del stats

return result


def get_stats(id, query, reference, open, extend, matrix):
if reference is None:
Expand All @@ -285,9 +336,9 @@ def get_stats(id, query, reference, open, extend, matrix):
result = {
'id': id,
'matches': stats.matches,
'length': stats.len_ref,
'length': stats.length,
'score': stats.score,
'identity': stats.matches / stats.len_ref
'identity': stats.matches / stats.length
}

del stats
Expand Down Expand Up @@ -325,7 +376,8 @@ def get_all(id, query, reference, open, extend, matrix):
'id': id,
'matches': stats.matches,
'score': stats.score,
'identity': stats.matches / stats.len_ref
'length': stats.length,
'identity': stats.matches / stats.length
}

del stats
Expand All @@ -335,7 +387,7 @@ def get_all(id, query, reference, open, extend, matrix):
traceback = trace.get_traceback()
cigar = trace.get_cigar()

result['length'] = len(traceback.comp)
#result['length'] = len(traceback.comp)
result['mismatches'] = traceback.comp.count('.')
result['similarity'] = result['matches'] / (result['matches'] + result['mismatches'])

Expand Down
77 changes: 45 additions & 32 deletions readucks/readucks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from Bio import SeqIO
import parasail

from .demuxer import set_alignment_settings, demux_read, print_result
from .demuxer import set_alignment_settings, demux_read, best_read_identity, print_result
from .barcodes import NATIVE_BARCODES, PCR_BARCODES, RAPID_BARCODES
from .misc import bold_underline, MyHelpFormatter, output_progress_line
from .version import __version__
Expand Down Expand Up @@ -122,40 +122,57 @@ def get_barcode_list(barcode_set, limit_barcodes_to, verbosity):

return barcode_list

def filter_barcodes_by_counts(barcode_set, barcode_counts, high_identity_barcodes, verbosity, min_count=2, min_frac=0.0025, min_ratio=0.015):
def filter_barcodes(barcode_set, start_identities, end_identities, adapter_threshold, verbosity):
subset_barcodes = []
total = sum([barcode_counts[barcode] for barcode in barcode_counts])
max_count = max([barcode_counts[barcode] for barcode in barcode_counts])

for barcode in high_identity_barcodes:
if barcode == 'unassigned' or barcode not in barcode_counts:
continue
#if barcode_counts[barcode] > min_count and barcode_counts[barcode] > min_frac*total \
# and barcode_counts[barcode] > min_ratio*max_count:
subset_barcodes.append(barcode)

for barcode_id in start_identities:
if start_identities[barcode_id] > adapter_threshold and end_identities[barcode_id] > adapter_threshold:
subset_barcodes.append(barcode_id)
barcode_list = get_barcode_list(barcode_set, subset_barcodes, verbosity)
return barcode_list

def run_check_reads(read_files, output, barcode_list, check_reads, adapter_threshold, settings, verbosity, threads, batch_size):
high_identity_barcodes = set()
barcode_counts = defaultdict(int)

if verbosity > 0:
print(bold_underline("\nProcessing files for barcodes"), flush=True)
output_progress_line(0, len(read_files))
def run_check_reads(read_files, barcode_list, check_reads, adapter_threshold, settings, batch_size):
start_identities = {}
end_identities = {}

for index, read_file in enumerate(read_files):
file_type = 'fastq'
if read_file.lower().endswith('.fasta'):
file_type = 'fasta'

file_high_identity_barcodes = process_read_file(read_file, output, barcode_list, settings, barcode_counts,
adapter_threshold, verbosity, threads, batch_size)
high_identity_barcodes.update(file_high_identity_barcodes)
records = SeqIO.index(read_file, file_type)
read_names = list(records.keys())
n_reads = min(len(records), check_reads)

total_counts = sum([barcode_counts[barcode] for barcode in barcode_counts])
if total_counts > check_reads:
barcode_list = filter_barcodes_by_counts(settings['barcode_set'], barcode_counts, high_identity_barcodes, verbosity)
for i in range(0, n_reads, batch_size):
reads = [records[r] for r in read_names[i:i + batch_size]]
batch_start_identities, batch_end_identities = best_read_identity(reads, barcode_list, settings['barcode_set'])

for id in barcode_list:
if id in start_identities:
start_identities[id] = max(start_identities[id], batch_start_identities[id])
else:
start_identities[id] = batch_start_identities[id]

if id in end_identities:
end_identities[id] = max(end_identities[id], batch_end_identities[id])
else:
end_identities[id] = batch_end_identities[id]

check_reads = check_reads - n_reads
if check_reads <= 0:
break

if settings['verbosity'] > 0:
print("Best start and end identities")
for id in barcode_list:
print("%s\tStart: %f\tEnd: %f" %(id, start_identities[id], end_identities[id]), flush=True)

barcode_list = filter_barcodes(settings['barcode_set'], start_identities, end_identities, adapter_threshold, settings['verbosity'])
return barcode_list



def process_files(input_path, output, limit_barcodes_to, check_reads, adapter_threshold, settings, verbosity, threads, batch_size):
"""
Core function to process one or more input files and create the required output files.
Expand All @@ -182,15 +199,16 @@ def process_files(input_path, output, limit_barcodes_to, check_reads, adapter_th
output['file_type'] = get_output_file_type(read_files)

if check_reads:
barcode_list = run_check_reads(read_files, output, barcode_list, check_reads, adapter_threshold, settings, verbosity, threads, batch_size)
print("Use first %i reads to subset barcodes" %check_reads)
barcode_list = run_check_reads(read_files, barcode_list, check_reads, adapter_threshold, settings, batch_size)

if verbosity > 0:
print(bold_underline("\nProcessing files"), flush=True)
output_progress_line(0, len(read_files))

for index, read_file in enumerate(read_files):

process_read_file(read_file, output, barcode_list, settings, barcode_counts, adapter_threshold, verbosity, threads, batch_size)
process_read_file(read_file, output, barcode_list, settings, barcode_counts, verbosity, threads, batch_size)

if verbosity > 0:
output_progress_line(index, len(read_files))
Expand Down Expand Up @@ -270,7 +288,7 @@ def get_output_file_type(read_files):
return file_type


def process_read_file(read_file, output, barcodes, settings, barcode_counts, adapter_threshold, verbosity, threads = 1, batch_size = 200):
def process_read_file(read_file, output, barcodes, settings, barcode_counts, verbosity, threads = 1, batch_size = 200):
"""
Iterates through the reads in an input files and bins or filters them into the
output files as required.
Expand All @@ -295,7 +313,6 @@ def process_read_file(read_file, output, barcodes, settings, barcode_counts, ada
n_reads = len(records)

results = []
high_identity_barcodes = set()
for i in range(0, n_reads, batch_size):
reads = [records[r] for r in read_names[i:i + batch_size]]

Expand Down Expand Up @@ -348,9 +365,6 @@ def process_read_file(read_file, output, barcodes, settings, barcode_counts, ada
# threadpool map function maintains the same order as the input data
for result in results:
barcode_counts[result['call']] += 1
if result['primary']['identity'] >= adapter_threshold \
and result['primary']['score'] > 0:
high_identity_barcodes.add(result['primary']['id'])

if annotation_file:
if output['extended_info']:
Expand Down Expand Up @@ -390,7 +404,6 @@ def process_read_file(read_file, output, barcodes, settings, barcode_counts, ada
summary_file.close()

records.close()
return high_identity_barcodes

def bin_read(read, result, output):
"""
Expand Down

0 comments on commit 9d1e620

Please sign in to comment.