Skip to content

Commit

Permalink
assemble reads and then trimmed assembled sequences down to desired l…
Browse files Browse the repository at this point in the history
…evels of coverage
  • Loading branch information
iskandr committed Oct 27, 2016
1 parent f60e654 commit d13e68d
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 81 deletions.
26 changes: 15 additions & 11 deletions isovar/allele_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,15 @@ def allele_read_from_locus_read(locus_read, n_ref):
if ref_pos_before is None:
logger.warn(
"Missing reference pos for nucleotide before variant on read: %s",
locus_read)
locus_read)
return None

ref_pos_after = reference_positions[read_pos_after]

if ref_pos_after is None:
logger.warn(
"Missing reference pos for nucleotide after variant on read: %s",
locus_read)
locus_read)
return None

if n_ref == 0:
Expand All @@ -102,9 +102,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
ref_pos_before,
ref_pos_after,
locus_read)
ref_pos_before,
ref_pos_after,
locus_read)
return None

# insertions require a sequence of non-aligned bases
Expand All @@ -125,9 +125,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
ref_pos_before,
ref_pos_after,
locus_read)
ref_pos_before,
ref_pos_after,
locus_read)
return None

nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
Expand Down Expand Up @@ -206,9 +206,11 @@ def reads_overlapping_variant(
if chromosome is None:
chromosome = variant.contig

logger.info("Gathering variant reads for variant %s (chromosome=%s)",
logger.info(
"Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
variant,
chromosome)
chromosome,
variant.gene_names)

base1_position, ref, alt = trim_variant(variant)

Expand Down Expand Up @@ -271,7 +273,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
else:
logger.warn(
"Chromosome '%s' from variant %s not in alignment file %s",
chromosome, variant, samfile.filename)
chromosome,
variant,
samfile.filename)
continue
allele_reads = reads_overlapping_variant(
samfile=samfile,
Expand Down
55 changes: 13 additions & 42 deletions isovar/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
# I'd rather just use list.copy but Python 2.7 doesn't support it
from copy import copy

from .read_helpers import group_unique_sequences
from .variant_sequences import VariantSequence


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,14 +65,6 @@ def sort_by_decreasing_total_length(seq):
"""
return -len(seq.sequence)

def combine_variant_sequences(variant_sequence1, variant_sequence2):
return VariantSequence(
prefix=variant_sequence1.prefix,
alt=variant_sequence1.alt,
suffix=variant_sequence2.suffix,
reads=set(
variant_sequence1.reads).union(set(variant_sequence2.reads)))

def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
"""
Greedily merge overlapping sequences into longer sequences.
Expand Down Expand Up @@ -103,10 +92,15 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
variant_sequence2,
min_overlap_size=min_overlap_size):
continue
combined = combine_variant_sequences(
variant_sequence1, variant_sequence2)
combined = variant_sequence1.combine(variant_sequence2)
if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
# if the combined sequence isn't any longer, then why bother?
logger.info(
"Skipping %s since doesn't increase length beyond max(%d, %d)",
combined,
len(variant_sequence1),
len(variant_sequence2)
)
continue
if combined.sequence in merged_variant_sequences:
# it's possible to get the same merged sequence from distinct
Expand All @@ -120,8 +114,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
# sequence is supported by any one read.
existing_record_with_same_sequence = merged_variant_sequences[
combined.sequence]
combined_with_more_reads = combine_variant_sequences(
existing_record_with_same_sequence,
combined_with_more_reads = existing_record_with_same_sequence.combine(
combined)
merged_variant_sequences[combined.sequence] = combined_with_more_reads
else:
Expand Down Expand Up @@ -168,7 +161,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
"""
return -len(variant_sequence.reads), -len(variant_sequence.sequence)

def iterative_assembly(
def iterative_overlap_assembly(
variant_sequences,
min_overlap_size=30,
n_merge_iters=2):
Expand All @@ -185,9 +178,9 @@ def iterative_assembly(

logger.info(
"Iteration #%d of assembly: generated %d sequences (from %d)",
i + 1,
len(variant_sequences),
len(previous_sequences))
i + 1,
len(variant_sequences),
len(previous_sequences))

if len(variant_sequences) == 0:
# if the greedy merge procedure fails for all pairs of candidate
Expand All @@ -199,31 +192,9 @@ def iterative_assembly(
variant_sequences = collapse_substrings(variant_sequences)
logger.info(
"After collapsing subsequences, %d distinct sequences left",
len(variant_sequences))
len(variant_sequences))
if len(variant_sequences) == 1:
# once we have only one sequence then there's no point trying
# to further combine sequences
break
return variant_sequences

def assemble_reads_into_variant_sequences(
variant_reads,
min_overlap_size=30,
n_merge_iters=2):
"""
Turn a collection of AlleleRead objects into a collection of VariantSequence
objects, which are returned in order of (# supported reads, sequence length)
"""
initial_variant_sequences = [
VariantSequence(
prefix=prefix,
alt=alt,
suffix=suffix,
reads=reads)
for (prefix, alt, suffix), reads in
group_unique_sequences(variant_reads).items()
]
return iterative_assembly(
initial_variant_sequences,
min_overlap_size=min_overlap_size,
n_merge_iters=n_merge_iters)
Loading

0 comments on commit d13e68d

Please sign in to comment.