assemble reads and then trimmed assembled sequences down to desired l…

…evels of coverage
openvax · Oct 27, 2016 · d13e68d · d13e68d
1 parent f60e654
commit d13e68d
Show file tree

Hide file tree

Showing 6 changed files with 203 additions and 81 deletions.
diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py
@@ -84,15 +84,15 @@ def allele_read_from_locus_read(locus_read, n_ref):
     if ref_pos_before is None:
         logger.warn(
             "Missing reference pos for nucleotide before variant on read: %s",
-                locus_read)
+            locus_read)
         return None
 
     ref_pos_after = reference_positions[read_pos_after]
 
     if ref_pos_after is None:
         logger.warn(
             "Missing reference pos for nucleotide after variant on read: %s",
-                locus_read)
+            locus_read)
         return None
 
     if n_ref == 0:
@@ -102,9 +102,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
             # don't use this read
             logger.debug(
                 "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                    ref_pos_before,
-                    ref_pos_after,
-                    locus_read)
+                ref_pos_before,
+                ref_pos_after,
+                locus_read)
             return None
 
         # insertions require a sequence of non-aligned bases
@@ -125,9 +125,9 @@ def allele_read_from_locus_read(locus_read, n_ref):
             # don't use this read
             logger.debug(
                 "Positions before (%d) and after (%d) variant should be adjacent on read %s",
-                    ref_pos_before,
-                    ref_pos_after,
-                    locus_read)
+                ref_pos_before,
+                ref_pos_after,
+                locus_read)
             return None
 
     nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
@@ -206,9 +206,11 @@ def reads_overlapping_variant(
     if chromosome is None:
         chromosome = variant.contig
 
-    logger.info("Gathering variant reads for variant %s (chromosome=%s)",
+    logger.info(
+        "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
         variant,
-        chromosome)
+        chromosome,
+        variant.gene_names)
 
     base1_position, ref, alt = trim_variant(variant)
 
@@ -271,7 +273,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
         else:
             logger.warn(
                 "Chromosome '%s' from variant %s not in alignment file %s",
-                    chromosome, variant, samfile.filename)
+                chromosome,
+                variant,
+                samfile.filename)
             continue
         allele_reads = reads_overlapping_variant(
             samfile=samfile,

diff --git a/isovar/assembly.py b/isovar/assembly.py
@@ -19,9 +19,6 @@
 # I'd rather just use list.copy but Python 2.7 doesn't support it
 from copy import copy
 
-from .read_helpers import group_unique_sequences
-from .variant_sequences import VariantSequence
-
 
 logger = logging.getLogger(__name__)
 
@@ -68,14 +65,6 @@ def sort_by_decreasing_total_length(seq):
     """
     return -len(seq.sequence)
 
-def combine_variant_sequences(variant_sequence1, variant_sequence2):
-    return VariantSequence(
-        prefix=variant_sequence1.prefix,
-        alt=variant_sequence1.alt,
-        suffix=variant_sequence2.suffix,
-        reads=set(
-            variant_sequence1.reads).union(set(variant_sequence2.reads)))
-
 def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
     """
     Greedily merge overlapping sequences into longer sequences.
@@ -103,10 +92,15 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                     variant_sequence2,
                     min_overlap_size=min_overlap_size):
                 continue
-            combined = combine_variant_sequences(
-                variant_sequence1, variant_sequence2)
+            combined = variant_sequence1.combine(variant_sequence2)
             if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
                 # if the combined sequence isn't any longer, then why bother?
+                logger.info(
+                    "Skipping %s since doesn't increase length beyond max(%d, %d)",
+                    combined,
+                    len(variant_sequence1),
+                    len(variant_sequence2)
+                )
                 continue
             if combined.sequence in merged_variant_sequences:
                 # it's possible to get the same merged sequence from distinct
@@ -120,8 +114,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                 # sequence is supported by any one read.
                 existing_record_with_same_sequence = merged_variant_sequences[
                     combined.sequence]
-                combined_with_more_reads = combine_variant_sequences(
-                    existing_record_with_same_sequence,
+                combined_with_more_reads = existing_record_with_same_sequence.combine(
                     combined)
                 merged_variant_sequences[combined.sequence] = combined_with_more_reads
             else:
@@ -168,7 +161,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
     """
     return -len(variant_sequence.reads), -len(variant_sequence.sequence)
 
-def iterative_assembly(
+def iterative_overlap_assembly(
         variant_sequences,
         min_overlap_size=30,
         n_merge_iters=2):
@@ -185,9 +178,9 @@ def iterative_assembly(
 
         logger.info(
             "Iteration #%d of assembly: generated %d sequences (from %d)",
-                i + 1,
-                len(variant_sequences),
-                len(previous_sequences))
+            i + 1,
+            len(variant_sequences),
+            len(previous_sequences))
 
         if len(variant_sequences) == 0:
             # if the greedy merge procedure fails for all pairs of candidate
@@ -199,31 +192,9 @@ def iterative_assembly(
         variant_sequences = collapse_substrings(variant_sequences)
         logger.info(
             "After collapsing subsequences, %d distinct sequences left",
-                len(variant_sequences))
+            len(variant_sequences))
         if len(variant_sequences) == 1:
             # once we have only one sequence then there's no point trying
             # to further combine sequences
             break
     return variant_sequences
-
-def assemble_reads_into_variant_sequences(
-        variant_reads,
-        min_overlap_size=30,
-        n_merge_iters=2):
-    """
-    Turn a collection of AlleleRead objects into a collection of VariantSequence
-    objects, which are returned in order of (# supported reads, sequence length)
-    """
-    initial_variant_sequences = [
-        VariantSequence(
-            prefix=prefix,
-            alt=alt,
-            suffix=suffix,
-            reads=reads)
-        for (prefix, alt, suffix), reads in
-        group_unique_sequences(variant_reads).items()
-    ]
-    return iterative_assembly(
-        initial_variant_sequences,
-        min_overlap_size=min_overlap_size,
-        n_merge_iters=n_merge_iters)