diff --git a/.travis.yml b/.travis.yml index 01c9b9c..d15bdef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,12 @@ language: python python: - "2.7" - "3.4" +cache: + pip: true + # cache directory used for Ensembl downloads of GTF and FASTA files + # along with the indexed db of intervals and ID mappings and pickles + # of sequence dictionaries + directories: /home/travis/.cache/pyensembl/ before_install: # Commands below copied from: http://conda.pydata.org/docs/travis.html # We do this conditionally because it saves us some downloading if the diff --git a/isovar/__init__.py b/isovar/__init__.py index 42419f9..f4cac02 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -14,4 +14,4 @@ from __future__ import print_function, division, absolute_import -__version__ = "0.2.4" +__version__ = "0.3.0" diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py index 4508559..f6fb32e 100644 --- a/isovar/allele_reads.py +++ b/isovar/allele_reads.py @@ -34,18 +34,15 @@ logger = logging.getLogger(__name__) - # subclassing from namedtuple to get a lightweight object with built-in # hashing and comparison while also being able to add methods -AlleleReadFields = namedtuple( +AlleleReadBase = namedtuple( "AlleleRead", "prefix allele suffix name sequence") -class AlleleRead(AlleleReadFields): +class AlleleRead(AlleleReadBase): def __new__(cls, prefix, allele, suffix, name): - # construct sequence from prefix + alt + suffix - return AlleleReadFields.__new__( - cls, + return AlleleReadBase( prefix=prefix, allele=allele, suffix=suffix, @@ -55,100 +52,102 @@ def __new__(cls, prefix, allele, suffix, name): def __len__(self): return len(self.prefix) + len(self.allele) + len(self.suffix) -def allele_read_from_locus_read(locus_read, n_ref): - """ - Given a single ReadAtLocus object, return either an AlleleRead or None - - Parameters - ---------- - locus_read : LocusRead - Read which overlaps a variant locus but doesn't necessarily contain the - alternate nucleotides - - n_ref : int - Number of reference positions we are expecting to be modified or - deleted (for insertions this should be 0) - """ - sequence = locus_read.sequence - reference_positions = locus_read.reference_positions - - # positions of the nucleotides before and after the variant within - # the read sequence - read_pos_before = locus_read.base0_read_position_before_variant - read_pos_after = locus_read.base0_read_position_after_variant - - # positions of the nucleotides before and after the variant on the - # reference genome - ref_pos_before = reference_positions[read_pos_before] - - if ref_pos_before is None: - logger.warn( - "Missing reference pos for nucleotide before variant on read: %s", + @classmethod + def from_locus_read(cls, locus_read, n_ref): + """ + Given a single LocusRead object, return either an AlleleRead or None + + Parameters + ---------- + locus_read : LocusRead + Read which overlaps a variant locus but doesn't necessarily contain the + alternate nucleotides + + n_ref : int + Number of reference positions we are expecting to be modified or + deleted (for insertions this should be 0) + """ + sequence = locus_read.sequence + reference_positions = locus_read.reference_positions + + # positions of the nucleotides before and after the variant within + # the read sequence + read_pos_before = locus_read.base0_read_position_before_variant + read_pos_after = locus_read.base0_read_position_after_variant + + # positions of the nucleotides before and after the variant on the + # reference genome + ref_pos_before = reference_positions[read_pos_before] + + if ref_pos_before is None: + logger.warn( + "Missing reference pos for nucleotide before variant on read: %s", locus_read) - return None + return None - ref_pos_after = reference_positions[read_pos_after] + ref_pos_after = reference_positions[read_pos_after] - if ref_pos_after is None: - logger.warn( - "Missing reference pos for nucleotide after variant on read: %s", + if ref_pos_after is None: + logger.warn( + "Missing reference pos for nucleotide after variant on read: %s", locus_read) - return None - - if n_ref == 0: - if ref_pos_after - ref_pos_before != 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - "Positions before (%d) and after (%d) variant should be adjacent on read %s", + return None + + if n_ref == 0: + if ref_pos_after - ref_pos_before != 1: + # if the number of nucleotides skipped isn't the same + # as the number of reference nucleotides in the variant then + # don't use this read + logger.debug( + "Positions before (%d) and after (%d) variant should be adjacent on read %s", ref_pos_before, ref_pos_after, locus_read) - return None - - # insertions require a sequence of non-aligned bases - # followed by the subsequence reference position - ref_positions_for_inserted = reference_positions[ - read_pos_before + 1:read_pos_after] - if any(insert_pos is not None for insert_pos in ref_positions_for_inserted): - # all these inserted nucleotides should *not* align to the - # reference - logger.debug( - "Skipping read, inserted nucleotides shouldn't map to reference") - return None - else: - # substitutions and deletions - if ref_pos_after - ref_pos_before != n_ref + 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - "Positions before (%d) and after (%d) variant should be adjacent on read %s", + return None + + # insertions require a sequence of non-aligned bases + # followed by the subsequence reference position + ref_positions_for_inserted = reference_positions[ + read_pos_before + 1:read_pos_after] + if any(insert_pos is not None for insert_pos in ref_positions_for_inserted): + # all these inserted nucleotides should *not* align to the + # reference + logger.debug( + "Skipping read, inserted nucleotides shouldn't map to reference") + return None + else: + # substitutions and deletions + if ref_pos_after - ref_pos_before != n_ref + 1: + # if the number of nucleotides skipped isn't the same + # as the number of reference nucleotides in the variant then + # don't use this read + logger.debug( + ("Positions before (%d) and after (%d) variant should be " + "adjacent on read %s"), ref_pos_before, ref_pos_after, locus_read) - return None + return None - nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] + nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] - prefix = sequence[:read_pos_before + 1] - suffix = sequence[read_pos_after:] + prefix = sequence[:read_pos_before + 1] + suffix = sequence[read_pos_after:] - prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix) - prefix, suffix = trim_N_nucleotides(prefix, suffix) + prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix) + prefix, suffix = trim_N_nucleotides(prefix, suffix) - return AlleleRead( - prefix, - nucleotides_at_variant_locus, - suffix, - name=locus_read.name) + return cls( + prefix, + nucleotides_at_variant_locus, + suffix, + name=locus_read.name) def allele_reads_from_locus_reads(locus_reads, n_ref): """ - Given a collection of ReadAtLocus objects, returns a - list of VariantRead objects (which are split into prefix/variant/suffix - nucleotides). + Given a collection of LocusRead objects, returns a + list of AlleleRead objects + (which are split into prefix/allele/suffix nucleotide strings). Parameters ---------- @@ -161,7 +160,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref): """ for locus_read in locus_reads: - allele_read = allele_read_from_locus_read(locus_read, n_ref) + allele_read = AlleleRead.from_locus_read(locus_read, n_ref) if allele_read is None: continue else: @@ -206,9 +205,11 @@ def reads_overlapping_variant( if chromosome is None: chromosome = variant.contig - logger.info("Gathering variant reads for variant %s (chromosome=%s)", + logger.info( + "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)", variant, - chromosome) + chromosome, + variant.gene_names) base1_position, ref, alt = trim_variant(variant) @@ -271,7 +272,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs): else: logger.warn( "Chromosome '%s' from variant %s not in alignment file %s", - chromosome, variant, samfile.filename) + chromosome, + variant, + samfile.filename) continue allele_reads = reads_overlapping_variant( samfile=samfile, diff --git a/isovar/assembly.py b/isovar/assembly.py index c19156f..86c78fc 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -19,13 +19,9 @@ # I'd rather just use list.copy but Python 2.7 doesn't support it from copy import copy -from .read_helpers import group_unique_sequences -from .variant_sequences import VariantSequence - logger = logging.getLogger(__name__) - MIN_OVERLAP_SIZE = 30 def sort_by_decreasing_prefix_length(seq): @@ -68,14 +64,6 @@ def sort_by_decreasing_total_length(seq): """ return -len(seq.sequence) -def combine_variant_sequences(variant_sequence1, variant_sequence2): - return VariantSequence( - prefix=variant_sequence1.prefix, - alt=variant_sequence1.alt, - suffix=variant_sequence2.suffix, - reads=set( - variant_sequence1.reads).union(set(variant_sequence2.reads))) - def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): """ Greedily merge overlapping sequences into longer sequences. @@ -103,11 +91,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): variant_sequence2, min_overlap_size=min_overlap_size): continue - combined = combine_variant_sequences( - variant_sequence1, variant_sequence2) - if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)): - # if the combined sequence isn't any longer, then why bother? - continue + combined = variant_sequence1.combine(variant_sequence2) if combined.sequence in merged_variant_sequences: # it's possible to get the same merged sequence from distinct # input sequences @@ -120,8 +104,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE): # sequence is supported by any one read. existing_record_with_same_sequence = merged_variant_sequences[ combined.sequence] - combined_with_more_reads = combine_variant_sequences( - existing_record_with_same_sequence, + combined_with_more_reads = existing_record_with_same_sequence.combine( combined) merged_variant_sequences[combined.sequence] = combined_with_more_reads else: @@ -168,7 +151,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence): """ return -len(variant_sequence.reads), -len(variant_sequence.sequence) -def iterative_assembly( +def iterative_overlap_assembly( variant_sequences, min_overlap_size=30, n_merge_iters=2): @@ -185,9 +168,9 @@ def iterative_assembly( logger.info( "Iteration #%d of assembly: generated %d sequences (from %d)", - i + 1, - len(variant_sequences), - len(previous_sequences)) + i + 1, + len(variant_sequences), + len(previous_sequences)) if len(variant_sequences) == 0: # if the greedy merge procedure fails for all pairs of candidate @@ -199,31 +182,9 @@ def iterative_assembly( variant_sequences = collapse_substrings(variant_sequences) logger.info( "After collapsing subsequences, %d distinct sequences left", - len(variant_sequences)) + len(variant_sequences)) if len(variant_sequences) == 1: # once we have only one sequence then there's no point trying # to further combine sequences break return variant_sequences - -def assemble_reads_into_variant_sequences( - variant_reads, - min_overlap_size=30, - n_merge_iters=2): - """ - Turn a collection of AlleleRead objects into a collection of VariantSequence - objects, which are returned in order of (# supported reads, sequence length) - """ - initial_variant_sequences = [ - VariantSequence( - prefix=prefix, - alt=alt, - suffix=suffix, - reads=reads) - for (prefix, alt, suffix), reads in - group_unique_sequences(variant_reads).items() - ] - return iterative_assembly( - initial_variant_sequences, - min_overlap_size=min_overlap_size, - n_merge_iters=n_merge_iters) diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py index b27fd89..0f85713 100644 --- a/isovar/cli/variant_sequences.py +++ b/isovar/cli/variant_sequences.py @@ -38,6 +38,11 @@ def add_variant_sequence_args( default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, help="Minimum number of reads supporting a variant sequence (default %(default)s)") + rna_sequence_group.add_argument( + "--variant-sequence-assembly", + default=False, + action="store_true") + # when cDNA sequence length can be inferred from a protein length then # we may want to omit this arg if add_sequence_length_arg: @@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args): return reads_generator_to_sequences_generator( allele_reads_generator, min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence, - preferred_sequence_length=args.cdna_sequence_length) + preferred_sequence_length=args.cdna_sequence_length, + variant_cdna_sequence_assembly=args.variant_sequence_assembly) def variant_sequences_dataframe_from_args(args): variant_sequences_generator = variant_sequences_generator_from_args(args) diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py index b279b05..1bbab69 100644 --- a/isovar/default_parameters.py +++ b/isovar/default_parameters.py @@ -72,3 +72,8 @@ # number of protein sequences we want to return per variant MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1 + +# run overlap assembly algorithm to construct variant +# sequences from multiple reads which only partially +# overlap (rather than fully spanning a coding sequence) +VARIANT_CDNA_SEQUENCE_ASSEMBLY = False diff --git a/isovar/locus_reads.py b/isovar/locus_reads.py index 971595b..f5f8a81 100644 --- a/isovar/locus_reads.py +++ b/isovar/locus_reads.py @@ -33,154 +33,154 @@ logger = logging.getLogger(__name__) - -LocusRead = namedtuple( - "LocusRead", - [ +class LocusRead(namedtuple("LocusRead", [ "name", "sequence", "reference_positions", "quality_scores", "base0_read_position_before_variant", - "base0_read_position_after_variant", - ]) - -def create_locus_read_from_pysam_pileup_element( - pileup_element, - base0_position_before_variant, - base0_position_after_variant, - use_secondary_alignments, - use_duplicate_reads, - min_mapping_quality): - """ - Parameters - ---------- - pileup_element : pysam.PileupRead - - base0_position_before_variant : int + "base0_read_position_after_variant"])): - base0_position_after_variant : int - - use_secondary_alignments : bool - - use_duplicate_reads : bool - - min_mapping_quality : int - - Returns LocusRead or None - """ - read = pileup_element.alignment - - # For future reference, may get overlapping reads - # which can be identified by having the same name - name = read.query_name - - if name is None: - logger.warn("Read missing name at position %d", - base0_position_before_variant + 1) - return None - - if read.is_unmapped: - logger.warn( - "How did we get unmapped read '%s' in a pileup?", name) - return None - - if pileup_element.is_refskip: - # if read sequence doesn't actually align to the reference - # base before a variant, skip it - logger.debug("Skipping pileup element with CIGAR alignment N (intron)") - return None - elif pileup_element.is_del: - logger.debug( - "Skipping deletion at position %d (read name = %s)", + @classmethod + def from_pysam_pileup_element( + cls, + pileup_element, + base0_position_before_variant, + base0_position_after_variant, + use_secondary_alignments, + use_duplicate_reads, + min_mapping_quality): + """ + Parameters + ---------- + pileup_element : pysam.PileupRead + + base0_position_before_variant : int + + base0_position_after_variant : int + + use_secondary_alignments : bool + + use_duplicate_reads : bool + + min_mapping_quality : int + + Returns LocusRead or None + """ + read = pileup_element.alignment + + # For future reference, may get overlapping reads + # which can be identified by having the same name + name = read.query_name + + if name is None: + logger.warn( + "Read missing name at position %d", + base0_position_before_variant + 1) + return None + + if read.is_unmapped: + logger.warn( + "How did we get unmapped read '%s' in a pileup?", name) + return None + + if pileup_element.is_refskip: + # if read sequence doesn't actually align to the reference + # base before a variant, skip it + logger.debug("Skipping pileup element with CIGAR alignment N (intron)") + return None + elif pileup_element.is_del: + logger.debug( + "Skipping deletion at position %d (read name = %s)", base0_position_before_variant + 1, name) - return None + return None - if read.is_secondary and not use_secondary_alignments: - logger.debug("Skipping secondary alignment of read '%s'", name) - return None + if read.is_secondary and not use_secondary_alignments: + logger.debug("Skipping secondary alignment of read '%s'", name) + return None - if read.is_duplicate and not use_duplicate_reads: - logger.debug("Skipping duplicate read '%s'", name) - return None + if read.is_duplicate and not use_duplicate_reads: + logger.debug("Skipping duplicate read '%s'", name) + return None - mapping_quality = read.mapping_quality + mapping_quality = read.mapping_quality - missing_mapping_quality = mapping_quality is None - if min_mapping_quality > 0 and missing_mapping_quality: - logger.debug("Skipping read '%s' due to missing MAPQ", name) - return None - elif mapping_quality < min_mapping_quality: - logger.debug( - "Skipping read '%s' due to low MAPQ: %d < %d", + missing_mapping_quality = mapping_quality is None + if min_mapping_quality > 0 and missing_mapping_quality: + logger.debug("Skipping read '%s' due to missing MAPQ", name) + return None + elif mapping_quality < min_mapping_quality: + logger.debug( + "Skipping read '%s' due to low MAPQ: %d < %d", read.mapping_quality, mapping_quality, min_mapping_quality) - return None - - sequence = read.query_sequence - - if sequence is None: - logger.warn("Read '%s' missing sequence", name) - return None - - base_qualities = read.query_qualities - - if base_qualities is None: - logger.warn("Read '%s' missing base qualities", name) - return None - - # - # Documentation for pysam.AlignedSegment.get_reference_positions: - # ------------------------------------------------------------------ - # By default, this method only returns positions in the reference - # that are within the alignment. If full_length is set, None values - # will be included for any soft-clipped or unaligned positions - # within the read. The returned list will thus be of the same length - # as the read. - # - # Source: - # http://pysam.readthedocs.org/en/latest/ - # api.html#pysam.AlignedSegment.get_reference_positions - # - # We want a None value for every read position that does not have a - # corresponding reference position. - reference_positions = read.get_reference_positions( - full_length=True) - - # pysam uses base-0 positions everywhere except region strings - # Source: - # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong - if base0_position_before_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because first position %d not mapped", + return None + + sequence = read.query_sequence + + if sequence is None: + logger.warn("Read '%s' missing sequence", name) + return None + + base_qualities = read.query_qualities + + if base_qualities is None: + logger.warn("Read '%s' missing base qualities", name) + return None + + # + # Documentation for pysam.AlignedSegment.get_reference_positions: + # ------------------------------------------------------------------ + # By default, this method only returns positions in the reference + # that are within the alignment. If full_length is set, None values + # will be included for any soft-clipped or unaligned positions + # within the read. The returned list will thus be of the same length + # as the read. + # + # Source: + # http://pysam.readthedocs.org/en/latest/ + # api.html#pysam.AlignedSegment.get_reference_positions + # + # We want a None value for every read position that does not have a + # corresponding reference position. + reference_positions = read.get_reference_positions( + full_length=True) + + # pysam uses base-0 positions everywhere except region strings + # Source: + # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong + if base0_position_before_variant not in reference_positions: + logger.debug( + "Skipping read '%s' because first position %d not mapped", name, base0_position_before_variant) - return None - else: - base0_read_position_before_variant = reference_positions.index( - base0_position_before_variant) - - if base0_position_after_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because last position %d not mapped", + return None + else: + base0_read_position_before_variant = reference_positions.index( + base0_position_before_variant) + + if base0_position_after_variant not in reference_positions: + logger.debug( + "Skipping read '%s' because last position %d not mapped", name, base0_position_after_variant) - return None - else: - base0_read_position_after_variant = reference_positions.index( - base0_position_after_variant) - - if isinstance(sequence, bytes): - sequence = sequence.decode('ascii') - return LocusRead( - name=name, - sequence=sequence, - reference_positions=reference_positions, - quality_scores=base_qualities, - base0_read_position_before_variant=base0_read_position_before_variant, - base0_read_position_after_variant=base0_read_position_after_variant) + return None + else: + base0_read_position_after_variant = reference_positions.index( + base0_position_after_variant) + + if isinstance(sequence, bytes): + sequence = sequence.decode('ascii') + + return cls( + name=name, + sequence=sequence, + reference_positions=reference_positions, + quality_scores=base_qualities, + base0_read_position_before_variant=base0_read_position_before_variant, + base0_read_position_after_variant=base0_read_position_after_variant) def pileup_reads_at_position(samfile, chromosome, base0_position): """ @@ -250,9 +250,9 @@ def locus_read_generator( logger.debug( "Gathering reads at locus %s: %d-%d", - chromosome, - base1_position_before_variant, - base1_position_after_variant) + chromosome, + base1_position_before_variant, + base1_position_after_variant) base0_position_before_variant = base1_position_before_variant - 1 base0_position_after_variant = base1_position_after_variant - 1 @@ -267,7 +267,7 @@ def locus_read_generator( samfile=samfile, chromosome=chromosome, base0_position=base0_position_before_variant): - read = create_locus_read_from_pysam_pileup_element( + read = LocusRead.from_pysam_pileup_element( pileup_element, base0_position_before_variant=base0_position_before_variant, base0_position_after_variant=base0_position_after_variant, @@ -281,10 +281,10 @@ def locus_read_generator( logger.info( "Found %d reads overlapping locus %s: %d-%d", - count, - chromosome, - base1_position_before_variant, - base1_position_after_variant) + count, + chromosome, + base1_position_before_variant, + base1_position_after_variant) def locus_reads_dataframe(*args, **kwargs): """ diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py index ff3b158..0934310 100644 --- a/isovar/protein_sequences.py +++ b/isovar/protein_sequences.py @@ -21,6 +21,7 @@ from __future__ import print_function, division, absolute_import from collections import namedtuple, defaultdict +import logging from .default_parameters import ( MIN_TRANSCRIPT_PREFIX_LENGTH, @@ -28,6 +29,7 @@ PROTEIN_SEQUENCE_LENGTH, MAX_PROTEIN_SEQUENCES_PER_VARIANT, MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + VARIANT_CDNA_SEQUENCE_ASSEMBLY ) from .dataframe_builder import dataframe_from_generator from .translation import translate_variant_reads, TranslationKey @@ -47,6 +49,8 @@ # ########################## +logger = logging.getLogger(__name__) + ProteinSequence = namedtuple( "ProteinSequence", TranslationKey._fields + ( @@ -65,10 +69,10 @@ # number of unique read names from all the VariantSequence objects # from each translation "alt_reads_supporting_protein_sequence", - # how many reference transcripts overlap the variant locus? + # IDs of transcripts overlapping the variant locus "transcripts_overlapping_variant", - # how many reference transcripts were used to establish the - # reading frame for this protein sequence + # IDs of reference transcripts used to establish the reading frame for + # this protein sequence "transcripts_supporting_protein_sequence", # name of gene of the reference transcripts used in Translation # objects @@ -131,7 +135,9 @@ def summarize_translations(translations): def protein_sequence_sort_key(protein_sequence): """ - Sort protein sequences lexicographically by three criteria: + Sort protein sequences lexicographically by six criteria: + - TODO: min number of reads covering each nucleotide of + the protein sequence >= 2 - number of unique supporting reads - minimum mismatch versus a supporting reference transcript - number of supporting reference transcripts @@ -158,7 +164,8 @@ def reads_generator_to_protein_sequences_generator( min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT): + max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """" Translates each coding variant in a collection to one or more Translation objects, which are then aggregated into equivalent @@ -190,6 +197,10 @@ def reads_generator_to_protein_sequences_generator( max_protein_sequences_per_variant : int Number of protein sequences to return for each ProteinSequence + variant_cdna_sequence_assembly : bool + If True, then assemble variant cDNA sequences based on overlap of + RNA reads. If False, then variant cDNA sequences must be fully spanned + and contained within RNA reads. Yields pairs of a Variant and a list of ProteinSequence objects """ @@ -214,15 +225,23 @@ def reads_generator_to_protein_sequences_generator( protein_sequence_length=protein_sequence_length, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) protein_sequences = [] for (key, equivalent_translations) in group_translations(translations).items(): + # get the variant read names, transcript IDs and gene names for # protein sequence we're about to construct alt_reads_supporting_protein_sequence, group_transcript_ids, group_gene_names = \ summarize_translations(equivalent_translations) + logger.info( + "%s: %s alt reads supporting protein sequence (gene names = %s)", + key, + len(alt_reads_supporting_protein_sequence), + group_gene_names) + protein_sequence = translation_key_to_protein_sequence( translation_key=key, translations=equivalent_translations, @@ -233,6 +252,7 @@ def reads_generator_to_protein_sequences_generator( transcripts_supporting_protein_sequence=group_transcript_ids, transcripts_overlapping_variant=overlapping_transcript_ids, gene=list(group_gene_names)) + logger.info("%s: protein sequence = %s" % (key, protein_sequence.amino_acids)) protein_sequences.append(protein_sequence) # sort protein sequences before returning the top results diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py new file mode 100644 index 0000000..d37bc04 --- /dev/null +++ b/isovar/reference_coding_sequence_key.py @@ -0,0 +1,195 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import logging +from collections import namedtuple + +from .variant_helpers import interbase_range_affected_by_variant_on_transcript +from .reference_sequence_key import ReferenceSequenceKey + +logger = logging.getLogger(__name__) + +class ReferenceCodingSequenceKey( + namedtuple( + "ReferenceCodingSequenceKey", + ReferenceSequenceKey._fields + ( + # if the reference context includes the 5' UTR then + # this is the offset to the start codon, otherwise it's the + # offset needed to get the first base of a codon + "offset_to_first_complete_codon", + # does this context overlap a start codon? + "overlaps_start_codon", + # does this context contain the whole trinucleotide start codon? + "contains_start_codon", + # does this context contain any UTR bases? + "contains_five_prime_utr", + # translation of complete codons in the reference context + # before the variant + "amino_acids_before_variant"))): + + """ + ReferenceCodingSequenceKey includes all the fields of a ReferenceSequenceKey, + and additionally tracks the reading frame and information about where the + start codon and 5' UTR are relative to this sequence fragment. + """ + + @classmethod + def from_variant_and_transcript( + cls, + variant, + transcript, + context_size): + """ + Extracts the reference sequence around a variant locus on a particular + transcript and determines the reading frame at the start of that + sequence context. + + Parameters + ---------- + variant : varcode.Variant + + transcript : pyensembl.Transcript + + context_size : int + + Returns SequenceKeyWithReadingFrame object or None if Transcript lacks + coding sequence, protein sequence or annotated start/stop codons. + """ + if not transcript.contains_start_codon: + logger.info( + "Expected transcript %s for variant %s to have start codon", + transcript, + variant) + return None + + if not transcript.contains_stop_codon: + logger.info( + "Expected transcript %s for variant %s to have stop codon", + transcript, + variant) + return None + + if not transcript.protein_sequence: + logger.info( + "Expected transript %s for variant %s to have protein sequence", + transcript, + variant) + return None + + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( + variant=variant, + transcript=transcript, + context_size=context_size) + + if sequence_key is None: + logger.info( + "No sequence key for variant %s on transcript %s", + variant, + transcript) + return None + + # get the interbase range of offsets which capture all reference + # bases modified by the variant + variant_start_offset, variant_end_offset = \ + interbase_range_affected_by_variant_on_transcript( + variant=variant, + transcript=transcript) + + start_codon_idx = min(transcript.start_codon_spliced_offsets) + + # skip any variants which occur in the 5' UTR or overlap the start codon + # since + # (1) UTR variants have unclear coding effects and + # (2) start-loss variants may result in a new start codon / reading + # frame but we're not sure which one! + if variant_start_offset < start_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, must be after start codon", + transcript, + variant) + return None + + stop_codon_idx = min(transcript.stop_codon_spliced_offsets) + + # skip variants which affect the 3' UTR of the transcript since + # they don't have obvious coding effects on the protein sequence + if variant_start_offset >= stop_codon_idx + 3: + logger.info( + "Skipping transcript %s for variant %s, occurs in 3' UTR", + transcript, + variant) + return None + + n_prefix = len(sequence_key.sequence_before_variant_locus) + prefix_start_idx = variant_start_offset - n_prefix + n_bases_between_start_and_variant = variant_start_offset - start_codon_idx + n_full_codons_before_variant = n_bases_between_start_and_variant // 3 + + # if the sequence before the variant contains more bases than the + # distance to the start codon, then by definition it must contain + # some untranslated bases + contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) + # allows for the possibility that the first base in the sequence might + # be the first nucleotide of the start codon + contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) + # the sequence context might only include the 2nd or 3rd bases of + # the start codon + overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) + + if contains_start_codon: + offset_to_first_complete_codon = start_codon_idx - prefix_start_idx + amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] + else: + reading_frame = (prefix_start_idx - start_codon_idx) % 3 + offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) + n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 + amino_acids_before_variant = transcript.protein_sequence[ + n_full_codons_before_variant - n_codons_in_prefix: + n_full_codons_before_variant] + return cls( + strand=sequence_key.strand, + sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, + sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, + sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, + offset_to_first_complete_codon=offset_to_first_complete_codon, + contains_start_codon=contains_start_codon, + overlaps_start_codon=overlaps_start_codon, + contains_five_prime_utr=contains_five_prime_utr, + amino_acids_before_variant=amino_acids_before_variant) + + +def reading_frame_to_offset(reading_frame_at_start_of_sequence): + """ + Given a reading frame (how many nucleotides into a codon) at the + start of a cDNA sequence, return the number of nucleotides which need + to be trimmed to start on a complete codon. + + Parameters + ---------- + + reading_frame_at_start_of_sequence : int + + Returns an int + """ + if reading_frame_at_start_of_sequence < 0: + raise ValueError("Reading frame can't be negative: %d" % ( + reading_frame_at_start_of_sequence,)) + elif reading_frame_at_start_of_sequence > 2: + raise ValueError("Reading frame must be within 0 and 2, not %d" % ( + reading_frame_at_start_of_sequence,)) + # If we're 1 nucleotide into the codon then we need to shift + # over two more to restore the ORF. Likewise, if we're 2 nucleotides in + # then we have to shift over one more. + return (3 - reading_frame_at_start_of_sequence) % 3 diff --git a/isovar/reference_context.py b/isovar/reference_context.py index 2e8dbd3..1b0d262 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -16,66 +16,13 @@ from collections import namedtuple, OrderedDict, defaultdict import logging -from skbio import DNA - from .effect_prediction import reference_transcripts_for_variant -from .variant_helpers import interbase_range_affected_by_variant_on_transcript from .dataframe_builder import DataFrameBuilder - +from .reference_coding_sequence_key import ReferenceCodingSequenceKey logger = logging.getLogger(__name__) -########################## -# -# SequenceKey -# ----------- -# -# Used to identify and group the distinct sequences occurring on a set of -# transcripts overlapping a variant locus -# -########################## - -SequenceKey = namedtuple( - "SequenceKey", [ - "strand", - "sequence_before_variant_locus", - "sequence_at_variant_locus", - "sequence_after_variant_locus" - ] -) - - -########################## -# -# SequenceKeyWithReadingFrame -# --------------------------- -# -# Includes all the fields of a SequenceKey, but also includes a reading frame -# at the start of the reference sequence. -# -# -########################## - -SequenceKeyWithReadingFrame = namedtuple( - "ReferenceContext", SequenceKey._fields + ( - # if the reference context includes the 5' UTR then - # this is the offset to the start codon, otherwise it's the - # offset needed to get the first base of a codon - "offset_to_first_complete_codon", - # does this context overlap a start codon? - "overlaps_start_codon", - # does this context contain the whole trinucleotide start codon? - "contains_start_codon", - # does this context contain any UTR bases? - "contains_five_prime_utr", - # translation of complete codons in the reference context - # before the variant - "amino_acids_before_variant", - ) -) - - ########################## # # ReferenceContext @@ -87,258 +34,31 @@ # ########################## -ReferenceContext = namedtuple( - "ReferenceContext", - SequenceKeyWithReadingFrame._fields + ( - "variant", - "transcripts") -) - -def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): - """ - Make sure that reference nucleotides we expect to see on the reference - transcript from a variant are the same ones we encounter. - """ - if strand == "-": - ref_seq_on_transcript = str( - DNA(ref_seq_on_transcript).reverse_complement()) - - return ref_seq_on_transcript == variant.ref - -def sequence_key_for_variant_on_transcript(variant, transcript, context_size): - """ - Extracts the reference sequence around a variant locus on a particular - transcript. - - Parameters - ---------- - variant : varcode.Variant - - transcript : pyensembl.Transcript - - context_size : int - - Returns SequenceKey object with the following fields: - - strand - - sequence_before_variant_locus - - sequence_at_variant_locus - - sequence_after_variant_locus - - Can also return None if Transcript lacks sufficiently long sequence - """ - - full_sequence = transcript.sequence - - if full_sequence is None: - logger.warn( - "Expected transcript %s (overlapping %s) to have sequence", - transcript, - variant) - return None - - if len(full_sequence) < 6: - # need at least 6 nucleotides for a start and stop codon - logger.warn( - "Sequence of %s (overlapping %s) too short: %d", - transcript, - variant, - len(full_sequence)) - return None - - # get the interbase range of offsets which capture all reference - # bases modified by the variant - variant_start_offset, variant_end_offset = \ - interbase_range_affected_by_variant_on_transcript( - variant=variant, - transcript=transcript) - - logger.info("Interbase offset range on %s for variant %s = %d:%d", - transcript, - variant, - variant_start_offset, - variant_end_offset) - - prefix = full_sequence[ - max(0, variant_start_offset - context_size): - variant_start_offset] - - suffix = full_sequence[ - variant_end_offset: - variant_end_offset + context_size] - - ref_nucleotides_at_variant = full_sequence[ - variant_start_offset:variant_end_offset] - if not variant_matches_reference_sequence( - variant=variant, - strand=transcript.strand, - ref_seq_on_transcript=ref_nucleotides_at_variant): - # TODO: once we're more confident about other logic in isovar, - # change this to a warning and return None to allow for modest - # differences between reference sequence patches, since - # GRCh38.p1 may differ at some positions from GRCh38.p5 - raise ValueError( - "Wrong reference sequence for variant %s on transcript %s" % ( - variant, - transcript)) - return SequenceKey( - strand=transcript.strand, - sequence_before_variant_locus=prefix, - sequence_at_variant_locus=ref_nucleotides_at_variant, - sequence_after_variant_locus=suffix) - - -def reading_frame_to_offset(reading_frame_at_start_of_sequence): - """ - Given a reading frame (how many nucleotides into a codon) at the - start of a cDNA sequence, return the number of nucleotides which need - to be trimmed to start on a complete codon. - - Parameters - ---------- - - reading_frame_at_start_of_sequence : int - - Returns an int - """ - if reading_frame_at_start_of_sequence < 0: - raise ValueError("Reading frame can't be negative: %d" % ( - reading_frame_at_start_of_sequence,)) - elif reading_frame_at_start_of_sequence > 2: - raise ValueError("Reading frame must be within 0 and 2, not %d" % ( - reading_frame_at_start_of_sequence,)) - # If we're 1 nucleotide into the codon then we need to shift - # over two more to restore the ORF. Likewise, if we're 2 nucleotides in - # then we have to shift over one more. - return (3 - reading_frame_at_start_of_sequence) % 3 - - -def sequence_key_with_reading_frame_for_variant_on_transcript( - variant, - transcript, - context_size): - """ - Extracts the reference sequence around a variant locus on a particular - transcript and determines the reading frame at the start of that - sequence context. - - Parameters - ---------- - variant : varcode.Variant +class ReferenceContext(namedtuple( + "ReferenceContext", + ReferenceCodingSequenceKey._fields + ("variant", "transcripts"))): - transcript : pyensembl.Transcript - - context_size : int - - Returns SequenceKeyWithReadingFrame object or None if Transcript lacks - coding sequence, protein sequence or annotated start/stop codons. - """ - if not transcript.contains_start_codon: - logger.info( - "Expected transcript %s for variant %s to have start codon", - transcript, - variant) - return None - - if not transcript.contains_stop_codon: - logger.info( - "Expected transcript %s for variant %s to have stop codon", - transcript, - variant) - return None - - if not transcript.protein_sequence: - logger.info( - "Expected transript %s for variant %s to have protein sequence", - transcript, - variant) - return None - - sequence_key = sequence_key_for_variant_on_transcript( - variant=variant, - transcript=transcript, - context_size=context_size) - - if sequence_key is None: - logger.info("No sequence key for variant %s on transcript %s", - variant, - transcript) - return None - - # get the interbase range of offsets which capture all reference - # bases modified by the variant - variant_start_offset, variant_end_offset = \ - interbase_range_affected_by_variant_on_transcript( + @classmethod + def from_reference_coding_sequence_key(cls, key, variant, transcripts): + return cls( + strand=key.strand, + sequence_before_variant_locus=key.sequence_before_variant_locus, + sequence_at_variant_locus=key.sequence_at_variant_locus, + sequence_after_variant_locus=key.sequence_after_variant_locus, + offset_to_first_complete_codon=key.offset_to_first_complete_codon, + contains_start_codon=key.contains_start_codon, + overlaps_start_codon=key.overlaps_start_codon, + contains_five_prime_utr=key.contains_five_prime_utr, + amino_acids_before_variant=key.amino_acids_before_variant, variant=variant, - transcript=transcript) - - start_codon_idx = min(transcript.start_codon_spliced_offsets) + transcripts=transcripts) - # skip any variants which occur in the 5' UTR or overlap the start codon - # since - # (1) UTR variants have unclear coding effects and - # (2) start-loss variants may result in a new start codon / reading - # frame but we're not sure which one! - if variant_start_offset < start_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, must be after start codon", - transcript, - variant) - return None - - stop_codon_idx = min(transcript.stop_codon_spliced_offsets) - - # skip variants which affect the 3' UTR of the transcript since - # they don't have obvious coding effects on the protein sequence - if variant_start_offset >= stop_codon_idx + 3: - logger.info( - "Skipping transcript %s for variant %s, occurs in 3' UTR", - transcript, - variant) - return None - - n_prefix = len(sequence_key.sequence_before_variant_locus) - prefix_start_idx = variant_start_offset - n_prefix - n_bases_between_start_and_variant = variant_start_offset - start_codon_idx - n_full_codons_before_variant = n_bases_between_start_and_variant // 3 - - # if the sequence before the variant contains more bases than the - # distance to the start codon, then by definition it must contain - # some untranslated bases - contains_five_prime_utr = (n_prefix > n_bases_between_start_and_variant) - # allows for the possibility that the first base in the sequence might - # be the first nucleotide of the start codon - contains_start_codon = (n_prefix >= n_bases_between_start_and_variant) - # the sequence context might only include the 2nd or 3rd bases of - # the start codon - overlaps_start_codon = (n_prefix > n_bases_between_start_and_variant - 3) - - if contains_start_codon: - offset_to_first_complete_codon = start_codon_idx - prefix_start_idx - amino_acids_before_variant = transcript.protein_sequence[:n_full_codons_before_variant] - else: - reading_frame = (prefix_start_idx - start_codon_idx) % 3 - offset_to_first_complete_codon = reading_frame_to_offset(reading_frame) - n_codons_in_prefix = (n_prefix - offset_to_first_complete_codon) // 3 - amino_acids_before_variant = transcript.protein_sequence[ - n_full_codons_before_variant - n_codons_in_prefix: - n_full_codons_before_variant] - return SequenceKeyWithReadingFrame( - strand=sequence_key.strand, - sequence_before_variant_locus=sequence_key.sequence_before_variant_locus, - sequence_at_variant_locus=sequence_key.sequence_at_variant_locus, - sequence_after_variant_locus=sequence_key.sequence_after_variant_locus, - offset_to_first_complete_codon=offset_to_first_complete_codon, - contains_start_codon=contains_start_codon, - overlaps_start_codon=overlaps_start_codon, - contains_five_prime_utr=contains_five_prime_utr, - amino_acids_before_variant=amino_acids_before_variant) - -def sort_key_decreasing_max_length_transcript_cds(reference_context): - """ - Used to sort a sequence of ReferenceContext objects by the longest CDS - in each context's list of transcripts. - """ - return -max(len(t.coding_sequence) for t in reference_context.transcripts) + def sort_key_decreasing_max_length_transcript_cds(self): + """ + Used to sort a sequence of ReferenceContext objects by the longest CDS + in each context's list of transcripts. + """ + return -max(len(t.coding_sequence) for t in self.transcripts) def reference_contexts_for_variant( variant, @@ -367,7 +87,7 @@ def reference_contexts_for_variant( for transcript in overlapping_transcripts: sequence_key_with_reading_frame = \ - sequence_key_with_reading_frame_for_variant_on_transcript( + ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=context_size) @@ -375,21 +95,12 @@ def reference_contexts_for_variant( sequence_groups[sequence_key_with_reading_frame].append(transcript) reference_contexts = [ - ReferenceContext( - strand=key.strand, - sequence_before_variant_locus=key.sequence_before_variant_locus, - sequence_at_variant_locus=key.sequence_at_variant_locus, - sequence_after_variant_locus=key.sequence_after_variant_locus, - offset_to_first_complete_codon=key.offset_to_first_complete_codon, - contains_start_codon=key.contains_start_codon, - overlaps_start_codon=key.overlaps_start_codon, - contains_five_prime_utr=key.contains_five_prime_utr, - amino_acids_before_variant=key.amino_acids_before_variant, - variant=variant, - transcripts=matching_transcripts) + ReferenceContext.from_reference_coding_sequence_key( + key, variant, matching_transcripts) for (key, matching_transcripts) in sequence_groups.items() ] - reference_contexts.sort(key=sort_key_decreasing_max_length_transcript_cds) + reference_contexts.sort( + key=ReferenceContext.sort_key_decreasing_max_length_transcript_cds) return reference_contexts def reference_contexts_for_variants( diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py new file mode 100644 index 0000000..d7f1548 --- /dev/null +++ b/isovar/reference_sequence_key.py @@ -0,0 +1,127 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import +import logging +from collections import namedtuple + +from skbio import DNA + +from .variant_helpers import interbase_range_affected_by_variant_on_transcript + +logger = logging.getLogger(__name__) + + +class ReferenceSequenceKey(namedtuple("ReferenceSequenceKey", [ + "strand", + "sequence_before_variant_locus", + "sequence_at_variant_locus", + "sequence_after_variant_locus"])): + """ + Used to identify and group the distinct sequences occurring on a set of + transcripts overlapping a variant locus + """ + + @classmethod + def from_variant_and_transcript(cls, variant, transcript, context_size): + """ + Extracts the reference sequence around a variant locus on a particular + transcript. + + Parameters + ---------- + variant : varcode.Variant + + transcript : pyensembl.Transcript + + context_size : int + + Returns SequenceKey object with the following fields: + - strand + - sequence_before_variant_locus + - sequence_at_variant_locus + - sequence_after_variant_locus + + Can also return None if Transcript lacks sufficiently long sequence + """ + full_transcript_sequence = transcript.sequence + + if full_transcript_sequence is None: + logger.warn( + "Expected transcript %s (overlapping %s) to have sequence", + transcript, + variant) + return None + + if len(full_transcript_sequence) < 6: + # need at least 6 nucleotides for a start and stop codon + logger.warn( + "Sequence of %s (overlapping %s) too short: %d", + transcript, + variant, + len(full_transcript_sequence)) + return None + + # get the interbase range of offsets which capture all reference + # bases modified by the variant + variant_start_offset, variant_end_offset = \ + interbase_range_affected_by_variant_on_transcript( + variant=variant, + transcript=transcript) + + logger.info( + "Interbase offset range on %s for variant %s = %d:%d", + transcript, + variant, + variant_start_offset, + variant_end_offset) + + prefix = full_transcript_sequence[ + max(0, variant_start_offset - context_size): + variant_start_offset] + + suffix = full_transcript_sequence[ + variant_end_offset: + variant_end_offset + context_size] + + ref_nucleotides_at_variant = full_transcript_sequence[ + variant_start_offset:variant_end_offset] + if not variant_matches_reference_sequence( + variant=variant, + strand=transcript.strand, + ref_seq_on_transcript=ref_nucleotides_at_variant): + # TODO: once we're more confident about other logic in isovar, + # change this to a warning and return None to allow for modest + # differences between reference sequence patches, since + # GRCh38.p1 may differ at some positions from GRCh38.p5 + raise ValueError( + "Wrong reference sequence for variant %s on transcript %s" % ( + variant, + transcript)) + return cls( + strand=transcript.strand, + sequence_before_variant_locus=prefix, + sequence_at_variant_locus=ref_nucleotides_at_variant, + sequence_after_variant_locus=suffix) + + +def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): + """ + Make sure that reference nucleotides we expect to see on the reference + transcript from a variant are the same ones we encounter. + """ + if strand == "-": + ref_seq_on_transcript = str( + DNA(ref_seq_on_transcript).reverse_complement()) + return ref_seq_on_transcript == variant.ref diff --git a/isovar/translation.py b/isovar/translation.py index fc87fed..a246201 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -26,13 +26,14 @@ from skbio import DNA from .reference_context import reference_contexts_for_variant -from .variant_sequences import supporting_reads_to_variant_sequences +from .variant_sequences import reads_to_variant_sequences from .default_parameters import ( MIN_TRANSCRIPT_PREFIX_LENGTH, MAX_REFERENCE_TRANSCRIPT_MISMATCHES, PROTEIN_SEQUENCE_LENGTH, MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + VARIANT_CDNA_SEQUENCE_ASSEMBLY, ) from .dataframe_builder import dataframe_from_generator @@ -467,9 +468,9 @@ def translate_variant_sequence( logger.warn( ("Truncating amino acid sequence %s from variant sequence %s " "to only %d elements loses all variant residues"), - variant_amino_acids, - variant_sequence, - protein_sequence_length) + variant_amino_acids, + variant_sequence, + protein_sequence_length) return None # if the protein is too long then short it, which implies we're no longer # stopping due to a stop codon and that the variant amino acids might @@ -542,7 +543,8 @@ def translate_variant_reads( transcript_id_whitelist=None, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES): + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): if len(variant_reads) == 0: logger.info("No supporting reads for variant %s", variant) return [] @@ -551,10 +553,12 @@ def translate_variant_reads( # need to clip nucleotides at the start/end of the sequence cdna_sequence_length = (protein_sequence_length + 1) * 3 - variant_sequences = supporting_reads_to_variant_sequences( - variant_reads=variant_reads, + variant_sequences = reads_to_variant_sequences( + variant=variant, + reads=variant_reads, preferred_sequence_length=cdna_sequence_length, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) if not variant_sequences: logger.info("No spanning cDNA sequences for variant %s", variant) @@ -572,7 +576,7 @@ def translate_variant_reads( if context_size < min_transcript_prefix_length: logger.info( "Skipping variant %s, none of the cDNA sequences have sufficient context", - variant) + variant) return [] reference_contexts = reference_contexts_for_variant( @@ -592,7 +596,8 @@ def translate_variants( protein_sequence_length=PROTEIN_SEQUENCE_LENGTH, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES): + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ Translates each coding variant in a collection to one or more protein fragment sequences (if the variant is not filtered and its spanning RNA @@ -636,7 +641,8 @@ def translate_variants( transcript_id_whitelist=transcript_id_whitelist, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) yield variant, translations def translations_generator_to_dataframe(translations_generator): diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index a12fa0c..4fa52db 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -13,8 +13,10 @@ # limitations under the License. from __future__ import print_function, division, absolute_import -from collections import namedtuple import logging +from collections import namedtuple + +import numpy as np from .read_helpers import ( get_single_allele_from_reads, @@ -24,16 +26,16 @@ from .default_parameters import ( MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, VARIANT_CDNA_SEQUENCE_LENGTH, + VARIANT_CDNA_SEQUENCE_ASSEMBLY, ) from .dataframe_builder import dataframe_from_generator - +from .assembly import iterative_overlap_assembly, collapse_substrings logger = logging.getLogger(__name__) - # using this base class to define the core fields of a VariantSequence # but inheriting it from it to allow the addition of helper methods -VariantSequenceFields = namedtuple( +VariantSequenceBase = namedtuple( "VariantSequence", [ # nucleotides before a variant @@ -45,13 +47,12 @@ # since we often want to look at prefix+alt+suffix, let's cache it "sequence", # reads which were used to determine this sequences - "reads", - ]) + "reads"]) -class VariantSequence(VariantSequenceFields): +class VariantSequence(VariantSequenceBase): def __new__(cls, prefix, alt, suffix, reads): # construct sequence from prefix + alt + suffix - return VariantSequenceFields.__new__( + return VariantSequenceBase.__new__( cls, prefix=prefix, alt=alt, @@ -81,8 +82,6 @@ def contains(self, other): def left_overlaps(self, other, min_overlap_size=1): """ Does this VariantSequence overlap another on the left side? - The alleles must match and they must share at least `min_overlap_size` - nucleotides in their prefix or suffix. """ if self.alt != other.alt: @@ -107,10 +106,14 @@ def left_overlaps(self, other, min_overlap_size=1): # p2 a2 s2 = XX Y ZZZZZZZZZ # ... # then we can combine them into a longer sequence - return ( + sequence_overlaps = ( self.prefix.endswith(other.prefix) and other.suffix.startswith(self.suffix) ) + prefix_overlap_size = len(self.prefix) - len(other.prefix) + suffix_overlap_size = len(other.suffix) - len(self.suffix) + overlap_size = prefix_overlap_size + suffix_overlap_size + len(self.alt) + return sequence_overlaps and overlap_size >= min_overlap_size def add_reads(self, reads): """ @@ -122,10 +125,111 @@ def add_reads(self, reads): suffix=self.suffix, reads=self.reads.union(reads)) -def sort_key_decreasing_read_count(variant_sequence): - return -len(variant_sequence.reads) + def combine(self, other_sequence): + """ + If this sequence is the prefix of another sequence, combine + them into a single VariantSequence object. + """ + if other_sequence.alt != self.alt: + raise ValueError( + "Cannot combine %s and %s with mismatching alt sequences" % ( + self, + other_sequence)) + elif self.left_overlaps(other_sequence): + # If sequences are like AABC and ABCC + return VariantSequence( + prefix=self.prefix, + alt=self.alt, + suffix=other_sequence.suffix, + reads=self.reads.union(other_sequence.reads)) + elif other_sequence.left_overlaps(self): + return other_sequence.combine(self) + elif self.contains(other_sequence): + return self.add_reads(other_sequence.reads) + elif other_sequence.contains(self): + return other_sequence.add_reads(self.reads) + else: + raise ValueError("%s does not overlap with %s" % (self, other_sequence)) + + def variant_indices(self): + """ + When we combine prefix + alt + suffix into a single string, + what are is base-0 index interval which gets us back the alt + sequence? First returned index is inclusive, the second is exclusive. + """ + variant_start_index = len(self.prefix) + variant_len = len(self.alt) + variant_end_index = variant_start_index + variant_len + return variant_start_index, variant_end_index + + def coverage(self): + """ + Returns NumPy array indicating number of reads covering each + nucleotides of this sequence. + """ + variant_start_index, variant_end_index = self.variant_indices() + n_nucleotides = len(self) + coverage_array = np.zeros(n_nucleotides, dtype="int32") + for read in self.reads: + coverage_array[ + max(0, variant_start_index - len(read.prefix)): + min(n_nucleotides, variant_end_index + len(read.suffix))] += 1 + return coverage_array + + def min_coverage(self): + return np.min(self.coverage()) + + def mean_coverage(self): + return np.mean(self.coverage()) + + def trim_by_coverage(self, min_reads): + """ + Given the min number of reads overlapping each nucleotide of + a variant sequence, trim this sequence by getting rid of positions + which are overlapped by fewer reads than specified. + """ + read_count_array = self.coverage() + logger.info("Coverage: %s (len=%d)" % ( + read_count_array, len(read_count_array))) + sufficient_coverage_mask = read_count_array >= min_reads + sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) + if len(sufficient_coverage_indices) == 0: + logger.debug("No bases in %s have coverage >= %d" % (self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + variant_start_index, variant_end_index = self.variant_indices() + # assuming that coverage drops off monotonically away from + # variant nucleotides + first_covered_index = sufficient_coverage_indices.min() + last_covered_index = sufficient_coverage_indices.max() + # adding 1 to last_covered_index since it's an inclusive index + # whereas variant_end_index is the end of a half-open interval + if (first_covered_index > variant_start_index or + last_covered_index + 1 < variant_end_index): + # Example: + # Nucleotide sequence: + # ACCCTTTT|AA|GGCGCGCC + # Coverage: + # 12222333|44|33333211 + # Then the mask for bases covered >= 4x would be: + # ________|**|________ + # with indices: + # first_covered_index = 9 + # last_covered_index = 10 + # variant_start_index = 9 + # variant_end_index = 11 + logger.debug("Some variant bases in %s don't have coverage >= %d" % ( + self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + return VariantSequence( + prefix=self.prefix[first_covered_index:], + alt=self.alt, + suffix=self.suffix[:last_covered_index - variant_end_index + 1], + reads=self.reads) + + def sort_key_decreasing_read_count(self): + return -len(self.reads) -def all_variant_sequences_supported_by_variant_reads( +def initial_variant_sequences_from_reads( variant_reads, max_nucleotides_before_variant=None, max_nucleotides_after_variant=None): @@ -148,6 +252,7 @@ def all_variant_sequences_supported_by_variant_reads( for ((prefix, alt, suffix), reads) in unique_sequence_groups.items() ] + def filter_variant_sequences_by_read_support( variant_sequences, min_reads_supporting_cdna_sequence): @@ -155,11 +260,12 @@ def filter_variant_sequences_by_read_support( variant_sequences = [ s for s in variant_sequences - if len(s.reads) >= min_reads_supporting_cdna_sequence + if s.min_coverage() >= min_reads_supporting_cdna_sequence ] n_dropped = n_total - len(variant_sequences) if n_dropped > 0: - logger.info("Dropped %d/%d variant sequences less than %d supporting reads", + logger.info( + "Dropped %d/%d variant sequences less than %d supporting reads", n_dropped, n_total, min_reads_supporting_cdna_sequence) @@ -179,19 +285,38 @@ def filter_variant_sequences_by_length( min_required_sequence_length = min( max_observed_sequence_length, preferred_sequence_length) - variant_sequences = [ s for s in variant_sequences if len(s.sequence) >= min_required_sequence_length ] n_dropped = n_total - len(variant_sequences) if n_dropped > 0: - logger.info("Dropped %d/%d variant sequences shorter than %d", + logger.info( + "Dropped %d/%d variant sequences shorter than %d", n_dropped, n_total, min_required_sequence_length) return variant_sequences +def trim_variant_sequences(variant_sequences, min_reads_supporting_cdna_sequence): + """ + Trim VariantSequences to desired coverage and then combine any + subsequences which get generated. + """ + n_total = len(variant_sequences) + trimmed_variant_sequences = [ + variant_sequence.trim_by_coverage(min_reads_supporting_cdna_sequence) + for variant_sequence in variant_sequences + ] + collapsed_variant_sequences = collapse_substrings(trimmed_variant_sequences) + n_after_trimming = len(collapsed_variant_sequences) + logger.info( + "Kept %d/%d variant sequences after read coverage trimming to >=%dx", + n_after_trimming, + n_total, + min_reads_supporting_cdna_sequence) + return collapsed_variant_sequences + def filter_variant_sequences( variant_sequences, preferred_sequence_length, @@ -200,25 +325,28 @@ def filter_variant_sequences( Drop variant sequences which are shorter than request or don't have enough supporting reads. """ - variant_sequences = filter_variant_sequences_by_read_support( - variant_sequences=variant_sequences, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + variant_sequences = trim_variant_sequences( + variant_sequences, min_reads_supporting_cdna_sequence) return filter_variant_sequences_by_length( variant_sequences=variant_sequences, preferred_sequence_length=preferred_sequence_length) -def supporting_reads_to_variant_sequences( - variant_reads, +def reads_to_variant_sequences( + variant, + reads, preferred_sequence_length, - min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE): + min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ - Collapse variant-support RNA reads into consensus sequences of approximately - the preferred length (may differ at the ends of transcripts), filter - consensus sequences by length and number of supporting RNA reads. + Collapse variant-supporting RNA reads into consensus sequences of + approximately the preferred length (may differ at the ends of transcripts), + filter consensus sequences by length and number of supporting RNA reads. Parameters ---------- - variant_reads : list of AlleleRead objects + variant : varcode.Variant + + reads : list of AlleleRead objects Should all support the same variant allele nucleotides. preferred_sequence_length : int @@ -232,10 +360,14 @@ def supporting_reads_to_variant_sequences( Drop sequences which don't at least have this number of fully spanning reads. + reads_to_variant_sequences : bool + Construct variant sequences by merging overlapping reads. If False + then variant sequences must be fully spanned by cDNA reads. + Returns a collection of VariantSequence objects """ # just in case variant_reads is a generator, convert it to a list - variant_reads = list(variant_reads) + variant_reads = list(filter_non_alt_reads_for_variant(variant, reads)) if len(variant_reads) == 0: return [] @@ -254,36 +386,56 @@ def supporting_reads_to_variant_sequences( max_nucleotides_before_variant = ( n_surrounding_nucleotides - max_nucleotides_after_variant) - variant_sequences = all_variant_sequences_supported_by_variant_reads( + variant_sequences = initial_variant_sequences_from_reads( variant_reads=variant_reads, max_nucleotides_before_variant=max_nucleotides_before_variant, max_nucleotides_after_variant=max_nucleotides_after_variant) + logger.info( + "Initial pool of %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + + if variant_cdna_sequence_assembly: + variant_sequences = iterative_overlap_assembly(variant_sequences) + + if variant_sequences: + logger.info( + "After overlap assembly: %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After overlap assembly: 0 variant sequences") + return [] + variant_sequences = filter_variant_sequences( variant_sequences=variant_sequences, preferred_sequence_length=preferred_sequence_length, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) + if variant_sequences: + logger.info( + ("After coverage & length filtering: %d variant sequences " + "(min length=%d, max length=%d)"), + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After coverage & length filtering: 0 variant sequences") + return [] + # sort VariantSequence objects by decreasing order of supporting read # counts - variant_sequences.sort(key=sort_key_decreasing_read_count) + variant_sequences.sort(key=VariantSequence.sort_key_decreasing_read_count) return variant_sequences -def overlapping_reads_to_variant_sequences( - variant, - overlapping_reads, - min_reads_supporting_cdna_sequence, - preferred_sequence_length): - variant_reads = filter_non_alt_reads_for_variant(variant, overlapping_reads) - return supporting_reads_to_variant_sequences( - variant_reads, - preferred_sequence_length=preferred_sequence_length, - min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence) - def reads_generator_to_sequences_generator( variant_and_reads_generator, min_reads_supporting_cdna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, - preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH): + preferred_sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH, + variant_cdna_sequence_assembly=VARIANT_CDNA_SEQUENCE_ASSEMBLY): """ For each variant, collect all possible sequence contexts around the variant which are spanned by at least min_reads. @@ -300,16 +452,21 @@ def reads_generator_to_sequences_generator( sequence_length : int Desired sequence length, including variant nucleotides + variant_cdna_sequence_assembly : bool + Construct variant sequences by merging overlapping reads. If False + then variant sequences must be fully spanned by cDNA reads. + Yields pairs with the following fields: - Variant - list of VariantSequence objects """ for variant, variant_reads in variant_and_reads_generator: - variant_sequences = overlapping_reads_to_variant_sequences( + variant_sequences = reads_to_variant_sequences( variant=variant, - overlapping_reads=variant_reads, + reads=variant_reads, min_reads_supporting_cdna_sequence=min_reads_supporting_cdna_sequence, - preferred_sequence_length=preferred_sequence_length) + preferred_sequence_length=preferred_sequence_length, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) yield variant, variant_sequences def variant_sequences_generator_to_dataframe(variant_sequences_generator): diff --git a/script/isovar-variant-sequences.py b/script/isovar-variant-sequences.py index 122908c..397fc69 100644 --- a/script/isovar-variant-sequences.py +++ b/script/isovar-variant-sequences.py @@ -24,8 +24,8 @@ variant_sequences_dataframe_from_args ) - -logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf')) +logging.config.fileConfig( + pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_variant_sequences_arg_parser(add_sequence_length_arg=True) diff --git a/test/data/b16.f10/isovar-translate-variants-results.csv b/test/data/b16.f10/isovar-translate-variants-results.csv index bc59adb..f70f03f 100644 --- a/test/data/b16.f10/isovar-translate-variants-results.csv +++ b/test/data/b16.f10/isovar-translate-variants-results.csv @@ -1,9 +1,3 @@ -,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,supporting_variant_reads,total_variant_reads,supporting_transcripts,total_transcripts,gene -0,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTEDFF,14,27,False,True,1,16,27,1,1,Klf6 -1,13,5864876,C,CG,DSQEDLWTKFILARGGRRRRNQN,14,23,True,True,1,8,27,1,1,Klf6 -2,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGTTDCARN,14,27,False,True,1,2,27,1,1,Klf6 -3,13,5864876,C,CG,DSQEDLWTKFILARGGEEGGIRTDDFF,14,27,False,True,1,1,27,1,1,Klf6 -4,4,45802539,G,C,CLQGRTTSYSTAAPLPNPIPNPEICY,14,15,False,False,1,3,3,1,1,Aldh1b1 -5,9,82927102,G,T,TMVAWDRHDNTVINYNCVVMSIPSYHS,14,15,False,False,2,11,13,1,1,Phip -6,9,82927102,G,T,TMVAWDRHDNTVIQYNCVVMSIPSYHS,14,15,False,False,1,2,13,1,1,Phip -7,X,8125624,C,A,NKLQGHSAPVLDVIDVKHGRAVALQLV,14,15,False,False,2,8,8,3,3,Wdr13 +,chr,pos,ref,alt,amino_acids,variant_aa_interval_start,variant_aa_interval_end,ends_with_stop_codon,frameshift,translations,overlapping_reads,ref_reads,alt_reads,alt_reads_supporting_protein_sequence,transcripts_overlapping_variant,transcripts_supporting_protein_sequence,gene +0,9,82927102,G,T,AWDRHDNTVIHAVNNMTLKV,10,11,False,False,1,56,39,17,5,1,1,Phip +1,X,8125624,C,A,QGHSAPVLDVIVNCDESLLA,10,11,False,False,1,71,46,25,8,3,3,Wdr13 diff --git a/test/test_allele_counts.py b/test/test_allele_counts.py index 4d76abe..777b286 100644 --- a/test/test_allele_counts.py +++ b/test/test_allele_counts.py @@ -12,7 +12,7 @@ def test_allele_count_dataframe(): ] df = allele_counts_dataframe([(variant, reads)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df,) - row = df.irow(0) + row = df.iloc[0] eq_(row.n_ref, 2) eq_(row.n_alt, 1) eq_(row.n_other, 0) diff --git a/test/test_allele_reads.py b/test/test_allele_reads.py index b2864ce..1c4aab8 100644 --- a/test/test_allele_reads.py +++ b/test/test_allele_reads.py @@ -1,5 +1,5 @@ -from isovar.allele_reads import AlleleRead, allele_read_from_locus_read +from isovar.allele_reads import AlleleRead from isovar.locus_reads import LocusRead from nose.tools import eq_ @@ -16,7 +16,7 @@ def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"): def test_allele_read_from_single_read_at_locus_trim_N_nucleotides(): read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA") - allele_read = allele_read_from_locus_read(read_at_locus, n_ref=1) + allele_read = AlleleRead.from_locus_read(read_at_locus, n_ref=1) print(allele_read) expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy") eq_(allele_read, expected) diff --git a/test/test_assembly.py b/test/test_assembly.py index 3a8fda1..3ea5d4c 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -15,7 +15,10 @@ from __future__ import print_function, division, absolute_import from isovar.variant_reads import reads_supporting_variant -from isovar.assembly import assemble_reads_into_variant_sequences +from isovar.variant_sequences import initial_variant_sequences_from_reads +from isovar.allele_reads import AlleleRead +from isovar.variant_sequences import VariantSequence +from isovar.assembly import iterative_overlap_assembly from pyensembl import ensembl_grch38 from varcode import Variant from nose.tools import eq_ @@ -39,8 +42,8 @@ def test_assemble_transcript_fragments_snv(): samfile=samfile, chromosome=chromosome,) - sequences = assemble_reads_into_variant_sequences( - variant_reads, + sequences = iterative_overlap_assembly( + initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 @@ -57,5 +60,45 @@ def test_assemble_transcript_fragments_snv(): "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,) -if __name__ == "__main__": - test_assemble_transcript_fragments_snv() +def test_assembly_of_simple_sequence_from_mock_reads(): + # + # Read sequences: + # AAAAA|CC|TTTTT + # AAAAA|CC|TTTTT + # GAAAAA|CC|TTTTTG + # AAAA|CC|TTTT + reads = [ + # two identical reads with sequence AAAAA|CC|TTTTT + AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"), + AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"), + # longer sequence GAAAAA|CC|TTTTTG + AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"), + # shorter sequence AAAA|CC|TTTT + AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"), + ] + expected_variant_sequence = VariantSequence( + prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads) + initial_variant_sequences = initial_variant_sequences_from_reads(reads) + # expecting one fewer sequence than reads since two of the reads are + # duplicates + eq_(len(initial_variant_sequences), len(reads) - 1) + + assembled_variant_sequences = iterative_overlap_assembly( + initial_variant_sequences, + min_overlap_size=1) + + # since no reads contradict each other then we should get back a single + # assembled sequence + eq_(len(assembled_variant_sequences), + 1, + "Unexpected number of variant sequences: %s" % (assembled_variant_sequences,)) + assembled_variant_sequence = assembled_variant_sequences[0] + eq_(assembled_variant_sequence, expected_variant_sequence) + + eq_(len(assembled_variant_sequence.reads), len(reads)) + + eq_(assembled_variant_sequence.min_coverage(), 1) + # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with + # all 4/4 reads + expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14 + eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage) diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py index caf129a..5973203 100644 --- a/test/test_protein_sequences.py +++ b/test/test_protein_sequences.py @@ -152,32 +152,58 @@ def test_sort_protein_sequences(): ] eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order) - -def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant(): - expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf") - not_expressed_variants = load_vcf("data/b16.f10/b16.not-expressed.vcf") +def variants_to_protein_sequences_dataframe( + expressed_vcf="data/b16.f10/b16.expressed.vcf", + not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", + tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", + min_mapping_quality=0, + max_protein_sequences_per_variant=1, + variant_cdna_sequence_assembly=False): + """ + Helper function to load pair of VCFs and tumor RNA BAM + and use them to generate a DataFrame of expressed variant protein + sequences. + """ + expressed_variants = load_vcf(expressed_vcf) + not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) - samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") + samfile = load_bam(tumor_rna_bam) allele_reads_generator = reads_overlapping_variants( variants=combined_variants, samfile=samfile, - min_mapping_quality=0) + min_mapping_quality=min_mapping_quality) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, - max_protein_sequences_per_variant=1) + max_protein_sequences_per_variant=max_protein_sequences_per_variant, + variant_cdna_sequence_assembly=variant_cdna_sequence_assembly) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) + return df, expressed_variants, combined_variants + +def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly(): + df, expressed_variants, combined_variants = \ + variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=True) print(df) - eq_( - len(df), + eq_(len(df), + len(expressed_variants), + "Expected %d/%d entries to have RNA support, got %d" % ( + len(expressed_variants), + len(combined_variants), + len(df))) + +def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly(): + df, expressed_variants, combined_variants = \ + variants_to_protein_sequences_dataframe(variant_cdna_sequence_assembly=False) + print(df) + eq_(len(df), len(expressed_variants), "Expected %d/%d entries to have RNA support, got %d" % ( len(expressed_variants), len(combined_variants), - len(df),)) + len(df))) def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped @@ -207,7 +233,7 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): "--vcf", data_path("data/b16.f10/b16.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", - "--protein-sequence-length", str(desired_length) + "--protein-sequence-length", str(desired_length), ]) df = protein_sequences_dataframe_from_args(args) eq_( @@ -220,10 +246,12 @@ def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): protein_sequences = df["amino_acids"] print(protein_sequences) protien_sequence_lengths = protein_sequences.str.len() - assert (protien_sequence_lengths == desired_length).all(), protien_sequence_lengths + assert (protien_sequence_lengths == desired_length).all(), ( + protien_sequence_lengths,) if __name__ == "__main__": test_sort_protein_sequences() - test_variants_to_protein_sequences_dataframe_one_sequence_per_variant() + test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_with_assembly() + test_variants_to_protein_sequences_dataframe_one_sequence_per_variant_without_assembly() test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality() test_variants_to_protein_sequences_dataframe_protein_sequence_length() diff --git a/test/test_reference_sequence_key_with_reading_frame.py b/test/test_reference_coding_sequence_key.py similarity index 82% rename from test/test_reference_sequence_key_with_reading_frame.py rename to test/test_reference_coding_sequence_key.py index 79f9ed9..f60dabd 100644 --- a/test/test_reference_sequence_key_with_reading_frame.py +++ b/test/test_reference_coding_sequence_key.py @@ -14,10 +14,9 @@ from __future__ import print_function, division, absolute_import -from isovar.reference_context import ( +from isovar.reference_coding_sequence_key import ( reading_frame_to_offset, - sequence_key_with_reading_frame_for_variant_on_transcript, - SequenceKeyWithReadingFrame, + ReferenceCodingSequenceKey, ) from varcode import Variant from pyensembl import ensembl_grch38 @@ -51,12 +50,11 @@ def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr(): # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_substitution, - transcript=tp53_001, - context_size=10) - expected = SequenceKeyWithReadingFrame( + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_substitution, + transcript=tp53_001, + context_size=10) + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", @@ -89,12 +87,11 @@ def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # 5th codon: CAG # first nt of 6th codon: T - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_deletion, - transcript=tp53_001, - context_size=10) - expected = SequenceKeyWithReadingFrame( + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_deletion, + transcript=tp53_001, + context_size=10) + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", @@ -127,13 +124,12 @@ def test_sequence_key_with_reading_frame_insertion(): # 5th codon: CAG # first nt of 6th codon: T - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=10) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=10) - expected = SequenceKeyWithReadingFrame( + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", @@ -145,7 +141,7 @@ def test_sequence_key_with_reading_frame_insertion(): amino_acids_before_variant="ME") assert_equal_fields(result, expected) -def test_sequence_key_with_reading_frame_insertion_inside_start_codon(): +def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' @@ -154,11 +150,10 @@ def test_sequence_key_with_reading_frame_insertion_inside_start_codon(): tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=1) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=1) assert result is None, "Expected result to be None when variant affects start codon" def test_sequence_key_with_reading_frame_insertion_before_start_codon(): @@ -167,11 +162,10 @@ def test_sequence_key_with_reading_frame_insertion_before_start_codon(): tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=1) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=1) assert result is None, "Expected result to be None when variant before start codon" @@ -192,13 +186,12 @@ def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start(): # 3rd codon: GAG # 4th codon: CCG - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=6) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=6) - expected = SequenceKeyWithReadingFrame( + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="ATGGAG", sequence_at_variant_locus="", @@ -229,13 +222,12 @@ def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): # 3rd codon: GAG # first two nt of 4th codon: CC - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=5) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=5) - expected = SequenceKeyWithReadingFrame( + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", @@ -264,13 +256,12 @@ def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # <---- insertion variant occurs between these two codons # 3rd codon: GAG - result = \ - sequence_key_with_reading_frame_for_variant_on_transcript( - variant=tp53_insertion, - transcript=tp53_001, - context_size=3) + result = ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=tp53_insertion, + transcript=tp53_001, + context_size=3) - expected = SequenceKeyWithReadingFrame( + expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", diff --git a/test/test_reference_sequence_key.py b/test/test_reference_sequence_key.py index fa40b94..766b7d0 100644 --- a/test/test_reference_sequence_key.py +++ b/test/test_reference_sequence_key.py @@ -14,10 +14,7 @@ from __future__ import print_function, division, absolute_import -from isovar.reference_context import ( - sequence_key_for_variant_on_transcript, - SequenceKey, -) +from isovar.reference_sequence_key import ReferenceSequenceKey from varcode import Variant from pyensembl import ensembl_grch38 from nose.tools import eq_ @@ -36,11 +33,11 @@ def test_sequence_key_for_variant_on_transcript_substitution(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_rs769125639, transcript=brca2_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", @@ -59,11 +56,11 @@ def test_sequence_key_for_variant_on_transcript_deletion(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_deletion, transcript=brca2_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:5], sequence_at_variant_locus="T", @@ -81,14 +78,14 @@ def test_sequence_key_for_variant_on_transcript_insertion(): eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG") print(brca2_ref_seq) # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=brca2_variant_insertion, transcript=brca2_001, context_size=10) # expecting nothing at the variant locus since we're inserting between # two reference nucleotides - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="+", sequence_before_variant_locus=brca2_ref_seq[:6], sequence_at_variant_locus="", @@ -108,12 +105,12 @@ def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_substitution, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", @@ -132,12 +129,12 @@ def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC") # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="GGTCACTGCC", sequence_at_variant_locus="ATG", @@ -160,12 +157,12 @@ def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand(): # GCGGCTCCTC_CAT_GGCAGTGACC # get the 5 nucleotides before the variant and 10 nucleotides after - sequence_key = sequence_key_for_variant_on_transcript( + sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) - expected_sequence_key = SequenceKey( + expected_sequence_key = ReferenceSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="", diff --git a/test/test_variant_sequences.py b/test/test_variant_sequences.py index 2191bdf..42ea0a1 100644 --- a/test/test_variant_sequences.py +++ b/test/test_variant_sequences.py @@ -16,7 +16,7 @@ from nose.tools import eq_ from varcode import Variant -from isovar.variant_sequences import supporting_reads_to_variant_sequences +from isovar.variant_sequences import reads_to_variant_sequences from isovar.variant_reads import reads_supporting_variant from testing_helpers import load_bam @@ -34,8 +34,9 @@ def test_sequence_counts_snv(): chromosome=chromosome, variant=variant) - variant_sequences = supporting_reads_to_variant_sequences( - variant_reads, + variant_sequences = reads_to_variant_sequences( + variant=variant, + reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: