Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

overlap assembly and coverage trimming to create VariantSequence candidates #57

Merged
merged 11 commits into from
Nov 8, 2016
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ language: python
python:
- "2.7"
- "3.4"
cache:
pip: true
# cache directory used for Ensembl downloads of GTF and FASTA files
# along with the indexed db of intervals and ID mappings and pickles
# of sequence dictionaries
directories: /home/travis/.cache/pyensembl/
before_install:
# Commands below copied from: http://conda.pydata.org/docs/travis.html
# We do this conditionally because it saves us some downloading if the
Expand Down
2 changes: 1 addition & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@

from __future__ import print_function, division, absolute_import

__version__ = "0.2.4"
__version__ = "0.3.0"
177 changes: 90 additions & 87 deletions isovar/allele_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,15 @@

logger = logging.getLogger(__name__)


# subclassing from namedtuple to get a lightweight object with built-in
# hashing and comparison while also being able to add methods
AlleleReadFields = namedtuple(
AlleleReadBase = namedtuple(
"AlleleRead",
"prefix allele suffix name sequence")

class AlleleRead(AlleleReadFields):
class AlleleRead(AlleleReadBase):
def __new__(cls, prefix, allele, suffix, name):
# construct sequence from prefix + alt + suffix
return AlleleReadFields.__new__(
cls,
return AlleleReadBase(
prefix=prefix,
allele=allele,
suffix=suffix,
Expand All @@ -55,100 +52,102 @@ def __new__(cls, prefix, allele, suffix, name):
def __len__(self):
return len(self.prefix) + len(self.allele) + len(self.suffix)

def allele_read_from_locus_read(locus_read, n_ref):
"""
Given a single ReadAtLocus object, return either an AlleleRead or None

Parameters
----------
locus_read : LocusRead
Read which overlaps a variant locus but doesn't necessarily contain the
alternate nucleotides

n_ref : int
Number of reference positions we are expecting to be modified or
deleted (for insertions this should be 0)
"""
sequence = locus_read.sequence
reference_positions = locus_read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = locus_read.base0_read_position_before_variant
read_pos_after = locus_read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]

if ref_pos_before is None:
logger.warn(
"Missing reference pos for nucleotide before variant on read: %s",
@classmethod
def from_locus_read(cls, locus_read, n_ref):
"""
Given a single LocusRead object, return either an AlleleRead or None

Parameters
----------
locus_read : LocusRead
Read which overlaps a variant locus but doesn't necessarily contain the
alternate nucleotides

n_ref : int
Number of reference positions we are expecting to be modified or
deleted (for insertions this should be 0)
"""
sequence = locus_read.sequence
reference_positions = locus_read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = locus_read.base0_read_position_before_variant
read_pos_after = locus_read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]

if ref_pos_before is None:
logger.warn(
"Missing reference pos for nucleotide before variant on read: %s",
locus_read)
return None
return None

ref_pos_after = reference_positions[read_pos_after]
ref_pos_after = reference_positions[read_pos_after]

if ref_pos_after is None:
logger.warn(
"Missing reference pos for nucleotide after variant on read: %s",
if ref_pos_after is None:
logger.warn(
"Missing reference pos for nucleotide after variant on read: %s",
locus_read)
return None

if n_ref == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
return None

if n_ref == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
ref_pos_before,
ref_pos_after,
locus_read)
return None

# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logger.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
return None
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != n_ref + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
return None

# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logger.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
return None
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != n_ref + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
("Positions before (%d) and after (%d) variant should be "
"adjacent on read %s"),
ref_pos_before,
ref_pos_after,
locus_read)
return None
return None

nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]

prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]
prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]

prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
prefix, suffix = trim_N_nucleotides(prefix, suffix)
prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
prefix, suffix = trim_N_nucleotides(prefix, suffix)

return AlleleRead(
prefix,
nucleotides_at_variant_locus,
suffix,
name=locus_read.name)
return cls(
prefix,
nucleotides_at_variant_locus,
suffix,
name=locus_read.name)

def allele_reads_from_locus_reads(locus_reads, n_ref):
"""
Given a collection of ReadAtLocus objects, returns a
list of VariantRead objects (which are split into prefix/variant/suffix
nucleotides).
Given a collection of LocusRead objects, returns a
list of AlleleRead objects
(which are split into prefix/allele/suffix nucleotide strings).

Parameters
----------
Expand All @@ -161,7 +160,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref):
"""

for locus_read in locus_reads:
allele_read = allele_read_from_locus_read(locus_read, n_ref)
allele_read = AlleleRead.from_locus_read(locus_read, n_ref)
if allele_read is None:
continue
else:
Expand Down Expand Up @@ -206,9 +205,11 @@ def reads_overlapping_variant(
if chromosome is None:
chromosome = variant.contig

logger.info("Gathering variant reads for variant %s (chromosome=%s)",
logger.info(
"Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
variant,
chromosome)
chromosome,
variant.gene_names)

base1_position, ref, alt = trim_variant(variant)

Expand Down Expand Up @@ -271,7 +272,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
else:
logger.warn(
"Chromosome '%s' from variant %s not in alignment file %s",
chromosome, variant, samfile.filename)
chromosome,
variant,
samfile.filename)
continue
allele_reads = reads_overlapping_variant(
samfile=samfile,
Expand Down
53 changes: 7 additions & 46 deletions isovar/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@
# I'd rather just use list.copy but Python 2.7 doesn't support it
from copy import copy

from .read_helpers import group_unique_sequences
from .variant_sequences import VariantSequence


logger = logging.getLogger(__name__)


MIN_OVERLAP_SIZE = 30

def sort_by_decreasing_prefix_length(seq):
Expand Down Expand Up @@ -68,14 +64,6 @@ def sort_by_decreasing_total_length(seq):
"""
return -len(seq.sequence)

def combine_variant_sequences(variant_sequence1, variant_sequence2):
return VariantSequence(
prefix=variant_sequence1.prefix,
alt=variant_sequence1.alt,
suffix=variant_sequence2.suffix,
reads=set(
variant_sequence1.reads).union(set(variant_sequence2.reads)))

def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
"""
Greedily merge overlapping sequences into longer sequences.
Expand Down Expand Up @@ -103,11 +91,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
variant_sequence2,
min_overlap_size=min_overlap_size):
continue
combined = combine_variant_sequences(
variant_sequence1, variant_sequence2)
if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
# if the combined sequence isn't any longer, then why bother?
continue
combined = variant_sequence1.combine(variant_sequence2)
if combined.sequence in merged_variant_sequences:
# it's possible to get the same merged sequence from distinct
# input sequences
Expand All @@ -120,8 +104,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
# sequence is supported by any one read.
existing_record_with_same_sequence = merged_variant_sequences[
combined.sequence]
combined_with_more_reads = combine_variant_sequences(
existing_record_with_same_sequence,
combined_with_more_reads = existing_record_with_same_sequence.combine(
combined)
merged_variant_sequences[combined.sequence] = combined_with_more_reads
else:
Expand Down Expand Up @@ -168,7 +151,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
"""
return -len(variant_sequence.reads), -len(variant_sequence.sequence)

def iterative_assembly(
def iterative_overlap_assembly(
variant_sequences,
min_overlap_size=30,
n_merge_iters=2):
Expand All @@ -185,9 +168,9 @@ def iterative_assembly(

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't comment on the actual line, but curious what the reasoning is behind a default min_overlap_size of 30?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, it should probably be set to MINIMUM_OVERLAP_SIZE?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Otherwise, I reviewed this entire module, and it looks good to me!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ihodes I'm worried about "false" assemblies that only overlap on the variant nucleotides (but actually come from different splice isoforms). It seems that requiring some amount of overlap beyond the variant nucleotides will decrease the chances of false assembly but the exact amount (30) is pretty arbitrarily chosen: shorter than a read length but long enough to be a kmer unlikely to happen by chance.

logger.info(
"Iteration #%d of assembly: generated %d sequences (from %d)",
i + 1,
len(variant_sequences),
len(previous_sequences))
i + 1,
len(variant_sequences),
len(previous_sequences))

if len(variant_sequences) == 0:
# if the greedy merge procedure fails for all pairs of candidate
Expand All @@ -199,31 +182,9 @@ def iterative_assembly(
variant_sequences = collapse_substrings(variant_sequences)
logger.info(
"After collapsing subsequences, %d distinct sequences left",
len(variant_sequences))
len(variant_sequences))
if len(variant_sequences) == 1:
# once we have only one sequence then there's no point trying
# to further combine sequences
break
return variant_sequences

def assemble_reads_into_variant_sequences(
variant_reads,
min_overlap_size=30,
n_merge_iters=2):
"""
Turn a collection of AlleleRead objects into a collection of VariantSequence
objects, which are returned in order of (# supported reads, sequence length)
"""
initial_variant_sequences = [
VariantSequence(
prefix=prefix,
alt=alt,
suffix=suffix,
reads=reads)
for (prefix, alt, suffix), reads in
group_unique_sequences(variant_reads).items()
]
return iterative_assembly(
initial_variant_sequences,
min_overlap_size=min_overlap_size,
n_merge_iters=n_merge_iters)
8 changes: 7 additions & 1 deletion isovar/cli/variant_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def add_variant_sequence_args(
default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
help="Minimum number of reads supporting a variant sequence (default %(default)s)")

rna_sequence_group.add_argument(
"--variant-sequence-assembly",
default=False,
action="store_true")

# when cDNA sequence length can be inferred from a protein length then
# we may want to omit this arg
if add_sequence_length_arg:
Expand Down Expand Up @@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args):
return reads_generator_to_sequences_generator(
allele_reads_generator,
min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence,
preferred_sequence_length=args.cdna_sequence_length)
preferred_sequence_length=args.cdna_sequence_length,
variant_cdna_sequence_assembly=args.variant_sequence_assembly)

def variant_sequences_dataframe_from_args(args):
variant_sequences_generator = variant_sequences_generator_from_args(args)
Expand Down
5 changes: 5 additions & 0 deletions isovar/default_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@

# number of protein sequences we want to return per variant
MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should MIN_OVERLAPPING_READS go here as well? What makes it into here?

# run overlap assembly algorithm to construct variant
# sequences from multiple reads which only partially
# overlap (rather than fully spanning a coding sequence)
VARIANT_CDNA_SEQUENCE_ASSEMBLY = False
Loading