diff --git a/.gitignore b/.gitignore index 1dbc687..c053039 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,7 @@ target/ #Ipython Notebook .ipynb_checkpoints + +# PyCharm +.idea/ +vcs.xml diff --git a/.travis.yml b/.travis.yml index 1a60cc4..a2f3cb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -58,8 +58,10 @@ before_script: --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.87/ script: - > - nosetests test --with-coverage --cover-package=isovar && - nosetests openvax-integration-tests/test + nosetests test --with-coverage --cover-package=isovar + # removing integration tests until Vaxrank is updated to use new API + # && + # nosetests openvax-integration-tests/test after_success: coveralls deploy: diff --git a/README.md b/README.md index 3fc1465..fdaab9c 100644 --- a/README.md +++ b/README.md @@ -8,57 +8,386 @@ PyPI -# isovar -Isovar assembles protein subsequences around mutations from cancer RNA-Seq data. Since Isovar uses sequenced reads to determine a mutant coding sequence it is able to correctly phase somatic variants with adjacent germline variants, as well as sometimes recovering alternatively spliced isoforms. +# Isovar -## Example +* [Overview](#overview) +* [Python API](#python-api) +* [Commandline](#commandline) +* [Internal Design](#internal-design) +* [Other Isovar Commandline Tools](#other-isovar-commandline-tools) +* [Sequencing Recommendations](#sequencing-recommendations) + +## Overview +Isovar determines mutant protein subsequences around mutations from cancer RNAseq data. + +Isovar works by: + + 1) collecting RNA reads which spanning the location of a variant, + + 2) filtering the RNA reads to those which support the mutation, + + 3) assembling mutant reads into longer coding sequences, + + 4) matching mutant coding sequences against reference annotated reading +frames, and + + 5) translating coding sequences determined directly from RNA into mutant protein sequences. + +The assembled coding sequences may incorporate proximal +(germline and somatic) variants, along with any splicing alterations +which occur due to modified splice signals. + +## Python API + +In the example below, `isovar.run_isovar` returns a list of `isovar.IsovarResult` objects. +Each of these objects corresponds to a single input variant and contains all of the information about the RNA evidence at that variant's location and any mutant protein sequences which were assembled for the variant. + +```python + +from isovar import run_isovar + +isovar_results = run_isovar( + variants="cancer-mutations.vcf", + alignment_file="tumor-rna.bam") + +# this code traverses every variant and prints the number +# of RNA reads which support the alt allele for variants +# which had a successfully assembled/translated protein sequence +for isovar_result in isovar_results: + # if any protein sequences were assembled from RNA + # then the one with most supporting reads can be + # accessed from a property called `top_protein_sequence`. + if isovar_result.top_protein_sequence is not None: + # print number of distinct fragments supporting the + # the variant allele for this mutation + print(isovar_result.variant, isovar_result.num_alt_fragments) + +``` + +A collection of `IsovarResult` objects can also be flattened into a Pandas DataFrame: + +```python + +from isovar import run_isovar, isovar_results_to_dataframe + +df = isovar_results_to_dataframe( + run_isovar( + variants="cancer-mutations.vcf", + alignment_file="tumor-rna.bam")) +``` + + +### Python API options for collecting RNA reads + +To change how Isovar collects and filters RNA reads you can create +your own instance of the `isovar.ReadCollector` class and pass it to `run_isovar`. +```python +from isovar import run_isovar, ReadCollector + +# create a custom ReadCollector to change options for how RNA reads are processed +read_collector = ReadCollector( + use_duplicate_reads=True, + use_secondary_alignments=True, + use_soft_clipped_bases=True) + +isovar_results = run_isovar( + variants="cancer-mutations.vcf", + alignment_file="tumor-rna.bam", + read_collector=read_collector) + +```` + + +### Python API options for coding sequence assembly and translation + +To change how Isovar assembles RNA reads into coding sequences, determines their +reading frames, and groups translated amino acid sequences you can create your +own instance of the `isovar.ProteinSequenceCreator` class and pass it to `run_isovar`. + + +```python +from isovar import run_isovar, ProteinSequenceCreator + +# create a custom ProteinSequenceCreator to change options for how +# protein sequences are assembled from RNA reads +protein_sequence_creator = ProteinSequenceCreator( + # number of amino acids we're aiming for, coding sequences + # might still give us a shorter sequence due to an early stop + # codon or poor coverage + protein_sequence_length=30, + # minimum number of reads covering each base of the coding sequence + min_variant_sequence_coverage=2, + # how much of a reference transcript should a coding sequence match before + # we use it to establish a reading frame + min_transcript_prefix_length=20, + # how many mismatches allowed between coding sequence (before the variant) + # and transcript (before the variant location) + max_transcript_mismatches=2, + # also count mismatches after the variant location toward + # max_transcript_mismatches + count_mismatches_after_variant=False, + # if more than one protein sequence can be assembled for a variant + # then drop any beyond this number + max_protein_sequences_per_variant=1, + # if set to False then coding sequence will be derived from + # a single RNA read with the variant closest to its center + variant_sequence_assembly=True, + # how many nucleotides must two reads overlap before they are combined + # into a single coding sequence + min_assembly_overlap_size=30) + +isovar_results = run_isovar( + variants="cancer-mutations.vcf", + alignment_file="tumor-rna.bam", + protein_sequence_creator=protein_sequence_creator) +``` + +### Python API for filtering results + +You can filter a collection of `IsovarResult` objects by any of their numerical properties using the `filter_thresholds` option +of the `run_isovar` function. The value expected for this argument is a dictionary whose keys have named like `'min_fraction_ref_reads'` or `'max_num_alt_fragments'` and whose values are numerical thresholds. +Everything after the `'min_'` or `'max_'` at the start of a key is expected to be the name of a property of `IsovarResult`. +Many of the commonly accessed properties regarding RNA read evidence follow the pattern: +``` +{num|fraction}_{ref|alt|other}_{reads|fragments} +``` + +For example, in the following code the results are filtered to have 10 or more alt reads supporting a variant and no more than 25% of the fragments supporting an allele other than the ref or alt. +```python +from isovar import run_isovar + +isovar_results = run_isovar( + variants="cancer-mutations.vcf", + alignment_file="tumor-rna.bam", + filter_thresholds={"min_num_alt_reads": 10, "max_fraction_other_fragments": 0.25}) + +for isovar_result in isovar_results: + # print each variant and whether it passed both filters + print(isovar_result.variant, isovar_result.passes_all_filters) +``` + +A variant which fails one or more filters is not excluded from the result collection but it has `False` values in its corresponding +`filter_values` dictionary property and will have a `False` value for the `passes_all_filters` property. + +If a result collection is flattened into a DataFrame then each filter is included as a column. + +It's also possible to filter on boolean properties (without numerical thresholds) by passing `filter_flags` to `run_isovar`. These boolean +properties can be further negated by prepending 'not_' to the property name, so that both `'protein_sequence_matches_predicted_effect'` and `'not_protein_sequence_matches_predicted_effect'` are valid names for `filter_flags`. + +## Commandline + +Basic example: ```sh -$ isovar-protein-sequences \ +$ isovar \ --vcf somatic-variants.vcf \ --bam rnaseq.bam \ - --min-reads 2 \ --protein-sequence-length 30 \ --output isovar-results.csv +``` + +### Commandline options for loading variants + +### +``` + --vcf VCF Genomic variants in VCF format + + --maf MAF Genomic variants in TCGA's MAF format + + --variant CHR POS REF ALT + Individual variant as 4 arguments giving chromsome, + position, ref, and alt. Example: chr1 3848 C G. Use + '.' to indicate empty alleles for insertions or + deletions. + + --genome GENOME What reference assembly your variant coordinates are + using. Examples: 'hg19', 'GRCh38', or 'mm9'. This + argument is ignored for MAF files, since each row + includes the reference. For VCF files, this is used if + specified, and otherwise is guessed from the header. + For variants specfied on the commandline with + --variant, this option is required. + + --download-reference-genome-data + Automatically download genome reference data required + for annotation using PyEnsembl. Otherwise you must + first run 'pyensembl install' for the release/species + corresponding to the genome used in your VCF. + + --json-variants JSON_VARIANTS + Path to Varcode.VariantCollection object serialized as + a JSON file. + +``` + +### Commandline options for loading aligned tumor RNA-seq reads + +``` + --bam BAM BAM file containing RNAseq reads + + --min-mapping-quality MIN_MAPPING_QUALITY + Minimum MAPQ value to allow for a read (default 1) + + --use-duplicate-reads + By default, reads which have been marked as duplicates + are excluded.Use this option to include duplicate + reads. + + --drop-secondary-alignments + By default, secondary alignments are included in + reads, use this option to instead only use primary + alignments. +``` - chr pos ref alt amino_acids \ -0 22 46931060 A C FGVEAVDHGWPSMSSGSSWRASRGPPPPPR +### Commandline options for coding sequence assembly +``` + --min-variant-sequence-coverage MIN_VARIANT_SEQUENCE_COVERAGE + Minimum number of reads supporting a variant sequence + (default 2) + + --disable-variant-sequence-assembly + Disable assemble variant cDNA sequence from + overlapping reads +``` - variant_aa_interval_start variant_aa_interval_end ends_with_stop_codon \ -0 16 17 False +### Commandline options for translating cDNA to protein sequence +``` + --protein-sequence-length PROTEIN_SEQUENCE_LENGTH + + --max-reference-transcript-mismatches MAX_REFERENCE_TRANSCRIPT_MISMATCHES + Maximum number of mismatches between variant sequence + reference sequence before a candidate reading frame is + ignored. + + --count-mismatches-after-variant + If true, mismatches after the variant locus will count + toward the --max-reference-transcript-mismatches + filter. + + --min-transcript-prefix-length MIN_TRANSCRIPT_PREFIX_LENGTH + Number of nucleotides before the variant we try to + match against a reference transcript. Values greater + than zero exclude variants near the start codon of + transcripts without 5' UTRs. + + --max-protein-sequences-per-variant MAX_PROTEIN_SEQUENCES_PER_VARIANT - frameshift translations_count supporting_variant_reads_count \ -0 False 1 1 +``` + +### Commandline options for filtering - total_variant_reads supporting_transcripts_count total_transcripts gene -0 130 2 2 CELSR1 ``` + --min-alt-rna-reads MIN_ALT_RNA_READS + Minimum number of reads supporting variant allele + (default 3) -## Algorithm/Design + --min-alt-rna-fragments MIN_ALT_RNA_FRAGMENTS + Minimum number of fragments supporting variant allele + (default 2). Note that this option is the same as + --min-alt-rna-reads for single-end sequencing. -The one line explanation of Isovar: `ProteinSequence = VariantSequence + ReferenceContext`. + --min-alt-rna-fraction MIN_ALT_RNA_FRACTION + Minimum ratio of fragments supporting variant allele + to total RNA fragments (default 0.005). -A little more detail about the algorithm: - 1. Scan through an RNAseq BAM file and extract sequences overlapping a variant locus (represented by `LocusRead`) - 2. Make sure that the read contains the variant allele and split its sequence into prefix/alt/suffix string parts (represented by `AlleleRead`) - 3. Assemble overlapping `AlleleRead`s (which agree with the variant allele) into a `VariantSequence` - 4. Gather possible reading frames for distinct reference sequences around the variant locus (represented by `ReferenceContext`). - 5. Use the reading frame from a `ReferenceContext` to translate a `VariantSequence` into a protein fragment (represented by `Translation`). - 6. Multiple distinct variant sequences and reference contexts can generate the same translations, so we aggregate those equivalent `Translation` objects into a `ProteinSequence`. + --min-ratio-alt-to-other-fragments MIN_RATIO_ALT_TO_OTHER_FRAGMENTS + At loci where alleles other than the ref and a single + alt are supported, this parameter controls how many + more times fragments supporting the variant allele are + required relative to other non-reference alleles + (default 3.0). +``` -Since we may not want to deal with *every* possible translation of *every* distinct sequence detected around a variant, Isovar sorts the variant sequences by the number of supporting reads and the reference contexts in order of protein length and a configurable number of translated protein fragments can be kept from this ordering. +### Commandline options for writing an output CSV -## Sequencing Recommendations +``` + --output OUTPUT Output CSV file + + --output-columns OUTPUT_COLUMNS [OUTPUT_COLUMNS ...] + Subset of columns to write -Isovar works best with high quality / high coverage mRNA sequence data. This means that you will get best results from >100M paired-end reads sequenced on an Illumina HiSeq from a library enriched with poly-A capture. The number of reads varies depending on degree of RNA degradation and tumor purity. The read length will determine the longest protein sequence you can recover, since Isovar's cDNA assembly only considers reads that overlap a variant. With 100bp reads you will be able to assemble at most 199bp of sequence around a somatic single nucleotide variant, and consequently only be to determine 66 amino acids from the protein sequence. If you disable the cDNA assembly algorithm then a 100bp read will only be able to determine 33 amino acids. +``` -## Commandline Tools -* `isovar-protein-sequences` -* `isovar-allele-counts` -* `isovar-allele-reads` -* `isovar-translations` -* `isovar-reference-contexts` -* `isovar-variant-reads` -* `isovar-variant-sequences` +## Internal Design + +![](isovar_design.png) + +The inputs to Isovar are one or more somatic variant call (VCF) files, along with a BAM file +containing aligned tumor RNA reads. The following objects are used to aggregate information within Isovar: + +* [LocusRead](https://github.com/openvax/isovar/blob/master/isovar/locus_read.py): Isovar examines each variant locus and extracts reads overlapping that locus, +represented by `LocusRead`. The `LocusRead` representation allows filtering based +on quality and alignment criteria (e.g. MAPQ > 0) which are thrown away in later stages +of Isovar. + +* [AlleleRead](https://github.com/openvax/isovar/blob/master/isovar/allele_read.py): Once `LocusRead` objects have been filtered, they are converted into a simplified +representation called `AlleleRead`. Each `AlleleRead` contains only the cDNA sequences +*before*, *at*, and *after* the variant locus. + +* [ReadEvidence](https://github.com/openvax/isovar/blob/master/isovar/read_evidence.py): +The set of `AlleleRead` objects overlapping a mutation's location may support many different +distinct allele. The `ReadEvidence` type represents the grouping of these reads into +*ref*, *alt* and *other* `AlleleRead` sets, where *ref* reads agree with the reference + sequence, *alt* reads agree with the given mutation, and *other* reads contain all + non-ref/non-alt alleles. The *alt* reads will be used later to determine +a mutant coding sequence, but the *ref* and *other* groups are also kept in case they are +useful for filtering. + +* [VariantSequence](https://github.com/openvax/isovar/blob/master/isovar/variant_sequence.py): +Overlapping `AlleleRead`s containing the same mutation are assembled into a longer +sequence. The `VariantSequence` object represents this candidate coding sequence, as well +as all the `AlleleRead` objects which were used to create it. + +* [ReferenceContext](https://github.com/openvax/isovar/blob/master/isovar/reference_context.py): To determine the reading frame in which to translate a `VariantSequence`, Isovar +looks at all Ensembl annotated transcripts overlapping the locus and collapses them + into one or more `ReferenceContext` object. Each `ReferenceContext` represents the + cDNA sequence upstream of the variant locus and in which of the {0, +1, +2} reading frames + it is translated. + +* [Translation](https://github.com/openvax/isovar/blob/master/isovar/translation.py): Use the reading frame from a `ReferenceContext` to translate a `VariantSequence` +into a protein fragment, represented by `Translation`. + +* [ProteinSequence](https://github.com/openvax/isovar/blob/master/isovar/protein_sequence.py): +Multiple distinct variant sequences and reference contexts can generate the same translations, so we aggregate those equivalent `Translation` objects into a `ProteinSequence`. + +* [IsovarResult](https://github.com/openvax/isovar/blob/master/isovar/isovar_result.py): Since a single variant locus might have reads which assemble into multiple incompatible coding sequences, an `IsovarResult` represents a variant and one or more `ProteinSequence` objects which are associated with it. We typically don't want to deal with *every* possible translation of *every* distinct sequence detected around a variant, so the protein sequences are sorted by their number of supporting fragments and the best protein sequence is made easy to access. The `IsovarResult` object also has many informative properties such `num_alt_fragments`, `fraction_ref_reads`, &c. + + +## Other Isovar Commandline Tools + +
+
isovar-protein-sequences --vcf variants.vcf --bam rna.bam
+
All protein sequences which can be assembled from RNA reads for any of the given variants.
+ +
isovar-allele-counts --vcf variants.vcf --bam rna.bam
+
Counts of reads and fragments supporting the ref, alt, and other alleles at all given variant locations.
+ +
isovar-allele-reads --vcf variants.vcf --bam rna.bam
+
Sequences of all reads overlapping any of the given variants.
+ +
isovar-translations --vcf variants.vcf --bam rna.bam
+
All possible translations of any assembled cDNA sequence containing any of the given variants in the reference frame of any matching transcript.
+ +
isovar-reference-contexts --vcf variants.vcf
+
Shows all candidate reference contexts (sequence and reading frame) before each variant, derived from overlapping reference coding transcripts.
+ +
isovar-variant-reads --vcf variants.vcf --bam rna.bam
+
Like the isovar-allele-reads command but limited only to reads which support the alt allele.
+ +
isovar-variant-sequences --vcf variants.vcf --bam rna.bam
+
Shows all assembled cDNA coding sequences supporting any of the given variants.
+
+ +## Sequencing Recommendations + +Isovar works best with high quality / high coverage mRNA sequence data. +This means that you will get best results from >100M paired-end reads sequenced on an +Illumina HiSeq from a library enriched with poly-A capture. The number of reads varies +depending on degree of RNA degradation and tumor purity. The read length will determine +the longest protein sequence you can recover, since Isovar's cDNA assembly only +considers reads that overlap a variant. With 100bp reads you will be able to assemble +at most 199bp of sequence around a somatic single nucleotide variant, and consequently +only be to determine 66 amino acids from the protein sequence. If you disable the cDNA +assembly algorithm then a 100bp read will only be able to determine 33 amino acids. diff --git a/isovar/__init__.py b/isovar/__init__.py index 4c7cfbc..867af11 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,4 +14,34 @@ from __future__ import print_function, division, absolute_import -__version__ = "0.9.0" +__version__ = "1.0.0" + + +from .allele_read import AlleleRead +from .dataframe_helpers import isovar_results_to_dataframe +from .isovar_result import IsovarResult +from .locus_read import LocusRead +from .main import run_isovar +from .protein_sequence import ProteinSequence +from .protein_sequence_creator import ProteinSequenceCreator +from .read_collector import ReadCollector +from .read_evidence import ReadEvidence +from .variant_orf import VariantORF +from .variant_sequence import VariantSequence +from .variant_sequence_creator import VariantSequenceCreator + + +__all__ = [ + "run_isovar", + "isovar_results_to_dataframe", + "AlleleRead", + "IsovarResult", + "LocusRead", + "ProteinSequence", + "ProteinSequenceCreator", + "ReadCollector", + "ReadEvidence", + "VariantORF", + "VariantSequence", + "VariantSequenceCreator", +] diff --git a/isovar/alignment_score.py b/isovar/alignment_score.py new file mode 100644 index 0000000..56d9eeb --- /dev/null +++ b/isovar/alignment_score.py @@ -0,0 +1,86 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Sequence alignment helpers +""" + +from __future__ import print_function, division, absolute_import + + +def alignment_score(a, b, min_subsequence_length=1): + """ + Number of mismatches between all two input sequences, allows + for trimming of ends of sequences but not insertions or deletions + within the sequences. Number of trimmed amino acids from each sequence + count toward the mismatch total. + + Parameters + ---------- + a : str + + b : str + + min_subsequence_length : int + Only consider subsequences which are at least this long. + Returns int + """ + + n_a = len(a) + n_b = len(b) + + # swap a and b if a is longer since the loops below expect the first + # sequence to be shorter. This happens because any subsequence of `a` + # is expected to be of a length that can be found within `b` + if n_a > n_b: + a, b = b, a + n_a, n_b = n_b, n_a + + # compare all subsequences of a and b and count the number of mismatches + # between + best_score = n_a + n_b + + # if we need to make sequence of at least length `min_subsequence_length` + # then we can't start the sequence more than that number of characters + # from the end of the string. + # For example, if n_a = 7 and min_subsequence_length = 6 then + # this loop should only consider start_a = {0, 1} but not any value + # higher than that. + for start_a in range(n_a - min_subsequence_length + 1): + # Similarly, only consider end indices for the subsequence of `a` + # which span the minimum number of characters required. + # For example, if n_a = 7, min_subsequence_length = 6, start_a = 1 + # then this loop should only consider end_a = 7. + for end_a in range(start_a + min_subsequence_length, n_a + 1): + subseq_a = a[start_a:end_a] + n_subseq_a = len(subseq_a) + n_trimmed_a = n_a - n_subseq_a + # consider all subsequences of the second string of the same length + # as the subsequence extracted from the first string + for start_b in range(n_b - n_subseq_a + 1): + subseq_b = b[start_b:start_b + n_subseq_a] + n_subseq_b = len(subseq_b) + assert n_subseq_a == n_subseq_b + n_trimmed_b = n_b - n_subseq_b + # now that we have two subsequences of the same length, + # count up the number of mismatching characters between them + n_mismatches = sum([ + a_char != b_char + for (a_char, b_char) + in zip(subseq_a, subseq_b)]) + score = n_trimmed_a + n_trimmed_b + n_mismatches + if score < best_score: + best_score = score + return best_score diff --git a/isovar/allele_counts.py b/isovar/allele_counts.py deleted file mode 100644 index 920391e..0000000 --- a/isovar/allele_counts.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -Summarizes which alleles are found at a locus overlapping a variant. -""" - -from __future__ import print_function, division, absolute_import -from collections import namedtuple - -from .variant_helpers import trim_variant -from .dataframe_builder import DataFrameBuilder -from .read_helpers import group_reads_by_allele - -AlleleCount = namedtuple( - "AlleleCount", [ - "n_ref", - "n_alt", - "n_other", - ]) - - -def count_alleles_at_variant_locus(variant, allele_reads): - allele_reads = list(allele_reads) - n_total = len(allele_reads) - allele_to_reads_dict = group_reads_by_allele(allele_reads) - _, ref, alt = trim_variant(variant) - n_ref = len(allele_to_reads_dict[ref]) - n_alt = len(allele_to_reads_dict[alt]) - n_other = n_total - (n_ref + n_alt) - return AlleleCount(n_ref=n_ref, n_alt=n_alt, n_other=n_other) - - -def allele_counts_dataframe(variant_and_allele_reads_generator): - """ - Creates a DataFrame containing number of reads supporting the - ref vs. alt alleles for each variant. - """ - df_builder = DataFrameBuilder( - AlleleCount, - extra_column_fns={ - "gene": lambda variant, _: ";".join(variant.gene_names), - }) - for variant, allele_reads in variant_and_allele_reads_generator: - counts = count_alleles_at_variant_locus(variant, allele_reads) - df_builder.add(variant, counts) - return df_builder.to_dataframe() diff --git a/isovar/allele_read.py b/isovar/allele_read.py new file mode 100644 index 0000000..123ae84 --- /dev/null +++ b/isovar/allele_read.py @@ -0,0 +1,108 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Reads overlapping a locus of interest split into prefix, +allele (ref, alt, or otherwise), and suffix portions +""" + +from __future__ import print_function, division, absolute_import +import logging + +from .string_helpers import convert_from_bytes_if_necessary, trim_N_nucleotides +from .value_object import ValueObject + +logger = logging.getLogger(__name__) + + +class AlleleRead(ValueObject): + """ + Extremely simplified representation of a read at a locus: just the allele + at the locus and sequence before/after. We're ignoring the base qualities + and any additional information about splicing, clipping or alignment. + """ + __slots__ = ["prefix", "allele", "suffix", "name", "sequence"] + + def __init__(self, prefix, allele, suffix, name): + self.prefix = prefix + self.allele = allele + self.suffix = suffix + self.name = name + self.sequence = prefix + allele + suffix + + def __len__(self): + return len(self.sequence) + + @classmethod + def from_locus_read(cls, locus_read): + """ + Given a single LocusRead object, return either an AlleleRead or None + + Parameters + ---------- + locus_read : LocusRead + Read which overlaps a variant locus but doesn't necessarily contain the + alternate nucleotides + """ + sequence = locus_read.sequence + read_name = locus_read.name + + reference_base0_start_inclusive = locus_read.reference_base0_start_inclusive + reference_base0_end_exclusive = locus_read.reference_base0_end_exclusive + + read_base0_start_inclusive = locus_read.read_base0_start_inclusive + read_base0_end_exclusive = locus_read.read_base0_end_exclusive + + if read_base0_start_inclusive is None or read_base0_end_exclusive is None: + logger.debug( + "Skipping read '%s' because required bases in reference interval %s:%s aren't mapped", + read_name, + reference_base0_start_inclusive, + reference_base0_end_exclusive) + return None + + reference_positions = locus_read.reference_positions + + n_ref_bases = reference_base0_end_exclusive - reference_base0_start_inclusive + + insertion = (n_ref_bases == 0) + + if insertion: + # insertions require a sequence of non-aligned bases + # followed by the subsequence reference position + for read_index in range(read_base0_start_inclusive, read_base0_end_exclusive): + # all the inserted nucleotides should *not* align to the reference + if reference_positions[read_index] is not None: + logger.debug( + "Skipping read '%s', inserted nucleotides shouldn't map to reference", + read_name) + return None + + nucleotides_at_variant_locus = convert_from_bytes_if_necessary( + sequence[read_base0_start_inclusive:read_base0_end_exclusive]) + + if "N" in nucleotides_at_variant_locus: + logger.debug( + "Skipping read '%s', found N nucleotides at variant locus", + read_name) + prefix = convert_from_bytes_if_necessary(sequence[:read_base0_start_inclusive]) + suffix = convert_from_bytes_if_necessary(sequence[read_base0_end_exclusive:]) + + prefix, suffix = trim_N_nucleotides(prefix, suffix) + + return AlleleRead( + prefix, + nucleotides_at_variant_locus, + suffix, + name=read_name) diff --git a/isovar/read_helpers.py b/isovar/allele_read_helpers.py similarity index 57% rename from isovar/read_helpers.py rename to isovar/allele_read_helpers.py index a8bba30..19f189a 100644 --- a/isovar/read_helpers.py +++ b/isovar/allele_read_helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,25 @@ # limitations under the License. """ -Helper functions for working with RNA reads +Functions for filtering, grouping, and summarizing collections of +AlleleRead objects. """ -from __future__ import print_function, division, absolute_import from collections import defaultdict from .common import groupby +from .logging import get_logger +from .allele_read import AlleleRead + +logger = get_logger(__name__) + + +def group_reads_by_allele(allele_reads): + """ + Returns dictionary mapping each allele's nucleotide sequence to a list of + supporting AlleleRead objects. + """ + return groupby(allele_reads, lambda read: read.allele) def get_single_allele_from_reads(allele_reads): @@ -39,10 +51,6 @@ def get_single_allele_from_reads(allele_reads): return seq -def make_prefix_suffix_pairs(allele_reads): - return [(r.prefix, r.suffix) for r in allele_reads] - - def group_unique_sequences( allele_reads, max_prefix_size=None, @@ -66,28 +74,46 @@ def group_unique_sequences( return groups -def count_unique_sequences( - allele_reads, - max_prefix_size=None, - max_suffix_size=None): +def allele_reads_from_locus_reads(locus_reads): """ - Given a list of AlleleRead objects, extracts all unique - (prefix, allele, suffix) sequences and associate each with the number - of reads that contain that sequence. + Attempt to convert each LocusRead object to an AlleleRead and return + the successfully converted objects. + + Parameters + ---------- + locus_reads : list of LocusRead + + Returns list of AlleleRead + ------- + """ - groups = group_unique_sequences( - allele_reads, - max_prefix_size=max_prefix_size, - max_suffix_size=max_suffix_size) - return { - seq_tuple: len(read_names) - for (seq_tuple, read_names) in groups.items() - } + allele_reads = [] + for locus_read in locus_reads: + allele_read = AlleleRead.from_locus_read(locus_read) + if allele_read is None: + continue + else: + allele_reads.append(allele_read) + return allele_reads -def group_reads_by_allele(allele_reads): +def split_reads_into_ref_alt_other(ref, alt, overlapping_reads): """ - Returns dictionary mapping each allele's nucleotide sequence to a list of - supporting AlleleRead objects. + Returns three lists of AlleleRead objects + - reads which support the reference allele + - reads which support the variant's alt allele + - reads which support other alleles """ - return groupby(allele_reads, lambda read: read.allele) + # convert to list in case it's a generator since + # we want to traverse the sequence repeatedly + overlapping_reads = list(overlapping_reads) + + reads_grouped_by_allele = group_reads_by_allele(overlapping_reads) + ref_reads = reads_grouped_by_allele.get(ref, []) + alt_reads = reads_grouped_by_allele.get(alt, []) + other_reads = [] + for allele, allele_reads in reads_grouped_by_allele.items(): + if allele in {ref, alt}: + continue + other_reads.extend(allele_reads) + return ref_reads, alt_reads, other_reads \ No newline at end of file diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py deleted file mode 100644 index fccaa07..0000000 --- a/isovar/allele_reads.py +++ /dev/null @@ -1,308 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Reads overlapping a locus of interest split into prefix, -allele (ref, alt, or otherwise), and suffix portions -""" - -from __future__ import print_function, division, absolute_import -from collections import defaultdict -import logging - -from .locus_reads import locus_read_generator -from .default_parameters import ( - MIN_READ_MAPPING_QUALITY, - USE_SECONDARY_ALIGNMENTS, - USE_DUPLICATE_READS, -) -from .variant_helpers import trim_variant -from .dataframe_builder import DataFrameBuilder -from .string_helpers import convert_from_bytes_if_necessary, trim_N_nucleotides -from .value_object import ValueObject - -logger = logging.getLogger(__name__) - - -class AlleleRead(ValueObject): - __slots__ = ["prefix", "allele", "suffix", "name", "sequence"] - - def __init__(self, prefix, allele, suffix, name): - self.prefix = prefix - self.allele = allele - self.suffix = suffix - self.name = name - self.sequence = prefix + allele + suffix - - def __len__(self): - return len(self.sequence) - - @classmethod - def from_locus_read(cls, locus_read, n_ref): - """ - Given a single LocusRead object, return either an AlleleRead or None - - Parameters - ---------- - locus_read : LocusRead - Read which overlaps a variant locus but doesn't necessarily contain the - alternate nucleotides - - n_ref : int - Number of reference positions we are expecting to be modified or - deleted (for insertions this should be 0) - """ - sequence = locus_read.sequence - reference_positions = locus_read.reference_positions - - # positions of the nucleotides before and after the variant within - # the read sequence - read_pos_before = locus_read.base0_read_position_before_variant - read_pos_after = locus_read.base0_read_position_after_variant - - # positions of the nucleotides before and after the variant on the - # reference genome - ref_pos_before = reference_positions[read_pos_before] - - if ref_pos_before is None: - logger.warn( - "Missing reference pos for nucleotide before variant on read: %s", - locus_read) - return None - - ref_pos_after = reference_positions[read_pos_after] - - if ref_pos_after is None: - logger.warn( - "Missing reference pos for nucleotide after variant on read: %s", - locus_read) - return None - - if n_ref == 0: - if ref_pos_after - ref_pos_before != 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - "Positions before (%d) and after (%d) variant should be adjacent on read %s", - ref_pos_before, - ref_pos_after, - locus_read) - return None - - # insertions require a sequence of non-aligned bases - # followed by the subsequence reference position - ref_positions_for_inserted = reference_positions[ - read_pos_before + 1:read_pos_after] - if any(insert_pos is not None for insert_pos in ref_positions_for_inserted): - # all these inserted nucleotides should *not* align to the - # reference - logger.debug( - "Skipping read, inserted nucleotides shouldn't map to reference") - return None - else: - # substitutions and deletions - if ref_pos_after - ref_pos_before != n_ref + 1: - # if the number of nucleotides skipped isn't the same - # as the number of reference nucleotides in the variant then - # don't use this read - logger.debug( - ("Positions before (%d) and after (%d) variant should be " - "adjacent on read %s"), - ref_pos_before, - ref_pos_after, - locus_read) - return None - - nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after] - - prefix = sequence[:read_pos_before + 1] - suffix = sequence[read_pos_after:] - - prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix) - prefix, suffix = trim_N_nucleotides(prefix, suffix) - - return cls( - prefix, - nucleotides_at_variant_locus, - suffix, - name=locus_read.name) - -def allele_reads_from_locus_reads(locus_reads, n_ref): - """ - Given a collection of LocusRead objects, returns a - list of AlleleRead objects - (which are split into prefix/allele/suffix nucleotide strings). - - Parameters - ---------- - locus_reads : sequence of LocusRead records - - n_ref : int - Number of reference nucleotides affected by variant. - - Generates AlleleRead objects. - """ - - for locus_read in locus_reads: - allele_read = AlleleRead.from_locus_read(locus_read, n_ref) - if allele_read is None: - continue - else: - yield allele_read - -def reads_overlapping_variant( - samfile, - variant, - chromosome=None, - use_duplicate_reads=USE_DUPLICATE_READS, - use_secondary_alignments=USE_SECONDARY_ALIGNMENTS, - min_mapping_quality=MIN_READ_MAPPING_QUALITY): - """ - Find reads in the given SAM/BAM file which overlap the given variant and - return them as a list of AlleleRead objects. - - Parameters - ---------- - samfile : pysam.AlignmentFile - - variant : varcode.Variant - - chromosome : str - - use_duplicate_reads : bool - Should we use reads that have been marked as PCR duplicates - - use_secondary_alignments : bool - Should we use reads at locations other than their best alignment - - min_mapping_quality : int - Drop reads below this mapping quality - - only_alt_allele : bool - Filter reads to only include those that support the alt allele of - the variant. - - Returns sequence of AlleleRead objects. - """ - logger.info("Gathering reads for %s", variant) - if chromosome is None: - chromosome = variant.contig - - logger.info( - "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)", - variant, - chromosome, - variant.gene_names) - - base1_position, ref, alt = trim_variant(variant) - - if len(ref) == 0: - # if the variant is an insertion - base1_position_before_variant = base1_position - base1_position_after_variant = base1_position + 1 - else: - base1_position_before_variant = base1_position - 1 - base1_position_after_variant = base1_position + len(ref) - - locus_reads = locus_read_generator( - samfile=samfile, - chromosome=chromosome, - base1_position_before_variant=base1_position_before_variant, - base1_position_after_variant=base1_position_after_variant, - use_duplicate_reads=use_duplicate_reads, - use_secondary_alignments=use_secondary_alignments, - min_mapping_quality=min_mapping_quality) - - allele_reads = allele_reads_from_locus_reads( - locus_reads=locus_reads, - n_ref=len(ref)) - - return allele_reads - -def reads_overlapping_variants(variants, samfile, **kwargs): - """ - Generates sequence of tuples, each containing a variant paired with - a list of AlleleRead objects. - - Parameters - ---------- - variants : varcode.VariantCollection - - samfile : pysam.AlignmentFile - - use_duplicate_reads : bool - Should we use reads that have been marked as PCR duplicates - - use_secondary_alignments : bool - Should we use reads at locations other than their best alignment - - min_mapping_quality : int - Drop reads below this mapping quality - """ - chromosome_names = set(samfile.references) - for variant in variants: - # I imagine the conversation went like this: - # A: "Hey, I have an awesome idea" - # B: "What's up?" - # A: "Let's make two nearly identical reference genomes" - # B: "But...that sounds like it might confuse people." - # A: "Nah, it's cool, we'll give the chromosomes different prefixes!" - # B: "OK, sounds like a good idea." - if variant.contig in chromosome_names: - chromosome = variant.contig - elif "chr" + variant.contig in chromosome_names: - chromosome = "chr" + variant.contig - else: - logger.warn( - "Chromosome '%s' from variant %s not in alignment file %s", - chromosome, - variant, - samfile.filename) - yield variant, [] - continue - allele_reads = reads_overlapping_variant( - samfile=samfile, - chromosome=chromosome, - variant=variant, - **kwargs) - yield variant, allele_reads - - -def group_reads_by_allele(allele_reads): - """ - Returns dictionary mapping each allele's nucleotide sequence to a list of - supporting AlleleRead objects. - """ - allele_to_reads_dict = defaultdict(list) - for allele_read in allele_reads: - allele_to_reads_dict[allele_read.allele].append(allele_read) - return allele_to_reads_dict - -def reads_to_dataframe(variants_and_allele_reads): - """ - Parameters - ---------- - variants_and_allele_reads : sequence - List or generator of pairs whose first element is a Variant and - whose second element is a sequence of AlleleRead objects. - """ - df_builder = DataFrameBuilder( - AlleleRead, - extra_column_fns={ - "gene": lambda variant, _: ";".join(variant.gene_names), - }) - for variant, allele_reads in variants_and_allele_reads: - df_builder.add_many(variant, allele_reads) - return df_builder.to_dataframe() diff --git a/isovar/assembly.py b/isovar/assembly.py index 067de40..a4edb31 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -42,7 +42,7 @@ def greedy_merge_helper( # arguments if sequence1 is on the right of sequence2 for j in range(i + 1, len(variant_sequences)): sequence2 = variant_sequences[j] - combined = sequence1.combine(sequence2) + combined = sequence1.combine(sequence2, min_overlap_size=min_overlap_size) if combined is None: continue if combined.sequence in merged_variant_sequences: @@ -57,6 +57,7 @@ def greedy_merge_helper( result = list(merged_variant_sequences.values()) + list(unmerged_variant_sequences) return result, merged_any + def greedy_merge( variant_sequences, min_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE): @@ -75,6 +76,7 @@ def greedy_merge( min_overlap_size=min_overlap_size) return variant_sequences + def collapse_substrings(variant_sequences): """ Combine shorter sequences which are fully contained in longer sequences. @@ -115,6 +117,7 @@ def collapse_substrings(variant_sequences): for variant_sequence in result_list ] + def iterative_overlap_assembly( variant_sequences, min_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE): diff --git a/isovar/cli/__init__.py b/isovar/cli/__init__.py index c2d0eee..312aa6d 100644 --- a/isovar/cli/__init__.py +++ b/isovar/cli/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine +# Copyright (c) 2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,3 +14,10 @@ from __future__ import print_function, division, absolute_import + +from .main_args import make_isovar_arg_parser, run_isovar_from_parsed_args + +__all__ = [ + "make_isovar_arg_parser", + "run_isovar_from_parsed_args" +] \ No newline at end of file diff --git a/isovar/cli/filter_args.py b/isovar/cli/filter_args.py new file mode 100644 index 0000000..008f98d --- /dev/null +++ b/isovar/cli/filter_args.py @@ -0,0 +1,83 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Common command-line arguments for filtering Isovar results +""" + +from __future__ import print_function, division, absolute_import + +from collections import OrderedDict + +from ..default_parameters import ( + MIN_NUM_RNA_ALT_FRAGMENTS, + MIN_NUM_RNA_ALT_READS, + MIN_FRACTION_RNA_ALT_FRAGMENTS, + MIN_RATIO_RNA_ALT_TO_OTHER_FRAGMENTS +) + +def add_filter_args(parser): + """ + Extends an ArgumentParser instance with commandline arguments related + to filtering variants and/or their associated protein sequences. + """ + filter_group = parser.add_argument_group("Filtering") + filter_group.add_argument( + "--min-alt-rna-reads", + type=int, + default=MIN_NUM_RNA_ALT_READS, + help="Minimum number of reads supporting variant allele (default %(default)s)") + + filter_group.add_argument( + "--min-alt-rna-fragments", + type=int, + default=MIN_NUM_RNA_ALT_FRAGMENTS, + help=( + "Minimum number of fragments supporting variant allele (default %(default)s). " + "Note that this option is the same as --min-alt-rna-reads for single-end " + "sequencing.")) + + filter_group.add_argument( + "--min-alt-rna-fraction", + type=float, + default=MIN_FRACTION_RNA_ALT_FRAGMENTS, + help=( + "Minimum ratio of fragments supporting variant allele to total RNA fragments " + "(default %(default)s).")) + + filter_group.add_argument( + "--min-ratio-alt-to-other-fragments", + type=float, + default=MIN_RATIO_RNA_ALT_TO_OTHER_FRAGMENTS, + help=( + "At loci where alleles other than the ref and a single alt are supported, " + "this parameter controls how many more times fragments supporting " + "the variant allele are required relative to other non-reference " + "alleles (default %(default)s).")) + return filter_group + + +def filter_threshold_dict_from_args(args): + """ + Convert names of filters from external CLI options to more + consistent naming scheme of {min|max}_{Isovar property}. + + Returns OrderedDict + """ + d = OrderedDict() + d["min_ratio_alt_to_other_fragments"] = args.min_ratio_alt_to_other_fragments + d["min_fraction_alt_fragments"] = args.min_alt_rna_fraction + d["min_num_alt_fragments"] = args.min_alt_rna_fragments + d["min_num_alt_reads"] = args.min_alt_rna_reads + return d diff --git a/isovar/cli/isovar_allele_counts.py b/isovar/cli/isovar_allele_counts.py index 9a298cf..ce47fb8 100644 --- a/isovar/cli/isovar_allele_counts.py +++ b/isovar/cli/isovar_allele_counts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,12 +19,9 @@ from __future__ import print_function, division, absolute_import import sys -from ..allele_counts import allele_counts_dataframe from ..logging import get_logger -from .rna_args import ( - make_rna_reads_arg_parser, - allele_reads_generator_from_args -) + +from .rna_args import make_rna_reads_arg_parser, read_evidence_dataframe_from_args from .output_args import add_output_args, write_dataframe @@ -42,9 +39,8 @@ def run(args=None): args = sys.argv[1:] args = parser.parse_args(args) logger.info(args) - variants_and_allele_reads_generator = allele_reads_generator_from_args(args) - allele_counts_df = allele_counts_dataframe(variants_and_allele_reads_generator) - logger.info(allele_counts_df) + df = read_evidence_dataframe_from_args(args) + logger.info(df) write_dataframe( - df=allele_counts_df, + df=df, args=args) diff --git a/isovar/cli/isovar_allele_reads.py b/isovar/cli/isovar_allele_reads.py index b10df45..abdd4fd 100644 --- a/isovar/cli/isovar_allele_reads.py +++ b/isovar/cli/isovar_allele_reads.py @@ -22,7 +22,7 @@ from ..logging import get_logger from .rna_args import ( make_rna_reads_arg_parser, - allele_reads_dataframe_from_args + read_evidence_dataframe_from_args ) from .output_args import add_output_args, write_dataframe @@ -38,6 +38,6 @@ def run(args=None): args = sys.argv[1:] args = parser.parse_args(args) logger.info(args) - df = allele_reads_dataframe_from_args(args) + df = read_evidence_dataframe_from_args(args) logger.info(df) write_dataframe(df, args) diff --git a/isovar/cli/isovar_main.py b/isovar/cli/isovar_main.py new file mode 100644 index 0000000..c4c6387 --- /dev/null +++ b/isovar/cli/isovar_main.py @@ -0,0 +1,48 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Primary Isovar command, used to collect information about variants, +the RNA reads which overlap and protein sequences which can be constructed +from reads that support the variant. +""" +from __future__ import print_function, division, absolute_import + +import sys + + + +from ..logging import get_logger +from ..dataframe_helpers import isovar_results_to_dataframe + + +from .main_args import run_isovar_from_parsed_args, make_isovar_arg_parser + +from .output_args import add_output_args, write_dataframe + +logger = get_logger(__name__) + +def run(args=None): + if args is None: + args = sys.argv[1:] + parser = make_isovar_arg_parser() + parser = add_output_args( + parser, + filename="isovar-results.csv") + args = parser.parse_args(args) + logger.info(args) + isovar_results = run_isovar_from_parsed_args(args) + df = isovar_results_to_dataframe(isovar_results) + logger.info(df) + write_dataframe(df, args) diff --git a/isovar/cli/isovar_translations.py b/isovar/cli/isovar_translations.py index a3d7158..a88a3b5 100644 --- a/isovar/cli/isovar_translations.py +++ b/isovar/cli/isovar_translations.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,14 +21,11 @@ import sys from ..logging import get_logger -from ..translation import ( - translate_variants, - translations_generator_to_dataframe -) -from .translation_args import ( - make_translation_arg_parser, -) -from .rna_args import variant_reads_generator_from_args +from ..protein_sequence_creator import ProteinSequenceCreator +from ..dataframe_helpers import translations_generator_to_dataframe + +from .translation_args import make_translation_arg_parser +from .rna_args import read_evidence_generator_from_args from .output_args import add_output_args, write_dataframe logger = get_logger(__name__) @@ -41,19 +38,26 @@ def translations_generator_from_args(args): - variant_reads_generator = variant_reads_generator_from_args(args) - return translate_variants( - variant_reads_generator, + """ + Given parsed commandline arguments, returns a generator whose elements + are (varcode.Variant, [Translation]) + """ + read_evidence_generator = read_evidence_generator_from_args(args) + protein_sequence_creator = ProteinSequenceCreator( protein_sequence_length=args.protein_sequence_length, - min_alt_rna_reads=args.min_alt_rna_reads, min_variant_sequence_coverage=args.min_variant_sequence_coverage, variant_sequence_assembly=args.variant_sequence_assembly, min_transcript_prefix_length=args.min_transcript_prefix_length, max_transcript_mismatches=args.max_reference_transcript_mismatches, - include_mismatches_after_variant=args.include_mismatches_after_variant) + count_mismatches_after_variant=args.count_mismatches_after_variant) + return protein_sequence_creator.translate_variants(read_evidence_generator) def translations_dataframe_from_args(args): + """ + Collects Translation objects based on commandline arguments and + converts them into a DataFrame. + """ translations_generator = translations_generator_from_args(args) return translations_generator_to_dataframe(translations_generator) diff --git a/isovar/cli/isovar_variant_reads.py b/isovar/cli/isovar_variant_reads.py index 662e7be..1ba3203 100644 --- a/isovar/cli/isovar_variant_reads.py +++ b/isovar/cli/isovar_variant_reads.py @@ -21,7 +21,7 @@ from ..logging import get_logger from .rna_args import ( - variant_reads_dataframe_from_args, + read_evidence_dataframe_from_args, make_rna_reads_arg_parser, ) from .output_args import add_output_args, write_dataframe @@ -41,6 +41,6 @@ def run(args=None): args = sys.argv[1:] args = parser.parse_args(args) logger.info(args) - df = variant_reads_dataframe_from_args(args) + df = read_evidence_dataframe_from_args(args) logger.info(df) write_dataframe(df, args) diff --git a/isovar/cli/isovar_variant_sequences.py b/isovar/cli/isovar_variant_sequences.py index 80498d8..64fcdf7 100644 --- a/isovar/cli/isovar_variant_sequences.py +++ b/isovar/cli/isovar_variant_sequences.py @@ -13,15 +13,16 @@ # limitations under the License. from __future__ import print_function, division, absolute_import + import sys from ..logging import get_logger -from ..variant_sequences import ( - reads_generator_to_sequences_generator, - variant_sequences_generator_to_dataframe -) -from .rna_args import allele_reads_generator_from_args +from ..variant_sequence_creator import VariantSequenceCreator +from ..dataframe_helpers import variant_sequences_generator_to_dataframe + + +from .rna_args import read_evidence_generator_from_args from .variant_sequences_args import make_variant_sequences_arg_parser from .output_args import add_output_args, write_dataframe @@ -35,14 +36,16 @@ def variant_sequences_generator_from_args(args): - allele_reads_generator = allele_reads_generator_from_args(args) - return reads_generator_to_sequences_generator( - allele_reads_generator, - min_alt_rna_reads=args.min_alt_rna_reads, + """ + Use parsed commandline arguments to load variants and RNA reads and + generate a sequence of (Variant, list of VariantSequence) pairs. + """ + read_evidence_generator = read_evidence_generator_from_args(args) + variant_sequence_creator = VariantSequenceCreator( min_variant_sequence_coverage=args.min_variant_sequence_coverage, preferred_sequence_length=args.variant_sequence_length, variant_sequence_assembly=args.variant_sequence_assembly) - + return variant_sequence_creator.sequences_from_read_evidence_generator(read_evidence_generator) def variant_sequences_dataframe_from_args(args): variant_sequences_generator = variant_sequences_generator_from_args(args) diff --git a/isovar/cli/main_args.py b/isovar/cli/main_args.py new file mode 100644 index 0000000..6f6addf --- /dev/null +++ b/isovar/cli/main_args.py @@ -0,0 +1,59 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Create parser and run Isovar from parsed args +""" + +from __future__ import print_function, division, absolute_import + +from varcode.cli import variant_collection_from_args + +from ..main import run_isovar + +from .protein_sequence_args import ( + make_protein_sequences_arg_parser, + protein_sequence_creator_from_args +) +from .filter_args import add_filter_args, filter_threshold_dict_from_args +from .rna_args import read_collector_from_args, alignment_file_from_args + +def make_isovar_arg_parser(**kwargs): + """ + Create argument parser with all options required to run Isovar + + Parameters + ---------- + **kwargs : dict + Passed directly to argparse.ArgumentParser + """ + parser = make_protein_sequences_arg_parser(**kwargs) + add_filter_args(parser) + return parser + +def run_isovar_from_parsed_args(args): + """ + Extract parameters from parsed arguments and use them to run Isovar + """ + variants = variant_collection_from_args(args) + read_collector = read_collector_from_args(args) + alignment_file = alignment_file_from_args(args) + protein_sequence_creator = protein_sequence_creator_from_args(args) + filter_thresholds = filter_threshold_dict_from_args(args) + return run_isovar( + variants=variants, + alignment_file=alignment_file, + read_collector=read_collector, + protein_sequence_creator=protein_sequence_creator, + filter_thresholds=filter_thresholds) diff --git a/isovar/cli/output_args.py b/isovar/cli/output_args.py index 5e128b3..0b423d3 100644 --- a/isovar/cli/output_args.py +++ b/isovar/cli/output_args.py @@ -38,6 +38,10 @@ def add_output_args( def write_dataframe(df, args): + """ + Write a DataFrame to location specified in commandline arguments, + optionally filtered by specific columns + """ assert len(args.output) > 0 if args.output_columns is not None and len(args.output_columns) > 0: valid_columns = set(df.columns) diff --git a/isovar/cli/protein_sequence_args.py b/isovar/cli/protein_sequence_args.py index c53edba..995022b 100644 --- a/isovar/cli/protein_sequence_args.py +++ b/isovar/cli/protein_sequence_args.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,12 +16,10 @@ from __future__ import print_function, division, absolute_import from ..default_parameters import MAX_PROTEIN_SEQUENCES_PER_VARIANT -from ..protein_sequences import ( - protein_sequences_generator_to_dataframe, - reads_generator_to_protein_sequences_generator -) +from ..main import ProteinSequenceCreator +from ..dataframe_helpers import protein_sequences_generator_to_dataframe -from .rna_args import allele_reads_generator_from_args +from .rna_args import read_evidence_generator_from_args from .translation_args import make_translation_arg_parser @@ -41,6 +39,19 @@ def add_protein_sequence_args(parser): return protein_sequence_group +def protein_sequence_creator_from_args(args): + """ + Create ProteinSequenceCreator instance from parsed commandline arguments + """ + return ProteinSequenceCreator( + protein_sequence_length=args.protein_sequence_length, + min_variant_sequence_coverage=args.min_variant_sequence_coverage, + min_transcript_prefix_length=args.min_transcript_prefix_length, + max_transcript_mismatches=args.max_reference_transcript_mismatches, + max_protein_sequences_per_variant=args.max_protein_sequences_per_variant, + variant_sequence_assembly=args.variant_sequence_assembly) + + def make_protein_sequences_arg_parser(**kwargs): """ Parameters @@ -60,18 +71,22 @@ def make_protein_sequences_arg_parser(**kwargs): def protein_sequences_generator_from_args(args): - allele_reads_generator = allele_reads_generator_from_args(args) - return reads_generator_to_protein_sequences_generator( - allele_reads_generator, - protein_sequence_length=args.protein_sequence_length, - min_alt_rna_reads=args.min_alt_rna_reads, - min_variant_sequence_coverage=args.min_variant_sequence_coverage, - min_transcript_prefix_length=args.min_transcript_prefix_length, - max_transcript_mismatches=args.max_reference_transcript_mismatches, - max_protein_sequences_per_variant=args.max_protein_sequences_per_variant, - variant_sequence_assembly=args.variant_sequence_assembly) + """ + Uses parsed commandline arguments to load variants and aligned + reads and uses them to generate sequence (Variant, list of ProteinSequence) + pairs. + """ + read_evidence_generator = read_evidence_generator_from_args(args) + protein_sequence_creator = protein_sequence_creator_from_args(args) + return protein_sequence_creator.protein_sequences_from_read_evidence_generator( + read_evidence_generator) def protein_sequences_dataframe_from_args(args): + """ + Use parsed commandline arguments to load variants, aligned RNA reads, + create protein sequences for each variant and generate a DataFrame + for them all. + """ protein_sequences_generator = protein_sequences_generator_from_args(args) return protein_sequences_generator_to_dataframe(protein_sequences_generator) diff --git a/isovar/cli/reference_context_args.py b/isovar/cli/reference_context_args.py index ff3c6b9..3fce8a0 100644 --- a/isovar/cli/reference_context_args.py +++ b/isovar/cli/reference_context_args.py @@ -19,8 +19,9 @@ variant_collection_from_args ) +from ..reference_context_helpers import reference_contexts_generator from ..default_parameters import CDNA_CONTEXT_SIZE -from ..reference_context import variants_to_reference_contexts_dataframe +from ..dataframe_helpers import variants_to_reference_contexts_dataframe def add_reference_context_args(parser): @@ -54,7 +55,12 @@ def make_reference_context_arg_parser(**kwargs): def reference_contexts_dataframe_from_args(args): + """ + Generate a DataFrame for variants and their associated reference contexts + loaded based on parsed commandline arguments. + """ variants = variant_collection_from_args(args) - return variants_to_reference_contexts_dataframe( + reference_context_gen = reference_contexts_generator( variants=variants, context_size=args.context_size) + return variants_to_reference_contexts_dataframe(reference_context_gen) diff --git a/isovar/cli/rna_args.py b/isovar/cli/rna_args.py index 35cac3d..5eee2a2 100644 --- a/isovar/cli/rna_args.py +++ b/isovar/cli/rna_args.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Common command-line arguments for all Isovar commands which use RNA +""" from __future__ import print_function, division, absolute_import @@ -20,8 +23,9 @@ from varcode.cli import make_variants_parser, variant_collection_from_args from ..default_parameters import MIN_READ_MAPPING_QUALITY -from ..allele_reads import reads_overlapping_variants, reads_to_dataframe -from ..variant_reads import reads_supporting_variants + +from ..read_collector import ReadCollector +from ..dataframe_helpers import allele_reads_to_dataframe, read_evidence_generator_to_dataframe def add_rna_args( @@ -30,7 +34,6 @@ def add_rna_args( """ Extends an ArgumentParser instance with the following commandline arguments: --bam - --min-reads --min-mapping-quality --use-duplicate-reads --drop-secondary-alignments @@ -50,12 +53,19 @@ def add_rna_args( rna_group.add_argument( "--use-duplicate-reads", default=False, - action="store_true") + action="store_true", + help=( + "By default, reads which have been marked as duplicates are excluded." + "Use this option to include duplicate reads.")) rna_group.add_argument( "--drop-secondary-alignments", default=False, - action="store_true") + action="store_true", + help=( + "By default, secondary alignments are included in reads, " + "use this option to instead only use primary alignments.")) + return rna_group @@ -74,35 +84,57 @@ def make_rna_reads_arg_parser(**kwargs): return parser -def samfile_from_args(args): +def alignment_file_from_args(args): + """ + Use parsed arguments to load a file of aligned RNA reads. + """ return AlignmentFile(args.bam) -def allele_reads_generator_from_args(args): - variants = variant_collection_from_args(args) - samfile = samfile_from_args(args) - return reads_overlapping_variants( - variants=variants, - samfile=samfile, +def read_collector_from_args(args): + """ + Use parsed arguments to create a ReadCollector object + """ + return ReadCollector( + min_mapping_quality=args.min_mapping_quality, use_duplicate_reads=args.use_duplicate_reads, - use_secondary_alignments=not args.drop_secondary_alignments, - min_mapping_quality=args.min_mapping_quality) + use_secondary_alignments=not args.drop_secondary_alignments) -def allele_reads_dataframe_from_args(args): - return reads_to_dataframe(allele_reads_generator_from_args(args)) +def read_evidence_generator_from_args(args): + """ + Creates a generator of (Variant, ReadEvidence) pairs from parsed + arguments. + """ + variants = variant_collection_from_args(args) + samfile = alignment_file_from_args(args) + read_creator = read_collector_from_args(args) + return read_creator.read_evidence_generator( + variants=variants, + alignment_file=samfile) def variant_reads_generator_from_args(args): - variants = variant_collection_from_args(args) - samfile = samfile_from_args(args) - return reads_supporting_variants( - variants=variants, - samfile=samfile, - use_duplicate_reads=args.use_duplicate_reads, - use_secondary_alignments=not args.drop_secondary_alignments, - min_mapping_quality=args.min_mapping_quality) + """ + Creates a generator of (Variant, list of AlleleRead) from parsed + arguments, where all AlleleRead objects must have alleles matching + the variant. + """ + for variant, read_evidence in read_evidence_generator_from_args(args): + yield variant, read_evidence.alt_reads -def variant_reads_dataframe_from_args(args): - return reads_to_dataframe(variant_reads_generator_from_args(args)) +def read_evidence_dataframe_from_args(args): + """ + Collect ReadEvidence for each variant and turn them into a DataFrame + """ + return read_evidence_generator_to_dataframe( + read_evidence_generator_from_args(args)) + + +def variants_reads_dataframe_from_args(args): + """ + Collect variant reads for each variant and turn them into a DataFrame + """ + return allele_reads_to_dataframe( + read_evidence_generator_from_args(args)) diff --git a/isovar/cli/translation_args.py b/isovar/cli/translation_args.py index 1daa118..95693f3 100644 --- a/isovar/cli/translation_args.py +++ b/isovar/cli/translation_args.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Common command-line arguments for all Isovar commands which translate +cDNA into protein sequences. +""" from __future__ import print_function, division, absolute_import from ..default_parameters import ( MIN_TRANSCRIPT_PREFIX_LENGTH, MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - INCLUDE_MISMATCHES_AFTER_VARIANT, + COUNT_MISMATCHES_AFTER_VARIANT, PROTEIN_SEQUENCE_LENGTH, ) from .variant_sequences_args import make_variant_sequences_arg_parser @@ -42,9 +46,9 @@ def add_translation_args(parser): " reference sequence before a candidate reading frame is ignored.")) translation_group.add_argument( - "--include-mismatches-after-variant", + "--count-mismatches-after-variant", action="store_true", - default=INCLUDE_MISMATCHES_AFTER_VARIANT, + default=COUNT_MISMATCHES_AFTER_VARIANT, help="If true, mismatches after the variant locus will count toward the " "--max-reference-transcript-mismatches filter.") diff --git a/isovar/cli/variant_sequences_args.py b/isovar/cli/variant_sequences_args.py index 23db10c..ea31c30 100644 --- a/isovar/cli/variant_sequences_args.py +++ b/isovar/cli/variant_sequences_args.py @@ -16,7 +16,6 @@ from __future__ import print_function, division, absolute_import from ..default_parameters import ( - MIN_ALT_RNA_READS, MIN_VARIANT_SEQUENCE_COVERAGE, VARIANT_SEQUENCE_LENGTH, ) @@ -29,12 +28,6 @@ def add_variant_sequence_args( rna_sequence_group = parser.add_argument_group( "Determine coding sequence from RNA") - rna_sequence_group.add_argument( - "--min-alt-rna-reads", - type=int, - default=MIN_ALT_RNA_READS, - help="Minimum number of reads supporting variant allele (default %(default)s)") - rna_sequence_group.add_argument( "--min-variant-sequence-coverage", type=int, diff --git a/isovar/common.py b/isovar/common.py index 097e10a..8132cca 100644 --- a/isovar/common.py +++ b/isovar/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ from __future__ import print_function, division, absolute_import from collections import defaultdict +import numpy as np def list_to_string(list_of_anything, sep=";"): @@ -35,3 +36,27 @@ def groupby(xs, key_fn): key = key_fn(x) result[key].append(x) return result + + +def safediv(x, y): + """ + Compute ratio between two fields safely, so that + if numerator is zero, result is zero and if denominator + is zero then result is infinity. + + Parameters + ---------- + x : int or float + Numerator value + + y : int or float + Denominator value + + Returns float + """ + if x == 0: + return 0.0 + elif y == 0: + return np.inf + else: + return x / y diff --git a/isovar/dataframe_builder.py b/isovar/dataframe_builder.py index 08b5645..d6e1687 100644 --- a/isovar/dataframe_builder.py +++ b/isovar/dataframe_builder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine +# Copyright (c) 2016-2019 Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -129,8 +129,20 @@ def __init__( self.columns_dict = OrderedDict(columns_list) def add(self, variant, element): + """ + Add one row to the DataFrame + + Parameters + ---------- + variant : varcode.Variant + + element : must have type self.element_class + """ if self.variant_columns: - assert isinstance(variant, Variant) + assert isinstance(variant, Variant), \ + "Expected %s : %s to be a Variant" % ( + variant, + type(variant)) self.columns_dict["chr"].append(variant.contig) self.columns_dict["pos"].append(variant.original_start) self.columns_dict["ref"].append(variant.original_ref) @@ -138,7 +150,9 @@ def add(self, variant, element): else: assert variant is None - assert isinstance(element, self.element_class) + assert isinstance(element, self.element_class), \ + "Expected %s : %s to have type %s" % ( + element, type(element), self.element_class) for name in self.original_field_names: value = getattr(element, name) @@ -149,6 +163,8 @@ def add(self, variant, element): if isinstance(value, COLLECTION_TYPES) and self.convert_collections_to_size: value = len(value) + elif value is None: + value = None elif not isinstance(value, VALID_ELEMENT_TYPES): raise ValueError( "Please provider converter for field '%s' : %s to make a scalar or string" % ( @@ -182,15 +198,12 @@ def _check_column_lengths(self): "Mismatch between lengths of columns: %s" % (column_lengths_dict,)) def to_dataframe(self): + """ + Creates dataframe from accumulated rows + + Returns + ------- + pandas.DataFrame + """ self._check_column_lengths() return pd.DataFrame(self.columns_dict) - - -def dataframe_from_generator( - element_class, - variant_and_elements_generator, - **kwargs): - builder = DataFrameBuilder(element_class, **kwargs) - for variant, elements in variant_and_elements_generator: - builder.add_many(variant, elements) - return builder.to_dataframe() diff --git a/isovar/dataframe_helpers.py b/isovar/dataframe_helpers.py new file mode 100644 index 0000000..67a97a5 --- /dev/null +++ b/isovar/dataframe_helpers.py @@ -0,0 +1,220 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +import pandas as pd + +from .allele_read import AlleleRead +from .common import list_to_string +from .dataframe_builder import DataFrameBuilder +from .locus_read import LocusRead +from .protein_sequence import ProteinSequence +from .read_collector import ReadCollector +from .read_evidence import ReadEvidence +from .reference_context import ReferenceContext +from .translation import Translation +from .variant_sequence import VariantSequence + + +def dataframe_from_generator( + element_class, + variant_and_elements_generator, + **kwargs): + """ + Creates a DataFrame from a generator whose elements + are varcode.Variant objects paired with objects + of the `element_class` type. + + Parameters + ---------- + element_class : type + + variant_and_elements_generator : generator + Elements are (varcode.Variant, element_class) + + **kwargs : dict + Additional arguments to pass to DataFrameBuilder. + + Returns + ------- + pandas.DataFrame + """ + builder = DataFrameBuilder(element_class, **kwargs) + for variant, x in variant_and_elements_generator: + if isinstance(x, (list, tuple)): + builder.add_many(variant, x) + else: + builder.add(variant, x) + return builder.to_dataframe() + + +def protein_sequences_generator_to_dataframe(variant_and_protein_sequences_generator): + """ + Given a generator which yields (Variant, [ProteinSequence]) elements, + returns a pandas.DataFrame + """ + return dataframe_from_generator( + element_class=ProteinSequence, + variant_and_elements_generator=variant_and_protein_sequences_generator, + extra_column_fns=dict( + gene=lambda _, x: ";".join(x.gene_names))) + + +def allele_counts_dataframe(read_evidence_generator): + """ + Creates a DataFrame containing number of reads supporting the + ref vs. alt alleles for each variant. + """ + return dataframe_from_generator( + element_class=ReadEvidence, + variant_and_elements_generator=read_evidence_generator, + # DataFrameBuilder will take the length of these fields' values + rename_dict={ + "ref_reads": "num_ref_reads", + "alt_reads": "num_alt_reads", + "other_reads": "num_other_reads", + }, + extra_column_fns={ + "num_ref_fragments": lambda _, x: len(x.ref_read_names), + "num_alt_fragments": lambda _, x: len(x.alt_read_names), + "num_other_fragments": lambda _, x: len(x.other_read_names) + }) + + +def allele_reads_to_dataframe(variants_and_allele_reads): + """ + Parameters + ---------- + variants_and_allele_reads : sequence + List or generator of pairs whose first element is a Variant and + whose second element is a sequence of AlleleRead objects. + """ + df_builder = DataFrameBuilder( + AlleleRead, + extra_column_fns={ + "gene": lambda v, _: ";".join(v.gene_names), + }) + for variant, allele_reads in variants_and_allele_reads: + df_builder.add_many(variant, allele_reads) + return df_builder.to_dataframe() + + +def locus_reads_dataframe(alignments, chromosome, base0_start, base0_end, *args, **kwargs): + """ + Traverse an alignment file (typeically a BAM) to find all the reads + overlapping a specified locus. + + Extra parameters are the same as those for ReadCreator + """ + df_builder = DataFrameBuilder( + LocusRead, + variant_columns=False, + converters={ + "reference_positions": list_to_string, + "quality_scores": list_to_string, + }) + read_creator = ReadCollector(*args, **kwargs) + for locus_read in read_creator.get_locus_reads( + alignments, chromosome, base0_start, base0_end): + df_builder.add(variant=None, element=locus_read) + return df_builder.to_dataframe() + + +def variants_to_reference_contexts_dataframe( + variant_and_reference_contexts_generator): + """ + Given a generator of (Variant, [ReferenceContext]) pairs, create a + DataFrame. + + Returns a DataFrame with {"chr", "pos", "ref", "alt"} columns for variants, + as well as all the fields of ReferenceContext. + """ + + df_builder = DataFrameBuilder( + ReferenceContext, + exclude=["variant"], + converters=dict(transcripts=lambda ts: ";".join(t.name for t in ts)), + extra_column_fns={ + "gene": lambda variant, _: ";".join(variant.gene_names), + }) + for variant, reference_contexts in variant_and_reference_contexts_generator: + df_builder.add_many(variant, reference_contexts) + return df_builder.to_dataframe() + + +def variant_sequences_generator_to_dataframe(variant_sequences_generator): + """ + Creates a dataframe from a generator which yields + (Variant, [VariantSequence]) pairs. + + Returns pandas.DataFrame + """ + return dataframe_from_generator( + VariantSequence, + variant_sequences_generator, + rename_dict={"alt": "allele"}, + extra_column_fns={ + "gene": lambda variant, _: ";".join(variant.gene_names), + }) + + +def translations_generator_to_dataframe(translations_generator): + """ + Given a generator of (Variant, [Translation]) pairs, + returns a DataFrame of translated protein fragments with columns + for each field of a Translation object (and chr/pos/ref/alt per variant). + """ + return dataframe_from_generator( + element_class=Translation, + variant_and_elements_generator=translations_generator, + exclude=[], + converters={ + "untrimmed_variant_sequence": lambda vs: vs.sequence, + "variant_orf": ( + lambda variant_orf: variant_orf.in_frame_cdna_sequence), + "reference_context": ( + lambda rc: ";".join([ + transcript.name for + transcript in rc.transcripts])) + }, + extra_column_fns={ + "untrimmed_variant_sequence_read_count": ( + lambda _, t: len(t.untrimmed_variant_sequence.reads)), + }) + + +def read_evidence_generator_to_dataframe(read_evidence_generator): + """ + Create a DataFrame from generator of (Variant, ReadEvidence) pairs. + """ + return dataframe_from_generator( + element_class=ReadEvidence, + variant_and_elements_generator=read_evidence_generator) + + +def isovar_results_to_dataframe(isovar_results): + """ + Create a DataFrame from a sequence of IsovarResult objects. + + Parameters + ---------- + isovar_results : list or generator of IsovarResult + + Returns pandas.DataFrame + """ + records = [] + for isovar_result in isovar_results: + records.append(isovar_result.to_record()) + return pd.DataFrame.from_records(records) \ No newline at end of file diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py index 3234bb8..ca70d23 100644 --- a/isovar/default_parameters.py +++ b/isovar/default_parameters.py @@ -56,9 +56,6 @@ # parameter directly CDNA_CONTEXT_SIZE = VARIANT_SEQUENCE_LENGTH // 2 -# minimum number of total RNA reads supporting a variant allele -MIN_ALT_RNA_READS = 2 - # minimum number of reads supporting each nucleotide of a # variant coding sequence MIN_VARIANT_SEQUENCE_COVERAGE = 2 @@ -73,7 +70,7 @@ # whether to include mismatches after a variant locus toward the # MAX_REFERENCE_TRANSCRIPT_MISMATCHES count -INCLUDE_MISMATCHES_AFTER_VARIANT = False +COUNT_MISMATCHES_AFTER_VARIANT = False # number of amino acids / codons we're trying to translate PROTEIN_SEQUENCE_LENGTH = 20 @@ -91,3 +88,51 @@ # to minimize false assembly of isoforms which don't # actually exist. MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE = 30 + +# include sequences from reads if they were clipped +# by the aligner +USE_SOFT_CLIPPED_BASES = False + +# minimum number of RNA reads supporting a variant allele +MIN_NUM_RNA_ALT_READS = 3 + +# minimum number of total RNA fragments supporting a variant allele, +# differs from MIN_NUM_ALT_READS for paired end sequencing but is the same +# for single-end sequencing +MIN_NUM_RNA_ALT_FRAGMENTS = 2 + +# minimum ratio of # alt reads / # total overlapping reads +MIN_FRACTION_RNA_ALT_READS = 0.005 # (at least e.g. 3 in 600) + +# minimum ratio of # alt fragments / # total overlapping fragments +MIN_FRACTION_RNA_ALT_FRAGMENTS = 0.005 # (at least e.g. 3 in 600) + +# maximum number of RNA reads supporting a reference allele +MAX_NUM_RNA_REF_READS = 10 ** 9 + +# maximum number of total RNA fragments supporting a reference allele +MAX_NUM_RNA_REF_FRAGMENTS = 10 ** 9 + +# minimum ratio of # ref reads / # total overlapping reads +MAX_FRACTION_RNA_REF_READS = 1.0 + +# minimum ratio of # ref fragments / # total overlapping fragments +MAX_FRACTION_RNA_REF_FRAGMENTS = 1.0 + +# maximum number of RNA reads supporting a reference allele +MAX_NUM_RNA_OTHER_READS = 10 ** 9 + +# maximum number of total RNA fragments supporting a reference allele +MAX_NUM_RNA_OTHER_FRAGMENTS = 10 ** 9 + +# minimum ratio of # other (non-ref/non-alt) reads / # total overlapping reads +MAX_FRACTION_RNA_OTHER_READS = 0.5 + +# minimum ratio of # other fragments (non-ref/non-alt) / # total overlapping fragments +MAX_FRACTION_RNA_OTHER_FRAGMENTS = 0.5 + +# At loci where there is RNA support for both the alt allele and other +# non-reference alleles, we want the number of reads supporting the alt +# to be at least this many times greater than the total counts for the +# third and fourth alleles. +MIN_RATIO_RNA_ALT_TO_OTHER_FRAGMENTS = 3.0 diff --git a/isovar/dna.py b/isovar/dna.py index f23b4cd..8cdb53a 100644 --- a/isovar/dna.py +++ b/isovar/dna.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,11 @@ from __future__ import print_function, division, absolute_import +""" +This module implements basic DNA functionality in Python strings to +to avoid having to depend on a bigger library such as BioPython. +""" + dna_complement_dictionary = { "A": "T", "T": "A", @@ -29,8 +34,26 @@ def complement_dna(seq): + """ + Convert every A->T, T->A, C->G, G->C in a DNA sequence + + Parameters + ---------- + seq : str + + Returns str + """ return "".join(dna_complement_dictionary[nt] for nt in seq) def reverse_complement_dna(seq): + """ + Reverse complement of a DNA sequence + + Parameters + ---------- + seq : str + + Returns str + """ return complement_dna(seq)[::-1] diff --git a/isovar/effect_prediction.py b/isovar/effect_prediction.py index fcb770a..a16620d 100644 --- a/isovar/effect_prediction.py +++ b/isovar/effect_prediction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,7 +24,9 @@ def predicted_effects_for_variant( variant, transcript_id_whitelist=None, - only_coding_changes=True): + only_coding_transcripts=False, + drop_silent_and_noncoding=False, + require_mutant_protein_sequence=False): """ For a given variant, return its set of predicted effects. Optionally filter to transcripts where this variant results in a non-synonymous @@ -37,24 +39,29 @@ def predicted_effects_for_variant( transcript_id_whitelist : set Filter effect predictions to only include these transcripts + only_coding_transcripts : bool + If True, then only return effects on protein coding transcripts + + drop_silent_and_noncoding : bool + If True, drop effects which aren't predicted to change the protein + sequence. + + require_mutant_protein_sequence : bool + Drop effects for which we can't predict what the new protein sequence + will be. + Returns a varcode.EffectCollection object """ - effects = [] for transcript in variant.transcripts: - if only_coding_changes and not transcript.complete: - logger.info( - "Skipping transcript %s for variant %s because it's incomplete", - transcript.name, - variant) + if (only_coding_transcripts and not ( + transcript.complete and transcript.is_protein_coding)): continue - if transcript_id_whitelist and transcript.id not in transcript_id_whitelist: logger.info( - "Skipping transcript %s for variant %s because it's not one of %d allowed", + "Skipping transcript %s for variant %s because it's not in whitelist", transcript.name, - variant, - len(transcript_id_whitelist)) + variant) continue effects.append(variant.effect_on_transcript(transcript)) @@ -64,9 +71,7 @@ def predicted_effects_for_variant( logger.info("Predicted total %d effects for variant %s" % ( n_total_effects, variant)) - if not only_coding_changes: - return effects - else: + if drop_silent_and_noncoding: nonsynonymous_coding_effects = effects.drop_silent_and_noncoding() logger.info( "Keeping %d/%d effects which affect protein coding sequence for %s: %s", @@ -74,24 +79,49 @@ def predicted_effects_for_variant( n_total_effects, variant, nonsynonymous_coding_effects) + effects = nonsynonymous_coding_effects - usable_effects = [ + if require_mutant_protein_sequence: + effects_with_mut_sequence = [ effect for effect in nonsynonymous_coding_effects if effect.mutant_protein_sequence is not None ] logger.info( "Keeping %d effects with predictable AA sequences for %s: %s", - len(usable_effects), + len(effects_with_mut_sequence), variant, - usable_effects) - return usable_effects + effects_with_mut_sequence) + effects = effects_with_mut_sequence + return effects -def reference_transcripts_for_variant( +def top_varcode_effect(variant, transcript_id_whitelist=None): + """ + Find the best predicted effect for the given variant. If we have a + transcript whitelist (based on filtering bulk expression) then use + it to eliminate some of the effect predictions. + Returns subclass of varcode.MutationEffect + """ + effects = predicted_effects_for_variant( variant, - transcript_id_whitelist=None, - only_coding_changes=True): + transcript_id_whitelist=transcript_id_whitelist) + if len(effects) == 0 and transcript_id_whitelist is not None: + # if everything got filtered due to the transcript whitelist, + # we still need to return some kind of "top" effect so look + # at those which got filtered out by expression + effects = predicted_effects_for_variant( + variant, + transcript_id_whitelist=None) + if len(effects) == 0: + raise ValueError( + "Could not determine top effect prediction for %s" % variant) + return effects.top_priority_effect() + + +def reference_coding_transcripts_for_variant( + variant, + transcript_id_whitelist=None): """ For a given variant, find all the transcripts which overlap the variant and for which it has a predictable effect on the amino acid @@ -100,5 +130,8 @@ def reference_transcripts_for_variant( predicted_effects = predicted_effects_for_variant( variant=variant, transcript_id_whitelist=transcript_id_whitelist, - only_coding_changes=only_coding_changes) + only_coding_transcripts=True, + drop_silent_and_noncoding=True, + require_mutant_protein_sequence=True) return [effect.transcript for effect in predicted_effects] + diff --git a/isovar/filtering.py b/isovar/filtering.py new file mode 100644 index 0000000..d634f1c --- /dev/null +++ b/isovar/filtering.py @@ -0,0 +1,179 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functions used to annotate IsovarResult objects with filters. +""" + +from __future__ import print_function, division, absolute_import + +from collections import OrderedDict +import operator + + +def evaluate_threshold_filters(isovar_result, filter_thresholds): + """ + Helper method used by apply_filters + + Parameters + ---------- + isovar_result : IsovarResult + + filter_thresholds : dict or OrderedDict + Every argument is supposed to be something like "max_alt_reads" + where the first three characters are "min" or "max" and the + rest of the name is either a field of IsovarResult or + a numeric field like "num_alt_reads". The name of each filter + maps to a cutoff value. Filters starting with "max" + require that the corresponding field on CoverageStats + is <= cutoff, whereas filters starting with + "min" require >= cutoff. + + Returns OrderedDict + """ + filter_values_dict = OrderedDict() + for name, threshold in filter_thresholds.items(): + parts = name.split("_") + min_or_max = parts[0] + field_name = "_".join(parts[1:]) + if min_or_max == "min": + comparison_fn = operator.ge + elif min_or_max == "max": + comparison_fn = operator.le + else: + raise ValueError( + "Invalid filter '%s', must start with 'min' or 'max'" % name) + if hasattr(isovar_result, field_name): + field_value = getattr(isovar_result, field_name) + else: + raise ValueError( + "Invalid filter '%s' IsovarResult does not have property '%s'" % ( + name, + field_name)) + filter_values_dict[name] = comparison_fn(field_value, threshold) + return filter_values_dict + + +def evaluate_boolean_filters(isovar_result, filter_flags): + """ + Helper function used by apply_filters. + + Parameters + ---------- + isovar_result : IsovarResult + + filter_flags : list of str + Every element should be a boolean property of IsovarResult + or "not_" and the name of a property to be negated. + + Returns OrderedDict + """ + filter_values = OrderedDict() + for boolean_filter_name in filter_flags: + if boolean_filter_name.startswith("not_"): + boolean_field_name = boolean_filter_name[4:] + negate = True + else: + boolean_field_name = boolean_filter_name + negate = False + if hasattr(isovar_result, boolean_field_name): + field_value = getattr(isovar_result, boolean_field_name) + else: + raise ValueError( + "IsovarResult does not have field name '%s'" % boolean_field_name) + if field_value not in {True, False}: + raise ValueError("Expected filter '%s' to be boolean but got %s" % ( + boolean_filter_name, + field_value)) + filter_values[boolean_filter_name] = ( + not field_value if negate else field_value + ) + return filter_values + + +def evaluate_filters( + isovar_result, + filter_thresholds, + filter_flags=[]): + """ + Creates a dictionary whose keys are named of different + filter conditions and values are booleans, where True + indicates whether this set of coverage stats passes + the filter and False indicates that it failed. + + Parameters + ---------- + isovar_result : IsovarResult + + filter_thresholds : dict or OrderedDict + Every argument is supposed to be something like "max_alt_reads" + where the first three characters are "min" or "max" and the + rest of the name is either a field of IsovarResult or + a numeric field like "num_alt_reads". The name of each filter + maps to a cutoff value. Filters starting with "max" + require that the corresponding field on CoverageStats + is <= cutoff, whereas filters starting with + "min" require >= cutoff. + + filter_flags : list of str + Every element should be a boolean property of IsovarResult + or "not_" and the name of a property to be negated. + + Returns + ------- + Dictionary of filter names mapped to boolean value indicating + whether this locus passed the filter. + """ + filter_values_dict = evaluate_boolean_filters(isovar_result, filter_flags) + filter_values_dict.update( + evaluate_threshold_filters(isovar_result, filter_thresholds)) + return filter_values_dict + + +def apply_filters( + isovar_result, + filter_thresholds={}, + filter_flags=[]): + """ + Given an IsovarResult object, evaluates given filters + for each object, and returns a copy of the IsovarResult with new fiter + values. + + Parameters + ---------- + isovar_result : IsovarResult + + filter_thresholds : dict or OrderedDict + Every argument is supposed to be something like "max_alt_reads" + where the first three characters are "min" or "max" and the + rest of the name is either a field of IsovarResult or + a numeric field like "num_alt_reads". The name of each filter + maps to a cutoff value. Filters starting with "max" + require that the corresponding field on CoverageStats + is <= cutoff, whereas filters starting with + "min" require >= cutoff. + + filter_flags : list of str + Every element should be a boolean property of IsovarResult + or "not_" and the name of a property to be negated. + + Returns IsovarResult + """ + filter_values = OrderedDict(isovar_result.filter_values.items()) + new_filter_values = evaluate_filters( + isovar_result, + filter_thresholds=filter_thresholds, + filter_flags=filter_flags) + filter_values.update(new_filter_values) + return isovar_result.clone_with_updates(filter_values=filter_values) diff --git a/isovar/genetic_code.py b/isovar/genetic_code.py index a9279c9..5c17c62 100644 --- a/isovar/genetic_code.py +++ b/isovar/genetic_code.py @@ -14,8 +14,18 @@ from __future__ import print_function, division, absolute_import +""" +GeneticCode objects contain the rules for translating cDNA into a protein +sequence: the set of valid start and stop codons, as well as which +amino acid each DNA triplet is translated into. +""" + class GeneticCode(object): + """ + Represents distinct translation tables to go from cDNA triplets to amino + acids. + """ def __init__(self, name, start_codons, stop_codons, codon_table): self.name = name self.start_codons = set(start_codons) diff --git a/isovar/isovar_result.py b/isovar/isovar_result.py new file mode 100644 index 0000000..9b68ed7 --- /dev/null +++ b/isovar/isovar_result.py @@ -0,0 +1,864 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +IsovarResult is a collection of all information gathered about a variant +and any protein sequences which were successfully translated for it. +""" + +from __future__ import print_function, division, absolute_import + +from collections import OrderedDict + +from cached_property import cached_property + +from .common import safediv +from .alignment_score import alignment_score + +class IsovarResult(object): + """ + This object represents all information gathered about a variant, + which includes the AlleleReads supporting any allele at this variant's + locus and any protein sequences generated from an alt-allele cDNA + assembly. + """ + + def __init__( + self, + variant, + read_evidence, + predicted_effect, + sorted_protein_sequences=None, + filter_values=None): + self.variant = variant + self.read_evidence = read_evidence + self.predicted_effect = predicted_effect + + if sorted_protein_sequences is None: + sorted_protein_sequences = [] + + self.sorted_protein_sequences = sorted_protein_sequences + + if filter_values is None: + self.filter_values = OrderedDict() + else: + self.filter_values = filter_values + + @property + def fields(self): + """ + List of field names used to construct an IsovarResult instance. + """ + return [ + "variant", + "predicted_effect", + "read_evidence", + "sorted_protein_sequences", + "filter_values" + ] + + def __str__(self): + field_strings = ["%s=%s" % (k, v) for (k, v) in self.to_dict()] + return "%s(%s)" % ( + self.__class__.__name, + ", ".join(field_strings) + ) + + def __repr__(self): + return str(self) + + def to_dict(self): + """ + Dictionary representation of fields used to construct this IsovarResult + + Returns dict + """ + return OrderedDict([ + (k, getattr(self, k)) + for k in self.fields + ]) + + def clone_with_updates(self, **kwargs): + """ + Create a copy of this IsovarResult object including any new + parameters in `kwargs`. + + Returns IsovarResult + """ + for (k, v) in self.to_dict().items(): + if k not in kwargs: + kwargs[k] = v + return IsovarResult(**kwargs) + + def to_record(self): + """ + Create an OrderedDict of essential information from + this IsovarResult to be used for building a DataFrame across + variants. + + Returns OrderedDict + """ + d = OrderedDict([ + ("variant", self.variant.short_description), + ("overlapping_gene_names", + ";".join(self.overlapping_gene_names(only_coding=False))), + ("overlapping_gene_ids", + ";".join(self.overlapping_gene_ids(only_coding=False))), + ("overlapping_coding_gene_names", + ";".join(self.overlapping_gene_names(only_coding=True))), + ("overlapping_coding_gene_ids", + ";".join(self.overlapping_gene_ids(only_coding=True))), + + ]) + + # get all quantitative fields from this object + for key in dir(self): + if key.startswith("num_") or key.startswith("fraction_") or key.startswith("ratio_"): + d[key] = getattr(self, key) + + ######################################################################## + # predicted protein changes without looking at RNA reads + ######################################################################## + effect = self.predicted_effect + + d["predicted_effect"] = effect.short_description + d["predicted_effect_class"] = effect.__class__.__name__ + + # list of field names on varcode effect properties + effect_properties = [ + "gene_name", + "gene_id", + "transcript_id", + "transcript_name", + "modifies_protein_sequence", + "original_protein_sequence", + "aa_mutation_start_offset", + "aa_mutation_end_offset", + "mutant_protein_sequence" + ] + for field_name in effect_properties: + # store effect fields with prefix 'predicted_effect_' and use + # getattr in case the field is not available for all effects + d["predicted_effect_%s" % field_name] = getattr( + effect, + field_name, + None) + + ######################################################################## + # get the top protein sequence, if one exists + ######################################################################## + protein_sequence = self.top_protein_sequence + + # list of names we want to use in the result dictionary, + # paired with names of fields on ProteinSequence + protein_sequence_properties = [ + ("protein_sequence", "amino_acids"), + ("protein_sequence_mutation_start", "variant_aa_interval_start"), + ("protein_sequence_mutation_end", "variant_aa_interval_stop"), + ("protein_sequence_ends_with_stop_codon", "ends_with_stop_codon"), + ("protein_sequence_gene_names", "gene_names"), + ("protein_sequence_gene_ids", "gene_ids"), + ("protein_sequence_transcript_names", "transcript_names"), + ("protein_sequence_transcript_ids", "transcript_ids"), + ] + for (name, protein_sequence_field) in protein_sequence_properties: + value = getattr(protein_sequence, protein_sequence_field, None) + if isinstance(value, (list, set, tuple)): + value = ";".join(value) + d[name] = value + + d["trimmed_predicted_mutant_protein_sequence"] = self.trimmed_predicted_mutant_protein_sequence + d["trimmed_reference_protein_sequence"] = self.trimmed_reference_protein_sequence + d["protein_sequence_matches_predicted_effect"] = self.protein_sequence_matches_predicted_effect + + ######################################################################## + # filters + ######################################################################## + for filter_name, filter_value in self.filter_values.items(): + d["filter:%s" % filter_name] = filter_value + d["pass"] = d["passes_all_filters"] = self.passes_all_filters + + return d + + + @cached_property + def passes_all_filters(self): + """ + Does this IsovarResult have True for all the filter values in + self.filter_values? + """ + if len(self.filter_values) == 0: + return True + else: + return all(list(self.filter_values.values())) + + @cached_property + def top_protein_sequence(self): + """ + If any protein sequences were assembled for this variant then + return the best according to coverage, number of mismatches + relative to the reference, number of reference transcripts + which match sequence before the variant and protein + sequence length. + + Returns ProteinSequence or None + """ + if len(self.sorted_protein_sequences) > 0: + return self.sorted_protein_sequences[0] + else: + return None + + @cached_property + def num_cdna_mismatches_in_top_protein_sequence(self): + """ + How many nucleotide positions aside from the variant location don't + match the reference transcript sequence in the cDNA sequence used + to translate the top protein sequence. When multiple cDNA sequences + were used then take the min across them. + + Returns int + """ + if self.top_protein_sequence is None: + return None + return self.top_protein_sequence.num_mismatches + + @cached_property + def num_cdna_mismatches_before_variant_in_top_protein_sequence(self): + """ + How many nucleotide positions before the variant location don't + match the reference transcript sequence in the cDNA sequence used + to translate the top protein sequence. When multiple cDNA sequences + were used then take the min across them. + + Returns int + """ + if self.top_protein_sequence is None: + return None + return self.top_protein_sequence.num_mismatches_before_variant + + @cached_property + def num_cdna_mismatches_after_variant_in_top_protein_sequence(self): + """ + How many nucleotide positions after the variant location don't + match the reference transcript sequence in the cDNA sequence used + to translate the top protein sequence. When multiple cDNA sequences + were used then take the min across them. + + Returns int + """ + if self.top_protein_sequence is None: + return None + return self.top_protein_sequence.num_mismatches_after_variant + + @cached_property + def num_reads_supporting_top_protein_sequence(self): + """ + How many reads support the assembled protein sequence? + + Returns int + """ + if self.top_protein_sequence is None: + return 0 + return self.top_protein_sequence.num_supporting_reads + + @cached_property + def num_fragments_supporting_top_protein_sequence(self): + """ + How many fragments (unique read names) support the assembled + protein sequence? + + Returns int + """ + if self.top_protein_sequence is None: + return 0 + return self.top_protein_sequence.num_supporting_fragments + + @cached_property + def num_translations_for_top_protein_sequence(self): + """ + How many distinct translations were used to create the top + protein sequence? These can arise either from different cDNA sequences + or different reading frames. + + Returns int + """ + if self.top_protein_sequence is None: + return 0 + return len(self.top_protein_sequence.translations) + + @cached_property + def num_cdna_sequences_for_top_protein_sequence(self): + """ + How many distinct cDNA sequences were used to create the top + protein sequence? + + Returns int + """ + if self.top_protein_sequence is None: + return 0 + return self.top_protein_sequence.num_cdna_sequences + + @cached_property + def trimmed_predicted_mutant_protein_sequence(self): + """ + Trim the predicted mutant protein sequence from Varcode + to match the length of the protein subsequence assembled from RNA. + + Returns str or None + """ + p = self.top_protein_sequence + e = self.predicted_effect + if e is None or p is None: + return None + if e.mutant_protein_sequence is None: + return None + if e.aa_mutation_start_offset is None: + return None + n_before_mutation = p.variant_aa_interval_start + n_after_mutation = len(p.amino_acids) - p.variant_aa_interval_end + return e.mutant_protein_sequence[ + e.aa_mutation_start_offset - n_before_mutation: + e.aa_mutation_start_offset + len(e.aa_alt) + n_after_mutation] + + @cached_property + def trimmed_reference_protein_sequence(self): + """ + Trim the reference protein sequence from the top Varcode effect + to match the length of the protein sequence assembled from RNA. + + Returns str + """ + p = self.top_protein_sequence + e = self.predicted_effect + if e is None or p is None: + return None + if e.original_protein_sequence is None: + return None + if e.aa_mutation_start_offset is None: + return None + n_before_mutation = p.variant_aa_interval_start + n_total = len(p.amino_acids) + start_index_in_original_protein = ( + e.aa_mutation_start_offset - n_before_mutation) + return e.original_protein_sequence[ + start_index_in_original_protein: + start_index_in_original_protein + n_total] + + @cached_property + def num_amino_acid_mismatches_from_predicted_effect(self): + """ + Compute the number of mismatches between the mutant protein sequence + predicted by Varcode and the best supported sequence translated + from assembled RNA reads by Isovar. We're not allowing any + insertions or deletions in the middle of the sequences but do + allow a shorter sequence to start anywhere within a longer one. + + Returns int + """ + predicted_sequence = self.trimmed_predicted_mutant_protein_sequence + if predicted_sequence is None: + return None + + protein_sequence_object = self.top_protein_sequence + if protein_sequence_object is None: + return None + assembled_protein_sequence = protein_sequence_object.amino_acids + + return alignment_score( + assembled_protein_sequence, + predicted_sequence) + + + @cached_property + def protein_sequence_matches_predicted_effect(self): + """ + Does the top protein sequence translated from RNA reads + match the predicted protein change determined by Varcode? + + Returns bool + """ + return self.num_amino_acid_mismatches_from_predicted_effect == 0 + + @cached_property + def num_protein_sequences(self): + """ + Number of distinct protein sequences which were translated from + assembled RNA reads. + + Returns int + """ + return len(self.sorted_protein_sequences) + + def transcripts_from_protein_sequences(self, max_num_protein_sequences=None): + """ + Ensembl transcript IDs of all transcripts which support the reading + frame used by protein sequences in this IsovarResult. + + Parameters + ---------- + max_num_protein_sequences : int or None + If supplied then only consider the top protein sequences up to + this number. + + Returns list of pyensembl.Transcript + """ + transcript_set = set([]) + for p in self.sorted_protein_sequences[:max_num_protein_sequences]: + transcript_set.update(p.transcripts) + return sorted(transcript_set) + + def transcript_ids_from_protein_sequences(self, max_num_protein_sequences=None): + """ + Ensembl transcripts IDs which support the reading frame used by protein + sequences in this IsovarResult. + + Parameters + ---------- + max_num_protein_sequences : int or None + If supplied then only consider the top protein sequences up to + this number. + + Returns list of str + """ + return sorted({t.id for t in self.transcripts_from_protein_sequences( + max_num_protein_sequences=max_num_protein_sequences)}) + + @cached_property + def num_transcripts_from_protein_sequences(self): + """ + Number of genes used by any translated protein sequence associated + with this IsovarResult. + + Returns int + """ + return len(self.transcript_ids_from_protein_sequences( + max_num_protein_sequences=None)) + + @cached_property + def num_transcripts_from_top_protein_sequence(self): + """ + Number of genes used by any translated protein sequence associated + with this IsovarResult. + + Returns int + """ + return len(self.transcript_ids_from_protein_sequences( + max_num_protein_sequences=1)) + + def genes_from_protein_sequences(self, max_num_protein_sequences=None): + """ + Ensembl genes which support the reading frame used by protein + sequences in this IsovarResult. + + Parameters + ---------- + max_num_protein_sequences : int or None + If supplied then only consider the top protein sequences up to + this number. + + Returns list of pyensembl.Gene + """ + transcripts = self.transcripts_from_protein_sequences( + max_num_protein_sequences=max_num_protein_sequences) + genes = [t.gene for t in transcripts] + return sorted(genes) + + def gene_ids_from_protein_sequences(self, max_num_protein_sequences=None): + """ + Ensembl genes IDs which support the reading frame used by protein + sequences in this IsovarResult. + + Parameters + ---------- + max_num_protein_sequences : int or None + If supplied then only consider the top protein sequences up to + this number. + + Returns list of str + """ + return sorted({ + g.id + for g + in + self.genes_from_protein_sequences( + max_num_protein_sequences=max_num_protein_sequences) + }) + + @cached_property + def num_genes_from_protein_sequences(self): + """ + Number of genes used by any translated protein sequence associated + with this IsovarResult. + + Returns int + """ + return len(self.gene_ids_from_protein_sequences( + max_num_protein_sequences=None)) + + @cached_property + def num_genes_from_top_protein_sequence(self): + """ + Number of genes used by any translated protein sequence associated + with this IsovarResult. + + Returns int + """ + return len(self.gene_ids_from_protein_sequences( + max_num_protein_sequences=1)) + + def overlapping_transcripts(self, only_coding=True): + """ + Transcripts which this variant overlaps. + + Parameters + ---------- + only_coding : bool + Only return transcripts which are annotated as coding for a + protein (default=True) + + Returns set of pyensembl.Transcript objects + """ + return { + t + for t in self.variant.transcripts + if not only_coding or t.is_protein_coding + } + + def overlapping_transcript_ids(self, only_coding=True): + """ + Transcript IDs which this variant overlaps. + + Parameters + ---------- + only_coding : bool + Only return transcripts which are annotated as coding for a + protein (default=True) + Returns set of str + """ + return { + t.id + for t in self.variant.transcripts + if not only_coding or t.is_protein_coding + } + + @cached_property + def num_overlapping_transcripts(self): + """ + Number of transcripts overlapped by the variant + + Returns int + """ + return len(self.overlapping_transcript_ids(only_coding=False)) + + @cached_property + def num_overlapping_coding_transcripts(self): + """ + Number of coding transcripts overlapped by the variant + + Returns int + """ + return len(self.overlapping_transcript_ids(only_coding=True)) + + def overlapping_genes(self, only_coding=True): + """ + Genes which this variant overlaps. + + Parameters + ---------- + only_coding : bool + Only return genes which are annotated as coding for a + protein (default=True) + + Returns list of pyensembl.Gene objects + """ + return sorted({ + g + for g in self.variant.genes + if not only_coding or g.is_protein_coding + }) + + def overlapping_gene_names(self, only_coding=True): + """ + Names of genes which this variant overlaps. + + Parameters + ---------- + only_coding : bool + Only return genes which are annotated as coding for a + protein (default=True) + + Returns list of str + """ + return [ + g.name for g in self.overlapping_genes(only_coding=only_coding) + ] + + def overlapping_gene_ids(self, only_coding=True): + """ + Gene IDs which this variant overlaps. + + Parameters + ---------- + only_coding : bool + Only return genes which are annotated as coding for a + protein (default=True) + + Returns set of str + """ + return { + g.id + for g in self.variant.genes + if not only_coding or g.is_protein_coding + } + + @cached_property + def num_overlapping_genes(self): + """ + Number of genes overlapped by the variant + + Returns int + """ + return len(self.overlapping_gene_ids(only_coding=False)) + + @cached_property + def num_overlapping_coding_genes(self): + """ + Number of coding genes overlapped by the variant + + Returns int + """ + return len(self.overlapping_gene_ids(only_coding=True)) + + @cached_property + def ref_reads(self): + """ + AlleleRead objects at this locus which support the reference allele + """ + return self.read_evidence.ref_reads + + @cached_property + def alt_reads(self): + """ + AlleleRead objects at this locus which support the mutant allele + """ + return self.read_evidence.alt_reads + + @cached_property + def other_reads(self): + """ + AlleleRead objects at this locus which support some allele other than + either the reference or alternate. + """ + return self.read_evidence.other_reads + + @cached_property + def ref_read_names(self): + """ + Names of reference reads at this locus. + """ + return {r.name for r in self.ref_reads} + + @cached_property + def alt_read_names(self): + """ + Names of alt reads at this locus. + """ + return {r.name for r in self.alt_reads} + + @cached_property + def ref_and_alt_read_names(self): + """ + Names of reads which support either the ref or alt alleles. + """ + return self.ref_read_names.union(self.alt_read_names) + + @cached_property + def other_read_names(self): + """ + Names of other (non-alt, non-ref) reads at this locus. + """ + return {r.name for r in self.other_reads} + + @cached_property + def all_read_names(self): + """ + Names of all reads at this locus. + """ + return self.ref_read_names.union(self.alt_read_names).union(self.other_read_names) + + @cached_property + def num_total_reads(self): + """ + Total number of reads at this locus, regardless of allele. + """ + return self.num_ref_reads + self.num_alt_reads + self.num_other_reads + + @cached_property + def num_total_fragments(self): + """ + Total number of distinct fragments at this locus, which also corresponds + to the total number of read names. + """ + return len(self.all_read_names) + + @cached_property + def num_ref_reads(self): + """ + Number of reads which support the reference allele. + """ + return len(self.ref_reads) + + @cached_property + def num_ref_fragments(self): + """ + Number of distinct fragments which support the reference allele. + """ + return len(self.ref_read_names) + + @cached_property + def num_alt_reads(self): + """ + Number of reads which support the alt allele. + """ + return len(self.alt_reads) + + @cached_property + def num_alt_fragments(self): + """ + Number of distinct fragments which support the alt allele. + """ + return len(self.alt_read_names) + + @cached_property + def num_other_reads(self): + """ + Number of reads which support neither the reference nor alt alleles. + """ + return len(self.other_reads) + + @cached_property + def num_other_fragments(self): + """ + Number of distinct fragments which support neither the reference nor + alt alleles. + """ + return len(self.other_read_names) + + @cached_property + def fraction_ref_reads(self): + """ + Allelic fraction of the reference allele among all reads at this site. + """ + return safediv(self.num_ref_reads, self.num_total_reads) + + @cached_property + def fraction_ref_fragments(self): + """ + Allelic fraction of the reference allele among all fragments at this site. + """ + return safediv(self.num_ref_fragments, self.num_total_fragments) + + @cached_property + def fraction_alt_reads(self): + """ + Allelic fraction of the variant allele among all reads at this site. + """ + return safediv(self.num_alt_reads, self.num_total_reads) + + @cached_property + def fraction_alt_fragments(self): + """ + Allelic fraction of the variant allele among all fragments at this site. + """ + return safediv(self.num_alt_fragments, self.num_total_fragments) + + @cached_property + def fraction_other_reads(self): + """ + Allelic fraction of the "other" (non-ref, non-alt) alleles among all + reads at this site. + """ + return safediv(self.num_other_reads, self.num_total_reads) + + @cached_property + def fraction_other_fragments(self): + """ + Allelic fraction of the "other" (non-ref, non-alt) alleles among all + fragments at this site. + """ + return safediv(self.num_other_fragments, self.num_total_fragments) + + @cached_property + def ratio_other_to_ref_reads(self): + """ + Ratio of the number of reads which support alleles which are neither + ref/alt to the number of ref reads. + """ + return safediv(self.num_other_reads, self.num_ref_reads) + + @cached_property + def ratio_other_to_ref_fragments(self): + """ + Ratio of the number of fragments which support alleles which are neither + ref/alt to the number of ref fragments. + """ + return safediv(self.num_other_fragments, self.num_ref_fragments) + + @cached_property + def ratio_other_to_alt_reads(self): + """ + Ratio of the number of reads which support alleles which are neither + ref/alt to the number of alt reads. + """ + return safediv(self.num_other_reads, self.num_alt_reads) + + @cached_property + def ratio_other_to_alt_fragments(self): + """ + Ratio of the number of fragments which support alleles which are neither + ref/alt to the number of alt fragments. + """ + return safediv(self.num_other_fragments, self.num_alt_fragments) + + @cached_property + def ratio_ref_to_other_reads(self): + """ + Ratio of the number of reference reads to non-ref/non-alt reads + """ + return safediv(self.num_ref_reads, self.num_other_reads) + + @cached_property + def ratio_ref_to_other_fragments(self): + """ + Ratio of the number of reference fragments to non-ref/non-alt fragments + """ + return safediv(self.num_ref_fragments, self.num_other_fragments) + + @cached_property + def ratio_alt_to_other_reads(self): + """ + Ratio of alt allele reads to non-ref/non-alt reads + """ + return safediv(self.num_alt_reads, self.num_other_reads) + + @cached_property + def ratio_alt_to_other_fragments(self): + """ + Ratio of the number of fragments which support the alt allele + to the number of non-alt/non-ref allele fragments. + """ + return safediv(self.num_alt_fragments, self.num_other_fragments) + + diff --git a/isovar/locus_read.py b/isovar/locus_read.py new file mode 100644 index 0000000..b42a108 --- /dev/null +++ b/isovar/locus_read.py @@ -0,0 +1,146 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module wraps pysam and gives us a view of any reads overlapping +a variant locus which includes offsets into the read sequence & qualities +for extracting variant nucleotides. +""" + +from __future__ import print_function, division, absolute_import + +from .value_object import ValueObject + + +class LocusRead(ValueObject): + """ + Minimal set of information extracted from SAM/BAM alignment file at a particular + locus to later figure out the allele at this locus. + """ + __slots__ = [ + "name", + "sequence", + "reference_positions", + "quality_scores", + "reference_base0_start_inclusive", + "reference_base0_end_exclusive", + "read_base0_start_inclusive", + "read_base0_end_exclusive" + ] + + def __init__( + self, + name, + sequence, + reference_positions, + quality_scores, + reference_base0_start_inclusive, + reference_base0_end_exclusive, + read_base0_start_inclusive, + read_base0_end_exclusive): + """ + Parameters + ---------- + name : str + Fragment name, paired reads from the same fragment will have the same name + + sequence : str + cDNA sequence + + reference_positions : list of (int or None) + For every base in the sequence, which base-1 reference position + does it map to, or None if the read base is an insertion or soft-clipped + + quality_scores : array of int + Base qualities for every character in the sequence + + reference_base0_start_inclusive : int + Start index of reference locus which is overlapped + by this read (base 0, inclusive) + + reference_base0_end_exclusive : int + End index of reference locus which is overlapped + by this read (base 0, exclusive) + + read_base0_start_inclusive : int or None + Start index of base in read which corresponds to + start of reference locus (if it's mapped) + + read_base0_end_exclusive : int or None + End index after last base in sequence which + corresponds to reference locus (if it's mapped) + """ + ###################################################################### + # When can the start or end of the read interval be None? + # --------------------------------------------------------- + # If a locus goes from [x:y) on the reference chromosome then there are + # a few possibilities for whether x and y will have corresponding mapped + # positions on a read. + # + # If x=y, then we expect the read to either match the reference or have some bases + # inserted between x and y. In this case, the only reason why x and y wouldn't be mapped + # is if they occur after the end of a read, but the read still overlaps some + # part of the interval. + # + # If y > x, then we're selecting some non-zero reference bases and either matching them + # or deleting them. If they CIGAR operation is M (match), then x and y should have + # corresponding positions on the read (unless, like previously, only part of the + # interval is covered by a read). + # + # In the case of a deletion, however, the selected reference bases do not have + # corresponding positions on the read. + # + # Diagram of a deletion: + # + # REFERENCE + # 19124 19125 19126 19127 19128 19129 19130 19131 19132 19133 19134 + # A C T G G C A T T T T + # + # If we're interested in checking whether the sequence 'GCA' is deleted + # between 19128:19131 then we'll look at an RNA read. + # + # RNA WHICH SUPPORTS REFERENCE + # 00024 00025 00026 00027 00028 00029 00030 00031 00032 00033 00034 + # A C T G G C A T T T T + # + # RNA WHICH SUPPORTS MUTATION + # 00024 00025 00026 00027 00028 00029 00030 00031 00032 00033 00034 + # A C T G T T T T A A A + # + # In the RNA read which does not support the mutation (matches reference) the + # reference index 19128 is mapped to 28 and 19131 is mapped to 31. + # + # On the mutant RNA, however, the position 19128, 19129, 19130 are unmapped. + # + # 19124 19125 19126 19127 19128 19129 19130 19131 19132 19133 19134 + # A C T G G C A T T T T + # | | | | | | | | + # | | | | *-----------------* | | | + # | | | | | *-----------------* | | + # | | | | | | |-----------------* | + # | | | | | | | *----------------* + # | | | | | | | | + # 00024 00025 00026 00027 00028 00029 00030 00031 00032 00033 00034 + # A C T G T T T T A A A + + self.name = name + self.sequence = sequence + self.reference_positions = reference_positions + self.quality_scores = quality_scores + self.reference_base0_start_inclusive = reference_base0_start_inclusive + self.reference_base0_end_exclusive = reference_base0_end_exclusive + self.read_base0_start_inclusive = read_base0_start_inclusive + self.read_base0_end_exclusive = read_base0_end_exclusive + + diff --git a/isovar/locus_reads.py b/isovar/locus_reads.py deleted file mode 100644 index bfa7537..0000000 --- a/isovar/locus_reads.py +++ /dev/null @@ -1,335 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module wraps pysam and gives us a view of any reads overlapping -a variant locus which includes offsets into the read sequence & qualities -for extracting variant nucleotides. -""" - -from __future__ import print_function, division, absolute_import - -from .default_parameters import ( - MIN_READ_MAPPING_QUALITY, - USE_DUPLICATE_READS, - USE_SECONDARY_ALIGNMENTS, -) -from .common import list_to_string -from .dataframe_builder import DataFrameBuilder -from .value_object import ValueObject -from .logging import get_logger - -logger = get_logger(__name__) - - -class LocusRead(ValueObject): - __slots__ = [ - "name", - "sequence", - "reference_positions", - "quality_scores", - "base0_read_position_before_variant", - "base0_read_position_after_variant" - ] - - def __init__( - self, - name, - sequence, - reference_positions, - quality_scores, - base0_read_position_before_variant, - base0_read_position_after_variant): - self.name = name - self.sequence = sequence - self.reference_positions = reference_positions - self.quality_scores = quality_scores - self.base0_read_position_before_variant = base0_read_position_before_variant - self.base0_read_position_after_variant = base0_read_position_after_variant - - @classmethod - def from_pysam_pileup_element( - cls, - pileup_element, - base0_position_before_variant, - base0_position_after_variant, - use_secondary_alignments, - use_duplicate_reads, - min_mapping_quality, - use_soft_clipped_bases=False): - """ - Parameters - ---------- - pileup_element : pysam.PileupRead - - base0_position_before_variant : int - - base0_position_after_variant : int - - use_secondary_alignments : bool - - use_duplicate_reads : bool - - min_mapping_quality : int - - use_soft_clipped_bases : bool. Default false; set to true to keep soft-clipped bases - - Returns LocusRead or None - """ - read = pileup_element.alignment - - # For future reference, may get overlapping reads - # which can be identified by having the same name - name = read.query_name - - if name is None: - logger.warn( - "Read missing name at position %d", - base0_position_before_variant + 1) - return None - - if read.is_unmapped: - logger.warn( - "How did we get unmapped read '%s' in a pileup?", name) - return None - - if pileup_element.is_refskip: - # if read sequence doesn't actually align to the reference - # base before a variant, skip it - logger.debug("Skipping pileup element with CIGAR alignment N (intron)") - return None - elif pileup_element.is_del: - logger.debug( - "Skipping deletion at position %d (read name = %s)", - base0_position_before_variant + 1, - name) - return None - - if read.is_secondary and not use_secondary_alignments: - logger.debug("Skipping secondary alignment of read '%s'", name) - return None - - if read.is_duplicate and not use_duplicate_reads: - logger.debug("Skipping duplicate read '%s'", name) - return None - - mapping_quality = read.mapping_quality - - missing_mapping_quality = mapping_quality is None - - if min_mapping_quality > 0 and missing_mapping_quality: - logger.debug("Skipping read '%s' due to missing MAPQ", name) - return None - elif mapping_quality < min_mapping_quality: - logger.debug( - "Skipping read '%s' due to low MAPQ: %d < %d", - read.mapping_quality, - mapping_quality, - min_mapping_quality) - return None - - sequence = read.query_sequence - - if sequence is None: - logger.warn("Read '%s' missing sequence", name) - return None - - base_qualities = read.query_qualities - - if base_qualities is None: - logger.warn("Read '%s' missing base qualities", name) - return None - - # Documentation for pysam.AlignedSegment.get_reference_positions: - # ------------------------------------------------------------------ - # By default, this method only returns positions in the reference - # that are within the alignment. If full_length is set, None values - # will be included for any soft-clipped or unaligned positions - # within the read. The returned list will thus be of the same length - # as the read. - # - # Source: - # http://pysam.readthedocs.org/en/latest/ - # api.html#pysam.AlignedSegment.get_reference_positions - # - # We want a None value for every read position that does not have a - # corresponding reference position. - reference_positions = read.get_reference_positions( - full_length=True) - - # pysam uses base-0 positions everywhere except region strings - # Source: - # http://pysam.readthedocs.org/en/latest/faq.html#pysam-coordinates-are-wrong - if base0_position_before_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because first position %d not mapped", - name, - base0_position_before_variant) - return None - else: - base0_read_position_before_variant = reference_positions.index( - base0_position_before_variant) - - if base0_position_after_variant not in reference_positions: - logger.debug( - "Skipping read '%s' because last position %d not mapped", - name, - base0_position_after_variant) - return None - else: - base0_read_position_after_variant = reference_positions.index( - base0_position_after_variant) - - if isinstance(sequence, bytes): - sequence = sequence.decode('ascii') - - if not use_soft_clipped_bases: - start = read.query_alignment_start - end = read.query_alignment_end - sequence = sequence[start:end] - reference_positions = reference_positions[start:end] - base_qualities = base_qualities[start:end] - base0_read_position_before_variant -= start - base0_read_position_after_variant -= start - - return cls( - name=name, - sequence=sequence, - reference_positions=reference_positions, - quality_scores=base_qualities, - base0_read_position_before_variant=base0_read_position_before_variant, - base0_read_position_after_variant=base0_read_position_after_variant) - - -def pileup_reads_at_position(samfile, chromosome, base0_position): - """ - Returns a pileup column at the specified position. Unclear if a function - like this is hiding somewhere in pysam API. - """ - - # TODO: I want to pass truncate=True, stepper="all" - # but for some reason I get this error: - # pileup() got an unexpected keyword argument 'truncate' - # ...even though these options are listed in the docs for pysam 0.9.0 - # - for column in samfile.pileup( - chromosome, - start=base0_position, - end=base0_position + 1): - - if column.pos != base0_position: - # if this column isn't centered on the base before the - # variant then keep going - continue - - return column.pileups - - # if we get to this point then we never saw a pileup at the - # desired position - return [] - - -def locus_read_generator( - samfile, - chromosome, - base1_position_before_variant, - base1_position_after_variant, - use_duplicate_reads=USE_DUPLICATE_READS, - use_secondary_alignments=USE_SECONDARY_ALIGNMENTS, - min_mapping_quality=MIN_READ_MAPPING_QUALITY): - """ - Generator that yields a sequence of ReadAtLocus records for reads which - contain the positions before and after a variant. The actual work to figure - out if what's between those positions matches a variant happens later in - the `variant_reads` module. - - Parameters - ---------- - samfile : pysam.AlignmentFile - - chromosome : str - - base1_position_before_variant : int - Genomic position of reference nucleotide before a variant - - base1_position_after_variant : int - Genomic position of reference nucleotide before a variant - - use_duplicate_reads : bool - By default, we're ignoring any duplicate reads - - use_secondary_alignments : bool - By default we are using secondary alignments, set this to False to - only use primary alignments of reads. - - min_mapping_quality : int - Drop reads below this mapping quality - - Yields ReadAtLocus objects - """ - logger.debug( - "Gathering reads at locus %s: %d-%d", - chromosome, - base1_position_before_variant, - base1_position_after_variant) - base0_position_before_variant = base1_position_before_variant - 1 - base0_position_after_variant = base1_position_after_variant - 1 - - count = 0 - - # We get a pileup at the base before the variant and then check to make sure - # that reads also overlap the reference position after the variant. - # - # TODO: scan over a wider interval of pileups and collect reads that don't - # overlap the bases before/after a variant due to splicing - for pileup_element in pileup_reads_at_position( - samfile=samfile, - chromosome=chromosome, - base0_position=base0_position_before_variant): - read = LocusRead.from_pysam_pileup_element( - pileup_element, - base0_position_before_variant=base0_position_before_variant, - base0_position_after_variant=base0_position_after_variant, - use_secondary_alignments=use_secondary_alignments, - use_duplicate_reads=use_duplicate_reads, - min_mapping_quality=min_mapping_quality) - - if read is not None: - count += 1 - yield read - - logger.info( - "Found %d reads overlapping locus %s: %d-%d", - count, - chromosome, - base1_position_before_variant, - base1_position_after_variant) - - -def locus_reads_dataframe(*args, **kwargs): - """ - Traverse a BAM file to find all the reads overlapping a specified locus. - - Parameters are the same as those for read_locus_generator. - """ - df_builder = DataFrameBuilder( - LocusRead, - variant_columns=False, - converters={ - "reference_positions": list_to_string, - "quality_scores": list_to_string, - }) - for locus_read in locus_read_generator(*args, **kwargs): - df_builder.add(variant=None, element=locus_read) - return df_builder.to_dataframe() diff --git a/isovar/main.py b/isovar/main.py new file mode 100644 index 0000000..fb443f0 --- /dev/null +++ b/isovar/main.py @@ -0,0 +1,163 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import print_function, division, absolute_import + +from six import string_types +from varcode import load_vcf +from pysam import AlignmentFile +from collections import OrderedDict + +from .protein_sequence_creator import ProteinSequenceCreator +from .read_collector import ReadCollector +from .logging import get_logger +from .isovar_result import IsovarResult +from .default_parameters import ( + MIN_NUM_RNA_ALT_READS, + MIN_NUM_RNA_ALT_FRAGMENTS, + MIN_FRACTION_RNA_ALT_READS, + MIN_FRACTION_RNA_ALT_FRAGMENTS, + MAX_NUM_RNA_REF_READS, + MAX_NUM_RNA_REF_FRAGMENTS, + MAX_FRACTION_RNA_REF_READS, + MAX_FRACTION_RNA_REF_FRAGMENTS, + MAX_NUM_RNA_OTHER_READS, + MAX_NUM_RNA_OTHER_FRAGMENTS, + MAX_FRACTION_RNA_OTHER_READS, + MAX_FRACTION_RNA_OTHER_FRAGMENTS, + MIN_RATIO_RNA_ALT_TO_OTHER_FRAGMENTS, +) +from .effect_prediction import top_varcode_effect +from .filtering import apply_filters + +logger = get_logger(__name__) + + +DEFAULT_FILTER_THRESHOLDS = OrderedDict([ + # alt allele + ("min_num_alt_reads", MIN_NUM_RNA_ALT_READS), + ("min_num_alt_fragments", MIN_NUM_RNA_ALT_FRAGMENTS), + ("min_fraction_alt_reads", MIN_FRACTION_RNA_ALT_READS), + ("min_fraction_alt_fragments", MIN_FRACTION_RNA_ALT_FRAGMENTS), + + # ref allele coverage and VAF + ("max_num_ref_reads", MAX_NUM_RNA_REF_READS), + ("max_num_ref_fragments", MAX_NUM_RNA_REF_FRAGMENTS), + ("max_fraction_ref_reads", MAX_FRACTION_RNA_REF_READS), + ("max_fraction_ref_fragments", MAX_FRACTION_RNA_REF_FRAGMENTS), + + # other alleles + ("max_num_other_reads", MAX_NUM_RNA_OTHER_READS), + ("max_num_other_fragments", MAX_NUM_RNA_OTHER_FRAGMENTS), + ("max_fraction_other_reads", MAX_FRACTION_RNA_OTHER_READS), + ("max_fraction_other_fragments", MAX_FRACTION_RNA_OTHER_FRAGMENTS), + + # misc. filters + ("min_ratio_alt_to_other_fragments", MIN_RATIO_RNA_ALT_TO_OTHER_FRAGMENTS) +]) + + +def run_isovar( + variants, + alignment_file, + transcript_id_whitelist=None, + read_collector=None, + protein_sequence_creator=None, + filter_thresholds=DEFAULT_FILTER_THRESHOLDS, + filter_flags=[]): + """ + This is the main entrypoint into the Isovar library, which collects + RNA reads supporting variants and translates their coding sequence + into amino acid sequences. Collects both the read evidence and + protein sequences into IsovarResult objects. The values of any filters + which are supplied in the filter_thresholds argument are attached to + each IsovarResult's filter_values_dict field. + + Parameters + ---------- + variants : varcode.VariantCollection + Somatic variants + + alignment_file : pysam.AlignmentFile + Aligned tumor RNA reads + + transcript_id_whitelist : set of str or None + Which transcripts should be considered when predicting DNA-only + coding effects of mutations and also when trying to establish a + reading frame for identified cDNA sequences. + + read_collector : ReadCollector or None + Object used to collect ReadEvidence for each variant, created + with default settings if not supplied. + + protein_sequence_creator : ProteinSequenceCreator or None + Object used to turn (Variant, ReadEvidence) into one or more + ProteinSequence objects. Created with default settings if not + supplied. + + filter_thresholds : dict or OrderedDict + Dictionary whose entries have names like "min_num_alt_reads" + mapping to a numerical threshold value. In general, the keys + must start with either "min_" or "max_" followed by a property + of the IsovarResult class. + + filter_flags : list of str + List of boolean fields of IsovarResult used for filtering, + they can also be negated by prepending "not_", + such as "not_has_protein_sequence". + + Generator of IsovarResult objects, one for each variant. The + `protein_sequences` field of the IsovarVar result will be empty + if no sequences could be determined. + """ + if isinstance(variants, string_types): + variants = load_vcf(variants) + + if isinstance(alignment_file, string_types): + alignment_file = AlignmentFile(alignment_file) + + if read_collector is None: + read_collector = ReadCollector() + + if protein_sequence_creator is None: + protein_sequence_creator = ProteinSequenceCreator() + + # create generator which returns (Variant, ReadEvidence) pairs + read_evidence_gen = \ + read_collector.read_evidence_generator( + variants=variants, + alignment_file=alignment_file) + + for variant, read_evidence in read_evidence_gen: + # generate protein sequences by assembling variant reads + protein_sequences = \ + protein_sequence_creator.sorted_protein_sequences_for_variant( + variant=variant, + read_evidence=read_evidence, + transcript_id_whitelist=transcript_id_whitelist) + predicted_effect = top_varcode_effect( + variant=variant, + transcript_id_whitelist=transcript_id_whitelist) + isovar_result = IsovarResult( + variant=variant, + predicted_effect=predicted_effect, + read_evidence=read_evidence, + sorted_protein_sequences=protein_sequences) + isovar_result = apply_filters( + isovar_result, + filter_thresholds=filter_thresholds, + filter_flags=filter_flags) + yield isovar_result + diff --git a/isovar/nucleotide_counts.py b/isovar/nucleotide_counts.py deleted file mode 100644 index c82d5ed..0000000 --- a/isovar/nucleotide_counts.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import - -import numpy as np - -from .dna import ( - dna_nucleotide_to_index, - index_to_dna_nucleotide, -) -from .read_helpers import ( - make_prefix_suffix_pairs, - get_single_allele_from_reads, -) - - -def nucleotide_counts(variant_reads): - """ - Count the number of times {A, C, T, G} occur at each position to the - left and right of the variant. - - Parameters - ---------- - variant_reads : list of AlleleRead objects - Expected to all contain the same variant allele. - - Returns a tuple with the following elements: - - a matrix with four rows and as many columns as the sum of the longest - prefix preceding the variant, the longest suffix after the variant and - the number of variant nucleotids. - - the column indices for the variant nucleotides - """ - variant_seq = get_single_allele_from_reads(variant_reads) - - prefix_suffix_pairs = make_prefix_suffix_pairs(variant_reads) - n_reads = len(prefix_suffix_pairs) - max_prefix_length = max(len(p) for (p, _) in prefix_suffix_pairs) - max_suffix_length = max(len(s) for (_, s) in prefix_suffix_pairs) - n_variant_nucleotides = len(variant_seq) - - n_cols = max_prefix_length + max_suffix_length + n_variant_nucleotides - - counts = np.zeros((4, n_cols), dtype=int) - - variant_column_indices = [] - - # first fill in the variant nucleotide counts, since they'll - # be invariant across all the supporting reads - for i, nucleotide in enumerate(variant_seq): - variant_col_idx = max_prefix_length + i - variant_column_indices.append(variant_col_idx) - row_idx = dna_nucleotide_to_index[dna_nucleotide_to_index] - counts[row_idx, variant_col_idx] = n_reads - - for p, s in prefix_suffix_pairs: - for i, prefix_col_idx in enumerate(range( - max_prefix_length - len(p), - max_prefix_length)): - row_idx = dna_nucleotide_to_index[p[i]] - counts[row_idx, prefix_col_idx] += 1 - for i, suffix_col_idx in enumerate(range( - max_prefix_length + n_variant_nucleotides, - max_prefix_length + n_variant_nucleotides + len(s))): - row_idx = dna_nucleotide_to_index[s[i]] - counts[row_idx, suffix_col_idx] += 1 - return counts, variant_column_indices - - -def most_common_nucleotides(partitioned_read_sequences): - """ - Find the most common nucleotide at each offset to the left and - right of a variant. - - Parameters - ---------- - partitioned_read_sequences : list of tuples - Each tuple has three elements: - - sequence before mutant nucleotides - - mutant nucleotides - - sequence after mutant nucleotides - - Returns a tuple with the following elements: - - nucleotide sequence from most common nucleotide at each offset - relative to the variant - - an array of counts indicating how many reads supported this nucleotide - - an array of counts for all the *other* nucleotides at that position - """ - counts, variant_column_indices = nucleotide_counts( - partitioned_read_sequences) - max_count_per_column = counts.max(axis=0) - - assert len(max_count_per_column) == counts.shape[1] - max_nucleotide_index_per_column = np.argmax(counts, axis=0) - assert len(max_nucleotide_index_per_column) == counts.shape[1] - nucleotides = [ - index_to_dna_nucleotide[idx] - for idx in max_nucleotide_index_per_column - ] - other_nucleotide_counts = counts.sum(axis=0) - max_count_per_column - return "".join(nucleotides), max_count_per_column, other_nucleotide_counts diff --git a/isovar/protein_sequence.py b/isovar/protein_sequence.py new file mode 100644 index 0000000..9880cca --- /dev/null +++ b/isovar/protein_sequence.py @@ -0,0 +1,279 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +ProteinSequence is a representation of a translated coding sequence, +associated with its supporting (and non-supporting but overlapping) RNA reads. +""" + +from __future__ import print_function, division, absolute_import + +from .translation_key import TranslationKey +from .translation import Translation +from .logging import get_logger + +logger = get_logger(__name__) + + +class ProteinSequence(TranslationKey): + """ + Translated amino acid sequence aggregated across possibly multiple + VariantSequence and ReferenceContext objects (e.g. imagine two distinct + sequences which contain synonymous codons). + + This is the final result of the isovar variant->expressed peptide pipeline. + """ + __slots__ = [ + # list of all the Translation objects which support this distinct + # amino acid sequence + "translations", + ] + + def __init__(self, translations): + """ + Initialize fields of ProteinSequence. Fields inherited from TranslationKey + (e.g. frameshift, ends_with_stop_codon, &c) are inferred from the + translation objects which must all have the same values for these + fields. + + Parameters + ---------- + translations : list of Translation + Equivalent translations which might have different cDNA sequences + but agree in their amino acid sequences. + """ + if len(translations) == 0: + raise ValueError("Cannot create ProteinSequence without at least one Translation") + + self.translations = translations + + # fill in fields inherited from TranslationKey by taking value + # from first Translation iobject and then check to make sure + # other translations are consistent with this + first_translation = translations[0] + for field_name in TranslationKey.__slots__: + field_value = getattr(first_translation, field_name) + setattr(self, field_name, field_value) + # check other translations to make sure they have the same value + # for this field + for other_translation in translations[1:]: + other_translation_field_value = getattr(other_translation, field_name) + if other_translation_field_value != field_value: + raise ValueError( + "All translations must have same value %s=%s but got %s" % ( + field_name, + field_value, + other_translation_field_value)) + + def __len__(self): + return len(self.amino_acids) + + @property + def supporting_reads(self): + """ + Reads used to create cDNA coding sequence for any Translation + associated with this ProteinSequence. + + Returns set of AlleleRead + """ + read_set = set([]) + for translation in self.translations: + read_set.update(translation.reads) + return read_set + + @property + def read_names_supporting_protein_sequence(self): + """ + Names of reads used to create cDNA coding sequence for any Translation + associated with this ProteinSequence. + + Returns set of str + """ + return {r.name for r in self.supporting_reads} + + @property + def num_supporting_fragments(self): + """ + Number of unique read names used to construct the cDNA sequences from + which this protein sequence was translated. + + Returns int + """ + return len({r.name for r in self.supporting_reads}) + + @property + def num_supporting_reads(self): + """ + Number of reads used to construct the cDNA sequences from + which this protein sequence was translated. + + Returns int + """ + return len(self.supporting_reads) + + @property + def num_mismatches_before_variant(self): + """ + Since a ProteinSequence may arise from multiple equivalent translations, + take the minimum mismatch score from all the translations. + + Returns int + """ + return min(t.num_mismatches_before_variant for t in self.translations) + + @property + def num_mismatches_after_variant(self): + """ + Since a ProteinSequence may arise from multiple equivalent translations, + take the minimum mismatch score from all the translations. + + Returns int + """ + return min(t.num_mismatches_after_variant for t in self.translations) + + @property + def num_mismatches(self): + """ + Add up the mismatches before and after the variant across all + translations used to create this ProteinSequence. + + Returns int + """ + return self.num_mismatches_before_variant + self.num_mismatches_after_variant + + @property + def transcripts(self): + """ + Ensembl transcripts which support the reading frame used by + Translation objects in this ProteinSequence. + + Returns list of pyensembl.Transcript + """ + transcript_set = set([]) + for translation in self.translations: + transcript_set.update(translation.reference_context.transcripts) + return sorted(transcript_set) + + @property + def transcript_names(self): + """ + Ensembl transcript names which support the reading frame used by + Translation objects used in this ProteinSequence. + + Returns list of str + """ + return [ + t.name + for t + in self.transcripts + ] + + @property + def transcript_ids(self): + """ + Ensembl transcript IDs of all transcripts which support the reading + frame used by Translation objects associated with this + ProteinSequence. + + Returns list of str + """ + return [ + transcript.id + for transcript in self.transcripts + ] + + @property + def genes(self): + """ + Ensembl genes which support the reading frame used by Translation + objects associated with this ProteinSequence. + + Returns list of pyensembl.Gene + """ + transcripts = self.transcripts + genes = {t.gene for t in transcripts} + return sorted(genes) + + @property + def gene_names(self): + """ + Ensembl genes names which support the reading frame used by + Translation objects used in this ProteinSequence. + + Returns list of str + """ + return [ + g.name + for g + in self.genes + ] + + @property + def gene_name(self): + """ + Return gene name if only one gene is being used to determine the + reading from to translate this ProteinSequence, or in the very + unlikely case that multiple genes are being used, concatenate their + names with a semi-colon separator. + + Returns str + """ + return ";".join(self.gene_names) + + @property + def gene_ids(self): + """ + Ensembl genes IDs which support the reading frame used by + Translation objects used in this ProteinSequence. + + Returns list of str + """ + return [g.id for g in self.genes] + + @property + def cdna_sequences(self): + """ + Distinct cDNA sequences used to create this protein sequence. + + Returns set of str + """ + return {t.cdna_sequence for t in self.translations} + + @property + def num_cdna_sequences(self): + """ + Number of distinct cDNA sequences used to translate this protein + sequence. + + Returns int + """ + return len(self.cdna_sequences) + + def ascending_sort_key(self): + """ + Sort key function used to sort protein sequences lexicographically by these criteria: + - number of unique supporting fragments + - number of unique supporting reads (either 1 or 2 per fragment) + - minimum mismatch versus a supporting reference transcript before variant + - minimum mismatch versus a supporting reference transcript after variant + - all else being equal, prefer longer sequences + """ + return ( + self.num_supporting_fragments, + self.num_supporting_reads, + -self.num_mismatches_before_variant, + -self.num_mismatches_after_variant, + len(self.amino_acids), + ) \ No newline at end of file diff --git a/isovar/protein_sequence_creator.py b/isovar/protein_sequence_creator.py new file mode 100644 index 0000000..ea5c32a --- /dev/null +++ b/isovar/protein_sequence_creator.py @@ -0,0 +1,376 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import print_function, division, absolute_import + +from .default_parameters import ( + MIN_TRANSCRIPT_PREFIX_LENGTH, + MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + COUNT_MISMATCHES_AFTER_VARIANT, + PROTEIN_SEQUENCE_LENGTH, + MAX_PROTEIN_SEQUENCES_PER_VARIANT, + MIN_VARIANT_SEQUENCE_COVERAGE, + VARIANT_SEQUENCE_ASSEMBLY, + MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE, +) + +from .genetic_code import translate_cdna +from .protein_sequence_helpers import ( + sort_protein_sequences, + group_equivalent_translations +) +from .reference_context_helpers import reference_contexts_for_variant +from .translation import Translation +from .translation_helpers import find_mutant_amino_acid_interval +from .value_object import ValueObject +from .variant_sequence_creator import VariantSequenceCreator +from .variant_orf_helpers import match_variant_sequence_to_reference_context + +from .logging import get_logger + +logger = get_logger(__name__) + + +class ProteinSequenceCreator(ValueObject): + """ + Creates ProteinSequence objects for each variant by translating + cDNA into one or more Translation objects and then grouping them + by identical amino acid sequences. Each Translation comes from the + combination of a variant cDNA sequence with the reading frames of + annotated reference transcripts. + """ + + def __init__( + self, + protein_sequence_length=PROTEIN_SEQUENCE_LENGTH, + min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, + min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + count_mismatches_after_variant=COUNT_MISMATCHES_AFTER_VARIANT, + max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT, + variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY, + min_assembly_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE): + """ + protein_sequence_length : int + Try to translate protein sequences of this length, though sometimes + we'll have to return something shorter (depending on the RNAseq data, + and presence of stop codons). + + min_variant_sequence_coverage : int + Trim variant sequences to positions supported by at least this number + of RNA reads. + + min_transcript_prefix_length : int + Minimum number of bases we need to try matching between the reference + context and variant sequence. + + max_transcript_mismatches : int + Don't try to determine the reading frame for a transcript if more + than this number of bases differ. + + count_mismatches_after_variant : bool + Include mismatches after the variant locus in the count compared + against max_transcript_mismatches. + + max_protein_sequences_per_variant : int + Number of protein sequences to return for each ProteinSequence + + variant_sequence_assembly : bool + If True, then assemble variant cDNA sequences based on overlap of + RNA reads. If False, then variant cDNA sequences must be fully spanned + and contained within RNA reads. + + min_assembly_overlap_size : int + Minimum number of nucleotides that two reads need to overlap before they + can be merged into a single coding sequence. + """ + self.protein_sequence_length = protein_sequence_length + self.min_variant_sequence_coverage = min_variant_sequence_coverage + self.min_transcript_prefix_length = min_transcript_prefix_length + self.max_transcript_mismatches = max_transcript_mismatches + self.count_mismatches_after_variant = count_mismatches_after_variant + self.variant_sequence_assembly = variant_sequence_assembly + self.min_assembly_overlap_size = min_assembly_overlap_size + + # Adding an extra codon to the desired RNA sequence length in case we + # need to clip nucleotides at the start/end of the sequence + self._cdna_sequence_length = (self.protein_sequence_length + 1) * 3 + + self._variant_sequence_creator = VariantSequenceCreator( + min_variant_sequence_coverage=self.min_variant_sequence_coverage, + preferred_sequence_length=self._cdna_sequence_length, + variant_sequence_assembly=self.variant_sequence_assembly, + min_assembly_overlap_size=self.min_assembly_overlap_size) + + self.max_protein_sequences_per_variant = max_protein_sequences_per_variant + + def translation_from_variant_sequence_and_reference_context( + self, + variant_sequence, + reference_context): + """ + Attempt to translate a single VariantSequence using the reading frame + from a single ReferenceContext. + + Parameters + ---------- + variant_sequence : VariantSequence + + reference_context : ReferenceContext + + Returns either a Translation object or None if the number of + mismatches between the RNA and reference transcript sequences exceeds + given threshold. + """ + variant_orf = match_variant_sequence_to_reference_context( + variant_sequence, + reference_context, + min_transcript_prefix_length=self.min_transcript_prefix_length, + max_transcript_mismatches=self.max_transcript_mismatches, + count_mismatches_after_variant=self.count_mismatches_after_variant) + + if variant_orf is None: + logger.info("Unable to determine reading frame for %s", variant_sequence) + return None + + cdna_sequence = variant_orf.cdna_sequence + cdna_codon_offset = variant_orf.offset_to_first_complete_codon + + # get the offsets into the cDNA sequence which pick out the variant nucleotides + cdna_variant_start_offset = variant_orf.variant_cdna_interval_start + cdna_variant_end_offset = variant_orf.variant_cdna_interval_end + + # TODO: determine if the first codon is the start codon of a + # transcript, for now any of the unusual start codons like CTG + # will translate to leucine instead of methionine. + variant_amino_acids, ends_with_stop_codon = translate_cdna( + cdna_sequence[cdna_codon_offset:], + first_codon_is_start=False, + mitochondrial=reference_context.mitochondrial) + + variant_aa_interval_start, variant_aa_interval_end, frameshift = \ + find_mutant_amino_acid_interval( + cdna_sequence=cdna_sequence, + cdna_first_codon_offset=cdna_codon_offset, + cdna_variant_start_offset=cdna_variant_start_offset, + cdna_variant_end_offset=cdna_variant_end_offset, + n_ref=len(reference_context.sequence_at_variant_locus), + n_amino_acids=len(variant_amino_acids)) + + if self.protein_sequence_length: + if len(variant_amino_acids) > self.protein_sequence_length: + if self.protein_sequence_length <= variant_aa_interval_start: + logger.warn( + ("Truncating amino acid sequence %s " + "to only %d elements loses all variant residues"), + variant_amino_acids, + self.protein_sequence_length) + return None + else: + # if the protein is too long then shorten it, which implies + # we're no longer stopping due to a stop codon and that the variant + # amino acids might need a new stop index + variant_amino_acids = variant_amino_acids[:self.protein_sequence_length] + variant_aa_interval_end = min( + variant_aa_interval_end, + self.protein_sequence_length) + ends_with_stop_codon = False + + return Translation( + amino_acids=variant_amino_acids, + frameshift=frameshift, + ends_with_stop_codon=ends_with_stop_codon, + variant_aa_interval_start=variant_aa_interval_start, + variant_aa_interval_end=variant_aa_interval_end, + untrimmed_variant_sequence=variant_sequence, + reference_context=reference_context, + variant_orf=variant_orf) + + def all_pairs_translations( + self, + variant_sequences, + reference_contexts): + """ + Given all a list of VariantSequence objects for a particular variant + and all the ReferenceContext objects for that locus, attempt to + translate all pairs of sequences and reference contexts. + + Parameters + ---------- + variant_sequences : list of VariantSequence + + reference_contexts : list of ReferenceContext + + Return list of Translation objects. + """ + translations = [] + for reference_context in reference_contexts: + for variant_sequence in variant_sequences: + translation = self.translation_from_variant_sequence_and_reference_context( + variant_sequence=variant_sequence, + reference_context=reference_context) + if translation is not None: + translations.append(translation) + return translations + + def translate_variant_reads( + self, + variant, + variant_reads, + transcript_id_whitelist=None): + """ + Given a variant and its associated alt reads, construct variant sequences + and translate them into Translation objects. + + Returns 0 or more Translation objects. + + Parameters + ---------- + variant : varcode.Variant + + variant_reads : sequence or generator + AlleleRead objects supporting the variant + + transcript_id_whitelist : set, optional + If given, expected to be a set of transcript IDs which we should use + for determining the reading frame around a variant. If omitted, then + try to use all overlapping reference transcripts. + + Returns list of Translation objects + """ + if len(variant_reads) == 0: + logger.info("No supporting reads for variant %s", variant) + return [] + + variant_sequences = self._variant_sequence_creator.reads_to_variant_sequences( + variant=variant, + reads=variant_reads) + + if not variant_sequences: + logger.info("No spanning cDNA sequences for variant %s", variant) + return [] + + # try translating the variant sequences from the same set of + # ReferenceContext objects, which requires using the longest + # context_size to be compatible with all of the sequences. Some + # sequences maybe have fewer nucleotides than this before the variant + # and will thus have to be trimmed. + context_size = max( + len(variant_sequence.prefix) + for variant_sequence in variant_sequences) + + reference_contexts = reference_contexts_for_variant( + variant, + context_size=context_size, + transcript_id_whitelist=transcript_id_whitelist) + + return self.all_pairs_translations( + variant_sequences=variant_sequences, + reference_contexts=reference_contexts) + + def translate_variants( + self, + variants_with_read_evidence_generator, + transcript_id_whitelist=None): + """ + Translates each coding variant in a collection to one or more protein + fragment sequences (if the variant is not filtered and its spanning RNA + sequences can be given a reading frame). + + Parameters + ---------- + variants_with_read_evidence_generator : sequence or generator + Each item of this sequence should be a pair containing a varcode.Variant + and a ReadEvidence object + + transcript_id_whitelist : set, optional + If given, expected to be a set of transcript IDs which we should use + for determining the reading frame around a variant. If omitted, then + try to use all overlapping reference transcripts. + + Yields pairs of a Variant and a sequence of all its candidate + Translation objects. + """ + for variant, read_evidence in variants_with_read_evidence_generator: + translations = self.translate_variant_reads( + variant=variant, + variant_reads=read_evidence.alt_reads, + transcript_id_whitelist=transcript_id_whitelist) + yield variant, translations + + def sorted_protein_sequences_for_variant( + self, + variant, + read_evidence, + transcript_id_whitelist=None): + """" + Translates a coding variant and its overlapping RNA reads into Translation + objects, which are aggregated into ProteinSequence objects by their + amino acid sequence (when they have equivalent coding sequences). + + Parameters + ---------- + variant : varcode.Variant + + read_evidence : ReadEvidence object + + transcript_id_whitelist : set, optional + If given, expected to be a set of transcript IDs which we should use + for determining the reading frame around a variant. If omitted, then + try to use all overlapping reference transcripts. + + Returns a list of ProteinSequence objects + """ + translations = self.translate_variant_reads( + variant=variant, + variant_reads=read_evidence.alt_reads, + transcript_id_whitelist=transcript_id_whitelist) + + # group distinct cDNA translations into ProteinSequence objects + # by their amino acid sequence + protein_sequences = group_equivalent_translations(translations) + + # sort protein sequences before returning the top results + protein_sequences = sort_protein_sequences(protein_sequences) + return protein_sequences + + def protein_sequences_from_read_evidence_generator( + self, + read_evidence_generator, + transcript_id_whitelist=None): + """ + + Parameters + ---------- + read_evidence_generator : generator of (varcode.Variant, ReadEvidence) + Generator which yields sequence of Variant objects paired with + their corresponding ReadEvidence + + transcript_id_whitelist : set of str or None + Which transcripts should be considered when predicting DNA-only + coding effects of mutations and also when trying to establish a + reading frame for identified cDNA sequences. + + Generates sequence of (varcode.Variant, ProteinSequence list) pairs. + """ + for variant, read_evidence in read_evidence_generator: + protein_sequences = \ + self.sorted_protein_sequences_for_variant( + variant=variant, + read_evidence=read_evidence, + transcript_id_whitelist=transcript_id_whitelist) + if self.max_protein_sequences_per_variant: + protein_sequences = protein_sequences[:self.max_protein_sequences_per_variant] + yield variant, protein_sequences diff --git a/isovar/protein_sequence_helpers.py b/isovar/protein_sequence_helpers.py new file mode 100644 index 0000000..104370c --- /dev/null +++ b/isovar/protein_sequence_helpers.py @@ -0,0 +1,60 @@ +# Copyright (c) 2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Since multiple variant sequences can translate to the same amino acid sequence, +this module aggregates equivalent Translation objects into a single +ProteinSequence. +""" + +from __future__ import print_function, division, absolute_import + +from .common import groupby +from .logging import get_logger +from .protein_sequence import ProteinSequence +from .translation import Translation + +logger = get_logger(__name__) + + +def sort_protein_sequences(protein_sequences): + """ + Sort protein sequences in decreasing order of priority + """ + return list( + sorted( + protein_sequences, + key=ProteinSequence.ascending_sort_key, + reverse=True)) + + +def group_equivalent_translations(translations): + """ + Convert a list of Translation objects into a (potentially smaller) list + of ProteinSequence objects by grouping the equivalent amino acid sequences. + + Parameters + ---------- + translations : list of Translation objects + + Returns list of ProteinSequence objects + """ + protein_sequences = [] + translation_groups = groupby( + translations, + key_fn=Translation.as_translation_key) + for equivalent_translations in translation_groups.values(): + protein_sequences.append(ProteinSequence(equivalent_translations)) + return protein_sequences diff --git a/isovar/protein_sequences.py b/isovar/protein_sequences.py deleted file mode 100644 index 91ac4a3..0000000 --- a/isovar/protein_sequences.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -Since multiple variant sequences can translate to the same amino acid sequence, -this module aggregates equivalent Translation objects into a single -ProteinSequence. -""" - -from __future__ import print_function, division, absolute_import - -from .common import groupby -from .default_parameters import ( - MIN_TRANSCRIPT_PREFIX_LENGTH, - MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - INCLUDE_MISMATCHES_AFTER_VARIANT, - PROTEIN_SEQUENCE_LENGTH, - MAX_PROTEIN_SEQUENCES_PER_VARIANT, - MIN_ALT_RNA_READS, - MIN_VARIANT_SEQUENCE_COVERAGE, - VARIANT_SEQUENCE_ASSEMBLY -) -from .dataframe_builder import dataframe_from_generator -from .translation import translate_variant_reads, Translation, TranslationKey -from .read_helpers import group_reads_by_allele -from .variant_helpers import trim_variant -from .logging import get_logger - -logger = get_logger(__name__) - - -class ProteinSequence(TranslationKey): - """ - Translated amino acid sequence aggregated across possibly multiple - VariantSequence and ReferenceContext objects (e.g. imagine two distinct - sequences which contain synonymous codons). - - This is the final result of the isovar variant->expressed peptide pipeline. - """ - __slots__ = [ - # list of all the Translation objects which support this distinct - # amino acid sequence - "translations", - # number of reads overlapping the variant locus supporting any allele, - # including the reference, alt, or anything else - "overlapping_reads", - # number of reads overlapping this locus which support the reference - # allele - "ref_reads", - # total number of reads at the locus which contained the variant - # nucleotides, even if they supported other phased sequences - "alt_reads", - # number of unique read names from all the VariantSequence objects - # from each translation - "alt_reads_supporting_protein_sequence", - # IDs of transcripts overlapping the variant locus - "transcripts_overlapping_variant", - # IDs of reference transcripts used to establish the reading frame for - # this protein sequence - "transcripts_supporting_protein_sequence", - # name of gene of the reference transcripts used in Translation - # objects - "gene", - ] - - def __init__( - self, - amino_acids, - variant_aa_interval_start, - variant_aa_interval_end, - ends_with_stop_codon, - frameshift, - translations, - overlapping_reads, - ref_reads, - alt_reads, - alt_reads_supporting_protein_sequence, - transcripts_overlapping_variant, - transcripts_supporting_protein_sequence, - gene): - self.amino_acids = amino_acids - self.variant_aa_interval_start = variant_aa_interval_start - self.variant_aa_interval_end = variant_aa_interval_end - self.ends_with_stop_codon = ends_with_stop_codon - self.frameshift = frameshift - self.translations = translations - self.overlapping_reads = overlapping_reads - self.ref_reads = ref_reads - self.alt_reads = alt_reads - self.alt_reads_supporting_protein_sequence = ( - alt_reads_supporting_protein_sequence) - self.transcripts_overlapping_variant = transcripts_overlapping_variant - self.transcripts_supporting_protein_sequence = ( - transcripts_supporting_protein_sequence) - self.gene = gene - - @classmethod - def _summarize_translations(cls, translations): - """ - Summarize a collection of Translation objects into three values: - 1) List of unique reads supporting underlying variant sequences - 2) Set of unique transcript names for establishing reading frames of the - translations. - 3) Set of unique gene names for all transcripts used by translations. - """ - read_name_to_reads = {} - gene_names = set([]) - transcript_ids = set([]) - for translation in translations: - for read in translation.reads: - read_name_to_reads[read.name] = read - for transcript in translation.reference_context.transcripts: - transcript_ids.add(transcript.id) - gene_names.add(transcript.gene.name) - unique_reads = list(read_name_to_reads.values()) - return unique_reads, transcript_ids, gene_names - - @classmethod - def from_translation_key( - cls, - translation_key, - translations, - overlapping_reads, - ref_reads, - alt_reads, - alt_reads_supporting_protein_sequence, - transcripts_overlapping_variant, - transcripts_supporting_protein_sequence, - gene): - """ - Create a ProteinSequence object from a TranslationKey, along with - all the extra fields a ProteinSequence requires. - """ - return cls( - amino_acids=translation_key.amino_acids, - variant_aa_interval_start=translation_key.variant_aa_interval_start, - variant_aa_interval_end=translation_key.variant_aa_interval_end, - ends_with_stop_codon=translation_key.ends_with_stop_codon, - frameshift=translation_key.frameshift, - translations=translations, - overlapping_reads=overlapping_reads, - ref_reads=ref_reads, - alt_reads=alt_reads, - alt_reads_supporting_protein_sequence=( - alt_reads_supporting_protein_sequence), - transcripts_overlapping_variant=transcripts_overlapping_variant, - transcripts_supporting_protein_sequence=( - transcripts_supporting_protein_sequence), - gene=gene) - - def ascending_sort_key(self): - """ - Sort protein sequences lexicographically by three criteria: - - number of unique supporting reads - - minimum mismatch versus a supporting reference transcript before variant - - minimum mismatch versus a supporting reference transcript after variant - - number of supporting reference transcripts - - TODO: Add sort criterion: - - min number of reads covering each nucleotide of - the protein sequence >= 2 - """ - return ( - len(self.alt_reads_supporting_protein_sequence), - min(t.number_mismatches_before_variant for t in self.translations), - min(t.number_mismatches_after_variant for t in self.translations), - len(self.transcripts_supporting_protein_sequence) - ) - -def sort_protein_sequences(protein_sequences): - """ - Sort protein sequences in decreasing order of priority - """ - return list( - sorted( - protein_sequences, - key=ProteinSequence.ascending_sort_key, - reverse=True)) - -def reads_generator_to_protein_sequences_generator( - variant_and_overlapping_reads_generator, - transcript_id_whitelist=None, - protein_sequence_length=PROTEIN_SEQUENCE_LENGTH, - min_alt_rna_reads=MIN_ALT_RNA_READS, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, - min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - include_mismatches_after_variant=INCLUDE_MISMATCHES_AFTER_VARIANT, - max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT, - variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY): - """" - Translates each coding variant in a collection to one or more - Translation objects, which are then aggregated into equivalent - ProteinSequence objects. - - Parameters - ---------- - variant_and_overlapping_reads_generator : generator - Yields sequence of varcode.Variant objects paired with sequences - of AlleleRead objects that support that variant. - - transcript_id_whitelist : set, optional - If given, expected to be a set of transcript IDs which we should use - for determining the reading frame around a variant. If omitted, then - try to use all overlapping reference transcripts. - - protein_sequence_length : int - Try to translate protein sequences of this length, though sometimes - we'll have to return something shorter (depending on the RNAseq data, - and presence of stop codons). - - min_alt_rna_reads : int - Drop variant sequences at loci with fewer than this number of reads - supporting the alt allele. - - min_variant_sequence_coverage : int - Trim variant sequences to positions supported by at least this number - of RNA reads. - - min_transcript_prefix_length : int - Minimum number of bases we need to try matching between the reference - context and variant sequence. - - max_transcript_mismatches : int - Don't try to determine the reading frame for a transcript if more - than this number of bases differ. - - include_mismatches_after_variant : bool - Include mismatches after the variant locus in the count compared - against max_transcript_mismatches. - - max_protein_sequences_per_variant : int - Number of protein sequences to return for each ProteinSequence - - variant_cdna_sequence_assembly : bool - If True, then assemble variant cDNA sequences based on overlap of - RNA reads. If False, then variant cDNA sequences must be fully spanned - and contained within RNA reads. - - Yields pairs of a Variant and a list of ProteinSequence objects - """ - - for (variant, overlapping_reads) in variant_and_overlapping_reads_generator: - overlapping_transcript_ids = [ - t.id - for t in variant.transcripts - if t.is_protein_coding - ] - _, ref, alt = trim_variant(variant) - overlapping_reads = list(overlapping_reads) - reads_grouped_by_allele = group_reads_by_allele(overlapping_reads) - - ref_reads = reads_grouped_by_allele.get(ref, []) - alt_reads = reads_grouped_by_allele.get(alt, []) - - translations = translate_variant_reads( - variant=variant, - variant_reads=alt_reads, - transcript_id_whitelist=transcript_id_whitelist, - protein_sequence_length=protein_sequence_length, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches, - include_mismatches_after_variant=include_mismatches_after_variant, - variant_sequence_assembly=variant_sequence_assembly) - - protein_sequences = [] - for (key, equivalent_translations) in groupby( - translations, key_fn=Translation.as_translation_key).items(): - - # get the variant read names, transcript IDs and gene names for - # protein sequence we're about to construct - alt_reads_supporting_protein_sequence, group_transcript_ids, group_gene_names = \ - ProteinSequence._summarize_translations(equivalent_translations) - - logger.info( - "%s: %s alt reads supporting protein sequence (gene names = %s)", - key, - len(alt_reads_supporting_protein_sequence), - group_gene_names) - - protein_sequence = ProteinSequence.from_translation_key( - translation_key=key, - translations=equivalent_translations, - overlapping_reads=overlapping_reads, - alt_reads=alt_reads, - ref_reads=ref_reads, - alt_reads_supporting_protein_sequence=alt_reads_supporting_protein_sequence, - transcripts_supporting_protein_sequence=group_transcript_ids, - transcripts_overlapping_variant=overlapping_transcript_ids, - gene=list(group_gene_names)) - logger.info("%s: protein sequence = %s" % (key, protein_sequence.amino_acids)) - protein_sequences.append(protein_sequence) - - # sort protein sequences before returning the top results - protein_sequences = sort_protein_sequences(protein_sequences) - - yield variant, protein_sequences[:max_protein_sequences_per_variant] - - -def protein_sequences_generator_to_dataframe(variant_and_protein_sequences_generator): - """ - Given a generator which yields (Variant, [ProteinSequence]) elements, - returns a pandas.DataFrame - """ - return dataframe_from_generator( - element_class=ProteinSequence, - variant_and_elements_generator=variant_and_protein_sequences_generator, - converters=dict( - gene=lambda x: ";".join(x))) diff --git a/isovar/read_collector.py b/isovar/read_collector.py new file mode 100644 index 0000000..afb8bad --- /dev/null +++ b/isovar/read_collector.py @@ -0,0 +1,562 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from six import integer_types + +from .default_parameters import ( + USE_SECONDARY_ALIGNMENTS, + USE_DUPLICATE_READS, + MIN_READ_MAPPING_QUALITY, + USE_SOFT_CLIPPED_BASES, +) +from .locus_read import LocusRead +from .logging import get_logger +from .allele_read_helpers import allele_reads_from_locus_reads +from .variant_helpers import trim_variant +from .read_evidence import ReadEvidence + +logger = get_logger(__name__) + + +class ReadCollector(object): + """ + ReadCollector holds options related to extracting reads from SAM/BAM alignment files + and provides methods for different ways to create LocusRead objects. + """ + def __init__( + self, + use_secondary_alignments=USE_SECONDARY_ALIGNMENTS, + use_duplicate_reads=USE_DUPLICATE_READS, + min_mapping_quality=MIN_READ_MAPPING_QUALITY, + use_soft_clipped_bases=USE_SOFT_CLIPPED_BASES): + """ + Parameters + ---------- + use_secondary_alignments : bool + Use a read even when it's not the primary alignment at a locus + + use_duplicate_reads : bool + Use a read even if it's been marked as a duplicate + + min_mapping_quality : int + Minimum MAPQ (mapping quality) to use a read + + use_soft_clipped_bases : bool + Include soft-clipped positions on a read which were ignored by the aligner + """ + self.use_secondary_alignments = use_secondary_alignments + self.use_duplicate_reads = use_duplicate_reads + self.min_mapping_quality = min_mapping_quality + self.use_soft_clipped_bases = use_soft_clipped_bases + + def locus_read_from_pysam_aligned_segment( + self, + pysam_aligned_segment, + base0_start_inclusive, + base0_end_exclusive): + """ + Create LocusRead from pysam.AlignedSegment object and the start/end indices + of the locus of interest. If any essential fields of the aligned segment + are missing then None is returned instead. + + Parameters + ---------- + pysam_aligned_segment : pysam.AlignedSegment + AlignedSegment is expected to overlap the locus + + base0_start_inclusive : int + + base0_end_exclusive : int + + Returns + ------- + LocusRead or None + """ + if not isinstance(base0_start_inclusive, integer_types): + raise TypeError("Expected base0_start_inclusive to be an integer but got %s" % ( + type(base0_start_inclusive),)) + if not isinstance(base0_end_exclusive, integer_types): + raise TypeError("Expected base0_end_exclusive to be an integer but got %s" % ( + type(base0_end_exclusive),)) + + name = pysam_aligned_segment.query_name + if name is None: + logger.warn( + "Read missing name at position %d", + base0_start_inclusive + 1) + return None + + if pysam_aligned_segment.is_unmapped: + logger.warn( + "How did we get unmapped read '%s' in a pileup?", name) + return None + + if pysam_aligned_segment.is_secondary and not self.use_secondary_alignments: + logger.debug("Skipping secondary alignment of read '%s'", name) + return None + + if pysam_aligned_segment.is_duplicate and not self.use_duplicate_reads: + logger.debug("Skipping duplicate read '%s'", name) + return None + + mapping_quality = pysam_aligned_segment.mapping_quality + + if self.min_mapping_quality > 0 and (mapping_quality is None): + logger.debug("Skipping read '%s' due to missing MAPQ" % name) + return None + elif mapping_quality < self.min_mapping_quality: + logger.debug( + "Skipping read '%s' due to low MAPQ: %d < %d", + name, + mapping_quality, + self.min_mapping_quality) + return None + + sequence = pysam_aligned_segment.query_sequence + if sequence is None: + logger.warn("Skipping read '%s' due to missing sequence" % name) + return None + + base_qualities = pysam_aligned_segment.query_qualities + + if base_qualities is None: + logger.warn("Skipping read '%s' due to missing base qualities" % name) + return None + elif len(base_qualities) != len(sequence): + logger.warn( + "Skipping read '%s' due to mismatch in length of sequence (%d) and qualities (%d)" % ( + name, + len(sequence), + len(base_qualities))) + return None + # By default, AlignedSegment.get_reference_positions only returns base-1 positions + # from the reference that are within the alignment. If full_length is set, + # None values will be included for any soft-clipped or unaligned positions + # within the read. The returned list will thus be of the same + # length as the read. + + base0_reference_positions = pysam_aligned_segment.get_reference_positions(full_length=True) + + if len(base0_reference_positions) != len(base_qualities): + logger.warn( + "Skipping read '%s' due to mismatch in length of positions (%d) and qualities (%d)" % ( + name, + len(base0_reference_positions), + len(base_qualities))) + return None + + base0_reference_positions_dict = { + base0_reference_pos: base0_read_pos + for (base0_read_pos, base0_reference_pos) + in enumerate(base0_reference_positions) + if base0_reference_pos is not None + } + + reference_interval_size = base0_end_exclusive - base0_start_inclusive + if reference_interval_size < 0: + raise ValueError("Unexpected interval start after interval end") + + # TODO: Consider how to handle variants before splice sites, where + # the bases before or after on the genome will not be mapped on the + # read + # + # we have a dictionary mapping base-1 reference positions to base-0 + # read indices and we need to use that to convert the reference + # half-open interval into a half-open interval on the read. + if reference_interval_size == 0: + # Reference interval is between two bases but read may contain + # insertion. + # + # Reference: + # Insertion location: * + # Reference position: 10000 | 10001 10002 10003 10004 10005 10006 10007 + # Base sequence: A | T G C A A A A + # + # Read with inserted nucleotide: + # Read position: 00000 00001 00002 00003 00004 00005 00006 00007 + # Base sequence: A *A* T G C A A A + # Reference position: 10000 ----- 10001 10002 10003 10004 10005 10006 + # + # The start/end of the reference interval may be mapped to a read position, + # in this case reference:10000 -> read:00000, but it would be incorrect + # to take this position as the start/end of the insertion on the read + # since it does not cover the inserted bases. Instead, we look at the + # read position of the next base in the reference and, if it's more than + # 1 base away from the start, use that as the end of the interval. If it's + # next to the start of the interval then we return the empty "between bases" + # interval of [start, start). + # + # To deal with insertions at the beginning and end of a read we're + # going to allow the start/end to be None. + reference_position_before_insertion = base0_start_inclusive - 1 + reference_position_after_insertion = base0_start_inclusive + read_base0_before_insertion = base0_reference_positions_dict.get( + reference_position_before_insertion) + read_base0_after_insertion = base0_reference_positions_dict.get( + reference_position_after_insertion) + + if read_base0_before_insertion is None: + logger.warning("Cannot use read '%s' because reference position %d is not mapped" % ( + name, + reference_position_before_insertion)) + return None + elif read_base0_after_insertion is None: + logger.warning("Cannot use read '%s' because reference position %d is not mapped" % ( + name, + reference_position_after_insertion)) + return None + elif read_base0_after_insertion - read_base0_after_insertion == 1: + read_base0_start_inclusive = read_base0_end_exclusive = read_base0_before_insertion + 1 + else: + read_base0_start_inclusive = read_base0_before_insertion + 1 + read_base0_end_exclusive = read_base0_after_insertion + else: + # Reference bases are selected for match or deletion. + # + # What happens if the reference bases are interspersed with insertions? + # Reference: + # 10000 10001 10002 10003 10004 10005 10006 10007 + # A T G C A A A A + # + # Read: + # 00000 00001 00002 00003 00004 00005 00006 00007 + # A *A* T G C A A A + # + # ...and our reference interval is base-1 inclusive 10000:10001 + # but the read has an inserted 'A' in between the two bases. + # + # In this case we need to figure out the first and last positions + # which match the inclusive interval and then convert it to a half-open + # interval. One possibly more obvious alternative is just to + # figure out which read indices correspond to base0_start_inclusive and + # base0_end_exclusive but this would fail if base0_end_exclusive is + # after the end the end of the read. + read_base0_start_inclusive = base0_reference_positions_dict.get(base0_start_inclusive) + if read_base0_start_inclusive is None: + # if first base of reference locus isn't mapped, try getting the base + # before it and then adding one to its corresponding base index + reference_base0_position_before_locus = base0_start_inclusive - 1 + if reference_base0_position_before_locus in base0_reference_positions_dict: + read_base0_position_before_locus = base0_reference_positions_dict[ + reference_base0_position_before_locus] + read_base0_start_inclusive = read_base0_position_before_locus + 1 + else: + logger.warning( + "Cannot use read '%s' because neither reference positions %d or %d are not mapped" % ( + name, + base0_start_inclusive, + reference_base0_position_before_locus)) + return None + + read_base0_end_exclusive = base0_reference_positions_dict.get(base0_end_exclusive) + if read_base0_end_exclusive is None: + # if exclusve last index of reference interval doesn't have a corresponding + # base position then try getting the base position of the reference + # position before it and then adding one + reference_base0_end_inclusive = base0_end_exclusive - 1 + if reference_base0_end_inclusive in base0_reference_positions_dict: + read_base0_end_inclusive = base0_reference_positions_dict[ + reference_base0_end_inclusive] + read_base0_end_exclusive = read_base0_end_inclusive + 1 + else: + logger.warning( + "Cannot use read '%s' because neither reference positions %d or %d are not mapped" % ( + name, + base0_end_exclusive, + reference_base0_end_inclusive)) + return None + + + if isinstance(sequence, bytes): + sequence = sequence.decode('ascii') + + if not self.use_soft_clipped_bases: + # if we're not allowing soft clipped based then + # the fraction of the read which is usable may be smaller + # than the sequence, qualities, and alignment positions + # we've extracted, so slice through those to get rid of + # soft-clipped ends of the read + aligned_subsequence_start = pysam_aligned_segment.query_alignment_start + aligned_subsequence_end = pysam_aligned_segment.query_alignment_end + sequence = sequence[aligned_subsequence_start:aligned_subsequence_end] + base0_reference_positions = base0_reference_positions[ + aligned_subsequence_start:aligned_subsequence_end] + base_qualities = base_qualities[aligned_subsequence_start:aligned_subsequence_end] + if read_base0_start_inclusive is not None: + read_base0_start_inclusive -= aligned_subsequence_start + if read_base0_end_exclusive is not None: + read_base0_end_exclusive -= aligned_subsequence_start + return LocusRead( + name=name, + sequence=sequence, + reference_positions=base0_reference_positions, + quality_scores=base_qualities, + reference_base0_start_inclusive=base0_start_inclusive, + reference_base0_end_exclusive=base0_end_exclusive, + read_base0_start_inclusive=read_base0_start_inclusive, + read_base0_end_exclusive=read_base0_end_exclusive) + + def get_locus_reads( + self, + alignment_file, + chromosome, + base0_start_inclusive, + base0_end_exclusive): + """ + Create LocusRead objects for reads which overlap the given chromosome, + start, and end positions. The actual work to figure out if what's between + those positions matches a variant happens later when LocusRead objects are + converted to AlleleRead objects. + + Parameters + ---------- + alignment_file : pysam.AlignmentFile + + chromosome : str + + base0_start_inclusive : int + Start of genomic interval, base 0 and inclusive + + base0_end_exclusive : int + End of genomic interval, base 0 and exclusive + + Returns a sequence of ReadAtLocus objects + """ + logger.debug( + "Gathering reads at locus %s:%d-%d", + chromosome, + base0_start_inclusive, + base0_end_exclusive) + reads = [] + for aligned_segment in alignment_file.fetch( + chromosome, + base0_start_inclusive, + base0_end_exclusive): + read = self.locus_read_from_pysam_aligned_segment( + aligned_segment, + base0_start_inclusive=base0_start_inclusive, + base0_end_exclusive=base0_end_exclusive) + if read is not None: + reads.append(read) + logger.info( + "Found %d reads overlapping locus %s:%d-%d", + len(reads), + chromosome, + base0_start_inclusive, + base0_end_exclusive) + return reads + + @staticmethod + def _infer_chromosome_name(variant_chromosome_name, valid_chromosome_names): + """ + In case the variant is using an hg19 reference name and the alignment + was against b37 (or vice versa) we have to check whether adding or removing + the prefix "chr" is necessary. + Parameters + ---------- + variant_chromosome_name : str + + valid_chromosome_names : set of str + + Returns + ------- + str or None + """ + # I imagine the conversation went like this: + # A: "Hey, I have an awesome idea" + # B: "What's up?" + # A: "Let's make two nearly identical reference genomes" + # B: "But...that sounds like it might confuse people." + # A: "Nah, it's cool, we'll give the chromosomes different prefixes!" + # B: "OK, sounds like a good idea." + candidate_names = {variant_chromosome_name} + if variant_chromosome_name.startswith("chr"): + candidate_names.add(variant_chromosome_name[3:]) + else: + candidate_names.add("chr" + variant_chromosome_name) + for candidate in list(candidate_names): + candidate_names.add(candidate.lower()) + candidate_names.add(candidate.upper()) + for candidate in candidate_names: + if candidate in valid_chromosome_names: + return candidate + return None + + def locus_reads_overlapping_variant( + self, + alignment_file, + variant, + chromosome=None): + """ + Find reads in the given SAM/BAM file which overlap the given variant and + return them as a list of LocusRead objects. + + Parameters + ---------- + alignment_file : pysam.AlignmentFile + + variant : varcode.Variant + + chromosome : str or None + + Returns sequence of LocusRead objects. + """ + if chromosome is None: + # if a chromosome name isn't manually specified then try + # to figure out whether adding or removing "chr" is necessary + # match chromosome names used for variant calling and those + # found in read alignments + chromosome = self._infer_chromosome_name( + variant_chromosome_name=variant.contig, + valid_chromosome_names=set(alignment_file.references)) + + if chromosome is None: + # failed to infer a chromsome name for this variant which + # matches names used in SAM/BAM file + logger.warning( + "Chromosome '%s' from variant %s not in alignment file %s", + variant.contig, + variant, + alignment_file.filename) + return [] + + logger.info( + "Gathering variant reads for variant %s (with gene names %s)", + variant, + variant.gene_names) + + base1_position, ref, alt = trim_variant(variant) + + if len(ref) == 0: + # If there is no reference sequence in the variant + # then it's an insertion and the base 0 coordinates + # will select the space between two bases. + # + # For example, an insertion between base-1 positions chr1:3 and chr1:4 + # + # Base 1 inclusive: + # | 1 | 2 | 3 | 4 | 5 | + # Base 0 half-open: + # 0 1 2 3 4 5 + # + # The space between chr1:3 and chr1:4 in base-0 coordinates is chr1 3:3 + # + # So, to convert an insertion from base-1 inclusive to base-0 half-open we + # keep the original position + base0_start_inclusive = base1_position + base0_end_exclusive = base1_position + else: + # if variant is SNV or deletion then some number of reference bases + # are selected, so just get the interval for those. + # + # For example, if two bases at positions chr1:1000 and 1001 are deleted + # then the base0 indices will be 9999:1001 + base0_start_inclusive = base1_position - 1 + base0_end_exclusive = base0_start_inclusive + len(ref) + + return self.get_locus_reads( + alignment_file=alignment_file, + chromosome=chromosome, + base0_start_inclusive=base0_start_inclusive, + base0_end_exclusive=base0_end_exclusive) + + def allele_reads_overlapping_variant( + self, + variant, + alignment_file): + """ + Find reads in the given SAM/BAM file which overlap the given variant and + return them as a list of AlleleRead objects. + + Parameters + ---------- + variant : varcode.Variant + + alignment_file : pysam.AlignmentFile + Aligned RNA reads + + Returns sequence of AlleleRead objects. + """ + return allele_reads_from_locus_reads( + self.locus_reads_overlapping_variant( + alignment_file=alignment_file, + variant=variant)) + + def read_evidence_for_variant( + self, + variant, + alignment_file): + """ + Find reads in the given SAM/BAM file which overlap the given variant and + return them as a ReadEvidence object, which splits the reads into + ref/alt/other groups. + + Parameters + ---------- + variant : varcode.Variant + + alignment_file : pysam.AlignmentFile + Aligned RNA reads + + Returns ReadEvidence + """ + allele_reads = self.allele_reads_overlapping_variant( + variant=variant, + alignment_file=alignment_file) + return ReadEvidence.from_variant_and_allele_reads( + variant, + allele_reads) + + def allele_reads_supporting_variant(self, variant, alignment_file): + """ + Gather AlleleRead objects which contain the same allele as the variant. + + Parameters + ---------- + variant : varcode.VariantCollection + Variants which will be the keys of the result + + alignment_file : pysam.AlignmentFile + Aligned RNA reads + + Returns list of AlleleRead + """ + read_evidence = self.read_evidence_for_variant( + variant=variant, + alignment_file=alignment_file) + return read_evidence.alt_reads + + def read_evidence_generator(self, variants, alignment_file): + """ + Consumes a generator of varcode.Variant objects, collects read evidence + for each variant from the alignment_file, and generates a sequence + of (Variant, ReadEvidence) pairs. + + Parameters + ---------- + variants : varcode.VariantCollection + Variants which will be the keys of the result + + alignment_file : pysam.AlignmentFile + Aligned RNA reads + + Generates sequence of (varcode.Variant, ReadEvidence) pairs + """ + for variant in variants: + read_evidence = self.read_evidence_for_variant( + variant=variant, + alignment_file=alignment_file) + yield variant, read_evidence diff --git a/isovar/read_evidence.py b/isovar/read_evidence.py new file mode 100644 index 0000000..8396fbc --- /dev/null +++ b/isovar/read_evidence.py @@ -0,0 +1,139 @@ +# Copyright (c) 2018-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Collect AlleleReads overlapping each variant grouped by whether +they support the reference, somatic allele, or some other allele. +""" + +from __future__ import print_function, division, absolute_import + +from .allele_read_helpers import split_reads_into_ref_alt_other +from .variant_helpers import trim_variant +from .value_object import ValueObject + + +class ReadEvidence(ValueObject): + """ + This class represents the reads at a variant locus partitioned + by allele (ref/alt/other) relative to a variant. + """ + + __slots__ = [ + "trimmed_base1_start", + "trimmed_ref", + "trimmed_alt", + "ref_reads", + "alt_reads", + "other_reads" + ] + + @classmethod + def from_variant_and_allele_reads( + cls, + variant, + allele_reads): + """ + Create a GroupedAlleleReads object from a variant and the set of reads overlapping + the location of that variant. + + Parameters + ---------- + variant : varcode.Variant + + allele_reads : list of AlleleRead + + Returns GroupedAlleleReads + + """ + trimmed_base1_start, trimmed_ref, trimmed_alt = \ + trim_variant(variant) + ref_reads, alt_reads, other_reads = split_reads_into_ref_alt_other( + ref=trimmed_ref, + alt=trimmed_alt, + overlapping_reads=allele_reads) + return cls( + trimmed_base1_start=trimmed_base1_start, + trimmed_ref=trimmed_ref, + trimmed_alt=trimmed_alt, + ref_reads=ref_reads, + alt_reads=alt_reads, + other_reads=other_reads) + + def __init__( + self, + trimmed_base1_start, + trimmed_ref, + trimmed_alt, + ref_reads, + alt_reads, + other_reads): + """ + Parameters + ---------- + trimmed_base1_start : int + Base-1 inclusive genomic position indicating start of variant, + trimmed to avoid any shared prefix between the ref and alt sequences + + trimmed_ref : str + Reference allele, trimmed to avoid any shared prefix with the alt + sequence. + + trimmed_alt : str + Alternate allele, trimmed to avoid any shared prefix with the ref + sequence. + + ref_reads : list of AlleleRead + Reads supporting the reference allele + + alt_reads : list of AlleleRead + Reads supporting the alt allele + + other_reads : list of AlleleRead + Reads supporting some allele other than ref or alt. + """ + self.trimmed_base1_start = trimmed_base1_start + self.trimmed_ref = trimmed_ref + self.trimmed_alt = trimmed_alt + self.ref_reads = ref_reads + self.alt_reads = alt_reads + self.other_reads = other_reads + + @property + def ref_read_names(self): + """ + Names of reads which match the ref allele + + Returns set of str + """ + return {r.name for r in self.ref_reads} + + @property + def alt_read_names(self): + """ + Names of reads which match the alt allele + + Returns set of str + """ + return {r.name for r in self.alt_reads} + + @property + def other_read_names(self): + """ + Names of reads which match non-ref/non-alt alleles + + Returns set of str + """ + return {r.name for r in self.other_reads} diff --git a/isovar/reference_coding_sequence_key.py b/isovar/reference_coding_sequence_key.py index 8dcae3c..3225770 100644 --- a/isovar/reference_coding_sequence_key.py +++ b/isovar/reference_coding_sequence_key.py @@ -14,9 +14,9 @@ from __future__ import print_function, division, absolute_import -from .variant_helpers import interbase_range_affected_by_variant_on_transcript -from .reference_sequence_key import ReferenceSequenceKey from .logging import get_logger +from .reference_sequence_key import ReferenceSequenceKey +from .variant_helpers import interbase_range_affected_by_variant_on_transcript logger = get_logger(__name__) diff --git a/isovar/reference_context.py b/isovar/reference_context.py index 4286093..5d11b21 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +15,7 @@ from __future__ import print_function, division, absolute_import from collections import OrderedDict, defaultdict -from .effect_prediction import reference_transcripts_for_variant -from .dataframe_builder import DataFrameBuilder + from .reference_coding_sequence_key import ReferenceCodingSequenceKey from .logging import get_logger @@ -28,13 +27,18 @@ # ReferenceContext # ---------------- # -# Includes all the fields of SequenceKeyWithReadingFrame in addition to which +# Includes all the fields of ReferenceCodingSequenceKey in addition to which # variant we're examining and all transcripts overlapping that variant # which produced this particular sequence context and reading frame. # ########################## class ReferenceContext(ReferenceCodingSequenceKey): + """ + Representation of the sequence preceding a variant and its associated + reading frame. + """ + # additional fields on top of slots for ReferenceCodingSequenceKey __slots__ = ["variant", "transcripts"] @@ -67,6 +71,18 @@ def __init__( @classmethod def from_reference_coding_sequence_key(cls, key, variant, transcripts): + """ + Construct a ReferenceContext object from a ReferenceSequenceKey, variant, + and a set of transcript. + + Parameters + ---------- + key : ReferenceSequenceKey + variant : varcode.Variant + transcripts : list of pyensembl.Transcript + + Returns ReferenceContext + """ return ReferenceContext( strand=key.strand, sequence_before_variant_locus=key.sequence_before_variant_locus, @@ -95,111 +111,3 @@ def mitochondrial(self): """ return self.variant.contig.lower() in {"chrm", "m", "chrmt", "mt"} -def reference_contexts_for_variant( - variant, - context_size, - transcript_id_whitelist=None): - """ - variant : varcode.Variant - - context_size : int - Max of nucleotides to include to the left and right of the variant - in the context sequence. - - transcript_id_whitelist : set, optional - If given, then only consider transcripts whose IDs are in this set. - - Returns list of ReferenceContext objects, sorted by maximum length of - coding sequence of any supporting transcripts. - """ - overlapping_transcripts = reference_transcripts_for_variant( - variant=variant, - transcript_id_whitelist=transcript_id_whitelist) - - # dictionary mapping SequenceKeyWithReadingFrame keys to list of - # transcript objects - sequence_groups = defaultdict(list) - - for transcript in overlapping_transcripts: - sequence_key_with_reading_frame = \ - ReferenceCodingSequenceKey.from_variant_and_transcript( - variant=variant, - transcript=transcript, - context_size=context_size) - if sequence_key_with_reading_frame is not None: - sequence_groups[sequence_key_with_reading_frame].append(transcript) - - reference_contexts = [ - ReferenceContext.from_reference_coding_sequence_key( - key, variant, matching_transcripts) - for (key, matching_transcripts) in sequence_groups.items() - ] - reference_contexts.sort( - key=ReferenceContext.sort_key_decreasing_max_length_transcript_cds) - return reference_contexts - -def reference_contexts_for_variants( - variants, - context_size, - transcript_id_whitelist=None): - """ - Extract a set of reference contexts for each variant in the collection. - - Parameters - ---------- - variants : varcode.VariantCollection - - context_size : int - Max of nucleotides to include to the left and right of the variant - in the context sequence. - - transcript_id_whitelist : set, optional - If given, then only consider transcripts whose IDs are in this set. - - Returns a dictionary from variants to lists of ReferenceContext objects, - sorted by max coding sequence length of any transcript. - """ - result = OrderedDict() - for variant in variants: - result[variant] = reference_contexts_for_variant( - variant=variant, - context_size=context_size, - transcript_id_whitelist=transcript_id_whitelist) - return result - -def variants_to_reference_contexts_dataframe( - variants, - context_size, - transcript_id_whitelist=None): - """ - Given a collection of variants, find all reference sequence contexts - around each variant. - - Parameters - ---------- - variants : varcode.VariantCollection - - context_size : int - Max of nucleotides to include to the left and right of the variant - in the context sequence. - - transcript_id_whitelist : set, optional - If given, then only consider transcripts whose IDs are in this set. - - Returns a DataFrame with {"chr", "pos", "ref", "alt"} columns for variants, - as well as all the fields of ReferenceContext. - """ - - df_builder = DataFrameBuilder( - ReferenceContext, - exclude=["variant"], - converters=dict(transcripts=lambda ts: ";".join(t.name for t in ts)), - extra_column_fns={ - "gene": lambda variant, _: ";".join(variant.gene_names), - }) - for variant, reference_contexts in reference_contexts_for_variants( - variants=variants, - context_size=context_size, - transcript_id_whitelist=transcript_id_whitelist).items(): - df_builder.add_many(variant, reference_contexts) - return df_builder.to_dataframe() diff --git a/isovar/reference_context_helpers.py b/isovar/reference_context_helpers.py new file mode 100644 index 0000000..733b4a8 --- /dev/null +++ b/isovar/reference_context_helpers.py @@ -0,0 +1,96 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from collections import OrderedDict, defaultdict + + +from .effect_prediction import reference_coding_transcripts_for_variant +from .reference_context import ReferenceContext +from .reference_coding_sequence_key import ReferenceCodingSequenceKey + + +def reference_contexts_for_variant( + variant, + context_size, + transcript_id_whitelist=None): + """ + variant : varcode.Variant + + context_size : int + Max of nucleotides to include to the left and right of the variant + in the context sequence. + + transcript_id_whitelist : set, optional + If given, then only consider transcripts whose IDs are in this set. + + Returns list of ReferenceContext objects, sorted by maximum length of + coding sequence of any supporting transcripts. + """ + overlapping_transcripts = reference_coding_transcripts_for_variant( + variant=variant, + transcript_id_whitelist=transcript_id_whitelist) + + # dictionary mapping SequenceKeyWithReadingFrame keys to list of + # transcript objects + sequence_groups = defaultdict(list) + + for transcript in overlapping_transcripts: + reference_coding_sequence_key = \ + ReferenceCodingSequenceKey.from_variant_and_transcript( + variant=variant, + transcript=transcript, + context_size=context_size) + if reference_coding_sequence_key is not None: + sequence_groups[reference_coding_sequence_key].append(transcript) + + reference_contexts = [ + ReferenceContext.from_reference_coding_sequence_key( + key, variant, matching_transcripts) + for (key, matching_transcripts) in sequence_groups.items() + ] + reference_contexts.sort( + key=ReferenceContext.sort_key_decreasing_max_length_transcript_cds) + return reference_contexts + + +def reference_contexts_generator( + variants, + context_size, + transcript_id_whitelist=None): + """ + Extract a set of reference contexts for each variant in the collection. + + Parameters + ---------- + variants : varcode.VariantCollection + + context_size : int + Max of nucleotides to include to the left and right of the variant + in the context sequence. + + transcript_id_whitelist : set, optional + If given, then only consider transcripts whose IDs are in this set. + + Generate a series of (Variant, [ReferenceContext]) pairs, where the + to list of ReferenceContext objects for each variant is sorted by + max coding sequence length of any transcript. + """ + for variant in variants: + reference_contexts = reference_contexts_for_variant( + variant=variant, + context_size=context_size, + transcript_id_whitelist=transcript_id_whitelist) + yield variant, reference_contexts \ No newline at end of file diff --git a/isovar/reference_sequence_key.py b/isovar/reference_sequence_key.py index ca9047f..f15d7a4 100644 --- a/isovar/reference_sequence_key.py +++ b/isovar/reference_sequence_key.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,12 @@ from __future__ import print_function, division, absolute_import -from .value_object import ValueObject -from .dna import reverse_complement_dna -from .variant_helpers import interbase_range_affected_by_variant_on_transcript from .logging import get_logger +from .value_object import ValueObject +from .variant_helpers import ( + interbase_range_affected_by_variant_on_transcript, + variant_matches_reference_sequence +) logger = get_logger(__name__) @@ -126,13 +128,3 @@ def from_variant_and_transcript( sequence_before_variant_locus=reference_cdna_before_variant, sequence_at_variant_locus=reference_cdna_at_variant, sequence_after_variant_locus=reference_cdna_after_variant) - - -def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): - """ - Make sure that reference nucleotides we expect to see on the reference - transcript from a variant are the same ones we encounter. - """ - if strand == "-": - ref_seq_on_transcript = reverse_complement_dna(ref_seq_on_transcript) - return ref_seq_on_transcript == variant.ref diff --git a/isovar/string_helpers.py b/isovar/string_helpers.py index d491f19..e20039d 100644 --- a/isovar/string_helpers.py +++ b/isovar/string_helpers.py @@ -24,6 +24,14 @@ def trim_N_nucleotides(prefix, suffix): """ Drop all occurrences of 'N' from prefix and suffix nucleotide strings by trimming. + + Parameters + ---------- + prefix : str + + suffix : str + + Returns str """ if 'N' in prefix: # trim prefix to exclude all occurrences of N @@ -43,16 +51,13 @@ def trim_N_nucleotides(prefix, suffix): return prefix, suffix -def convert_from_bytes_if_necessary(prefix, suffix): + +def convert_from_bytes_if_necessary(seq): """ Depending on how we extract data from pysam we may end up with either a string or a byte array of nucleotides. For consistency and simplicity, we want to only use strings in the rest of our code. """ - if isinstance(prefix, bytes): - prefix = prefix.decode('ascii') - - if isinstance(suffix, bytes): - suffix = suffix.decode('ascii') - - return prefix, suffix + if isinstance(seq, bytes): + seq = seq.decode('ascii') + return seq diff --git a/isovar/translation.py b/isovar/translation.py index 816d15a..db7c0b9 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,62 +20,23 @@ from __future__ import print_function, division, absolute_import -import math - -from .reference_context import reference_contexts_for_variant -from .variant_sequences import reads_to_variant_sequences -from .genetic_code import translate_cdna -from .variant_sequence_in_reading_frame import ( - match_variant_sequence_to_reference_context, -) -from .default_parameters import ( - MIN_TRANSCRIPT_PREFIX_LENGTH, - MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - INCLUDE_MISMATCHES_AFTER_VARIANT, - PROTEIN_SEQUENCE_LENGTH, - MIN_ALT_RNA_READS, - MIN_VARIANT_SEQUENCE_COVERAGE, - VARIANT_SEQUENCE_ASSEMBLY, -) -from .dataframe_builder import dataframe_from_generator -from .value_object import ValueObject from .logging import get_logger - -logger = get_logger(__name__) +from .translation_key import TranslationKey -class TranslationKey(ValueObject): - """ - TranslationKey contains fields related to a translated protein sequence - which should be used to combine multiple equivalent mutated amino acid - sequences. - """ - __slots__ = [ - # translated sequence of a variant sequence in the ORF established - # by a reference context - "amino_acids", - # half-open interval coordinates for variant amino acids - # in the translated sequence - "variant_aa_interval_start", - "variant_aa_interval_end", - # did the amino acid sequence end due to a stop codon or did we - # just run out of sequence context around the variant? - "ends_with_stop_codon", - # was the variant a frameshift relative to the reference sequence? - "frameshift" - ] +logger = get_logger(__name__) class Translation(TranslationKey): """ - Translated amino acid sequence of a VariantSequenceInReadingFrame for a - particular ReferenceContext and VariantSequence. + Translated amino acid sequence of a VariantORF for a particular + ReferenceContext and VariantSequence. """ __slots__ = [ "untrimmed_variant_sequence", "reference_context", - "variant_sequence_in_reading_frame" + "variant_orf" ] def __init__( @@ -87,20 +48,20 @@ def __init__( frameshift, untrimmed_variant_sequence, reference_context, - variant_sequence_in_reading_frame): - # TODO: get rid of untrimmed_variant_sequence by making - # VariantSequenceInReadingFrame keep track of its inputs + variant_orf): + # TODO: + # get rid of untrimmed_variant_sequence by making + # VariantORF keep track of its inputs self.amino_acids = amino_acids self.variant_aa_interval_start = variant_aa_interval_start self.variant_aa_interval_end = variant_aa_interval_end self.ends_with_stop_codon = ends_with_stop_codon self.frameshift = frameshift # this variant sequence might differ from the one - # in variant_sequence_in_reading_frame due to trimming - # required to match the reference + # in variant_orf due to trimming required to match the reference self.untrimmed_variant_sequence = untrimmed_variant_sequence self.reference_context = reference_context - self.variant_sequence_in_reading_frame = variant_sequence_in_reading_frame + self.variant_orf = variant_orf @property def reads(self): @@ -112,40 +73,67 @@ def reads(self): @property def reference_cdna_sequence_before_variant(self): + """ + + Returns str + """ return ( - self. - variant_sequence_in_reading_frame. - reference_cdna_sequence_before_variant) + self.variant_orf.reference_cdna_sequence_before_variant) + @property - def number_mismatches(self): - """Only counting number of mismatches before the variant locus. + def num_mismatches_before_variant(self): """ - return self.number_mismatches_before_variant + Number of nucleotides in the variant cDNA sequence which + don't match the ReferenceContext transcript sequence at + positions before the variant locus. - @property - def number_mismatches_before_variant(self): - return self.variant_sequence_in_reading_frame.number_mismatches_before_variant + Returns int + """ + return self.variant_orf.num_mismatches_before_variant @property - def number_mismatches_after_variant(self): - return self.variant_sequence_in_reading_frame.number_mismatches_after_variant + def num_mismatches_after_variant(self): + """ + Number of nucleotides in the variant cDNA sequence which + don't match the ReferenceContext transcript sequence at + positions after the variant locus. + + Returns int + """ + return self.variant_orf.num_mismatches_after_variant @property def cdna_sequence(self): - return self.variant_sequence_in_reading_frame.cdna_sequence + """ + cDNA sequence assembled from variant supporting reads + + Returns str + """ + return self.variant_orf.cdna_sequence @property def offset_to_first_complete_codon(self): - return self.variant_sequence_in_reading_frame.offset_to_first_complete_codon + """ + Offset to first complete codon in the cDNA sequence + + Returns int in {0, 1, 2} + """ + return self.variant_orf.offset_to_first_complete_codon @property def variant_cdna_interval_start(self): - return self.variant_sequence_in_reading_frame.variant_cdna_interval_start + """ + Interbase start coordinate of variant interval in the cDNA sequence + """ + return self.variant_orf.variant_cdna_interval_start @property def variant_cdna_interval_end(self): - return self.variant_sequence_in_reading_frame.variant_cdna_interval_end + """ + Interbase end coordinate of variant interval in the cDNA sequence + """ + return self.variant_orf.variant_cdna_interval_end def as_translation_key(self): """ @@ -157,446 +145,4 @@ def as_translation_key(self): name: getattr(self, name) for name in TranslationKey._fields}) - @classmethod - def from_variant_sequence_and_reference_context( - cls, - variant_sequence, - reference_context, - min_transcript_prefix_length, - max_transcript_mismatches, - include_mismatches_after_variant, - protein_sequence_length=None): - """ - Attempt to translate a single VariantSequence using the reading frame - from a single ReferenceContext. - - Parameters - ---------- - variant_sequence : VariantSequence - - reference_context : ReferenceContext - - min_transcript_prefix_length : int - Minimum number of nucleotides before the variant to test whether - our variant sequence can use the reading frame from a reference - transcript. - - max_transcript_mismatches : int - Don't use the reading frame from a context where the cDNA variant - sequences disagrees at more than this number of positions before the - variant nucleotides. - - include_mismatches_after_variant : bool - If true, mismatches after the variant nucleotides will also count - against max_transcript_mismatches filtering. - - protein_sequence_length : int, optional - Truncate protein to be at most this long - - Returns either a ProteinSequence object or None if the number of - mismatches between the RNA and reference transcript sequences exceeds - given threshold. - """ - variant_sequence_in_reading_frame = match_variant_sequence_to_reference_context( - variant_sequence, - reference_context, - min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches, - include_mismatches_after_variant=include_mismatches_after_variant) - - if variant_sequence_in_reading_frame is None: - logger.info("Unable to determine reading frame for %s", variant_sequence) - return None - - cdna_sequence = variant_sequence_in_reading_frame.cdna_sequence - cdna_codon_offset = variant_sequence_in_reading_frame.offset_to_first_complete_codon - - # get the offsets into the cDNA sequence which pick out the variant nucleotides - cdna_variant_start_offset = variant_sequence_in_reading_frame.variant_cdna_interval_start - cdna_variant_end_offset = variant_sequence_in_reading_frame.variant_cdna_interval_end - - # TODO: determine if the first codon is the start codon of a - # transcript, for now any of the unusual start codons like CTG - # will translate to leucine instead of methionine. - variant_amino_acids, ends_with_stop_codon = translate_cdna( - cdna_sequence[cdna_codon_offset:], - first_codon_is_start=False, - mitochondrial=reference_context.mitochondrial) - - variant_aa_interval_start, variant_aa_interval_end, frameshift = \ - find_mutant_amino_acid_interval( - cdna_sequence=cdna_sequence, - cdna_first_codon_offset=cdna_codon_offset, - cdna_variant_start_offset=cdna_variant_start_offset, - cdna_variant_end_offset=cdna_variant_end_offset, - n_ref=len(reference_context.sequence_at_variant_locus), - n_amino_acids=len(variant_amino_acids)) - - if protein_sequence_length and len(variant_amino_acids) > protein_sequence_length: - if protein_sequence_length <= variant_aa_interval_start: - logger.warn( - ("Truncating amino acid sequence %s " - "to only %d elements loses all variant residues"), - variant_amino_acids, - protein_sequence_length) - return None - # if the protein is too long then shorten it, which implies - # we're no longer stopping due to a stop codon and that the variant - # amino acids might need a new stop index - variant_amino_acids = variant_amino_acids[:protein_sequence_length] - variant_aa_interval_end = min(variant_aa_interval_end, protein_sequence_length) - ends_with_stop_codon = False - - return Translation( - amino_acids=variant_amino_acids, - frameshift=frameshift, - ends_with_stop_codon=ends_with_stop_codon, - variant_aa_interval_start=variant_aa_interval_start, - variant_aa_interval_end=variant_aa_interval_end, - untrimmed_variant_sequence=variant_sequence, - reference_context=reference_context, - variant_sequence_in_reading_frame=variant_sequence_in_reading_frame) - - -def find_mutant_amino_acid_interval( - cdna_sequence, - cdna_first_codon_offset, - cdna_variant_start_offset, - cdna_variant_end_offset, - n_ref, - n_amino_acids): - """ - Parameters - ---------- - cdna_sequence : skbio.DNA or str - cDNA sequence found in RNAseq data - - cdna_first_codon_offset : int - Offset into cDNA sequence to first complete codon, lets us skip - past UTR region and incomplete codons. - - cdna_variant_start_offset : int - Interbase start offset into cDNA sequence for selecting mutant - nucleotides. - - cdna_variant_end_offset : int - Interbase end offset into cDNA sequence for selecting mutant - nucleotides. - - n_ref : int - Number of reference nucleotides - - n_amino_acids : int - Number of translated amino acids - - Returns tuple with three fields: - 1) Start offset for interval of mutant amino acids in translated sequence - 2) End offset for interval of mutant amino acids in translated sequence - 3) Boolean flag indicating whether the variant was a frameshift. - """ - cdna_alt_nucleotides = cdna_sequence[ - cdna_variant_start_offset:cdna_variant_end_offset] - - n_alt = len(cdna_alt_nucleotides) - # sequence of nucleotides before the variant starting from the first codon - cdna_coding_prefix = cdna_sequence[cdna_first_codon_offset:cdna_variant_start_offset] - - # rounding down since a change in the middle of a codon should count - # toward the variant codons - n_coding_nucleotides_before_variant = len(cdna_coding_prefix) - - n_complete_prefix_codons = n_coding_nucleotides_before_variant // 3 - - frame_of_variant_nucleotides = n_coding_nucleotides_before_variant % 3 - frameshift = abs(n_ref - n_alt) % 3 != 0 - indel = n_ref != n_alt - - variant_aa_interval_start = n_complete_prefix_codons - - if frameshift: - # if mutation is a frame shift then every amino acid from the - # first affected codon to the stop is considered mutant - # - # TODO: what if the first k amino acids are synonymous with the - # reference sequence? - variant_aa_interval_end = n_amino_acids - else: - n_alt_codons = int(math.ceil(n_alt / 3.0)) - if indel: - # We need to adjust the number of affected codons by whether the - # variant is aligned with codon boundaries, since in-frame indels - # may still be split across multiple codons. - # - # Example of in-frame deletion of 3 nucleotides which leaves - # 0 variant codons in the sequence (interval = 1:1) - # ref = CCC|AAA|GGG|TTT - # alt = CCC|GGG|TTT - # - # Example of in-frame deletion of 3 nucleotides which leaves - # 1 variant codon in the sequence (interval = 1:2) - # ref = CCC|AAA|GGG|TTT - # alt = CCC|AGG|TTT - # - # Example of in-frame insertion of 3 nucleotides which - # yields two variant codons: - # ref = CCC|AAA|GGG|TTT - # alt = CTT|TCC|AAA|GGG|TTT - extra_affected_codon = int(frame_of_variant_nucleotides != 0) - variant_aa_interval_end = ( - variant_aa_interval_start + n_alt_codons + extra_affected_codon) - else: - # if the variant is a simple substitution then it only affects - # as many codons as are in the alternate sequence - variant_aa_interval_end = variant_aa_interval_start + n_alt_codons - return variant_aa_interval_start, variant_aa_interval_end, frameshift - - -def translation_generator( - variant_sequences, - reference_contexts, - min_transcript_prefix_length, - max_transcript_mismatches, - include_mismatches_after_variant, - protein_sequence_length=None): - """ - Given all detected VariantSequence objects for a particular variant - and all the ReferenceContext objects for that locus, translate - multiple protein sequences, up to the number specified by the argument - max_protein_sequences_per_variant. - - Parameters - ---------- - variant_sequences : list of VariantSequence objects - Variant sequences overlapping a single original variant - - reference_contexts : list of ReferenceContext objects - Reference sequence contexts from the same variant as the variant_sequences - - min_transcript_prefix_length : int - Minimum number of nucleotides before the variant to test whether - our variant sequence can use the reading frame from a reference - transcript. - - max_transcript_mismatches : int - Maximum number of mismatches between coding sequence before variant - and reference transcript we're considering for determing the reading - frame. - - include_mismatches_after_variant : bool - If true, mismatches occurring after the variant locus will also count - toward max_transcript_mismatches filtering. - - protein_sequence_length : int, optional - Truncate protein to be at most this long. - - Yields a sequence of Translation objects. - """ - for reference_context in reference_contexts: - for variant_sequence in variant_sequences: - translation = Translation.from_variant_sequence_and_reference_context( - variant_sequence=variant_sequence, - reference_context=reference_context, - min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches, - include_mismatches_after_variant=include_mismatches_after_variant, - protein_sequence_length=protein_sequence_length) - if translation is not None: - yield translation - - -def translate_variant_reads( - variant, - variant_reads, - protein_sequence_length, - transcript_id_whitelist=None, - min_alt_rna_reads=MIN_ALT_RNA_READS, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, - min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - include_mismatches_after_variant=INCLUDE_MISMATCHES_AFTER_VARIANT, - variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY): - """ - Given a variant and its associated alt reads, construct variant sequences - and translate them into Translation objects. - - Returns 0 or more Translation objects. - - Parameters - ---------- - variant : varcode.Variant - - variant_reads : sequence or generator - AlleleRead objects supporting the variant - - protein_sequence_length : int - Try to translate protein sequences of this length, though sometimes - we'll have to return something shorter (depending on the RNAseq data, - and presence of stop codons). - - transcript_id_whitelist : set, optional - If given, expected to be a set of transcript IDs which we should use - for determining the reading frame around a variant. If omitted, then - try to use all overlapping reference transcripts. - - min_alt_rna_reads : int - Drop variant sequences from loci with fewer than this number of - RNA reads supporting the alt allele. - - min_variant_sequence_coverage : int - Trim variant sequences to nucleotides covered by at least this many - reads. - - min_transcript_prefix_length : int - Minimum number of bases we need to try matching between the reference - context and variant sequence. - - max_transcript_mismatches : int - Don't try to determine the reading frame for a transcript if more - than this number of bases differ. - - include_mismatches_after_variant : bool - Include mismatches after the variant locus in the count compared - against max_transcript_mismatches. - - variant_sequence_assembly : bool - Use overlap assembly to construct longer variant cDNA sequences. - """ - if len(variant_reads) == 0: - logger.info("No supporting reads for variant %s", variant) - return [] - - # Adding an extra codon to the desired RNA sequence length in case we - # need to clip nucleotides at the start/end of the sequence - cdna_sequence_length = (protein_sequence_length + 1) * 3 - - variant_sequences = reads_to_variant_sequences( - variant=variant, - reads=variant_reads, - preferred_sequence_length=cdna_sequence_length, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - variant_sequence_assembly=variant_sequence_assembly) - - if not variant_sequences: - logger.info("No spanning cDNA sequences for variant %s", variant) - return [] - - # try translating the variant sequences from the same set of - # ReferenceContext objects, which requires using the longest - # context_size to be compatible with all of the sequences. Some - # sequences maybe have fewer nucleotides than this before the variant - # and will thus have to be trimmed. - context_size = max( - len(variant_sequence.prefix) - for variant_sequence in variant_sequences) - - reference_contexts = reference_contexts_for_variant( - variant, - context_size=context_size, - transcript_id_whitelist=transcript_id_whitelist) - - return list(translation_generator( - variant_sequences=variant_sequences, - reference_contexts=reference_contexts, - min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches, - include_mismatches_after_variant=include_mismatches_after_variant, - protein_sequence_length=protein_sequence_length)) - - -def translate_variants( - variants_with_supporting_reads, - transcript_id_whitelist=None, - protein_sequence_length=PROTEIN_SEQUENCE_LENGTH, - min_alt_rna_reads=MIN_ALT_RNA_READS, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, - min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, - max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, - include_mismatches_after_variant=INCLUDE_MISMATCHES_AFTER_VARIANT, - variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY): - """ - Translates each coding variant in a collection to one or more protein - fragment sequences (if the variant is not filtered and its spanning RNA - sequences can be given a reading frame). - - Parameters - ---------- - variants_with_reads : sequence or generator - Each item of this sequence should be a pair containing a varcode.Variant - and a list of AlleleRead objects supporting that variant. - - transcript_id_whitelist : set, optional - If given, expected to be a set of transcript IDs which we should use - for determining the reading frame around a variant. If omitted, then - try to use all overlapping reference transcripts. - - protein_sequence_length : int - Try to translate protein sequences of this length, though sometimes - we'll have to return something shorter (depending on the RNAseq data, - and presence of stop codons). - - min_alt_rna_reads : int - Drop variant sequences from loci with fewer than this number of - RNA reads supporting the alt allele. - - min_variant_sequence_coverage : int - Trim variant sequences to nucleotides covered by at least this many - reads. - - min_transcript_prefix_length : int - Minimum number of bases we need to try matching between the reference - context and variant sequence. - - max_transcript_mismatches : int - Don't try to determine the reading frame for a transcript if more - than this number of bases differ. - - include_mismatches_after_variant : bool - Include mismatches after the variant locus in the count compared - against max_transcript_mismatches. - - variant_sequence_assembly : bool - Use overlap assembly to construct longer variant cDNA sequences. - - Yields pairs of a Variant and a sequence of all its candidate - Translation objects. - """ - for variant, variant_reads in variants_with_supporting_reads: - translations = translate_variant_reads( - variant=variant, - variant_reads=variant_reads, - protein_sequence_length=protein_sequence_length, - transcript_id_whitelist=transcript_id_whitelist, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - min_transcript_prefix_length=min_transcript_prefix_length, - max_transcript_mismatches=max_transcript_mismatches, - include_mismatches_after_variant=include_mismatches_after_variant, - variant_sequence_assembly=variant_sequence_assembly) - yield variant, translations - - -def translations_generator_to_dataframe(translations_generator): - """ - Given a generator of (Variant, [Translation]) pairs, - returns a DataFrame of translated protein fragments with columns - for each field of a Translation object (and chr/pos/ref/alt per variant). - """ - return dataframe_from_generator( - element_class=Translation, - variant_and_elements_generator=translations_generator, - exclude=[], - converters={ - "untrimmed_variant_sequence": lambda vs: vs.sequence, - "variant_sequence_in_reading_frame": ( - lambda vs: vs.in_frame_cdna_sequence), - "reference_context": ( - lambda rc: ";".join([ - transcript.name for - transcript in rc.transcripts])) - }, - extra_column_fns={ - "untrimmed_variant_sequence_read_count": ( - lambda _, t: len(t.untrimmed_variant_sequence.reads)), - }) diff --git a/isovar/translation_helpers.py b/isovar/translation_helpers.py new file mode 100644 index 0000000..3bb79e6 --- /dev/null +++ b/isovar/translation_helpers.py @@ -0,0 +1,117 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Helper functions used for creating translating a variant's cDNA sequence +into a particular reading frame. +""" + +from __future__ import print_function, division, absolute_import + +import math + +def find_mutant_amino_acid_interval( + cdna_sequence, + cdna_first_codon_offset, + cdna_variant_start_offset, + cdna_variant_end_offset, + n_ref, + n_amino_acids): + """ + Parameters + ---------- + cdna_sequence : skbio.DNA or str + cDNA sequence found in RNAseq data + + cdna_first_codon_offset : int + Offset into cDNA sequence to first complete codon, lets us skip + past UTR region and incomplete codons. + + cdna_variant_start_offset : int + Interbase start offset into cDNA sequence for selecting mutant + nucleotides. + + cdna_variant_end_offset : int + Interbase end offset into cDNA sequence for selecting mutant + nucleotides. + + n_ref : int + Number of reference nucleotides + + n_amino_acids : int + Number of translated amino acids + + Returns + ------- + tuple with three fields: + 1) Start offset for interval of mutant amino acids in translated sequence + 2) End offset for interval of mutant amino acids in translated sequence + 3) Boolean flag indicating whether the variant was a frameshift. + """ + cdna_alt_nucleotides = cdna_sequence[ + cdna_variant_start_offset:cdna_variant_end_offset] + + n_alt = len(cdna_alt_nucleotides) + + # sequence of nucleotides before the variant starting from the first codon + cdna_coding_prefix = cdna_sequence[cdna_first_codon_offset:cdna_variant_start_offset] + + # rounding down since a change in the middle of a codon should count + # toward the variant codons + n_coding_nucleotides_before_variant = len(cdna_coding_prefix) + + n_complete_prefix_codons = n_coding_nucleotides_before_variant // 3 + + frame_of_variant_nucleotides = n_coding_nucleotides_before_variant % 3 + frameshift = abs(n_ref - n_alt) % 3 != 0 + indel = n_ref != n_alt + + variant_aa_interval_start = n_complete_prefix_codons + + if frameshift: + # if mutation is a frame shift then every amino acid from the + # first affected codon to the stop is considered mutant + # + # TODO: what if the first k amino acids are synonymous with the reference sequence? + variant_aa_interval_end = n_amino_acids + else: + n_alt_codons = int(math.ceil(n_alt / 3.0)) + if indel: + # We need to adjust the number of affected codons by whether the + # variant is aligned with codon boundaries, since in-frame indels + # may still be split across multiple codons. + # + # Example of in-frame deletion of 3 nucleotides which leaves + # 0 variant codons in the sequence (interval = 1:1) + # ref = CCC|AAA|GGG|TTT + # alt = CCC|GGG|TTT + # + # Example of in-frame deletion of 3 nucleotides which leaves + # 1 variant codon in the sequence (interval = 1:2) + # ref = CCC|AAA|GGG|TTT + # alt = CCC|AGG|TTT + # + # Example of in-frame insertion of 3 nucleotides which + # yields two variant codons: + # ref = CCC|AAA|GGG|TTT + # alt = CTT|TCC|AAA|GGG|TTT + extra_affected_codon = int(frame_of_variant_nucleotides != 0) + variant_aa_interval_end = ( + variant_aa_interval_start + n_alt_codons + extra_affected_codon) + else: + # if the variant is a simple substitution then it only affects + # as many codons as are in the alternate sequence + variant_aa_interval_end = variant_aa_interval_start + n_alt_codons + return variant_aa_interval_start, variant_aa_interval_end, frameshift diff --git a/isovar/translation_key.py b/isovar/translation_key.py new file mode 100644 index 0000000..f7f6c8d --- /dev/null +++ b/isovar/translation_key.py @@ -0,0 +1,50 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module combines variant cDNA sequences collected from a BAM file with +the reading frames of annotated reference transcripts to create candidate +translations. +""" + + +from __future__ import print_function, division, absolute_import + +from .logging import get_logger +from .value_object import ValueObject + + +logger = get_logger(__name__) + + +class TranslationKey(ValueObject): + """ + TranslationKey contains fields related to a translated protein sequence + which should be used to combine multiple equivalent mutated amino acid + sequences. + """ + __slots__ = [ + # translated sequence of a variant sequence in the ORF established + # by a reference context + "amino_acids", + # half-open interval coordinates for variant amino acids + # in the translated sequence + "variant_aa_interval_start", + "variant_aa_interval_end", + # did the amino acid sequence end due to a stop codon or did we + # just run out of sequence context around the variant? + "ends_with_stop_codon", + # was the variant a frameshift relative to the reference sequence? + "frameshift" + ] diff --git a/isovar/value_object.py b/isovar/value_object.py index cca107f..3f11bf5 100644 --- a/isovar/value_object.py +++ b/isovar/value_object.py @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs): getattr(cls, '__slots__', []) for cls in inherited_class_order)) + @add_metaclass(MetaclassCollectSlots) class ValueObject(object): """ diff --git a/isovar/variant_helpers.py b/isovar/variant_helpers.py index 91794d3..f6ea6a0 100644 --- a/isovar/variant_helpers.py +++ b/isovar/variant_helpers.py @@ -20,6 +20,7 @@ from six.moves import range from .logging import get_logger +from .dna import reverse_complement_dna logger = get_logger(__name__) @@ -124,6 +125,7 @@ def base0_interval_for_variant(variant): ref=ref, alt=alt) + def interbase_range_affected_by_variant_on_transcript(variant, transcript): """ Convert from a variant's position in global genomic coordinates on the @@ -142,13 +144,10 @@ def interbase_range_affected_by_variant_on_transcript(variant, transcript): which indicates which bases in the reference sequence are affected by a variant. - Example: - The insertion of "TTT" into the middle of an exon would result in an - offset pair such as (100,100) since no reference bases are changed - or deleted by an insertion. - - On the other hand, deletion the preceding "CGG" at that same locus could - result in an offset pair such as (97, 100) + For example, the insertion of "TTT" into the middle of an exon would result in an + offset pair such as (100,100) since no reference bases are changed or deleted by + an insertion. On the other hand, deletion the preceding "CGG" at that same locus could + result in an offset pair such as (97, 100) """ if variant.is_insertion: if transcript.strand == "+": @@ -188,3 +187,13 @@ def interbase_range_affected_by_variant_on_transcript(variant, transcript): start_offset = min(offsets) end_offset = max(offsets) + 1 return (start_offset, end_offset) + + +def variant_matches_reference_sequence(variant, ref_seq_on_transcript, strand): + """ + Make sure that reference nucleotides we expect to see on the reference + transcript from a variant are the same ones we encounter. + """ + if strand == "-": + ref_seq_on_transcript = reverse_complement_dna(ref_seq_on_transcript) + return ref_seq_on_transcript == variant.ref diff --git a/isovar/variant_sequence_in_reading_frame.py b/isovar/variant_orf.py similarity index 67% rename from isovar/variant_sequence_in_reading_frame.py rename to isovar/variant_orf.py index 9c16c8f..7872b07 100644 --- a/isovar/variant_sequence_in_reading_frame.py +++ b/isovar/variant_orf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine +# Copyright (c) 2016-2019. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,16 +20,16 @@ from __future__ import print_function, division, absolute_import -from six.moves import range, zip +from six.moves import zip from .dna import reverse_complement_dna -from .value_object import ValueObject from .logging import get_logger +from .value_object import ValueObject logger = get_logger(__name__) -class VariantSequenceInReadingFrame(ValueObject): +class VariantORF(ValueObject): """ A variant cDNA sequence (possibly trimmed to get rid of low coverage tails) assigned to a particular strand ('+' or '-') and reading frame @@ -49,8 +49,8 @@ class VariantSequenceInReadingFrame(ValueObject): "variant_cdna_interval_end", "reference_cdna_sequence_before_variant", "reference_cdna_sequence_after_variant", - "number_mismatches_before_variant", - "number_mismatches_after_variant" + "num_mismatches_before_variant", + "num_mismatches_after_variant" ] def __init__( @@ -61,8 +61,8 @@ def __init__( variant_cdna_interval_end, reference_cdna_sequence_before_variant, reference_cdna_sequence_after_variant, - number_mismatches_before_variant, - number_mismatches_after_variant): + num_mismatches_before_variant, + num_mismatches_after_variant): self.cdna_sequence = cdna_sequence self.offset_to_first_complete_codon = offset_to_first_complete_codon self.variant_cdna_interval_start = variant_cdna_interval_start @@ -71,11 +71,14 @@ def __init__( reference_cdna_sequence_before_variant) self.reference_cdna_sequence_after_variant = ( reference_cdna_sequence_after_variant) - self.number_mismatches_before_variant = number_mismatches_before_variant - self.number_mismatches_after_variant = number_mismatches_after_variant + self.num_mismatches_before_variant = num_mismatches_before_variant + self.num_mismatches_after_variant = num_mismatches_after_variant @property def in_frame_cdna_sequence(self): + """ + Subsequence of cDNA from start to codon until last complete codon. + """ from_first_codon = self.cdna_sequence[self.offset_to_first_complete_codon:] # get rid of any trailing out of frame nucleotides in_frame_length = len(from_first_codon) // 3 * 3 @@ -88,13 +91,14 @@ def from_variant_sequence_and_reference_context( Combines a VariantSequence with the reading frame implied by a ReferenceContext, reverse complementing if necessary and finding the offset to the first complete codon in the cDNA sequence. + Parameters ---------- variant_sequence : VariantSequence reference_context : ReferenceContext - Returns a VariantSequenceInReadingFrame object + Returns a VariantORF object """ (cdna_prefix, cdna_alt, cdna_suffix, reference_prefix, reference_suffix, n_trimmed_from_reference) = trim_sequences( @@ -110,8 +114,10 @@ def from_variant_sequence_and_reference_context( reference_suffix, n_trimmed_from_reference) - n_mismatch_before_variant = count_mismatches_before_variant(reference_prefix, cdna_prefix) - n_mismatch_after_variant = count_mismatches_after_variant(reference_suffix, cdna_suffix) + n_mismatch_before_variant = count_mismatches_before_variant( + reference_prefix, cdna_prefix) + n_mismatch_after_variant = count_mismatches_after_variant( + reference_suffix, cdna_suffix) ref_codon_offset = reference_context.offset_to_first_complete_codon @@ -126,15 +132,15 @@ def from_variant_sequence_and_reference_context( variant_interval_start = len(cdna_prefix) variant_interval_end = variant_interval_start + len(cdna_alt) - return VariantSequenceInReadingFrame( + return VariantORF( cdna_sequence=cdna_sequence, offset_to_first_complete_codon=offset_to_first_complete_codon, variant_cdna_interval_start=variant_interval_start, variant_cdna_interval_end=variant_interval_end, reference_cdna_sequence_before_variant=reference_prefix, reference_cdna_sequence_after_variant=reference_suffix, - number_mismatches_before_variant=n_mismatch_before_variant, - number_mismatches_after_variant=n_mismatch_after_variant) + num_mismatches_before_variant=n_mismatch_before_variant, + num_mismatches_after_variant=n_mismatch_after_variant) def trim_sequences(variant_sequence, reference_context): @@ -280,113 +286,3 @@ def compute_offset_to_first_complete_codon( offset_to_first_complete_reference_codon) frame = n_nucleotides_trimmed_after_first_codon % 3 return (3 - frame) % 3 - - -def match_variant_sequence_to_reference_context( - variant_sequence, - reference_context, - min_transcript_prefix_length, - max_transcript_mismatches, - include_mismatches_after_variant=False, - max_trimming_attempts=2): - """ - Iteratively trim low-coverage subsequences of a variant sequence - until it either matches the given reference context or there - are too few nucleotides left in the variant sequence. - - Parameters - ---------- - variant_sequence : VariantSequence - Assembled sequence from RNA reads, will need to be to be reverse - complemented if matching against a reference transcript on the - negative strand. - - reference_context : ReferenceContext - Sequence of reference transcript before the variant and associated - metadata. - - min_transcript_prefix_length : int - Minimum number of nucleotides we try to match against a reference - transcript. - - max_transcript_mismatches : int - Maximum number of nucleotide differences between reference transcript - sequence and the variant sequence. - - include_mismatches_after_variant : bool - Set to true if the number of mismatches after the variant locus should - count toward the total max_transcript_mismatches, which by default - only counts mismatches before the variant locus. - - max_trimming_attempts : int - How many times do we try trimming the VariantSequence to higher - levels of coverage before giving up? - - Returns VariantSequenceInReadingFrame or None - """ - variant_sequence_in_reading_frame = None - - # if we can't get the variant sequence to match this reference - # context then keep trimming it by coverage until either - for i in range(max_trimming_attempts + 1): - # check the reverse-complemented prefix if the reference context is - # on the negative strand since variant sequence is aligned to - # genomic DNA (positive strand) - variant_sequence_too_short = ( - (reference_context.strand == "+" and - len(variant_sequence.prefix) < min_transcript_prefix_length) or - (reference_context.strand == "-" and - len(variant_sequence.suffix) < min_transcript_prefix_length) - ) - if variant_sequence_too_short: - logger.info( - "Variant sequence %s shorter than min allowed %d (iter=%d)", - variant_sequence, - min_transcript_prefix_length, - i + 1) - return None - - variant_sequence_in_reading_frame = \ - VariantSequenceInReadingFrame.from_variant_sequence_and_reference_context( - variant_sequence=variant_sequence, - reference_context=reference_context) - - if variant_sequence_in_reading_frame is None: - return None - - n_mismatch_before_variant = ( - variant_sequence_in_reading_frame.number_mismatches_before_variant) - n_mismatch_after_variant = ( - variant_sequence_in_reading_frame.number_mismatches_after_variant) - - logger.info("Iter #%d/%d: %s" % ( - i + 1, - max_trimming_attempts + 1, - variant_sequence_in_reading_frame)) - - total_mismatches = n_mismatch_before_variant - if include_mismatches_after_variant: - total_mismatches += n_mismatch_after_variant - if total_mismatches <= max_transcript_mismatches: - # if we got a variant sequence + reading frame with sufficiently - # few mismatches then call it a day - return variant_sequence_in_reading_frame - - logger.info( - ("Too many mismatches (%d) between variant sequence %s and " - "reference context %s (attempt=%d/%d)"), - n_mismatch_before_variant, - variant_sequence, - reference_context, - i + 1, - max_trimming_attempts + 1) - # if portions of the sequence are supported by only 1 read - # then try trimming to 2 to see if the better supported - # subsequence can be better matched against the reference - current_min_coverage = variant_sequence.min_coverage() - logger.info( - "Trimming to subsequence covered by at least %d reads", - current_min_coverage + 1) - variant_sequence = variant_sequence.trim_by_coverage( - current_min_coverage + 1) - return None diff --git a/isovar/variant_orf_helpers.py b/isovar/variant_orf_helpers.py new file mode 100644 index 0000000..dfde566 --- /dev/null +++ b/isovar/variant_orf_helpers.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from .logging import get_logger +from .variant_orf import VariantORF + +logger = get_logger(__name__) + + +def match_variant_sequence_to_reference_context( + variant_sequence, + reference_context, + min_transcript_prefix_length, + max_transcript_mismatches, + count_mismatches_after_variant=False, + max_trimming_attempts=2): + """ + Iteratively trim low-coverage subsequences of a variant sequence + until it either matches the given reference context or there + are too few nucleotides left in the variant sequence. + + Parameters + ---------- + variant_sequence : VariantSequence + Assembled sequence from RNA reads, will need to be to be reverse + complemented if matching against a reference transcript on the + negative strand. + + reference_context : ReferenceContext + Sequence of reference transcript before the variant and associated + metadata. + + min_transcript_prefix_length : int + Minimum number of nucleotides we try to match against a reference + transcript. + + max_transcript_mismatches : int + Maximum number of nucleotide differences between reference transcript + sequence and the variant sequence. + + count_mismatches_after_variant : bool + Set to true if the number of mismatches after the variant locus should + count toward the total max_transcript_mismatches, which by default + only counts mismatches before the variant locus. + + max_trimming_attempts : int + How many times do we try trimming the VariantSequence to higher + levels of coverage before giving up? + + Returns VariantORF or None + """ + variant_orf = None + + # if we can't get the variant sequence to match this reference + # context then keep trimming it by coverage until either + for i in range(max_trimming_attempts + 1): + # check the reverse-complemented prefix if the reference context is + # on the negative strand since variant sequence is aligned to + # genomic DNA (positive strand) + variant_sequence_too_short = ( + (reference_context.strand == "+" and + len(variant_sequence.prefix) < min_transcript_prefix_length) or + (reference_context.strand == "-" and + len(variant_sequence.suffix) < min_transcript_prefix_length) + ) + if variant_sequence_too_short: + logger.info( + "Variant sequence %s shorter than min allowed %d (iter=%d)", + variant_sequence, + min_transcript_prefix_length, + i + 1) + return None + + variant_orf = \ + VariantORF.from_variant_sequence_and_reference_context( + variant_sequence=variant_sequence, + reference_context=reference_context) + + if variant_orf is None: + return None + + n_mismatch_before_variant = ( + variant_orf.num_mismatches_before_variant) + n_mismatch_after_variant = ( + variant_orf.num_mismatches_after_variant) + + logger.info("Iter #%d/%d: %s" % ( + i + 1, + max_trimming_attempts + 1, + variant_orf)) + + total_mismatches = n_mismatch_before_variant + if count_mismatches_after_variant: + total_mismatches += n_mismatch_after_variant + if total_mismatches <= max_transcript_mismatches: + # if we got a variant sequence + reading frame with sufficiently + # few mismatches then call it a day + return variant_orf + + logger.info( + ("Too many mismatches (%d) between variant sequence %s and " + "reference context %s (attempt=%d/%d)"), + n_mismatch_before_variant, + variant_sequence, + reference_context, + i + 1, + max_trimming_attempts + 1) + # if portions of the sequence are supported by only 1 read + # then try trimming to 2 to see if the better supported + # subsequence can be better matched against the reference + current_min_coverage = variant_sequence.min_coverage() + logger.info( + "Trimming to subsequence covered by at least %d reads", + current_min_coverage + 1) + variant_sequence = variant_sequence.trim_by_coverage( + current_min_coverage + 1) + return None diff --git a/isovar/variant_reads.py b/isovar/variant_reads.py deleted file mode 100644 index ad1cc1f..0000000 --- a/isovar/variant_reads.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Functions for getting AlleleReads which support variant alleles. -""" - -from .variant_helpers import trim_variant -from .allele_reads import reads_overlapping_variant, reads_overlapping_variants - - -def filter_non_alt_reads_for_variant(variant, allele_reads): - _, _, alt = trim_variant(variant) - return [read for read in allele_reads if read.allele == alt] - - -def filter_non_alt_reads_for_variants(variants_and_allele_reads_sequence): - """ - Given a sequence of variants paired with all of their overlapping reads, - yields a sequence of variants paired only with reads which contain their - mutated nucleotide sequence. - """ - for variant, allele_reads in variants_and_allele_reads_sequence: - yield variant, filter_non_alt_reads_for_variant(variant, allele_reads) - - -def reads_supporting_variant(variant, samfile, **kwargs): - allele_reads = reads_overlapping_variant( - variant=variant, - samfile=samfile, - **kwargs) - return filter_non_alt_reads_for_variant( - variant=variant, - allele_reads=allele_reads) - - -def reads_supporting_variants(variants, samfile, **kwargs): - """ - Given a SAM/BAM file and a collection of variants, generates a sequence - of variants paired with reads which support each variant. - """ - for variant, allele_reads in reads_overlapping_variants( - variants=variants, - samfile=samfile, - **kwargs): - yield variant, filter_non_alt_reads_for_variant(variant, allele_reads) diff --git a/isovar/variant_sequence.py b/isovar/variant_sequence.py new file mode 100644 index 0000000..a2c6006 --- /dev/null +++ b/isovar/variant_sequence.py @@ -0,0 +1,255 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +import numpy as np + + +from .value_object import ValueObject +from .logging import get_logger + +logger = get_logger(__name__) + + +class VariantSequence(ValueObject): + """ + Representation of a cDNA sequence containing a mutation + """ + + __slots__ = [ + # nucleotides before a variant + "prefix", + # nucleotide sequence of a variant + "alt", + # nucleotides after a variant + "suffix", + # since we often want to look at prefix+alt+suffix, let's cache it + "sequence", + # reads which were used to determine this sequences + "reads" + ] + + def __init__(self, prefix, alt, suffix, reads): + self.prefix = prefix + self.alt = alt + self.suffix = suffix + self.sequence = prefix + alt + suffix + self.reads = frozenset(reads) + + def __len__(self): + return len(self.sequence) + + @property + def read_names(self): + """ + Unique read names from which this VariantSequence was constructed. + + Returns set of str + """ + return {r.name for r in self.reads} + + def contains(self, other): + """ + Is the other VariantSequence a subsequence of this one? + + The two sequences must agree on the alt nucleotides, the prefix of the + longer must contain the prefix of the shorter, and the suffix of the + longer must contain the suffix of the shorter. + """ + return (self.alt == other.alt and + self.prefix.endswith(other.prefix) and + self.suffix.startswith(other.suffix)) + + def left_overlaps(self, other, min_overlap_size=1): + """ + Does this VariantSequence overlap another on the left side? + """ + + if self.alt != other.alt: + # allele must match! + return False + + if len(other.prefix) > len(self.prefix): + # only consider strings that overlap like: + # self: ppppAssss + # other: ppAsssssss + # which excludes cases where the other sequence has a longer + # prefix + return False + elif len(other.suffix) < len(self.suffix): + # similarly, we throw away cases where the other sequence is shorter + # after the alt nucleotides than this sequence + return False + + # is the other sequence a prefix of this sequence? + # Example: + # p1 a1 s1 = XXXXXXXX Y ZZZZZZ + # p2 a2 s2 = XX Y ZZZZZZZZZ + # ... + # then we can combine them into a longer sequence + sequence_overlaps = ( + self.prefix.endswith(other.prefix) and + other.suffix.startswith(self.suffix) + ) + prefix_overlap_size = min(len(self.prefix), len(other.prefix)) + suffix_overlap_size = min(len(other.suffix), len(self.suffix)) + overlap_size = ( + prefix_overlap_size + suffix_overlap_size + len(self.alt)) + + return sequence_overlaps and overlap_size >= min_overlap_size + + def add_reads(self, reads): + """ + Create another VariantSequence with more supporting reads. + """ + if len(reads) == 0: + return self + new_reads = self.reads.union(reads) + if len(new_reads) > len(self.reads): + return VariantSequence( + prefix=self.prefix, + alt=self.alt, + suffix=self.suffix, + reads=new_reads) + else: + return self + + def combine(self, other_sequence, min_overlap_size=1): + """ + If this sequence is the prefix of another sequence, combine + them into a single VariantSequence object. If the other sequence + is contained in this one, then add its reads to this VariantSequence. + Also tries to flip the order (e.g. this sequence is a suffix or + this sequence is a subsequence). If sequences can't be combined + then returns None. + """ + if other_sequence.alt != self.alt: + logger.warn( + "Cannot combine %s and %s with mismatching alt sequences", + self, + other_sequence) + return None + elif self.contains(other_sequence): + if len(other_sequence) >= min_overlap_size: + return self.add_reads(other_sequence.reads) + else: + return None + elif other_sequence.contains(self): + if len(self) >= min_overlap_size: + return other_sequence.add_reads(self.reads) + else: + return None + elif self.left_overlaps(other_sequence, min_overlap_size=min_overlap_size): + # If sequences are like AABC and ABCC + return VariantSequence( + prefix=self.prefix, + alt=self.alt, + suffix=other_sequence.suffix, + reads=self.reads.union(other_sequence.reads)) + elif other_sequence.left_overlaps(self, min_overlap_size=min_overlap_size): + return VariantSequence( + prefix=other_sequence.prefix, + alt=self.alt, + suffix=self.suffix, + reads=self.reads.union(other_sequence.reads)) + else: + # sequences don't overlap + return None + + def variant_indices(self): + """ + When we combine prefix + alt + suffix into a single string, + what are is base-0 index interval which gets us back the alt + sequence? First returned index is inclusive, the second is exclusive. + """ + variant_start_index = len(self.prefix) + variant_len = len(self.alt) + variant_end_index = variant_start_index + variant_len + return variant_start_index, variant_end_index + + def coverage(self): + """ + Returns NumPy array indicating number of reads covering each + nucleotides of this sequence. + """ + variant_start_index, variant_end_index = self.variant_indices() + n_nucleotides = len(self) + coverage_array = np.zeros(n_nucleotides, dtype="int32") + for read in self.reads: + coverage_array[ + max(0, variant_start_index - len(read.prefix)): + min(n_nucleotides, variant_end_index + len(read.suffix))] += 1 + return coverage_array + + def min_coverage(self): + """ + Minimum number of reads covering any base in the cDNA sequence + + Returns int + """ + return np.min(self.coverage()) + + def mean_coverage(self): + """ + Average number of reads covering each base in the cDNA sequence. + + Returns float + """ + return np.mean(self.coverage()) + + def trim_by_coverage(self, min_reads): + """ + Given the min number of reads overlapping each nucleotide of + a variant sequence, trim this sequence by getting rid of positions + which are overlapped by fewer reads than specified. + """ + read_count_array = self.coverage() + logger.info("Coverage: %s (len=%d)" % ( + read_count_array, len(read_count_array))) + sufficient_coverage_mask = read_count_array >= min_reads + sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) + if len(sufficient_coverage_indices) == 0: + logger.debug("No bases in %s have coverage >= %d" % (self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + variant_start_index, variant_end_index = self.variant_indices() + # assuming that coverage drops off monotonically away from + # variant nucleotides + first_covered_index = sufficient_coverage_indices.min() + last_covered_index = sufficient_coverage_indices.max() + # adding 1 to last_covered_index since it's an inclusive index + # whereas variant_end_index is the end of a half-open interval + if (first_covered_index > variant_start_index or + last_covered_index + 1 < variant_end_index): + # Example: + # Nucleotide sequence: + # ACCCTTTT|AA|GGCGCGCC + # Coverage: + # 12222333|44|33333211 + # Then the mask for bases covered >= 4x would be: + # ________|**|________ + # with indices: + # first_covered_index = 9 + # last_covered_index = 10 + # variant_start_index = 9 + # variant_end_index = 11 + logger.debug("Some variant bases in %s don't have coverage >= %d" % ( + self, min_reads)) + return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) + return VariantSequence( + prefix=self.prefix[first_covered_index:], + alt=self.alt, + suffix=self.suffix[:last_covered_index - variant_end_index + 1], + reads=self.reads) + diff --git a/isovar/variant_sequence_creator.py b/isovar/variant_sequence_creator.py new file mode 100644 index 0000000..ff2fc12 --- /dev/null +++ b/isovar/variant_sequence_creator.py @@ -0,0 +1,198 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division, absolute_import + +from .allele_read_helpers import get_single_allele_from_reads +from .assembly import iterative_overlap_assembly +from .default_parameters import ( + MIN_VARIANT_SEQUENCE_COVERAGE, + VARIANT_SEQUENCE_LENGTH, + VARIANT_SEQUENCE_ASSEMBLY, + MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE +) +from .logging import get_logger +from .variant_sequence_helpers import ( + filter_variant_sequences, + initial_variant_sequences_from_reads +) + +logger = get_logger(__name__) + + +class VariantSequenceCreator(object): + """ + Assembler is used to assemble a set of AlleleReads into a smaller set of + VariantSequence objects based on overlap of read sequences. + """ + def __init__( + self, + min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, + preferred_sequence_length=VARIANT_SEQUENCE_LENGTH, + variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY, + min_assembly_overlap_size=MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE): + """ + Parameters + ---------- + min_variant_sequence_coverage : int + Minimum number of RNA reads supporting each nucleotide of the + variant cDNA sequence + + preferred_sequence_length : int + Total number of nucleotides in the assembled sequences, including + variant nucleotides. + + variant_sequence_assembly : bool + Construct variant sequences by merging overlapping reads. If False + then variant sequences must be fully spanned by cDNA reads. + + min_assembly_overlap_size : int + Minimum number of nucleotides shared by two sequences before they + can be merged into a single VariantSequence object. + + """ + self.min_variant_sequence_coverage = min_variant_sequence_coverage + self.preferred_sequence_length = preferred_sequence_length + self.variant_sequence_assembly = variant_sequence_assembly + self.min_assembly_overlap_size = min_assembly_overlap_size + + def reads_to_variant_sequences( + self, + variant, + reads): + """ + Collapse variant-supporting RNA reads into consensus sequences of + approximately the preferred length (may differ at the ends of transcripts), + filter consensus sequences by length and number of supporting RNA reads. + + Parameters + ---------- + variant : varcode.Variant + + reads : list of AlleleRead objects + Reads which support the variant allele + + Returns + ------- + list of VariantSequence + """ + # convert to list in case it's a generator + variant_reads = list(reads) + + if len(variant_reads) == 0: + return [] + + alt_seq = get_single_allele_from_reads(variant_reads) + + # the number of context nucleotides on either side of the variant + # is half the desired length (minus the number of variant nucleotides) + n_alt_nucleotides = len(alt_seq) + + n_surrounding_nucleotides = self.preferred_sequence_length - n_alt_nucleotides + max_nucleotides_after_variant = n_surrounding_nucleotides // 2 + + # if the number of nucleotides we need isn't divisible by 2 then + # prefer to have one more *before* the variant since we need the + # prefix sequence to match against reference transcripts + max_nucleotides_before_variant = ( + n_surrounding_nucleotides - max_nucleotides_after_variant) + + variant_sequences = initial_variant_sequences_from_reads( + variant_reads=variant_reads, + max_nucleotides_before_variant=max_nucleotides_before_variant, + max_nucleotides_after_variant=max_nucleotides_after_variant) + + logger.info( + "Initial pool of %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + + if self.variant_sequence_assembly: + # this is a tricky parameter to set correctly: + # by how many bases should two sequences overlap before + # we merge, currently defaulting to either half the non-variant + # nucleotides or the specified min_assembly_overlap_size + # (whichever is smaller) + min_overlap_size = min( + self.min_assembly_overlap_size, + n_surrounding_nucleotides // 2) + variant_sequences = iterative_overlap_assembly( + variant_sequences, + min_overlap_size=min_overlap_size) + + if variant_sequences: + logger.info( + "After overlap assembly: %d variant sequences (min length=%d, max length=%d)", + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After overlap assembly: 0 variant sequences") + return [] + + variant_sequences = filter_variant_sequences( + variant_sequences=variant_sequences, + preferred_sequence_length=self.preferred_sequence_length, + min_variant_sequence_coverage=self.min_variant_sequence_coverage) + + if variant_sequences: + logger.info( + ("After coverage & length filtering: %d variant sequences " + "(min length=%d, max length=%d)"), + len(variant_sequences), + min(len(s) for s in variant_sequences), + max(len(s) for s in variant_sequences)) + else: + logger.info("After coverage & length filtering: 0 variant sequences") + return [] + + # sort VariantSequence objects by decreasing order of supporting read + # counts + variant_sequences.sort(key=lambda vs: -len(vs.reads)) + return variant_sequences + + def sequences_from_alt_reads_generator(self, variant_and_reads_generator): + """ + For each (variant, [AlleleRead]) pair in the input generator, + collapse the reads into a list of VariantSequence objects. + + Parameters + ---------- + variant_and_reads_generator : generator + Sequence of Variant objects paired with a list of reads which + support that variant. + + Yields pairs with the following fields: + - Variant + - list of VariantSequence objects + """ + for variant, reads in variant_and_reads_generator: + variant_sequences = self.reads_to_variant_sequences( + variant=variant, + reads=reads) + yield variant, variant_sequences + + def sequences_from_read_evidence_generator( + self, variant_and_read_evidence_generator): + """ + Given a generator of (Variant, ReadEvidence) pairs, generate a + sequence of (Variant, [VariantSequence]) pairs. + """ + reads_gen = ( + (variant, read_evidence.alt_reads) + for (variant, read_evidence) in + variant_and_read_evidence_generator + ) + return self.sequences_from_alt_reads_generator(reads_gen) diff --git a/isovar/variant_sequence_helpers.py b/isovar/variant_sequence_helpers.py new file mode 100644 index 0000000..e0291aa --- /dev/null +++ b/isovar/variant_sequence_helpers.py @@ -0,0 +1,197 @@ +# Copyright (c) 2016-2019. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper functions for constructing and filtering VariantSequence objects +from reads overlapping a variant locus. +""" + +from __future__ import print_function, division, absolute_import + +from .allele_read_helpers import group_unique_sequences +from .assembly import collapse_substrings +from .logging import get_logger +from .variant_sequence import VariantSequence + +logger = get_logger(__name__) + + +def initial_variant_sequences_from_reads( + variant_reads, + max_nucleotides_before_variant=None, + max_nucleotides_after_variant=None): + """ + Get all unique sequences from reads spanning a variant locus. This will + include partial sequences due to reads starting in the middle of the + sequence around around a variant. + + Parameters + ---------- + variant_reads : list of AlleleRead objects + + max_nucleotides_before_variant : int or None + + max_nucleotides_after_variant : int or None + + Returns + ------- + list of VariantSequence + """ + unique_sequence_groups = group_unique_sequences( + variant_reads, + max_prefix_size=max_nucleotides_before_variant, + max_suffix_size=max_nucleotides_after_variant) + + return [ + VariantSequence( + prefix=prefix, + alt=alt, + suffix=suffix, + reads=reads) + for ((prefix, alt, suffix), reads) + in unique_sequence_groups.items() + ] + + +def filter_variant_sequences_by_read_support( + variant_sequences, + min_variant_sequence_coverage): + """ + Filter VariantSequences to only keep those with at least the desired + level of coverage. + + Parameters + ---------- + variant_sequences : list of VariantSequence + + min_variant_sequence_coverage : int + Minimum number of reads which must cover each + base of a VariantSequence + + Returns + ------- + list of VariantSequence + """ + n_total = len(variant_sequences) + variant_sequences = [ + s + for s in variant_sequences + if s.min_coverage() >= min_variant_sequence_coverage + ] + n_dropped = n_total - len(variant_sequences) + if n_dropped > 0: + logger.info( + "Dropped %d/%d variant sequences less than %d supporting reads", + n_dropped, + n_total, + min_variant_sequence_coverage) + return variant_sequences + + +def filter_variant_sequences_by_length( + variant_sequences, + preferred_sequence_length): + """ + Parameters + ---------- + variant_sequences : list of VariantSequence + + preferred_sequence_length : int + If we get some sequences which are at least this long and others + which are shorter, then drop the shorter ones. + + Returns + ------- + list of VariantSequence + """ + n_total = len(variant_sequences) + if n_total == 0: + return [] + # since we might have gotten some shorter fragments, + # keep only the longest spanning sequence + max_observed_sequence_length = max(len(s) for s in variant_sequences) + + # if we get back a sequence that's longer than the preferred length + # then that doesn't mean we should necessarily drop the other sequences + min_required_sequence_length = min( + max_observed_sequence_length, + preferred_sequence_length) + + variant_sequences = [ + s for s in variant_sequences + if len(s.sequence) >= min_required_sequence_length + ] + n_dropped = n_total - len(variant_sequences) + if n_dropped > 0: + logger.info( + "Dropped %d/%d variant sequences shorter than %d", + n_dropped, + n_total, + min_required_sequence_length) + return variant_sequences + + +def trim_variant_sequences(variant_sequences, min_variant_sequence_coverage): + """ + Trim VariantSequences to desired coverage and then combine any + subsequences which get generated. + + Parameters + ---------- + variant_sequences : list of VariantSequence + + min_variant_sequence_coverage : int + + Returns list of VariantSequence + """ + n_total = len(variant_sequences) + trimmed_variant_sequences = [ + variant_sequence.trim_by_coverage(min_variant_sequence_coverage) + for variant_sequence in variant_sequences + ] + collapsed_variant_sequences = collapse_substrings(trimmed_variant_sequences) + n_after_trimming = len(collapsed_variant_sequences) + logger.info( + "Kept %d/%d variant sequences after read coverage trimming to >=%dx", + n_after_trimming, + n_total, + min_variant_sequence_coverage) + return collapsed_variant_sequences + + +def filter_variant_sequences( + variant_sequences, + preferred_sequence_length, + min_variant_sequence_coverage): + """ + Drop variant sequences which are shorter than request or don't have + enough supporting reads. + + Parameters + ---------- + variant_sequences : list of VariantSequence + + preferred_sequence_length : int + + min_variant_sequence_coverage : int + + Returns list of VariantSequence + """ + variant_sequences = trim_variant_sequences( + variant_sequences, min_variant_sequence_coverage) + + return filter_variant_sequences_by_length( + variant_sequences=variant_sequences, + preferred_sequence_length=preferred_sequence_length) + diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py deleted file mode 100644 index f625b66..0000000 --- a/isovar/variant_sequences.py +++ /dev/null @@ -1,534 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import - -import numpy as np - -from .read_helpers import ( - get_single_allele_from_reads, - group_unique_sequences, -) -from .variant_reads import filter_non_alt_reads_for_variant -from .default_parameters import ( - MIN_ALT_RNA_READS, - MIN_VARIANT_SEQUENCE_COVERAGE, - VARIANT_SEQUENCE_LENGTH, - VARIANT_SEQUENCE_ASSEMBLY, - MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE, -) -from .dataframe_builder import dataframe_from_generator -from .assembly import iterative_overlap_assembly, collapse_substrings -from .value_object import ValueObject -from .logging import get_logger - -logger = get_logger(__name__) - - -class VariantSequence(ValueObject): - __slots__ = [ - # nucleotides before a variant - "prefix", - # nucleotide sequence of a variant - "alt", - # nucleotides after a variant - "suffix", - # since we often want to look at prefix+alt+suffix, let's cache it - "sequence", - # reads which were used to determine this sequences - "reads" - ] - - def __init__(self, prefix, alt, suffix, reads): - self.prefix = prefix - self.alt = alt - self.suffix = suffix - self.sequence = prefix + alt + suffix - self.reads = frozenset(reads) - - def __len__(self): - return len(self.sequence) - - @property - def read_names(self): - return {r.name for r in self.reads} - - def contains(self, other): - """ - Is the other VariantSequence a subsequence of this one? - - The two sequences must agree on the alt nucleotides, the prefix of the - longer must contain the prefix of the shorter, and the suffix of the - longer must contain the suffix of the shorter. - """ - return (self.alt == other.alt and - self.prefix.endswith(other.prefix) and - self.suffix.startswith(other.suffix)) - - def left_overlaps(self, other, min_overlap_size=1): - """ - Does this VariantSequence overlap another on the left side? - """ - - if self.alt != other.alt: - # allele must match! - return False - - if len(other.prefix) > len(self.prefix): - # only consider strings that overlap like: - # self: ppppAssss - # other: ppAsssssss - # which excludes cases where the other sequence has a longer - # prefix - return False - elif len(other.suffix) < len(self.suffix): - # similarly, we throw away cases where the other sequence is shorter - # after the alt nucleotides than this sequence - return False - - # is the other sequence a prefix of this sequence? - # Example: - # p1 a1 s1 = XXXXXXXX Y ZZZZZZ - # p2 a2 s2 = XX Y ZZZZZZZZZ - # ... - # then we can combine them into a longer sequence - sequence_overlaps = ( - self.prefix.endswith(other.prefix) and - other.suffix.startswith(self.suffix) - ) - prefix_overlap_size = min(len(self.prefix), len(other.prefix)) - suffix_overlap_size = min(len(other.suffix), len(self.suffix)) - overlap_size = ( - prefix_overlap_size + suffix_overlap_size + len(self.alt)) - - return sequence_overlaps and overlap_size >= min_overlap_size - - def add_reads(self, reads): - """ - Create another VariantSequence with more supporting reads. - """ - if len(reads) == 0: - return self - new_reads = self.reads.union(reads) - if len(new_reads) > len(self.reads): - return VariantSequence( - prefix=self.prefix, - alt=self.alt, - suffix=self.suffix, - reads=new_reads) - else: - return self - - def combine(self, other_sequence): - """ - If this sequence is the prefix of another sequence, combine - them into a single VariantSequence object. If the other sequence - is contained in this one, then add its reads to this VariantSequence. - Also tries to flip the order (e.g. this sequence is a suffix or - this sequence is a subsequence). If sequences can't be combined - then returns None. - """ - if other_sequence.alt != self.alt: - logger.warn( - "Cannot combine %s and %s with mismatching alt sequences", - self, - other_sequence) - return None - elif self.contains(other_sequence): - return self.add_reads(other_sequence.reads) - elif other_sequence.contains(self): - return other_sequence.add_reads(self.reads) - elif self.left_overlaps(other_sequence): - # If sequences are like AABC and ABCC - return VariantSequence( - prefix=self.prefix, - alt=self.alt, - suffix=other_sequence.suffix, - reads=self.reads.union(other_sequence.reads)) - elif other_sequence.left_overlaps(self): - return VariantSequence( - prefix=other_sequence.prefix, - alt=self.alt, - suffix=self.suffix, - reads=self.reads.union(other_sequence.reads)) - else: - # sequences don't overlap - return None - - def variant_indices(self): - """ - When we combine prefix + alt + suffix into a single string, - what are is base-0 index interval which gets us back the alt - sequence? First returned index is inclusive, the second is exclusive. - """ - variant_start_index = len(self.prefix) - variant_len = len(self.alt) - variant_end_index = variant_start_index + variant_len - return variant_start_index, variant_end_index - - def coverage(self): - """ - Returns NumPy array indicating number of reads covering each - nucleotides of this sequence. - """ - variant_start_index, variant_end_index = self.variant_indices() - n_nucleotides = len(self) - coverage_array = np.zeros(n_nucleotides, dtype="int32") - for read in self.reads: - coverage_array[ - max(0, variant_start_index - len(read.prefix)): - min(n_nucleotides, variant_end_index + len(read.suffix))] += 1 - return coverage_array - - def min_coverage(self): - return np.min(self.coverage()) - - def mean_coverage(self): - return np.mean(self.coverage()) - - def trim_by_coverage(self, min_reads): - """ - Given the min number of reads overlapping each nucleotide of - a variant sequence, trim this sequence by getting rid of positions - which are overlapped by fewer reads than specified. - """ - read_count_array = self.coverage() - logger.info("Coverage: %s (len=%d)" % ( - read_count_array, len(read_count_array))) - sufficient_coverage_mask = read_count_array >= min_reads - sufficient_coverage_indices = np.argwhere(sufficient_coverage_mask) - if len(sufficient_coverage_indices) == 0: - logger.debug("No bases in %s have coverage >= %d" % (self, min_reads)) - return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) - variant_start_index, variant_end_index = self.variant_indices() - # assuming that coverage drops off monotonically away from - # variant nucleotides - first_covered_index = sufficient_coverage_indices.min() - last_covered_index = sufficient_coverage_indices.max() - # adding 1 to last_covered_index since it's an inclusive index - # whereas variant_end_index is the end of a half-open interval - if (first_covered_index > variant_start_index or - last_covered_index + 1 < variant_end_index): - # Example: - # Nucleotide sequence: - # ACCCTTTT|AA|GGCGCGCC - # Coverage: - # 12222333|44|33333211 - # Then the mask for bases covered >= 4x would be: - # ________|**|________ - # with indices: - # first_covered_index = 9 - # last_covered_index = 10 - # variant_start_index = 9 - # variant_end_index = 11 - logger.debug("Some variant bases in %s don't have coverage >= %d" % ( - self, min_reads)) - return VariantSequence(prefix="", alt="", suffix="", reads=self.reads) - return VariantSequence( - prefix=self.prefix[first_covered_index:], - alt=self.alt, - suffix=self.suffix[:last_covered_index - variant_end_index + 1], - reads=self.reads) - - -def initial_variant_sequences_from_reads( - variant_reads, - max_nucleotides_before_variant=None, - max_nucleotides_after_variant=None): - """ - Get all unique sequences from reads spanning a variant locus. This will - include partial sequences due to reads starting in the middle of the - sequence around around a variant. - """ - unique_sequence_groups = group_unique_sequences( - variant_reads, - max_prefix_size=max_nucleotides_before_variant, - max_suffix_size=max_nucleotides_after_variant) - - return [ - VariantSequence( - prefix=prefix, - alt=alt, - suffix=suffix, - reads=reads) - for ((prefix, alt, suffix), reads) - in unique_sequence_groups.items() - ] - - -def filter_variant_sequences_by_read_support( - variant_sequences, - min_variant_sequence_coverage): - n_total = len(variant_sequences) - variant_sequences = [ - s - for s in variant_sequences - if s.min_coverage() >= min_variant_sequence_coverage - ] - n_dropped = n_total - len(variant_sequences) - if n_dropped > 0: - logger.info( - "Dropped %d/%d variant sequences less than %d supporting reads", - n_dropped, - n_total, - min_variant_sequence_coverage) - return variant_sequences - - -def filter_variant_sequences_by_length( - variant_sequences, - preferred_sequence_length): - n_total = len(variant_sequences) - if n_total == 0: - return [] - # since we might have gotten some shorter fragments, - # keep only the longest spanning sequence - max_observed_sequence_length = max(len(s) for s in variant_sequences) - - # if we get back a sequence that's longer than the preferred length - # then that doesn't mean we should necessarily drop the other sequences - min_required_sequence_length = min( - max_observed_sequence_length, - preferred_sequence_length) - - variant_sequences = [ - s for s in variant_sequences - if len(s.sequence) >= min_required_sequence_length - ] - n_dropped = n_total - len(variant_sequences) - if n_dropped > 0: - logger.info( - "Dropped %d/%d variant sequences shorter than %d", - n_dropped, - n_total, - min_required_sequence_length) - return variant_sequences - - -def trim_variant_sequences(variant_sequences, min_variant_sequence_coverage): - """ - Trim VariantSequences to desired coverage and then combine any - subsequences which get generated. - """ - n_total = len(variant_sequences) - trimmed_variant_sequences = [ - variant_sequence.trim_by_coverage(min_variant_sequence_coverage) - for variant_sequence in variant_sequences - ] - collapsed_variant_sequences = collapse_substrings(trimmed_variant_sequences) - n_after_trimming = len(collapsed_variant_sequences) - logger.info( - "Kept %d/%d variant sequences after read coverage trimming to >=%dx", - n_after_trimming, - n_total, - min_variant_sequence_coverage) - return collapsed_variant_sequences - - -def filter_variant_sequences( - variant_sequences, - preferred_sequence_length, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE,): - """ - Drop variant sequences which are shorter than request or don't have - enough supporting reads. - """ - variant_sequences = trim_variant_sequences( - variant_sequences, min_variant_sequence_coverage) - - return filter_variant_sequences_by_length( - variant_sequences=variant_sequences, - preferred_sequence_length=preferred_sequence_length) - - -def reads_to_variant_sequences( - variant, - reads, - preferred_sequence_length, - min_alt_rna_reads=MIN_ALT_RNA_READS, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, - variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY): - """ - Collapse variant-supporting RNA reads into consensus sequences of - approximately the preferred length (may differ at the ends of transcripts), - filter consensus sequences by length and number of supporting RNA reads. - - Parameters - ---------- - variant : varcode.Variant - - reads : list of AlleleRead objects - Should all support the same variant allele nucleotides. - - preferred_sequence_length : int - Total number of nucleotides in the assembled sequences, including - variant nucleotides. - - min_alt_rna_reads : int - Drop sequences from loci which lack at least this number of reads - that agree with the variant allele. - - min_variant_sequence_coverage : int - Drop sequences which don't at least have this number of reads - covering each cDNA position. - - variant_sequence_assembly : bool - Construct variant sequences by merging overlapping reads. If False - then variant sequences must be fully spanned by cDNA reads. - - Returns a collection of VariantSequence objects - """ - # just in case variant_reads is a generator, convert it to a list - variant_reads = list(filter_non_alt_reads_for_variant(variant, reads)) - - if len(variant_reads) < min_alt_rna_reads: - logger.info( - "Skipping %s because only %d alt RNA reads (min=%d)", - variant, - len(variant_reads), - min_alt_rna_reads) - return [] - if len(variant_reads) == 0: - return [] - - alt_seq = get_single_allele_from_reads(variant_reads) - - # the number of context nucleotides on either side of the variant - # is half the desired length (minus the number of variant nucleotides) - n_alt_nucleotides = len(alt_seq) - - n_surrounding_nucleotides = preferred_sequence_length - n_alt_nucleotides - max_nucleotides_after_variant = n_surrounding_nucleotides // 2 - - # if the number of nucleotides we need isn't divisible by 2 then - # prefer to have one more *before* the variant since we need the - # prefix sequence to match against reference transcripts - max_nucleotides_before_variant = ( - n_surrounding_nucleotides - max_nucleotides_after_variant) - - variant_sequences = initial_variant_sequences_from_reads( - variant_reads=variant_reads, - max_nucleotides_before_variant=max_nucleotides_before_variant, - max_nucleotides_after_variant=max_nucleotides_after_variant) - - logger.info( - "Initial pool of %d variant sequences (min length=%d, max length=%d)", - len(variant_sequences), - min(len(s) for s in variant_sequences), - max(len(s) for s in variant_sequences)) - - if variant_sequence_assembly: - # this is a tricky parameter to set correctly: - # by how many bases should two sequences overlap before - # we merge, currently defaulting to either half the non-variant - # nucleotides or 30 (whichever is smaller) - variant_sequences = iterative_overlap_assembly( - variant_sequences, - min_overlap_size=min( - MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE, - n_surrounding_nucleotides // 2)) - - if variant_sequences: - logger.info( - "After overlap assembly: %d variant sequences (min length=%d, max length=%d)", - len(variant_sequences), - min(len(s) for s in variant_sequences), - max(len(s) for s in variant_sequences)) - else: - logger.info("After overlap assembly: 0 variant sequences") - return [] - - variant_sequences = filter_variant_sequences( - variant_sequences=variant_sequences, - preferred_sequence_length=preferred_sequence_length, - min_variant_sequence_coverage=min_variant_sequence_coverage) - - if variant_sequences: - logger.info( - ("After coverage & length filtering: %d variant sequences " - "(min length=%d, max length=%d)"), - len(variant_sequences), - min(len(s) for s in variant_sequences), - max(len(s) for s in variant_sequences)) - else: - logger.info("After coverage & length filtering: 0 variant sequences") - return [] - - # sort VariantSequence objects by decreasing order of supporting read - # counts - variant_sequences.sort(key=lambda vs: -len(vs.reads)) - return variant_sequences - - -def reads_generator_to_sequences_generator( - variant_and_reads_generator, - min_alt_rna_reads=MIN_ALT_RNA_READS, - min_variant_sequence_coverage=MIN_VARIANT_SEQUENCE_COVERAGE, - preferred_sequence_length=VARIANT_SEQUENCE_LENGTH, - variant_sequence_assembly=VARIANT_SEQUENCE_ASSEMBLY): - """ - For each variant, collect all possible sequence contexts around the - variant which are spanned by at least min_reads. - - Parameters - ---------- - variant_and_reads_generator : generator - Sequence of Variant objects paired with a list of reads which - overlap that variant. - - min_alt_rna_reads : int - Minimum number of RNA reads supporting variant allele - - min_variant_sequence_coverage : int - Minimum number of RNA reads supporting each nucleotide of the - variant cDNA sequence - - sequence_length : int - Desired sequence length, including variant nucleotides - - variant_sequence_assembly : bool - Construct variant sequences by merging overlapping reads. If False - then variant sequences must be fully spanned by cDNA reads. - - Yields pairs with the following fields: - - Variant - - list of VariantSequence objects - """ - for variant, variant_reads in variant_and_reads_generator: - variant_sequences = reads_to_variant_sequences( - variant=variant, - reads=variant_reads, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - preferred_sequence_length=preferred_sequence_length, - variant_sequence_assembly=variant_sequence_assembly) - yield variant, variant_sequences - - -def variant_sequences_generator_to_dataframe(variant_sequences_generator): - """ - Creates a dataframe from a generator which yields - (Variant, [VariantSequence]) pairs. - - Returns pandas.DataFrame - """ - # TODO: Change VariantSequence.alt to VariantSequence.alt_nucleotides - # or something else that doesn't clash with a variant's `alt` field - return dataframe_from_generator( - VariantSequence, - variant_sequences_generator, - rename_dict={"alt": "allele"}, - extra_column_fns={ - "gene": lambda variant, _: ";".join(variant.gene_names), - }) diff --git a/isovar_design.png b/isovar_design.png new file mode 100644 index 0000000..f4ec98b Binary files /dev/null and b/isovar_design.png differ diff --git a/requirements.txt b/requirements.txt index e26bf24..a0f2b80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ six>=1.9.0 pylint>=1.4.4 pyensembl>=1.5.0 -varcode>=0.5.9 +varcode>=0.9.0 pandas>=0.23.0 -pysam==0.9.0 +pysam>=0.15.2,<=0.16.0 nose>=1.3.3 +cached_property>=1.5.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 61bb9fe..1f21a46 100644 --- a/setup.py +++ b/setup.py @@ -25,17 +25,9 @@ with open(readme_path, 'r') as f: readme_markdown = f.read() except: - logging.warn("Failed to load %s" % readme_path) + logging.warning("Failed to load %s" % readme_path) readme_markdown = "" -try: - import pypandoc - readme_restructured = pypandoc.convert(readme_markdown, to='rst', format='md') -except: - readme_restructured = readme_markdown - logging.warn("Conversion of long_description from MD to RST failed") - pass - with open('isovar/__init__.py', 'r') as f: version = re.search( @@ -50,10 +42,10 @@ setup( name='isovar', version=version, - description="Assemble transcript sequences fragments around variants", + description="Determine mutant protein sequences from RN using assembly around variants", author="Alex Rubinsteyn, Arman Aksoy, Julia Kodysh", author_email="alex.rubinsteyn@mssm.edu", - url="https://github.com/hammerlab/isovar", + url="https://github.com/openvax/isovar", license="http://www.apache.org/licenses/LICENSE-2.0.html", classifiers=[ 'Development Status :: 4 - Beta', @@ -66,16 +58,19 @@ ], install_requires=[ 'six', - 'pysam==0.9.0', + 'pysam>=0.15.2,<=0.16.0', 'pandas', - 'varcode>=0.5.9', + 'varcode>=0.9.0', 'pyensembl>=1.5.0', + 'cached_property>=1.5.1', ], - long_description=readme_restructured, + long_description=readme_markdown, + long_description_content_type='text/markdown', packages=find_packages(), package_data={'isovar': ['logging.conf']}, entry_points={ 'console_scripts': [ + 'isovar=isovar.cli.isovar_main:run', 'isovar-protein-sequences=isovar.cli.isovar_protein_sequences:run', "isovar-translations=isovar.cli.isovar_translations:run", "isovar-reference-contexts=isovar.cli.isovar_reference_contexts:run", diff --git a/test/genomes_for_testing.py b/test/genomes_for_testing.py new file mode 100644 index 0000000..5b74c62 --- /dev/null +++ b/test/genomes_for_testing.py @@ -0,0 +1,4 @@ +from pyensembl import cached_release, genome_for_reference_name + +grcm38 = genome_for_reference_name("grcm38") +grch38 = cached_release(87) diff --git a/test/mock_objects.py b/test/mock_objects.py new file mode 100644 index 0000000..62cdf3a --- /dev/null +++ b/test/mock_objects.py @@ -0,0 +1,141 @@ +from __future__ import print_function, division, absolute_import + +import pysam + +from isovar.translation import Translation +from isovar.protein_sequence import ProteinSequence +from isovar.variant_orf import VariantORF + +class MockAlignmentFile(object): + """ + Used instead of real AlignmentFile objects for test. + """ + def __init__(self, references, reads): + self.references = tuple(references) + self.reads = reads + + def fetch(self, *args, **kwargs): + return self.reads + + @property + def filename(self): + return "MOCK-READS.bam" + +def make_pysam_read( + seq, + cigar, + mdtag=None, + name="dummy", + mapq=10, + baseq=30, + reference_start=0, + reference_id=0): + read = pysam.AlignedSegment() + read.seq = seq + read.cigarstring = cigar + if mdtag: + read.set_tag("MD", mdtag) + read.qname = name + read.mapq = mapq + read.reference_start = reference_start + read.reference_id = reference_id + qualities_string = pysam.qualities_to_qualitystring([baseq] * len(seq)) + read.qual = qualities_string.encode("ascii") + return read + + +class MockAlleleRead(object): + def __init__(self, name="mock-read"): + self.name = name + +class MockVariantSequence(object): + def __init__(self, n_reads): + self.n_reads = n_reads + + @property + def reads(self): + return [MockAlleleRead("mock-read-%d" % i) for i in range(self.n_reads)] + +class MockReferenceContext(object): + pass + +def make_dummy_translation( + amino_acids="MKHW", # ATG=M|AAA=K|CAC=H|TGG=W + cdna_sequence="CCCATGAAACACTGGTAG", + offset_to_first_complete_codon=3, + variant_cdna_interval_start=8, # assuming variant was AAC>AAA + variant_cdna_interval_end=9, + variant_aa_interval_start=1, + variant_aa_interval_end=2, + num_mismatches=1, + n_variant_reads=1): + """ + Create mock Translation object with minimal information needed to + get used successfully by ProteinSequence. + """ + varseq_in_orf = VariantORF( + cdna_sequence=cdna_sequence, + offset_to_first_complete_codon=offset_to_first_complete_codon, + variant_cdna_interval_start=variant_cdna_interval_start, + variant_cdna_interval_end=variant_cdna_interval_end, + reference_cdna_sequence_before_variant=cdna_sequence[:variant_cdna_interval_start], + reference_cdna_sequence_after_variant=cdna_sequence[variant_cdna_interval_end:], + num_mismatches_before_variant=num_mismatches, + num_mismatches_after_variant=0) + + return Translation( + variant_orf=varseq_in_orf, + amino_acids=amino_acids, + variant_aa_interval_start=variant_aa_interval_start, + variant_aa_interval_end=variant_aa_interval_end, + frameshift=False, + ends_with_stop_codon=False, + untrimmed_variant_sequence=MockVariantSequence(n_reads=n_variant_reads), + reference_context=MockReferenceContext()) + + +def make_dummy_protein_sequence( + n_supporting_variant_reads, + n_supporting_variant_sequences, + n_supporting_reference_transcripts, + n_total_variant_sequences=None, + n_total_variant_reads=None, + n_total_reference_transcripts=None, + amino_acids="MKHW", # ATG=M|AAA=K|CAC=H|TGG=W + cdna_sequence="CCCATGAAACACTGGTAG", + variant_cdna_interval_start=8, # assuming variant was AAC>AAA + variant_cdna_interval_end=9, + variant_aa_interval_start=1, + variant_aa_interval_end=2, + num_mismatches=1): + """ + Creates ProteinSequence object with None filled in for most fields + """ + if n_total_variant_reads is None: + n_total_variant_reads = n_supporting_variant_reads + + if n_total_variant_sequences is None: + n_total_variant_sequences = n_supporting_variant_sequences + + if n_total_reference_transcripts is None: + n_total_reference_transcripts = n_total_reference_transcripts + + assert n_supporting_variant_sequences <= n_supporting_variant_reads + assert n_supporting_variant_sequences <= n_total_variant_sequences + assert n_supporting_reference_transcripts <= n_total_reference_transcripts + + n_translations = n_total_reference_transcripts * n_total_variant_sequences + + translation = make_dummy_translation( + amino_acids=amino_acids, + cdna_sequence=cdna_sequence, + offset_to_first_complete_codon=3, + variant_cdna_interval_start=variant_cdna_interval_start, # assuming variant was AAC>AAA + variant_cdna_interval_end=variant_cdna_interval_end, + variant_aa_interval_start=variant_aa_interval_start, + variant_aa_interval_end=variant_aa_interval_end, + num_mismatches=num_mismatches, + n_variant_reads=n_total_variant_reads) + + return ProteinSequence( + translations=[translation] * n_translations) \ No newline at end of file diff --git a/test/mock_read_data.py b/test/mock_read_data.py deleted file mode 100644 index 8d48d09..0000000 --- a/test/mock_read_data.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import - -import pysam - -class DummyPileupElement(object): - def __init__(self, alignment, is_refskip, is_del): - self.alignment = alignment - self.is_del = is_del - self.is_refskip = is_refskip - - -class DummyPileupColumn(object): - def __init__(self, pos, reads, is_del=False, is_refskip=False): - self.pos = pos - self.pileups = [ - DummyPileupElement(read, is_del=is_del, is_refskip=is_refskip) - for read in reads] - - -class DummySamFile(object): - """ - Used instead of real AlignmentFile objects for test. - """ - def __init__(self, reads): - self.reads = reads - - def fetch(self, *args, **kwargs): - return self.reads - - def pileup(self, chromosome, start, end): - for i in range(start, end + 1): - yield DummyPileupColumn(pos=i, reads=self.reads) - - -def make_read(seq, cigar, mdtag=None, name="dummy", mapq=10, baseq=30): - read = pysam.AlignedSegment() - read.seq = seq - read.cigarstring = cigar - if mdtag: - read.set_tag("MD", mdtag) - read.qname = name - read.mapq = mapq - qualities_string = pysam.qualities_to_qualitystring([baseq] * len(seq)) - qualities_bytes = qualities_string.encode("ascii") - read.qual = qualities_bytes - return read diff --git a/test/test_alignment_score.py b/test/test_alignment_score.py new file mode 100644 index 0000000..4deca4c --- /dev/null +++ b/test/test_alignment_score.py @@ -0,0 +1,30 @@ +from isovar.alignment_score import alignment_score +from nose.tools import eq_ + +def test_alignment_score_same_strings(): + eq_(alignment_score("", ""), 0) + eq_(alignment_score("a", "a"), 0) + eq_(alignment_score("WUZZLE", "WUZZLE"), 0) + +def test_alignment_score_strings_of_different_length(): + eq_(alignment_score("ab", "a"), 1) + eq_(alignment_score("a", "ab"), 1) + eq_(alignment_score("WUZZLE", "WUZZLE?"), 1) + eq_(alignment_score("SNUZZLE", "UZZLE"), 2) + + +def test_alignment_score_totally_different_strings(): + eq_(alignment_score("", "a"), 1) + eq_(alignment_score("", "ab"), 2) + eq_(alignment_score("WUZZLE", "HEAVY"), 6) + eq_(alignment_score("DOG", "CATCATCAT"), 9) + eq_(alignment_score("a", ""), 1) + eq_(alignment_score("ab", ""), 2) + eq_(alignment_score("WUZZLE", "HEAVY"), 6) + eq_(alignment_score("CATCATCAT", "DOG"), 9) + + +def test_alignment_min_subsequence_length(): + # if matching subsequence isn't long enough then misalignment score + # will be sum of two sequence lengths + eq_(alignment_score("aaa", "aaa", min_subsequence_length=10), 6) diff --git a/test/test_allele_counts.py b/test/test_allele_counts.py index 5053284..35b64ce 100644 --- a/test/test_allele_counts.py +++ b/test/test_allele_counts.py @@ -1,22 +1,30 @@ -from isovar.allele_counts import allele_counts_dataframe -from isovar.allele_reads import AlleleRead +from isovar.dataframe_helpers import allele_counts_dataframe +from isovar.allele_read import AlleleRead +from isovar.read_evidence import ReadEvidence from varcode import Variant from nose.tools import eq_ def test_allele_count_dataframe(): variant = Variant("test_contig", 50, "C", "G") - reads = [ - AlleleRead(prefix="AAA", allele="C", suffix="TTT", name="C1"), - AlleleRead(prefix="AAC", allele="C", suffix="TTA", name="C2"), - AlleleRead(prefix="AAA", allele="G", suffix="TTT", name="G1"), - ] - df = allele_counts_dataframe([(variant, reads)]) + read_evidence = ReadEvidence( + trimmed_base1_start=50, + trimmed_ref="C", + trimmed_alt="G", + ref_reads=[ + AlleleRead(prefix="AAA", allele="C", suffix="TTT", name="C1"), + AlleleRead(prefix="AAC", allele="C", suffix="TTA", name="C2"), + ], + alt_reads=[ + AlleleRead(prefix="AAA", allele="G", suffix="TTT", name="G1") + ], + other_reads=[]) + df = allele_counts_dataframe([(variant, read_evidence)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df,) row = df.iloc[0] - eq_(row.n_ref, 2) - eq_(row.n_alt, 1) - eq_(row.n_other, 0) + eq_(row.num_ref_reads, 2) + eq_(row.num_alt_reads, 1) + eq_(row.num_other_reads, 0) if __name__ == "__main__": diff --git a/test/test_allele_reads.py b/test/test_allele_reads.py index 1c4aab8..24fe0ff 100644 --- a/test/test_allele_reads.py +++ b/test/test_allele_reads.py @@ -1,8 +1,9 @@ -from isovar.allele_reads import AlleleRead -from isovar.locus_reads import LocusRead +from isovar.allele_read import AlleleRead +from isovar.locus_read import LocusRead from nose.tools import eq_ + def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"): dummy_sequence = prefix + alt + suffix return LocusRead( @@ -10,13 +11,15 @@ def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"): sequence=dummy_sequence, reference_positions=list(range(1, len(dummy_sequence) + 1)), quality_scores=[base_quality] * len(dummy_sequence), - base0_read_position_before_variant=len(prefix) - 1, - base0_read_position_after_variant=len(prefix) + len(alt), - ) + read_base0_start_inclusive=len(prefix), + read_base0_end_exclusive=len(prefix) + len(alt), + reference_base0_start_inclusive=len(prefix), + reference_base0_end_exclusive=len(prefix) + len(alt)) + def test_allele_read_from_single_read_at_locus_trim_N_nucleotides(): read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA") - allele_read = AlleleRead.from_locus_read(read_at_locus, n_ref=1) + allele_read = AlleleRead.from_locus_read(read_at_locus) print(allele_read) expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy") eq_(allele_read, expected) diff --git a/test/test_assembly.py b/test/test_assembly.py index cdb0788..10d6fb6 100644 --- a/test/test_assembly.py +++ b/test/test_assembly.py @@ -1,26 +1,10 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import from time import time -from isovar.variant_reads import reads_supporting_variant -from isovar.variant_sequences import ( - initial_variant_sequences_from_reads, - VariantSequence -) -from isovar.allele_reads import AlleleRead +from isovar.read_collector import ReadCollector +from isovar.variant_sequence import VariantSequence +from isovar.variant_sequence_helpers import initial_variant_sequences_from_reads +from isovar.allele_read import AlleleRead from isovar.assembly import ( iterative_overlap_assembly, greedy_merge, @@ -33,8 +17,9 @@ from testing_helpers import load_bam + def test_assemble_transcript_fragments_snv(): - samfile = load_bam("data/cancer-wgs-primary.chr12.bam") + alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" @@ -45,10 +30,10 @@ def test_assemble_transcript_fragments_snv(): ref=ref, alt=alt, ensembl=ensembl_grch38) - variant_reads = reads_supporting_variant( + read_creator = ReadCollector() + variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=samfile, - chromosome=chromosome,) + alignment_file=alignment_file) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), @@ -71,6 +56,7 @@ def test_assemble_transcript_fragments_snv(): "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,) + def test_assembly_of_simple_sequence_from_mock_reads(): # Read sequences: # AAAAA|CC|TTTTT @@ -121,6 +107,7 @@ def test_assembly_of_simple_sequence_from_mock_reads(): expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14 eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage) + def test_collapse_substrings(): # AAA|C|GGG vs_longer = VariantSequence( @@ -175,6 +162,7 @@ def test_assembly_of_many_subsequences(): result_decoy = results[1] eq_(result_decoy.sequence, decoy.sequence) + def test_assembly_time(): original_prefix = "ACTGAACCTTGGAAACCCTTTGGG" original_allele = "CCCTTT" @@ -206,6 +194,7 @@ def test_assembly_time(): "Expected assembly of 400 sequences to take less than 100ms: %0.4fms" % ( t_elapsed * 1000,) + def test_assembly_unrelated_sequences(): # 2 overlapping sequences, 1 with a different suffix, # and 2 totally unrelated sequences @@ -252,9 +241,11 @@ def test_assembly_unrelated_sequences(): eq_(3, count_singleton) eq_(1, count_multiple) + def test_assembly_no_sequences(): eq_(iterative_overlap_assembly([]), []) + def test_assembly_1_sequence(): vs = VariantSequence( prefix="CCC", diff --git a/test/test_cli.py b/test/test_cli.py index 0a789c0..b67e8a4 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -11,7 +11,7 @@ from isovar.cli.isovar_reference_contexts import run as isovar_reference_contexts from isovar.cli.isovar_variant_reads import run as isovar_variant_reads from isovar.cli.isovar_variant_sequences import run as isovar_variant_sequences - +from isovar.cli.isovar_main import run as isovar_main vcf_args = [ "--vcf", @@ -64,3 +64,6 @@ def test_cli_variant_reads(): def test_cli_variant_sequences(): run_cli_fn(isovar_variant_sequences) + +def test_cli_main(): + run_cli_fn(isovar_main) \ No newline at end of file diff --git a/test/test_dataframe_builder.py b/test/test_dataframe_builder.py index 1793db4..f1c7e04 100644 --- a/test/test_dataframe_builder.py +++ b/test/test_dataframe_builder.py @@ -15,14 +15,16 @@ from __future__ import print_function, division, absolute_import from collections import namedtuple, OrderedDict from nose.tools import eq_ -from isovar.dataframe_builder import DataFrameBuilder from varcode import Variant import pandas as pd +from isovar.dataframe_builder import DataFrameBuilder + TestClass = namedtuple("TestClass", "a b c") test_obj = TestClass(a=1, b="s", c=3.0) test_variant = Variant("X", 10, "CC", "C") + def check_same_dataframes(df, expected): eq_(len(df.columns), len(expected.columns)) assert all(x == y for (x, y) in zip(df.columns, expected.columns)), \ @@ -47,6 +49,7 @@ def test_dataframe_builder(): ])) check_same_dataframes(df, expected) + def test_dataframe_builder_rename(): df_builder = DataFrameBuilder( TestClass, @@ -64,6 +67,7 @@ def test_dataframe_builder_rename(): ])) check_same_dataframes(df, expected) + def test_dataframe_rename_and_converters(): df_builder = DataFrameBuilder( TestClass, @@ -82,6 +86,7 @@ def test_dataframe_rename_and_converters(): ])) check_same_dataframes(df, expected) + def test_dataframe_rename_and_converters_and_exclude(): df_builder = DataFrameBuilder( TestClass, diff --git a/test/test_dna_helpers.py b/test/test_dna_helpers.py index 859f18a..890957c 100644 --- a/test/test_dna_helpers.py +++ b/test/test_dna_helpers.py @@ -1,9 +1,10 @@ from isovar.dna import reverse_complement_dna, complement_dna from nose.tools import eq_ + def test_reverse_complement_dna(): - eq_("ATGCAATTGGCC", - reverse_complement_dna("GGCCAATTGCAT")) + eq_("ATGCAATTGGCC", reverse_complement_dna("GGCCAATTGCAT")) + def test_complement_dna(): eq_("ATGC", complement_dna("TACG")) diff --git a/test/test_filtering.py b/test/test_filtering.py new file mode 100644 index 0000000..b435e50 --- /dev/null +++ b/test/test_filtering.py @@ -0,0 +1,56 @@ +from isovar.filtering import apply_filters + +class MockIsovarResult(object): + """ + Mock object which has whatever properties we give it but can only + be cloned with new filter_values passed into to clone_with_updates by + apply_filters. + """ + def __init__(self, filter_values={}, **kwargs): + self._init_args = kwargs + self.filter_values = filter_values + for (k, v) in kwargs.items(): + setattr(self, k, v) + + def clone_with_updates(self, filter_values): + return MockIsovarResult(filter_values=filter_values, **self._init_args) + +def test_apply_filters_min_pass(): + obj = MockIsovarResult(x=1) + new_obj = apply_filters(obj, filter_thresholds={"min_x": 1}) + assert new_obj.filter_values["min_x"] + +def test_apply_filters_min_fail(): + obj = MockIsovarResult(x=1) + new_obj = apply_filters(obj, filter_thresholds={"min_x": 2}) + assert not new_obj.filter_values["min_x"] + +def test_apply_filters_max_pass(): + obj = MockIsovarResult(x=1) + new_obj = apply_filters(obj, filter_thresholds={"max_x": 1}) + assert new_obj.filter_values["max_x"] + +def test_apply_filters_max_fail(): + obj = MockIsovarResult(x=2) + new_obj = apply_filters(obj, filter_thresholds={"max_x": 1}) + assert not new_obj.filter_values["max_x"] + +def test_apply_filters_bool_pass(): + obj = MockIsovarResult(x=True) + new_obj = apply_filters(obj, filter_flags=["x"]) + assert new_obj.filter_values["x"] + +def test_apply_filters_bool_fail(): + obj = MockIsovarResult(x=False) + new_obj = apply_filters(obj, filter_flags=["x"]) + assert not new_obj.filter_values["x"] + +def test_apply_filters_negated_bool_pass(): + obj = MockIsovarResult(x=False) + new_obj = apply_filters(obj, filter_flags=["not_x"]) + assert new_obj.filter_values["not_x"] + +def test_apply_filters_negated_bool_fail(): + obj = MockIsovarResult(x=True) + new_obj = apply_filters(obj, filter_flags=["not_x"]) + assert not new_obj.filter_values["not_x"] \ No newline at end of file diff --git a/test/test_gather_variant_reads_wgs.py b/test/test_gather_variant_reads_wgs.py index 79c0874..08dee79 100644 --- a/test/test_gather_variant_reads_wgs.py +++ b/test/test_gather_variant_reads_wgs.py @@ -1,20 +1,5 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -from isovar.variant_reads import reads_supporting_variant from pyensembl import ensembl_grch38 from varcode import Variant @@ -22,8 +7,11 @@ from testing_helpers import load_bam +from isovar import ReadCollector + + def test_partition_variant_reads_snv(): - samfile = load_bam("data/cancer-wgs-primary.chr12.bam") + alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" @@ -34,16 +22,18 @@ def test_partition_variant_reads_snv(): ref=ref, alt=alt, ensembl=ensembl_grch38) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + read_collector = ReadCollector() + read_evidence = read_collector.read_evidence_for_variant( + alignment_file=alignment_file, variant=variant) - assert len(variant_reads) > 1 - for variant_read in variant_reads: + alt_reads = read_evidence.alt_reads + assert len(alt_reads) > 1 + for variant_read in alt_reads: eq_(variant_read.allele, alt) + def test_partition_variant_reads_deletion(): - samfile = load_bam("data/cancer-wgs-primary.chr12.bam") + alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" @@ -54,12 +44,12 @@ def test_partition_variant_reads_deletion(): ref=ref, alt=alt, ensembl=ensembl_grch38) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + read_collector = ReadCollector() + read_evidence = read_collector.read_evidence_for_variant( + alignment_file=alignment_file, variant=variant) - assert len(variant_reads) > 1 - for variant_read in variant_reads: + assert len(read_evidence.alt_reads) > 1 + for variant_read in read_evidence.alt_reads: eq_(variant_read.allele, alt) if __name__ == "__main__": diff --git a/test/test_genetic_code.py b/test/test_genetic_code.py index bc1ceba..ad877ab 100644 --- a/test/test_genetic_code.py +++ b/test/test_genetic_code.py @@ -1,29 +1,36 @@ +from __future__ import print_function, division, absolute_import + from isovar.genetic_code import translate_cdna from nose.tools import eq_ -from pyensembl import ensembl_grch38 +from genomes_for_testing import grch38 def test_translate_cdna_no_stop_codon(): eq_(translate_cdna("ATGATG", first_codon_is_start=False), ("MM", False)) + def test_translate_cdna_stop_codon(): eq_(translate_cdna("ATGATGTAG", first_codon_is_start=False), ("MM", True)) + def test_translate_cdna_alternate_CTG_start(): eq_(translate_cdna("CTGCTG", first_codon_is_start=True), ("ML", False)) + def test_translate_cdna_CTG_after_start(): eq_(translate_cdna("CTGCTG", first_codon_is_start=False), ("LL", False)) + def test_TP53_translation_from_cdna(): - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] cdna = tp53_001.coding_sequence amino_acids, ends_with_stop_codon = translate_cdna(cdna, first_codon_is_start=True) assert ends_with_stop_codon eq_(amino_acids, tp53_001.protein_sequence) + def test_mitochondrial_MTND5_translation_from_cdna(): - mtnd5_001 = ensembl_grch38.transcripts_by_name("MT-ND5-201")[0] + mtnd5_001 = grch38.transcripts_by_name("MT-ND5-201")[0] cdna = mtnd5_001.coding_sequence amino_acids, ends_with_stop_codon = translate_cdna( cdna, diff --git a/test/test_isovar_result.py b/test/test_isovar_result.py new file mode 100644 index 0000000..84517ed --- /dev/null +++ b/test/test_isovar_result.py @@ -0,0 +1,56 @@ +from isovar import run_isovar +from isovar import ProteinSequence +from varcode import Variant +from testing_helpers import data_path + + +def test_isovar_result_property_types(): + for result in run_isovar( + variants=data_path("data/b16.f10/b16.vcf"), + alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): + # variant + assert type(result.variant) is Variant + + # counts of genes and transcripts from variant + assert type(result.num_overlapping_genes) is int + assert type(result.num_overlapping_coding_genes) is int + assert type(result.num_overlapping_transcripts) is int + assert type(result.num_overlapping_coding_transcripts) is int + + # protein sequence + assert type(result.top_protein_sequence) in (type(None), ProteinSequence) + + # counts of genes and transcripts from protein sequences + assert type(result.num_genes_from_protein_sequences) is int + assert type(result.num_genes_from_top_protein_sequence) is int + assert type(result.num_transcripts_from_protein_sequences) is int + assert type(result.num_transcripts_from_top_protein_sequence) is int + + # read and fragment counts + assert type(result.num_ref_reads) is int + assert type(result.num_alt_reads) is int + assert type(result.num_other_reads) is int + assert type(result.num_ref_fragments) is int + assert type(result.num_alt_fragments) is int + assert type(result.num_other_fragments) is int + + # read and fragment fractions + assert type(result.fraction_ref_reads) is float + assert type(result.fraction_alt_reads) is float + assert type(result.fraction_other_reads) is float + assert type(result.fraction_ref_fragments) is float + assert type(result.fraction_alt_fragments) is float + assert type(result.fraction_other_fragments) is float + + # read and fragment count ratios + assert type(result.ratio_alt_to_other_reads) is float + assert type(result.ratio_alt_to_other_fragments) is float + assert type(result.ratio_other_to_alt_fragments) is float + assert type(result.ratio_other_to_alt_reads) is float + assert type(result.ratio_ref_to_other_fragments) is float + assert type(result.ratio_other_to_ref_fragments) is float + assert type(result.ratio_other_to_ref_reads) is float + + # this property aggregates all filters + assert result.passes_all_filters in {True, False} + diff --git a/test/test_locus_reads.py b/test/test_locus_reads.py index 0e0266f..86856b7 100644 --- a/test/test_locus_reads.py +++ b/test/test_locus_reads.py @@ -1,30 +1,15 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import from nose.tools import eq_ from varcode import Variant -from isovar.locus_reads import ( - LocusRead, - locus_read_generator, - locus_reads_dataframe, -) +from isovar.locus_read import LocusRead +from isovar.dataframe_helpers import locus_reads_dataframe +from isovar.read_collector import ReadCollector -from mock_read_data import DummySamFile, make_read +from mock_objects import MockAlignmentFile, make_pysam_read from testing_helpers import assert_equal_fields, load_bam, data_path + def test_locus_reads_snv(): """ test_partitioned_read_sequences_snv : Test that read gets correctly @@ -39,14 +24,16 @@ def test_locus_reads_snv(): alt="G", normalize_contig_name=False) - pysam_read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2") + pysam_read = make_pysam_read( + seq="ACCGTG", + cigar="6M", + mdtag="3G2") - samfile = DummySamFile(reads=[pysam_read]) - reads = list(locus_read_generator( - samfile=samfile, - chromosome="chromosome", - base1_position_before_variant=variant.start - 1, - base1_position_after_variant=variant.start + 1)) + samfile = MockAlignmentFile( + references=("chromosome",), + reads=[pysam_read]) + read_creator = ReadCollector() + reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start - 1, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( @@ -57,10 +44,13 @@ def test_locus_reads_snv(): sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, - base0_read_position_before_variant=2, - base0_read_position_after_variant=4) + reference_base0_start_inclusive=3, + reference_base0_end_exclusive=4, + read_base0_start_inclusive=3, + read_base0_end_exclusive=4) assert_equal_fields(read, expected) + def test_locus_reads_insertion(): """ test_partitioned_read_sequences_insertion : Test that read gets correctly @@ -71,14 +61,17 @@ def test_locus_reads_insertion(): variant = Variant( "chromosome", 4, ref="T", alt="TG", normalize_contig_name=False) - pysam_read = make_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6") + pysam_read = make_pysam_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6") - samfile = DummySamFile(reads=[pysam_read]) - reads = list(locus_read_generator( - samfile=samfile, - chromosome="chromosome", - base1_position_before_variant=variant.start, - base1_position_after_variant=variant.start + 1)) + samfile = MockAlignmentFile( + references={"chromosome"}, + reads=[pysam_read]) + read_creator = ReadCollector() + reads = read_creator.get_locus_reads( + samfile, + "chromosome", + variant.start, + variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( @@ -91,10 +84,15 @@ def test_locus_reads_insertion(): # ref position reference_positions=[0, 1, 2, 3, None, 4, 5], quality_scores=pysam_read.query_qualities, - base0_read_position_before_variant=3, - base0_read_position_after_variant=5) + read_base0_start_inclusive=4, + read_base0_end_exclusive=5, + reference_base0_start_inclusive=4, + reference_base0_end_exclusive=4) + print("Actual: %s" % (read,)) + print("Expected: %s" % (expected,)) assert_equal_fields(read, expected) + def test_locus_reads_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly @@ -106,14 +104,17 @@ def test_locus_reads_deletion(): variant = Variant( "chromosome", 4, ref="TT", alt="T", normalize_contig_name=False) print(variant) - pysam_read = make_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") - - samfile = DummySamFile(reads=[pysam_read]) - reads = list(locus_read_generator( - samfile=samfile, - chromosome="chromosome", - base1_position_before_variant=variant.start - 1, - base1_position_after_variant=variant.start + 1)) + pysam_read = make_pysam_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") + + samfile = MockAlignmentFile( + references={"chromosome"}, + reads=[pysam_read]) + read_creator = ReadCollector() + reads = read_creator.get_locus_reads( + samfile, + "chromosome", + variant.start - 1, + variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( @@ -124,10 +125,14 @@ def test_locus_reads_deletion(): sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 5], quality_scores=pysam_read.query_qualities, - base0_read_position_before_variant=3, - base0_read_position_after_variant=4) + # missing would have gone after 4th nucleotide in the read + read_base0_start_inclusive=4, + read_base0_end_exclusive=4, + reference_base0_start_inclusive=4, + reference_base0_end_exclusive=5) assert_equal_fields(read, expected) + def test_locus_reads_substitution_longer(): # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG", # the alignment is interpreted as a C>G variant followed by an insertion of @@ -135,14 +140,17 @@ def test_locus_reads_substitution_longer(): variant = Variant( "chromosome", 2, ref="C", alt="GG", normalize_contig_name=False) print(variant) - pysam_read = make_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4") - - samfile = DummySamFile(reads=[pysam_read]) - reads = list(locus_read_generator( - samfile=samfile, - chromosome="chromosome", - base1_position_before_variant=1, - base1_position_after_variant=3)) + pysam_read = make_pysam_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4") + + samfile = MockAlignmentFile( + references={"chromosome"}, + reads=[pysam_read]) + read_creator = ReadCollector() + reads = read_creator.get_locus_reads( + samfile, + "chromosome", + 1, + 2) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( @@ -153,10 +161,13 @@ def test_locus_reads_substitution_longer(): sequence=pysam_read.query_sequence, reference_positions=[0, 1, None, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, - base0_read_position_before_variant=0, - base0_read_position_after_variant=3) + read_base0_start_inclusive=1, + read_base0_end_exclusive=3, + reference_base0_start_inclusive=1, + reference_base0_end_exclusive=2) assert_equal_fields(read, expected) + def test_locus_reads_substitution_shorter(): # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence # "ACCTTG", for which the alignment is interpreted as a C>G variant @@ -164,14 +175,17 @@ def test_locus_reads_substitution_shorter(): variant = Variant( "chromosome", 2, ref="CC", alt="G", normalize_contig_name=False) print(variant) - pysam_read = make_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") - - samfile = DummySamFile(reads=[pysam_read]) - reads = list(locus_read_generator( - samfile=samfile, - chromosome="chromosome", - base1_position_before_variant=1, - base1_position_after_variant=4)) + pysam_read = make_pysam_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") + + samfile = MockAlignmentFile( + references={"chromosome"}, + reads=[pysam_read]) + read_creator = ReadCollector() + reads = read_creator.get_locus_reads( + samfile, + "chromosome", + 1, + 3) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) @@ -182,10 +196,13 @@ def test_locus_reads_substitution_shorter(): sequence=pysam_read.query_sequence, reference_positions=[0, 1, 3, 4, 5], quality_scores=pysam_read.query_qualities, - base0_read_position_before_variant=0, - base0_read_position_after_variant=2) + read_base0_start_inclusive=1, + read_base0_end_exclusive=2, + reference_base0_start_inclusive=1, + reference_base0_end_exclusive=3) assert_equal_fields(read, expected) + def test_locus_reads_dataframe(): sam_all_variants = load_bam("data/b16.f10/b16.combined.bam") @@ -208,9 +225,9 @@ def test_locus_reads_dataframe(): print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) df = locus_reads_dataframe( - samfile=sam_all_variants, + alignments=sam_all_variants, chromosome="chr4", - base1_position_before_variant=45802538, - base1_position_after_variant=45802540) + base0_start=45802538, + base0_end=45802539) print(df) eq_(len(df), n_reads_expected) diff --git a/test/test_main.py b/test/test_main.py new file mode 100644 index 0000000..be35b48 --- /dev/null +++ b/test/test_main.py @@ -0,0 +1,15 @@ +from isovar import run_isovar, isovar_results_to_dataframe +from nose.tools import eq_ +from testing_helpers import data_path + +def test_isovar_main_to_dataframe(): + results = run_isovar( + variants=data_path("data/b16.f10/b16.vcf"), + alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")) + df = isovar_results_to_dataframe(results) + print(df) + eq_(len(df), 4) + # B16 test data has 2/4 variants with enough coverage + # to translate protein sequences + eq_(df["pass"].sum(), 2) + diff --git a/test/test_mutant_amino_acids_in_protein_sequence.py b/test/test_mutant_amino_acids_in_protein_sequence.py index ef829ef..c7f4217 100644 --- a/test/test_mutant_amino_acids_in_protein_sequence.py +++ b/test/test_mutant_amino_acids_in_protein_sequence.py @@ -1,20 +1,5 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import absolute_import, print_function, division - from nose.tools import eq_ from isovar.cli.protein_sequence_args import ( make_protein_sequences_arg_parser, @@ -44,6 +29,7 @@ def check_mutant_amino_acids(variant, protein_sequence, expected_amino_acids=Non protein_sequence.variant_aa_interval_start, protein_sequence.variant_aa_interval_end)) + def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode @@ -61,6 +47,7 @@ def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): protein_sequence = protein_sequences[0] check_mutant_amino_acids(variant, protein_sequence) + def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H(): # the variant chr9:82927102 G>T occurs right next to T>G so the varcode # prediction for the protein sequence (Asparagine) will be wrong since diff --git a/test/test_nucleotide_counts.py b/test/test_nucleotide_counts.py deleted file mode 100644 index 15815cc..0000000 --- a/test/test_nucleotide_counts.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function, division, absolute_import - -from isovar.nucleotide_counts import most_common_nucleotides -from isovar.variant_reads import reads_supporting_variant - -from varcode import Variant -from pyensembl import ensembl_grch38 -from nose.tools import eq_ - -from testing_helpers import load_bam - -def test_most_common_nucleotides_for_chr12_deletion(): - samfile = load_bam("data/cancer-wgs-primary.chr12.bam") - chromosome = "chr12" - base1_location = 70091490 - ref = "TTGTAGATGCTGCCTCTCC" - alt = "" - variant = Variant( - chromosome, - base1_location, - ref, - alt, - ensembl=ensembl_grch38) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, - variant=variant) - consensus_sequence, chosen_counts, other_counts = most_common_nucleotides( - variant_reads) - print(chosen_counts) - print(other_counts) - eq_(len(chosen_counts), len(consensus_sequence)) - eq_(len(other_counts), len(consensus_sequence)) - assert other_counts.sum() < chosen_counts.sum(), \ - "Counts for alternate nucleotides should not exceed the chosen sequence" - - number_matching_reads = 0 - for variant_read in variant_reads: - full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix - number_matching_reads += (full_seq in consensus_sequence) - fraction_matching_reads = number_matching_reads / float(len(variant_reads)) - print("Fraction matching reads is %d/%d = %f" % ( - number_matching_reads, len(variant_reads), fraction_matching_reads)) - assert fraction_matching_reads > 0.5, \ - "Expected majority of reads to match consensus sequence" diff --git a/test/test_protein_sequences.py b/test/test_protein_sequences.py index 279040d..a98bc97 100644 --- a/test/test_protein_sequences.py +++ b/test/test_protein_sequences.py @@ -1,127 +1,22 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import from nose.tools import eq_ -from isovar.translation import Translation +from testing_helpers import load_bam, load_vcf, data_path +from mock_objects import make_dummy_protein_sequence + +from varcode import VariantCollection +from isovar.read_collector import ReadCollector from isovar.cli.protein_sequence_args import ( protein_sequences_dataframe_from_args, make_protein_sequences_arg_parser, ) -from isovar.protein_sequences import ( - ProteinSequence, - sort_protein_sequences, - reads_generator_to_protein_sequences_generator, - protein_sequences_generator_to_dataframe, -) -from isovar.allele_reads import reads_overlapping_variants -from isovar.variant_sequence_in_reading_frame import VariantSequenceInReadingFrame -from varcode import VariantCollection +from isovar.dataframe_helpers import protein_sequences_generator_to_dataframe +from isovar.main import ProteinSequenceCreator +from isovar.protein_sequence_helpers import sort_protein_sequences -from testing_helpers import load_bam, load_vcf, data_path -# fields of a ProteinSequence: -# translations -# supporting_variant_reads -# total_variant_reads -# supporting_transcripts -# total_transcripts -# gene - - -def make_dummy_translation( - amino_acids="MKHW", # ATG=M|AAA=K|CAC=H|TGG=W - cdna_sequence="CCCATGAAACACTGGTAG", - offset_to_first_complete_codon=3, - variant_cdna_interval_start=8, # assuming variant was AAC>AAA - variant_cdna_interval_end=9, - variant_aa_interval_start=1, - variant_aa_interval_end=2, - number_mismatches=1): - varseq_in_orf = VariantSequenceInReadingFrame( - cdna_sequence=cdna_sequence, - offset_to_first_complete_codon=offset_to_first_complete_codon, - variant_cdna_interval_start=variant_cdna_interval_start, - variant_cdna_interval_end=variant_cdna_interval_end, - reference_cdna_sequence_before_variant=cdna_sequence[:variant_cdna_interval_start], - reference_cdna_sequence_after_variant=cdna_sequence[variant_cdna_interval_end:], - number_mismatches_before_variant=number_mismatches, - number_mismatches_after_variant=0) - return Translation( - variant_sequence_in_reading_frame=varseq_in_orf, - amino_acids=amino_acids, - variant_aa_interval_start=variant_aa_interval_start, - variant_aa_interval_end=variant_aa_interval_end, - frameshift=False, - ends_with_stop_codon=False, - untrimmed_variant_sequence=None, - reference_context=None) - - -def make_dummy_protein_sequence( - n_supporting_variant_reads, - n_supporting_variant_sequences, - n_supporting_reference_transcripts, - n_total_variant_sequences=None, - n_total_variant_reads=None, - n_total_reference_transcripts=None, - gene=["TP53"], - amino_acids="MKHW", # ATG=M|AAA=K|CAC=H|TGG=W - cdna_sequence="CCCATGAAACACTGGTAG", - variant_cdna_interval_start=8, # assuming variant was AAC>AAA - variant_cdna_interval_end=9, - variant_aa_interval_start=1, - variant_aa_interval_end=2, - number_mismatches=1): - """ - Creates ProteinSequence object with None filled in for most fields - """ - if n_total_variant_reads is None: - n_total_variant_reads = n_supporting_variant_reads - - if n_total_variant_sequences is None: - n_total_variant_sequences = n_supporting_variant_sequences - - if n_total_reference_transcripts is None: - n_total_reference_transcripts = n_total_reference_transcripts - - assert n_supporting_variant_sequences <= n_supporting_variant_reads - assert n_supporting_variant_sequences <= n_total_variant_sequences - assert n_supporting_reference_transcripts <= n_total_reference_transcripts - - n_translations = n_total_reference_transcripts * n_total_variant_sequences - - translation = make_dummy_translation() - - return ProteinSequence( - translations=[translation] * n_translations, - overlapping_reads=[None] * n_total_variant_reads, - ref_reads=[], - alt_reads=[None] * n_total_variant_reads, - alt_reads_supporting_protein_sequence=[None] * n_supporting_variant_reads, - transcripts_supporting_protein_sequence=[None] * n_supporting_reference_transcripts, - transcripts_overlapping_variant=[None] * n_supporting_reference_transcripts, - gene=gene, - amino_acids=amino_acids, - variant_aa_interval_start=variant_aa_interval_start, - variant_aa_interval_end=variant_aa_interval_end, - ends_with_stop_codon=translation.ends_with_stop_codon, - frameshift=translation.frameshift) - def test_sort_protein_sequences(): protseq_most_reads = make_dummy_protein_sequence( @@ -130,32 +25,23 @@ def test_sort_protein_sequences(): n_supporting_reference_transcripts=2, n_total_variant_sequences=3, n_total_variant_reads=100, - n_total_reference_transcripts=5) + n_total_reference_transcripts=2) - protseq_most_reference_transcripts = make_dummy_protein_sequence( - n_supporting_variant_reads=40, - n_supporting_variant_sequences=1, - n_supporting_reference_transcripts=3, - n_total_variant_sequences=3, - n_total_variant_reads=100, - n_total_reference_transcripts=5) - protseq_fewest_reads_or_transcripts = make_dummy_protein_sequence( + protseq_fewest_reads = make_dummy_protein_sequence( n_supporting_variant_reads=10, n_supporting_variant_sequences=1, n_supporting_reference_transcripts=1, n_total_variant_sequences=3, n_total_variant_reads=100, - n_total_reference_transcripts=5) + n_total_reference_transcripts=1) unsorted_protein_sequences = [ - protseq_fewest_reads_or_transcripts, + protseq_fewest_reads, protseq_most_reads, - protseq_most_reference_transcripts ] expected_order = [ protseq_most_reads, - protseq_most_reference_transcripts, - protseq_fewest_reads_or_transcripts, + protseq_fewest_reads, ] eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order) @@ -177,17 +63,17 @@ def variants_to_protein_sequences_dataframe( combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) - samfile = load_bam(tumor_rna_bam) - - allele_reads_generator = reads_overlapping_variants( + alignment_file = load_bam(tumor_rna_bam) + read_collector = ReadCollector(min_mapping_quality=min_mapping_quality) + read_evidence_gen = read_collector.read_evidence_generator( variants=combined_variants, - samfile=samfile, - min_mapping_quality=min_mapping_quality) + alignment_file=alignment_file) - protein_sequences_generator = reads_generator_to_protein_sequences_generator( - allele_reads_generator, + creator = ProteinSequenceCreator( max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) + protein_sequences_generator = \ + creator.protein_sequences_from_read_evidence_generator(read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants @@ -220,15 +106,18 @@ def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_q # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped # if we set the minimum quality to 256 variants = load_vcf("data/b16.f10/b16.vcf") - samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") - allele_reads_generator = reads_overlapping_variants( + alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam") + read_collector = ReadCollector(min_mapping_quality=256) + read_evidence_gen = read_collector.read_evidence_generator( variants=variants, - samfile=samfile, - min_mapping_quality=256) - protein_sequences_generator = reads_generator_to_protein_sequences_generator( - allele_reads_generator, - max_protein_sequences_per_variant=1) - df = protein_sequences_generator_to_dataframe(protein_sequences_generator) + alignment_file=alignment_file) + + creator = ProteinSequenceCreator( + max_protein_sequences_per_variant=1,) + protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator( + read_evidence_gen) + df = protein_sequences_generator_to_dataframe( + protein_sequences_generator) print(df) eq_( len(df), diff --git a/test/test_read_helpers.py b/test/test_read_helpers.py index 9b73daa..ec3c9dd 100644 --- a/test/test_read_helpers.py +++ b/test/test_read_helpers.py @@ -1,25 +1,13 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -from isovar.read_helpers import group_unique_sequences -from isovar.variant_reads import reads_supporting_variant +from isovar import ReadCollector +from isovar.allele_read_helpers import group_unique_sequences + from varcode import Variant -from pyensembl import ensembl_grch38 from testing_helpers import load_bam +from genomes_for_testing import grch38 + def test_group_unique_sequences(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") @@ -31,10 +19,10 @@ def test_group_unique_sequences(): contig=chromosome, start=base1_location, ref=ref, alt=alt, - ensembl=ensembl_grch38) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + ensembl=grch38) + read_collector = ReadCollector() + variant_reads = read_collector.allele_reads_supporting_variant( + alignment_file=samfile, variant=variant) print("%d variant reads: %s" % ( len(variant_reads), variant_reads)) diff --git a/test/test_reference_coding_sequence_key.py b/test/test_reference_coding_sequence_key.py index d5d5cda..51e3182 100644 --- a/test/test_reference_coding_sequence_key.py +++ b/test/test_reference_coding_sequence_key.py @@ -19,9 +19,10 @@ ReferenceCodingSequenceKey, ) from varcode import Variant -from pyensembl import ensembl_grch38 from nose.tools import eq_ +from genomes_for_testing import grch38 + def test_reading_frame_to_offset(): eq_(reading_frame_to_offset(0), 0) @@ -35,8 +36,8 @@ def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr(): # strand we have to take the reverse complement of the variant which turns # it into CTC>GGG tp53_substitution = Variant( - "17", 7676589, "CTC", "GGG", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + "17", 7676589, "CTC", "GGG", grch38) + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] @@ -65,14 +66,15 @@ def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr(): amino_acids_before_variant="M") eq_(result, expected) + def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant( - "17", 7676589, "CTC", "", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + "17", 7676589, "CTC", "", grch38) + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] @@ -109,9 +111,9 @@ def test_sequence_key_with_reading_frame_insertion(): # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) + "17", 7676586, "CTC", "CTCA", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC @@ -140,14 +142,15 @@ def test_sequence_key_with_reading_frame_insertion(): amino_acids_before_variant="ME") eq_(result, expected) + def test_reference_coding_sequence_key_insertion_inside_start_codon(): # insert nucleotide "C" in the middle of the start codon of TP53-001, # keeping only 1 nucleotide of context. In the reverse complement this # becomes 'T'>'TG' tp53_insertion = Variant( - "17", 7676592, "T", "TG", ensembl_grch38) + "17", 7676592, "T", "TG", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, @@ -155,11 +158,12 @@ def test_reference_coding_sequence_key_insertion_inside_start_codon(): context_size=1) assert result is None, "Expected result to be None when variant affects start codon" + def test_sequence_key_with_reading_frame_insertion_before_start_codon(): # insert nucleotide "T" before of the start codon of TP53-001, - tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38) + tp53_insertion = Variant("17", 7676593, "C", "CT", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, @@ -173,10 +177,9 @@ def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start(): # but in this test we're going to only keep enough context to see # the start codon but none of the 5' UTR. In the reverse complement this # variant becomes CTC>CTCA - tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) + tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # start codon: ATG (translates to M) @@ -210,9 +213,9 @@ def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(): # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) + "17", 7676586, "CTC", "CTCA", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG @@ -246,9 +249,9 @@ def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # complement this variant becomes CTC>CTCA. tp53_insertion = Variant( - "17", 7676586, "CTC", "CTCA", ensembl_grch38) + "17", 7676586, "CTC", "CTCA", grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) @@ -300,6 +303,7 @@ def test_reference_sequence_key_hash_and_equality_same_objects(): eq_(repr(rcsk1), repr(rcsk2)) eq_(hash(rcsk1), hash(rcsk2)) + def test_reference_sequence_key_hash_and_equality_different_objects(): rcsk1 = ReferenceCodingSequenceKey( strand="-", @@ -327,6 +331,7 @@ def test_reference_sequence_key_hash_and_equality_different_objects(): assert repr(rcsk1) != repr(rcsk_different_strand) assert hash(rcsk1) != hash(rcsk_different_strand) + def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: @@ -336,7 +341,7 @@ def test_reference_coding_sequence_key_around_TP53_201_variant(): # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) - variant = Variant("chr17", 7676591, "C", "T", "GRCh38") + variant = Variant("chr17", 7676591, "C", "T", grch38) # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] diff --git a/test/test_reference_contexts.py b/test/test_reference_contexts.py index 5626ce2..d28525a 100644 --- a/test/test_reference_contexts.py +++ b/test/test_reference_contexts.py @@ -1,38 +1,25 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -from isovar.reference_context import ( - reference_contexts_for_variants, - variants_to_reference_contexts_dataframe, - ReferenceContext, -) from varcode import Variant, VariantCollection -from pyensembl import ensembl_grch38 + from nose.tools import eq_ +from isovar.reference_context import ReferenceContext +from isovar.reference_context_helpers import reference_contexts_generator + +from isovar.dataframe_helpers import variants_to_reference_contexts_dataframe + from testing_helpers import load_vcf +from genomes_for_testing import grch38 def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # replace second codon of TP53-001 with 'CCC' tp53_substitution = Variant( - "17", 7676589, "CTC", "GGG", ensembl_grch38) + "17", 7676589, "CTC", "GGG", grch38) variant_collection = VariantCollection([tp53_substitution]) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] @@ -48,17 +35,19 @@ def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # first calling without a transcript ID white to see if we get back # multiple contexts - reference_context_dict_many_transcripts = \ - reference_contexts_for_variants( + reference_contexts_gen = \ + reference_contexts_generator( variants=variant_collection, context_size=10, transcript_id_whitelist=None) - assert len(reference_context_dict_many_transcripts) == 1, \ + reference_contexts_dict = dict(reference_contexts_gen) + + assert len(reference_contexts_dict) == 1, \ "Dictionary should have only one variant but got %d keys" % ( - len(reference_context_dict_many_transcripts),) + len(reference_contexts_dict),) - reference_contexts = reference_context_dict_many_transcripts[tp53_substitution] + reference_contexts = reference_contexts_dict[tp53_substitution] assert len(reference_contexts) > 1, \ "Expected multiple reference contexts for %s but got %d: %s" % ( @@ -67,10 +56,10 @@ def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): reference_contexts) reference_context_dict_single_transcript = \ - reference_contexts_for_variants( + dict(reference_contexts_generator( variants=variant_collection, context_size=10, - transcript_id_whitelist={tp53_001.id}) + transcript_id_whitelist={tp53_001.id})) # still only expect one variant key eq_(len(reference_context_dict_single_transcript), 1) @@ -101,7 +90,8 @@ def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): def test_variants_to_reference_contexts_dataframe(): variants = load_vcf("data/b16.f10/b16.vcf") assert len(variants) > 0 - df = variants_to_reference_contexts_dataframe(variants, context_size=10) + gen = reference_contexts_generator(variants, context_size=10) + df = variants_to_reference_contexts_dataframe(gen) print(df) groups = df.groupby(["chr", "pos", "ref", "alt"]) # make sure we have at least one reference context for each diff --git a/test/test_reference_sequence_key.py b/test/test_reference_sequence_key.py index fbe15e1..246d97d 100644 --- a/test/test_reference_sequence_key.py +++ b/test/test_reference_sequence_key.py @@ -1,30 +1,18 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -from isovar.reference_sequence_key import ReferenceSequenceKey from varcode import Variant -from pyensembl import ensembl_grch38, genome_for_reference_name from nose.tools import eq_ +from isovar.reference_sequence_key import ReferenceSequenceKey + +from genomes_for_testing import grch38, grcm38 + + def test_sequence_key_for_variant_on_transcript_substitution(): # rs769125639 is a simple T>A substitution in the 6th nucleotide of # BRCA2-001's 5' UTR - brca2_variant_rs769125639 = Variant( - "13", 32315479, "T", "A", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + brca2_variant_rs769125639 = Variant("13", 32315479, "T", "A", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] # first 50 characters of BRCA2-001: # "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG" brca2_ref_seq = brca2_001.sequence[:50] @@ -45,9 +33,8 @@ def test_sequence_key_for_variant_on_transcript_substitution(): def test_sequence_key_for_variant_on_transcript_deletion(): # Delete the 6th nucleotide of BRCA2-001's 5' UTR - brca2_variant_deletion = Variant( - "13", 32315479, "T", "", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + brca2_variant_deletion = Variant("13", 32315479, "T", "", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] # first 50 characters of BRCA2-001: # "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG" brca2_ref_seq = brca2_001.sequence[:50] @@ -65,11 +52,12 @@ def test_sequence_key_for_variant_on_transcript_deletion(): sequence_after_variant_locus=brca2_ref_seq[6:16]) eq_(sequence_key, expected_sequence_key) + def test_sequence_key_for_variant_on_transcript_insertion(): # Insert 'CCC' after the 6th nucleotide of BRCA2-001's 5' UTR brca2_variant_insertion = Variant( - "13", 32315479, "T", "TCCC", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + "13", 32315479, "T", "TCCC", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] # first 50 characters of BRCA2-001: # "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG" brca2_ref_seq = brca2_001.sequence[:50] @@ -94,9 +82,8 @@ def test_sequence_key_for_variant_on_transcript_insertion(): def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): # Replace start codon of TP53-001 with 'CCC', however since this is on # reverse strand the variant becomes "CAT">"GGG" - tp53_substitution = Variant( - "17", 7676592, "CAT", "GGG", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_substitution = Variant("17", 7676592, "CAT", "GGG", grch38) + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' @@ -115,12 +102,12 @@ def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand(): sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key) + def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): # delete start codon of TP53-001, which in reverse complement means # deleting the sequence "CAT" - tp53_deletion = Variant( - "17", 7676592, "CAT", "", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_deletion = Variant("17", 7676592, "CAT", "", grch38) + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' @@ -139,12 +126,12 @@ def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand(): sequence_after_variant_locus="GAGGAGCCGC") eq_(sequence_key, expected_sequence_key) + def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand(): # insert 'CCC' after start codon of TP53-001, which on the reverse # complement means inserting "GGG" between "CTC_CAT" - tp53_insertion = Variant( - "17", 7676589, "CTC", "CTCGGG", ensembl_grch38) - tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] + tp53_insertion = Variant("17", 7676589, "CTC", "CTCGGG", grch38) + tp53_001 = grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around start codon with 10 context nucleotides: # In [51]: t.sequence[190-10:190+13] # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC' @@ -184,6 +171,7 @@ def test_reference_sequence_key_hash_and_equality_same_objects(): eq_(str(rsk1), str(rsk2)) eq_(repr(rsk1), repr(rsk2)) + def test_reference_sequence_key_hash_and_equality_different_objects(): rsk1 = ReferenceSequenceKey( strand="+", @@ -200,12 +188,12 @@ def test_reference_sequence_key_hash_and_equality_different_objects(): assert str(rsk1) != str(rsk_different_strand) assert repr(rsk1) != repr(rsk_different_strand) + def test_reference_sequence_key_from_weird_deletion(): # variant reads into the intron; want to make sure isovar skips over such cases - mouse_genome = genome_for_reference_name("grcm38") - variant = Variant( - "11", 106262686, "GTGAAGG", "", mouse_genome) - transcript = mouse_genome.transcript_by_id("ENSMUST00000021049") + + variant = Variant("11", 106262686, "GTGAAGG", "", grcm38) + transcript = grcm38.transcript_by_id("ENSMUST00000021049") sequence_key = ReferenceSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, diff --git a/test/test_somatic_variant_with_0_supporting_rna_reads.py b/test/test_somatic_variant_with_0_supporting_rna_reads.py index 7a2c40a..acf3ea4 100644 --- a/test/test_somatic_variant_with_0_supporting_rna_reads.py +++ b/test/test_somatic_variant_with_0_supporting_rna_reads.py @@ -1,30 +1,36 @@ -from isovar.variant_reads import reads_supporting_variant +from __future__ import print_function, division, absolute_import + from varcode import Variant from testing_helpers import load_bam from nose.tools import eq_ +from isovar.read_collector import ReadCollector + +from genomes_for_testing import grch38 + + def test_somatic_variant_with_0_supporting_rna_reads(): - variant = Variant("6", 90411765, "G", "A") + variant = Variant("6", 90411765, "G", "A", grch38) base_dir = "data/somatic-variant-with-0-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam") rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam") - - normal_sample_variant_reads = reads_supporting_variant( + read_creator = ReadCollector() + normal_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=normal_reads) + alignment_file=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) - tumor_sample_variant_reads = reads_supporting_variant( + tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=tumor_reads) + alignment_file=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 5) - rna_sample_variant_reads = reads_supporting_variant( + rna_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=rna_reads) + alignment_file=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 0) diff --git a/test/test_somatic_variant_with_2_supporting_rna_reads.py b/test/test_somatic_variant_with_2_supporting_rna_reads.py index dfc73b3..8c45d32 100644 --- a/test/test_somatic_variant_with_2_supporting_rna_reads.py +++ b/test/test_somatic_variant_with_2_supporting_rna_reads.py @@ -1,30 +1,36 @@ -from isovar.variant_reads import reads_supporting_variant +from __future__ import print_function, division, absolute_import + from varcode import Variant from testing_helpers import load_bam from nose.tools import eq_ +from isovar.read_collector import ReadCollector + +from genomes_for_testing import grch38 + + def test_somatic_variant_with_2_supporting_rna_reads(): - variant = Variant("14", 105849746, "G", "A") + variant = Variant("14", 105849746, "G", "A", grch38) base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") - - normal_sample_variant_reads = reads_supporting_variant( + read_creator = ReadCollector() + normal_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=normal_reads) + alignment_file=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) - tumor_sample_variant_reads = reads_supporting_variant( + tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=tumor_reads) + alignment_file=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) - rna_sample_variant_reads = reads_supporting_variant( + rna_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, - samfile=rna_reads) + alignment_file=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names diff --git a/test/test_translation.py b/test/test_translation.py index a91457e..483d39f 100644 --- a/test/test_translation.py +++ b/test/test_translation.py @@ -1,22 +1,7 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -from isovar.translation import translate_variants -from isovar.variant_reads import reads_supporting_variants +from isovar import ProteinSequenceCreator, ReadCollector from nose.tools import eq_ @@ -26,12 +11,15 @@ def test_translate_variant_collection(): variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") - - result = list(translate_variants(reads_supporting_variants(variants, samfile))) + read_evidence_gen = ReadCollector().read_evidence_generator( + variants, + samfile) + translation_gen = ProteinSequenceCreator().translate_variants(read_evidence_gen) + translations = list(translation_gen) eq_( - len(result), + len(translations), 4, "Expected %d translated variants but got %d: %s" % ( len(variants), - len(result), - result)) + len(translations), + translations)) diff --git a/test/test_value_object.py b/test/test_value_object.py index b16e1f4..b541173 100644 --- a/test/test_value_object.py +++ b/test/test_value_object.py @@ -1,24 +1,31 @@ +from __future__ import print_function, division, absolute_import + from isovar.value_object import ValueObject from nose.tools import eq_ + def test_no_fields_unless_specified(): v = ValueObject() eq_(v._fields, ()) eq_(v._values, ()) + def test_default_string_repr(): v = ValueObject() eq_(str(v), "ValueObject()") eq_(repr(v), "ValueObject()") + class DerivedWithoutInit(ValueObject): __slots__ = ["a", "b"] + def test_default_init(): obj = DerivedWithoutInit(a=1, b=2) eq_(obj.a, 1) eq_(obj.b, 2) + class DerivedWithInit(ValueObject): __slots__ = ["a", "b"] @@ -26,6 +33,7 @@ def __init__(self, a, b): self.a = a self.b = b + def test_equality_checks_class(): # two objects of different classes should not be equal # even if their fields are the same @@ -35,6 +43,7 @@ def test_equality_checks_class(): eq_(hash(x), hash(y)) assert x != y, "Expected %s != %s" % (x, y) + def test_derived_string_repr(): x = DerivedWithInit(a=1, b=2) eq_(str(x), "DerivedWithInit(a=1, b=2)") diff --git a/test/test_variant_helpers.py b/test/test_variant_helpers.py index f27573f..dc4e0b4 100644 --- a/test/test_variant_helpers.py +++ b/test/test_variant_helpers.py @@ -1,17 +1,3 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import from isovar.variant_helpers import ( @@ -22,11 +8,11 @@ ) from nose.tools import eq_ from varcode import Variant -from pyensembl import ensembl_grch38 +from genomes_for_testing import grch38 def test_trim_variant_substitution(): - loc, ref, alt = trim_variant(Variant("chr1", 10, "C", "G")) + loc, ref, alt = trim_variant(Variant("chr1", 10, "C", "G", grch38)) eq_(loc, 10) eq_(ref, "C") eq_(alt, "G") @@ -38,6 +24,7 @@ def test_trim_variant_fields_substitution(): eq_(ref, "C") eq_(alt, "G") + def test_trim_variant_insertion(): loc, ref, alt = trim_variant(Variant("chr1", 10, "C", "CG")) eq_(loc, 10) @@ -58,53 +45,59 @@ def test_trim_variant_deletion(): eq_(ref, "G") eq_(alt, "") + def test_trim_variant_fields_deletion(): loc, ref, alt = trim_variant_fields(10, "CG", "C") eq_(loc, 11) eq_(ref, "G") eq_(alt, "") + def test_base0_interval_for_variant_substitution(): (start, end) = base0_interval_for_variant(Variant("chr1", 10, "C", "G")) eq_(start, 9) eq_(end, 10) + def test_base0_interval_for_variant_insertion(): (start, end) = base0_interval_for_variant(Variant("chr1", 10, "C", "CG")) eq_(start, 10) eq_(end, 10) + def test_base0_interval_for_variant_deletion(): (start, end) = base0_interval_for_variant(Variant("chr1", 10, "CG", "C")) eq_(start, 10) eq_(end, 11) + def test_interbase_range_for_brca2_utr_substitution(): # rs769125639 is a simple T>A substitution in the 6th nucleotide of # BRCA2-001's 5' UTR - brca2_variant_rs769125639 = Variant( - "13", 32315479, "T", "A", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + brca2_variant_rs769125639 = Variant("13", 32315479, "T", "A", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_variant_rs769125639, transcript=brca2_001) print(interbase_range) eq_(interbase_range, (5, 6)) + def test_interbase_range_for_brca2_utr_insertion(): # T>TC insertion after the 6th nucleotide of BRCA2-001's 5' UTR - brca2_insertion = Variant("13", 32315479, "T", "TC", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + brca2_insertion = Variant("13", 32315479, "T", "TC", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_insertion, transcript=brca2_001) print(interbase_range) eq_(interbase_range, (6, 6)) + def test_interbase_range_for_brca2_utr_deletion(): # Deletion of the 6th nucleotide of BRCA2-001's 5' UTR - brca2_deletion = Variant("13", 32315479, "T", "", ensembl_grch38) - brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0] + brca2_deletion = Variant("13", 32315479, "T", "", grch38) + brca2_001 = grch38.transcripts_by_name("BRCA2-001")[0] interbase_range = interbase_range_affected_by_variant_on_transcript( variant=brca2_deletion, transcript=brca2_001) diff --git a/test/test_variant_sequence_in_reading_frame.py b/test/test_variant_orf.py similarity index 95% rename from test/test_variant_sequence_in_reading_frame.py rename to test/test_variant_orf.py index 7d2587d..5cb037c 100644 --- a/test/test_variant_sequence_in_reading_frame.py +++ b/test/test_variant_orf.py @@ -1,17 +1,19 @@ +from __future__ import print_function, division, absolute_import + from nose.tools import eq_ from varcode import Variant -from isovar.variant_sequence_in_reading_frame import ( + +from isovar.variant_orf import ( compute_offset_to_first_complete_codon, - match_variant_sequence_to_reference_context, - VariantSequenceInReadingFrame, + VariantORF, ) -from isovar.variant_sequences import VariantSequence +from isovar.variant_orf_helpers import match_variant_sequence_to_reference_context +from isovar.variant_sequence import VariantSequence from isovar.reference_coding_sequence_key import ReferenceCodingSequenceKey from isovar.reference_context import ReferenceContext -from isovar.allele_reads import AlleleRead +from isovar.allele_read import AlleleRead from isovar.dna import reverse_complement_dna - def test_compute_offset_to_first_complete_codon_no_trimming(): # if nothing gets trimmed from the reference sequence, then # the offset to the first codon shouldn't change @@ -161,16 +163,16 @@ def make_inputs_for_tp53_201_variant( transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) - expected = VariantSequenceInReadingFrame( + expected = VariantORF( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], reference_cdna_sequence_after_variant="AGGAGCCGCAGTCAGAT"[:reference_context_size], - number_mismatches_before_variant=mismatches_before_variant, - number_mismatches_after_variant=mismatches_after_variant) - assert isinstance(expected, VariantSequenceInReadingFrame) + num_mismatches_before_variant=mismatches_before_variant, + num_mismatches_after_variant=mismatches_after_variant) + assert isinstance(expected, VariantORF) return variant_sequence, reference_context, expected @@ -308,7 +310,7 @@ def test_match_variant_sequence_to_reference_context_include_mismatches_after_va reference_context=reference_context, min_transcript_prefix_length=3, max_transcript_mismatches=0, - include_mismatches_after_variant=False) + count_mismatches_after_variant=False) # should have a result, since we're not counting mismatches after the variant eq_(expected, result) @@ -318,5 +320,5 @@ def test_match_variant_sequence_to_reference_context_include_mismatches_after_va reference_context=reference_context, min_transcript_prefix_length=3, max_transcript_mismatches=0, - include_mismatches_after_variant=True) + count_mismatches_after_variant=True) eq_(None, result) diff --git a/test/test_variant_reads_with_dummy_samfile.py b/test/test_variant_reads_with_dummy_samfile.py index c5c1271..182b44f 100644 --- a/test/test_variant_reads_with_dummy_samfile.py +++ b/test/test_variant_reads_with_dummy_samfile.py @@ -1,25 +1,13 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import +from nose.tools import eq_ from varcode import Variant -from isovar.variant_reads import reads_supporting_variant -from isovar.allele_reads import AlleleRead -from mock_read_data import DummySamFile, make_read -from nose.tools import eq_ +from isovar.allele_read import AlleleRead +from isovar.read_collector import ReadCollector + +from mock_objects import MockAlignmentFile, make_pysam_read +from genomes_for_testing import grch38 def test_partitioned_read_sequences_snv(): @@ -35,15 +23,22 @@ def test_partitioned_read_sequences_snv(): alt = "G" variant = Variant( - chromosome, location, ref, alt, normalize_contig_name=False) - - read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2") - - samfile = DummySamFile(reads=[read]) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, - variant=variant) + chromosome, + location, + ref, + alt, + grch38, + normalize_contig_name=False) + + read = make_pysam_read(seq="ACCGTG", cigar="6M", mdtag="3G2") + + samfile = MockAlignmentFile( + references=(chromosome,), + reads=[read]) + read_creator = ReadCollector() + variant_reads = read_creator.allele_reads_supporting_variant( + variant=variant, + alignment_file=samfile) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] @@ -68,14 +63,20 @@ def test_partitioned_read_sequences_insertion(): ref = "T" alt = "TG" variant = Variant( - chromosome, location, ref, alt, normalize_contig_name=False) + chromosome, location, ref, alt, grch38, normalize_contig_name=False) + + read = make_pysam_read( + seq=b"ACCTGTG", + cigar="4M1I2M", + mdtag="6") - read = make_read(seq=b"ACCTGTG", cigar="4M1I2M", mdtag="6") + samfile = MockAlignmentFile( + references=(chromosome,), + reads=[read]) + read_creator = ReadCollector() - samfile = DummySamFile(reads=[read]) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + variant_reads = read_creator.allele_reads_supporting_variant( + alignment_file=samfile, variant=variant) print(variant_reads) assert len(variant_reads) == 1 @@ -100,14 +101,18 @@ def test_partitioned_read_sequences_deletion(): ref = "TT" alt = "T" variant = Variant( - chromosome, location, ref, alt, normalize_contig_name=False) - - read = make_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") - - samfile = DummySamFile(reads=[read]) - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + chromosome, location, ref, alt, grch38, normalize_contig_name=False) + + read = make_pysam_read( + seq="ACCTG", + cigar="4M1D1M", + mdtag="4^T1") + samfile = MockAlignmentFile( + references=(chromosome,), + reads=[read]) + read_creator = ReadCollector() + variant_reads = read_creator.allele_reads_supporting_variant( + alignment_file=samfile, variant=variant) print(variant_reads) assert len(variant_reads) == 1 diff --git a/test/test_variant_sequences.py b/test/test_variant_sequences.py index e3d5c9e..dbc7393 100644 --- a/test/test_variant_sequences.py +++ b/test/test_variant_sequences.py @@ -1,29 +1,17 @@ -# Copyright (c) 2016-2018. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import from nose.tools import eq_ from varcode import Variant -from isovar.variant_sequences import ( - reads_to_variant_sequences, + +from isovar import ( + VariantSequenceCreator, VariantSequence ) -from isovar.variant_reads import reads_supporting_variant -from isovar.allele_reads import AlleleRead +from isovar.allele_read import AlleleRead +from isovar.read_collector import ReadCollector from testing_helpers import load_bam +from genomes_for_testing import grch38 def test_sequence_counts_snv(): @@ -32,17 +20,15 @@ def test_sequence_counts_snv(): base1_location = 65857041 ref = "G" alt = "C" - variant = Variant(chromosome, base1_location, ref, alt) - - variant_reads = reads_supporting_variant( - samfile=samfile, - chromosome=chromosome, + variant = Variant(chromosome, base1_location, ref, alt, grch38) + read_creator = ReadCollector() + variant_reads = read_creator.allele_reads_supporting_variant( + alignment_file=samfile, variant=variant) - - variant_sequences = reads_to_variant_sequences( + variant_sequence_creator = VariantSequenceCreator(preferred_sequence_length=61) + variant_sequences = variant_sequence_creator.reads_to_variant_sequences( variant=variant, - reads=variant_reads, - preferred_sequence_length=61) + reads=variant_reads) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence)