Skip to content

Commit

Permalink
info about variants (#76)
Browse files Browse the repository at this point in the history
* info about variants

* notebook update

* notebook update

* new tests

Co-authored-by: Kalin Nonchev <[email protected]>
Co-authored-by: Kalin Nonchev <[email protected]>
Co-authored-by: Kalin Nonchev <[email protected]>
  • Loading branch information
4 people authored May 26, 2020
1 parent 2a58b40 commit 4635665
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 87 deletions.
7 changes: 4 additions & 3 deletions kipoiseq/dataloaders/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,21 @@ def _extractor(self):
for transcript_id, seqs in self.protein_vcf_extractor.extract_all():
# reference sequence
ref_seq = self.transcript_extractor.get_protein_seq(transcript_id)
for alt_seq in seqs:
for (alt_seq, variant) in seqs:
yield {
'input': {
'ref_seq': ref_seq,
'alt_seq': alt_seq,
},
'metadata': self.get_metadata(transcript_id)
'metadata': self.get_metadata(transcript_id, variant)
}

def get_metadata(self, transcript_id: str):
def get_metadata(self, transcript_id: str, variant: dict):
"""
get metadata for given transcript_id
"""
row = self.metadatas.loc[transcript_id]
metadata = self.metadatas.loc[transcript_id].to_dict()
metadata['transcript_id'] = row.name
metadata['variants'] = variant
return metadata
29 changes: 21 additions & 8 deletions kipoiseq/extractors/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _get_cds_from_gtf(df):
.query("{} == 'protein_coding'".format(biotype_str))
.query("(Feature == 'CDS') | (Feature == 'CCDS')")
)
df = df[df['tag'].notna()] #grch37 have ccds without tags
df = df[df['tag'].notna()] # grch37 have ccds without tags
return df[df["tag"].str.contains("basic|CCDS")].set_index('transcript_id')

@staticmethod
Expand Down Expand Up @@ -233,7 +233,7 @@ def __init__(self, gtf_file, fasta_file, vcf_file):
# match variant with transcript_id
self.single_variant_matcher = SingleVariantMatcher(
self.vcf_file, pranges=pr_cds)

self.fasta = FastaStringExtractor(self.fasta_file)
self.multi_sample_VCF = MultiSampleVCF(self.vcf_file)
self.variant_seq_extractor = VariantSeqExtractor(self.fasta_file)
Expand All @@ -245,6 +245,18 @@ def _unstrand(intervals: List[Interval]):
"""
return [i.unstrand() for i in intervals]

@staticmethod
def _prepare_variants(variants: 'List of variants'):
variants_dict = dict()
# fill dict with variants (as dict)
for index, v in enumerate(variants):
variants_dict[index] = dict(
(key.replace('_', ''), value) for key, value in v.__dict__.items())
# if single varint, unpack dict
if len(variants_dict) == 1:
variants_dict = variants_dict[0]
return variants_dict

def extract_cds(self, cds: List[Interval], sample_id=None):
"""
Extract cds with variants in their dna sequence. It depends on the
Expand All @@ -261,9 +273,9 @@ def extract_cds(self, cds: List[Interval], sample_id=None):

iter_seqs = self.extract_query(variant_interval_queryable,
sample_id=sample_id)

for seqs in iter_seqs:
yield ProteinSeqExtractor._prepare_seq(seqs, cds[0].strand, cds[0].attrs['tag'])
# 1st seq, 2nd variant info
yield ProteinSeqExtractor._prepare_seq(seqs[0], cds[0].strand, cds[0].attrs['tag']), seqs[1]

def extract_all(self):
"""
Expand All @@ -277,7 +289,6 @@ def extract_all(self):
yield transcript_id, self.extract(transcript_id)
else:
print('No matched variants with transcript_ids.')


def extract_list(self, list_with_transcript_id: List[str]):
"""
Expand Down Expand Up @@ -328,15 +339,18 @@ def _extract_query(self, variant_interval_queryable, sample_id=None):
"""
seqs = []
flag = True
variants_info = list()
for variants, interval in variant_interval_queryable.variant_intervals:
variants = list(self._filter_snv(variants))
if len(variants) > 0:
flag = False
variants_info.extend(variants)
seqs.append(self.variant_seq_extractor.extract(
interval, variants, anchor=0))
if flag:
seqs = []
yield "".join(seqs)
variants_info = []
yield "".join(seqs), self._prepare_variants(variants_info)

def extract_query(self, variant_interval_queryable, sample_id=None):
"""
Expand Down Expand Up @@ -369,10 +383,9 @@ def extract_query(self, variant_interval_queryable, sample_id=None):
variant_interval_queryable.variant_intervals):
variants = self._filter_snv(variants)
for variant in variants:

yield [
*ref_cds_seq[:i],
self.variant_seq_extractor.extract(
interval, [variant], anchor=0),
*ref_cds_seq[(i+1):],
]
], self._prepare_variants([variant])
46 changes: 18 additions & 28 deletions notebooks/getting-started-with-VariantExtractors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,18 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/modules/i12g/anaconda/envs/nonchevFromBrechtmann/lib/python3.6/site-packages/sklearn/utils/linear_assignment_.py:22: FutureWarning: The linear_assignment_ module is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.\n",
" FutureWarning)\n"
]
}
],
"source": [
"from kipoiseq.extractors.protein import SingleVariantProteinVCFSeqExtractor, TranscriptSeqExtractor, SingleSeqProteinVCFSeqExtractor\n",
"from kipoiseq.transforms.functional import rc_dna, translate\n",
Expand All @@ -30,14 +39,6 @@
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"03/30/2020 14:07:39 - INFO - numexpr.utils - Note: detected 96 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n",
"03/30/2020 14:07:39 - INFO - numexpr.utils - Note: NumExpr detected 96 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -56,36 +57,25 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'PTSCGQQSLNVLAVLFSLLFSAVLSAHFRVCEPYTDHKGRYHFGFHCPRLSDNKTFILCCHHNNTVFKYCCNETEFQAVMQANLTASSEGYMHNNYTALLGVWIYGFFVLMLLVLDLLYYSAMNYDICKVYLARWGIQGRWMKQDPRRWGNPARAPRPGQRAPQPQPPPGPLPQAPQAVHTLRGDAHSPPLMTFQSSS'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"ssp.extract('enst_test2') # extract seq with variants for given transcript_id"
"name, info_var = ssp.extract('enst_test2') # extract seq with variants for given transcript_id"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"assert ssp.extract('enst_test1') == '' # seq for transcript_id withouth variant shoudl be empty string"
"assert ssp.extract('enst_test1')[0] == '' # seq for transcript_id withouth variant shoudl be empty string"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -97,7 +87,7 @@
}
],
"source": [
"for seq in ssp.extract_all(): # extract all transcript_ids with variant\n",
"for name, (seq, info_var) in ssp.extract_all(): # extract all transcript_ids with variant\n",
" print(seq)"
]
},
Expand Down
47 changes: 17 additions & 30 deletions notebooks/getting_started_with_SeqVec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/modules/i12g/anaconda/envs/nonchevFromBrechtmann/lib/python3.6/site-packages/sklearn/utils/linear_assignment_.py:22: FutureWarning: The linear_assignment_ module is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.\n",
" FutureWarning)\n"
]
}
],
"source": [
"from kipoiseq.extractors.protein import SingleVariantProteinVCFSeqExtractor, TranscriptSeqExtractor, SingleSeqProteinVCFSeqExtractor\n",
"from kipoiseq.transforms.functional import rc_dna, translate\n",
Expand All @@ -30,14 +39,6 @@
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"03/30/2020 14:01:50 - INFO - numexpr.utils - Note: detected 96 virtual cores but NumExpr set to maximum of 64, check \"NUMEXPR_MAX_THREADS\" environment variable.\n",
"03/30/2020 14:01:50 - INFO - numexpr.utils - Note: NumExpr detected 96 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand All @@ -63,9 +64,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
"03/30/2020 14:02:22 - INFO - kipoi.sources - Update /data/nasif12/home_if12/nonchev/.kipoi/models/\n",
"03/30/2020 14:02:49 - INFO - kipoi.data - successfully loaded the dataloader SeqVec/embedding/. from /data/nasif12/home_if12/nonchev/.kipoi/models/SeqVec/embedding/dataloader.SeqDataloader\n",
"03/30/2020 14:02:49 - INFO - kipoi.model - Downloading model arguments weights from https://rostlab.org/~deepppi/embedding_repo/embedding_models/seqvec/weights.hdf5\n"
"05/19/2020 15:09:25 - INFO - kipoi.sources - Update /data/nasif12/home_if12/nonchev/.kipoi/models/\n",
"05/19/2020 15:09:27 - INFO - kipoi.data - successfully loaded the dataloader SeqVec/embedding/. from /data/nasif12/home_if12/nonchev/.kipoi/models/SeqVec/embedding/dataloader.SeqDataloader\n",
"05/19/2020 15:09:27 - INFO - kipoi.model - Downloading model arguments weights from https://rostlab.org/~deepppi/embedding_repo/embedding_models/seqvec/weights.hdf5\n"
]
},
{
Expand All @@ -79,7 +80,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"03/30/2020 14:02:53 - INFO - kipoi.model - Downloading model arguments options from https://rostlab.org/~deepppi/embedding_repo/embedding_models/seqvec/options.json\n"
"05/19/2020 15:09:30 - INFO - kipoi.model - Downloading model arguments options from https://rostlab.org/~deepppi/embedding_repo/embedding_models/seqvec/options.json\n"
]
},
{
Expand All @@ -93,8 +94,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"03/30/2020 14:02:53 - INFO - allennlp.commands.elmo - Initializing ELMo.\n",
"03/30/2020 14:03:18 - INFO - kipoi.pipeline - dataloader.output_schema is compatible with model.schema\n"
"05/19/2020 15:09:30 - INFO - allennlp.commands.elmo - Initializing ELMo.\n",
"05/19/2020 15:09:40 - INFO - kipoi.pipeline - dataloader.output_schema is compatible with model.schema\n"
]
}
],
Expand Down Expand Up @@ -128,7 +129,7 @@
"outputs": [],
"source": [
"seqvecs = []\n",
"for seq in seqs:\n",
"for seq, info_var in seqs:\n",
" seqvecs.append(model.predict_single_sample(seq))\n",
"assert len(seqvecs) == 3"
]
Expand All @@ -152,20 +153,6 @@
"for seqvec in seqvecs:\n",
" print(seqvec.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Binary file modified tests/data/singleVar_vcf_enst_test2.vcf.gz
Binary file not shown.
Binary file modified tests/data/singleVar_vcf_enst_test2.vcf.gz.tbi
Binary file not shown.
17 changes: 16 additions & 1 deletion tests/dataloaders/test_protein_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,19 @@ def test_single_variant_protein_dataLoader(single_variant_protein_dataLoader):
assert type(units[2]['metadata']) == dict
assert len(units[2]) == 2
assert len(units[2]['input']) == 2
assert len(units[2]['metadata']) == 17 # number of columns
assert len(units[2]['metadata']) == 18 # number of columns

assert units[0]['metadata']['variants']['ref'] == "C"
assert units[0]['metadata']['variants']['alt'] == "G"
assert units[0]['metadata']['variants']['pos'] == 596
assert units[0]['metadata']['variants']['id'] == "9"

assert units[1]['metadata']['variants']['ref'] == "A"
assert units[1]['metadata']['variants']['alt'] == "G"
assert units[1]['metadata']['variants']['pos'] == 597
assert units[1]['metadata']['variants']['id'] == "10"

assert units[2]['metadata']['variants']['ref'] == "T"
assert units[2]['metadata']['variants']['alt'] == "G"
assert units[2]['metadata']['variants']['pos'] == 598
assert units[2]['metadata']['variants']['id'] == "11"
Loading

0 comments on commit 4635665

Please sign in to comment.