From d03b53137663c8ecb6072b12cf735bd004f8a1ec Mon Sep 17 00:00:00 2001 From: anna shcherbina Date: Tue, 5 Mar 2019 11:56:20 -0800 Subject: [PATCH 1/6] dinuc_shuffle can accept both a list of fasta strings and a one-hot-encoded array. Support for one-hot-encoded inputs to get_shuffle_seq_ref_function --- deeplift/dinuc_shuffle.py | 14 +++++++++----- deeplift/util.py | 23 ++++++++++++++--------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/deeplift/dinuc_shuffle.py b/deeplift/dinuc_shuffle.py index f3a3cf9..f9edf4b 100644 --- a/deeplift/dinuc_shuffle.py +++ b/deeplift/dinuc_shuffle.py @@ -6,7 +6,7 @@ def prepare_edges(s): edges = defaultdict(list) for i in range(len(s)-1): - edges[s[i]].append(s[i+1]) + edges[tuple(s[i])].append(s[i+1]) return edges @@ -26,11 +26,15 @@ def traverse_edges(s, edges): edges_queue_pointers = defaultdict(lambda: 0) for i in range(len(s)-1): last_char = generated[-1] - generated.append(edges[last_char][edges_queue_pointers[last_char]]) - edges_queue_pointers[last_char] += 1 - return "".join(generated) + generated.append(edges[tuple(last_char)][edges_queue_pointers[tuple(last_char)]]) + edges_queue_pointers[tuple(last_char)] += 1 + if isinstance(generated[0],str): + return "".join(generated) + import numpy as np + return np.asarray(generated) def dinuc_shuffle(s): - s = s.upper() + if isinstance(s, str): + s=s.upper() return traverse_edges(s, shuffle_edges(prepare_edges(s))) diff --git a/deeplift/util.py b/deeplift/util.py index fa4ec86..910fbfc 100644 --- a/deeplift/util.py +++ b/deeplift/util.py @@ -343,7 +343,8 @@ def hypothetical_contribs_func(task_idx, def get_shuffle_seq_ref_function(score_computation_function, - shuffle_func, one_hot_func): + shuffle_func, one_hot_func=None): + def compute_scores_with_shuffle_seq_refs( task_idx, input_data_sequences, num_refs_per_seq, batch_size, seed=1, progress_update=None): @@ -364,14 +365,18 @@ def compute_scores_with_shuffle_seq_refs( print(str(references_generated) +" reference seqs generated") to_run_input_data_seqs.append(seq) - to_run_input_data_refs.append(shuffle_func(seq)) - if (progress_update is not None): - print("One hot encoding sequences...") - input_data_list = [one_hot_func(to_run_input_data_seqs)] - input_references_list = [one_hot_func(to_run_input_data_refs)] - if (progress_update is not None): - print("One hot encoding done...") - + to_run_input_data_refs.append(shuffle_func(seq)) + if one_hot_func is not None: + if (progress_update is not None): + print("One hot encoding sequences...") + input_data_list = [one_hot_func(to_run_input_data_seqs)] + input_references_list = [one_hot_func(to_run_input_data_refs)] + if (progress_update is not None): + print("One hot encoding done...") + else: + #the data is already one-hot encoded + input_data_list = [to_run_input_data_seqs] + input_references_list = [to_run_input_data_refs] computed_scores = np.array(score_computation_function( task_idx=task_idx, input_data_list=input_data_list, From 752dbce4d78e2b2c8e326c1865bcae100aee5a45 Mon Sep 17 00:00:00 2001 From: anna shcherbina Date: Thu, 7 Mar 2019 19:41:39 -0800 Subject: [PATCH 2/6] functionality added for using one-hot-encoded sequences as inputs for deeplift with shuffled ref --- deeplift/dinuc_shuffle.py | 5 ++-- deeplift/util.py | 14 ++++++--- tests/shuffling/test_dinuc_shuffle.py | 30 +++++++++++++++++++ .../shuffling/test_dinuc_shuffle_on_onehot.py | 23 ++++++++++++++ 4 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 tests/shuffling/test_dinuc_shuffle.py create mode 100644 tests/shuffling/test_dinuc_shuffle_on_onehot.py diff --git a/deeplift/dinuc_shuffle.py b/deeplift/dinuc_shuffle.py index f9edf4b..0febcc5 100644 --- a/deeplift/dinuc_shuffle.py +++ b/deeplift/dinuc_shuffle.py @@ -30,8 +30,9 @@ def traverse_edges(s, edges): edges_queue_pointers[tuple(last_char)] += 1 if isinstance(generated[0],str): return "".join(generated) - import numpy as np - return np.asarray(generated) + else: + import numpy as np + return np.asarray(generated) def dinuc_shuffle(s): diff --git a/deeplift/util.py b/deeplift/util.py index 910fbfc..0ee7fbf 100644 --- a/deeplift/util.py +++ b/deeplift/util.py @@ -364,6 +364,9 @@ def compute_scores_with_shuffle_seq_refs( references_generated%progress_update==0): print(str(references_generated) +" reference seqs generated") + if isinstance(seq,np.ndarray): + seq_shape=seq.shape + seq=seq.squeeze() to_run_input_data_seqs.append(seq) to_run_input_data_refs.append(shuffle_func(seq)) if one_hot_func is not None: @@ -375,20 +378,23 @@ def compute_scores_with_shuffle_seq_refs( print("One hot encoding done...") else: #the data is already one-hot encoded - input_data_list = [to_run_input_data_seqs] - input_references_list = [to_run_input_data_refs] + input_shape=list(input_data_sequences.shape) + input_shape[0]=input_shape[0]*num_refs_per_seq + input_shape=tuple(input_shape) + input_data_list = [np.reshape(np.asarray(to_run_input_data_seqs),input_shape)] + input_references_list = [np.reshape(np.asarray(to_run_input_data_refs),input_shape)] computed_scores = np.array(score_computation_function( task_idx=task_idx, input_data_list=input_data_list, input_references_list=input_references_list, batch_size=batch_size, progress_update=progress_update)) - computed_scores = np.reshape( computed_scores, [len(input_data_sequences), num_refs_per_seq] - +list(input_data_list[0].shape[1:])) + +list(input_data_list[0].shape[1:])) + #take the mean over all the refs mean_scores = np.mean(computed_scores,axis=1) return mean_scores diff --git a/tests/shuffling/test_dinuc_shuffle.py b/tests/shuffling/test_dinuc_shuffle.py new file mode 100644 index 0000000..c9c4ed7 --- /dev/null +++ b/tests/shuffling/test_dinuc_shuffle.py @@ -0,0 +1,30 @@ +import unittest +from deeplift.dinuc_shuffle import dinuc_shuffle +from itertools import permutations +from collections import defaultdict +import random + +def dinuc_count(seq): + count = defaultdict(lambda: 0) + for i in range(len(seq)-2): + count[seq[i:i+2]] += 1 + return count + +class TestDinucShuffle(unittest.TestCase): + + def test_dinuc_shuffle(self): + for i in range(1000): + random_sequence = "".join([['A','C','G','T'][int(random.random()*4)] + for i in range(200)]) + shuffled_seq = dinuc_shuffle(random_sequence) + print("sequences") + print(random_sequence) + print(shuffled_seq) + orig_count = dinuc_count(random_sequence) + shuffled_count = dinuc_count(shuffled_seq) + print("counts") + print(orig_count) + print(shuffled_count) + assert len(orig_count.keys())==len(shuffled_count.keys()) + for key in orig_count: + assert orig_count[key]==shuffled_count[key] diff --git a/tests/shuffling/test_dinuc_shuffle_on_onehot.py b/tests/shuffling/test_dinuc_shuffle_on_onehot.py new file mode 100644 index 0000000..a7c3bdb --- /dev/null +++ b/tests/shuffling/test_dinuc_shuffle_on_onehot.py @@ -0,0 +1,23 @@ +from deeplift.dinuc_shuffle import dinuc_shuffle +from dragonn.utils import get_sequence_strings +import random +import numpy as np + +import wget +url="http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy" +wget.download(url) +deep_lift_input_classification_spi1=np.load("deep_lift_input_classification_spi1.npy") +print(deep_lift_input_classification_spi1.shape) +deep_lift_input_classification_spi1_strings=get_sequence_strings(deep_lift_input_classification_spi1) + +for i in range(len(deep_lift_input_classification_spi1)): + random.seed(1234) + shuffled_strings=dinuc_shuffle(deep_lift_input_classification_spi1_strings[i]) + random.seed(1234) + shuffled_array=dinuc_shuffle(deep_lift_input_classification_spi1[i].squeeze()) + #decode the array + shuffled_array=''.join(get_sequence_strings(np.expand_dims(np.expand_dims(shuffled_array,axis=1),axis=1))) + #make sure shuffling the string and numpy array gave same shuffle output + if (shuffled_strings != shuffled_array): + print("FAILED!") +print("TEST PASSED!") From 093c25e163293bc02116b5f0693ec296c8df89fd Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Thu, 7 Mar 2019 19:59:51 -0800 Subject: [PATCH 3/6] Delete test_dinuc_shuffle_on_onehot.py Because of the dragonn dependency, this particular test might make the travis build unhappy (and would also fail if we ever change the path of the wget'd files on mitra), so rather than go through the pain of making it compatible, I'm happy to just take your word for it that it works. --- .../shuffling/test_dinuc_shuffle_on_onehot.py | 23 ------------------- 1 file changed, 23 deletions(-) delete mode 100644 tests/shuffling/test_dinuc_shuffle_on_onehot.py diff --git a/tests/shuffling/test_dinuc_shuffle_on_onehot.py b/tests/shuffling/test_dinuc_shuffle_on_onehot.py deleted file mode 100644 index a7c3bdb..0000000 --- a/tests/shuffling/test_dinuc_shuffle_on_onehot.py +++ /dev/null @@ -1,23 +0,0 @@ -from deeplift.dinuc_shuffle import dinuc_shuffle -from dragonn.utils import get_sequence_strings -import random -import numpy as np - -import wget -url="http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy" -wget.download(url) -deep_lift_input_classification_spi1=np.load("deep_lift_input_classification_spi1.npy") -print(deep_lift_input_classification_spi1.shape) -deep_lift_input_classification_spi1_strings=get_sequence_strings(deep_lift_input_classification_spi1) - -for i in range(len(deep_lift_input_classification_spi1)): - random.seed(1234) - shuffled_strings=dinuc_shuffle(deep_lift_input_classification_spi1_strings[i]) - random.seed(1234) - shuffled_array=dinuc_shuffle(deep_lift_input_classification_spi1[i].squeeze()) - #decode the array - shuffled_array=''.join(get_sequence_strings(np.expand_dims(np.expand_dims(shuffled_array,axis=1),axis=1))) - #make sure shuffling the string and numpy array gave same shuffle output - if (shuffled_strings != shuffled_array): - print("FAILED!") -print("TEST PASSED!") From 4ece98a9c0ee5cb9caa7c2e6b7b4d4509affe5e8 Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Thu, 7 Mar 2019 20:03:52 -0800 Subject: [PATCH 4/6] Removed seq_shape since it wasn't used anywhere --- deeplift/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeplift/util.py b/deeplift/util.py index 0ee7fbf..59a5649 100644 --- a/deeplift/util.py +++ b/deeplift/util.py @@ -365,7 +365,6 @@ def compute_scores_with_shuffle_seq_refs( print(str(references_generated) +" reference seqs generated") if isinstance(seq,np.ndarray): - seq_shape=seq.shape seq=seq.squeeze() to_run_input_data_seqs.append(seq) to_run_input_data_refs.append(shuffle_func(seq)) From ab636c1ccb59e026f045184a90677b5b37b05d25 Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Thu, 7 Mar 2019 20:04:33 -0800 Subject: [PATCH 5/6] Version bump for anna's dinuc shuff functionality --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4ce4e69..4d7efb4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ Implements the methods in "Learning Important Features Through Propagating Activation Differences" by Shrikumar, Greenside & Kundaje, as well as other commonly-used methods such as gradients, guided backprop and integrated gradients. See https://github.com/kundajelab/deeplift for documentation and FAQ. """, url='https://github.com/kundajelab/deeplift', - version='0.6.8.1', + version='0.6.9.0', packages=['deeplift', 'deeplift.layers', 'deeplift.visualization', 'deeplift.conversion'], From e148939fd8d2cf2c1f873a024199f06b9c0f0a4d Mon Sep 17 00:00:00 2001 From: Av Shrikumar Date: Thu, 7 Mar 2019 20:05:03 -0800 Subject: [PATCH 6/6] Version bump for anna's dinuc shuff functionality --- deeplift.egg-info/PKG-INFO | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplift.egg-info/PKG-INFO b/deeplift.egg-info/PKG-INFO index bd2e22c..1f728c9 100644 --- a/deeplift.egg-info/PKG-INFO +++ b/deeplift.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: deeplift -Version: 0.6.8.1 +Version: 0.6.9.0 Summary: DeepLIFT (Deep Learning Important FeaTures) Home-page: https://github.com/kundajelab/deeplift License: UNKNOWN