Skip to content

Commit

Permalink
Merge pull request #78 from annashcherbina/master
Browse files Browse the repository at this point in the history
Functionality to dinucleotide shuffle one-hot inputs and provide one-hot-encoded inputs to deeplift with shuffled reference
  • Loading branch information
AvantiShri authored Mar 8, 2019
2 parents 0ecfe7c + e148939 commit f143790
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 18 deletions.
2 changes: 1 addition & 1 deletion deeplift.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: deeplift
Version: 0.6.8.1
Version: 0.6.9.0
Summary: DeepLIFT (Deep Learning Important FeaTures)
Home-page: https://github.com/kundajelab/deeplift
License: UNKNOWN
Expand Down
15 changes: 10 additions & 5 deletions deeplift/dinuc_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def prepare_edges(s):
edges = defaultdict(list)
for i in range(len(s)-1):
edges[s[i]].append(s[i+1])
edges[tuple(s[i])].append(s[i+1])
return edges


Expand All @@ -26,11 +26,16 @@ def traverse_edges(s, edges):
edges_queue_pointers = defaultdict(lambda: 0)
for i in range(len(s)-1):
last_char = generated[-1]
generated.append(edges[last_char][edges_queue_pointers[last_char]])
edges_queue_pointers[last_char] += 1
return "".join(generated)
generated.append(edges[tuple(last_char)][edges_queue_pointers[tuple(last_char)]])
edges_queue_pointers[tuple(last_char)] += 1
if isinstance(generated[0],str):
return "".join(generated)
else:
import numpy as np
return np.asarray(generated)


def dinuc_shuffle(s):
s = s.upper()
if isinstance(s, str):
s=s.upper()
return traverse_edges(s, shuffle_edges(prepare_edges(s)))
32 changes: 21 additions & 11 deletions deeplift/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,8 @@ def hypothetical_contribs_func(task_idx,


def get_shuffle_seq_ref_function(score_computation_function,
shuffle_func, one_hot_func):
shuffle_func, one_hot_func=None):

def compute_scores_with_shuffle_seq_refs(
task_idx, input_data_sequences, num_refs_per_seq,
batch_size, seed=1, progress_update=None):
Expand All @@ -363,27 +364,36 @@ def compute_scores_with_shuffle_seq_refs(
references_generated%progress_update==0):
print(str(references_generated)
+" reference seqs generated")
if isinstance(seq,np.ndarray):
seq=seq.squeeze()
to_run_input_data_seqs.append(seq)
to_run_input_data_refs.append(shuffle_func(seq))
if (progress_update is not None):
print("One hot encoding sequences...")
input_data_list = [one_hot_func(to_run_input_data_seqs)]
input_references_list = [one_hot_func(to_run_input_data_refs)]
if (progress_update is not None):
print("One hot encoding done...")

to_run_input_data_refs.append(shuffle_func(seq))
if one_hot_func is not None:
if (progress_update is not None):
print("One hot encoding sequences...")
input_data_list = [one_hot_func(to_run_input_data_seqs)]
input_references_list = [one_hot_func(to_run_input_data_refs)]
if (progress_update is not None):
print("One hot encoding done...")
else:
#the data is already one-hot encoded
input_shape=list(input_data_sequences.shape)
input_shape[0]=input_shape[0]*num_refs_per_seq
input_shape=tuple(input_shape)
input_data_list = [np.reshape(np.asarray(to_run_input_data_seqs),input_shape)]
input_references_list = [np.reshape(np.asarray(to_run_input_data_refs),input_shape)]
computed_scores = np.array(score_computation_function(
task_idx=task_idx,
input_data_list=input_data_list,
input_references_list=input_references_list,
batch_size=batch_size,
progress_update=progress_update))

computed_scores = np.reshape(
computed_scores,
[len(input_data_sequences),
num_refs_per_seq]
+list(input_data_list[0].shape[1:]))
+list(input_data_list[0].shape[1:]))

#take the mean over all the refs
mean_scores = np.mean(computed_scores,axis=1)
return mean_scores
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Implements the methods in "Learning Important Features Through Propagating Activation Differences" by Shrikumar, Greenside & Kundaje, as well as other commonly-used methods such as gradients, guided backprop and integrated gradients. See https://github.com/kundajelab/deeplift for documentation and FAQ.
""",
url='https://github.com/kundajelab/deeplift',
version='0.6.8.1',
version='0.6.9.0',
packages=['deeplift',
'deeplift.layers', 'deeplift.visualization',
'deeplift.conversion'],
Expand Down
30 changes: 30 additions & 0 deletions tests/shuffling/test_dinuc_shuffle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest
from deeplift.dinuc_shuffle import dinuc_shuffle
from itertools import permutations
from collections import defaultdict
import random

def dinuc_count(seq):
count = defaultdict(lambda: 0)
for i in range(len(seq)-2):
count[seq[i:i+2]] += 1
return count

class TestDinucShuffle(unittest.TestCase):

def test_dinuc_shuffle(self):
for i in range(1000):
random_sequence = "".join([['A','C','G','T'][int(random.random()*4)]
for i in range(200)])
shuffled_seq = dinuc_shuffle(random_sequence)
print("sequences")
print(random_sequence)
print(shuffled_seq)
orig_count = dinuc_count(random_sequence)
shuffled_count = dinuc_count(shuffled_seq)
print("counts")
print(orig_count)
print(shuffled_count)
assert len(orig_count.keys())==len(shuffled_count.keys())
for key in orig_count:
assert orig_count[key]==shuffled_count[key]

0 comments on commit f143790

Please sign in to comment.