Skip to content

loop_function

Higepon Taro Minowa edited this page Jul 9, 2017 · 7 revisions

Summary

  • for each this step, we choose best top k symbols
  • Questions
    • who eventually choose the best?
    • Is this necessary for training?
    • emb_prev is returning [beam_size, emb_vec] as nature.

Code with comments

# Say beam_size = 2
# i = 1
#     probs = [0.4, 0.2, 0.3, ..., 0.11] shape=(1, num_symbols)
#     best_probs = [0.4, 0.3] shape=(1, beam_size)
#     indices = [0, 2] shape=(1, beam_size)
#     indices = [0; 2] shape=(2, 1)
#     best_probs = [0.4; 0.3] shape=(2, 1)
#     log_beam_probs = [[0.4; 0.3]] list of shape(2, 1)
#
# i = 2
#     probs = [0.01, 0.9, 0.4, ..., 0.11] shape=(1, num_symbols)
#     probs = [probs + log_beam_probs[-1]]
#       probs = [[0.01, 0.9, 0.4, ..., 0.11] + [0.4; 0.3]]
#       probs = [[0.01 + 0.4, 0.9 + 0.4 , 0.4 + 0.4, ..., 0.11 + 0.4];
#                [0.01 + 0.3, 0.9 + 0.3 , 0.3 + 0.3, ..., 0.11 + 0.3]]
#       reshape
#       probs = [0.01 + 0.4, 0.9 + 0.4 , 0.4 + 0.4, ..., 0.11 + 0.4, 0.01 + 0.3, 0.9 + 0.3 , 0.3 + 0.3, ..., 0.11 + 0.3]
#             = [0.41, 1.3, 0.8, ..., 0.51, 0.31, 1.2, 0.6, ..., 0.41] shape=(1, num_symbols * beam_size)
#     best_probs = [1.3, 1,2]
#       reshape = [1.3; 1.2]
#     indices = [1, 102]
#       
def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
    if output_projection is not None:
        prev = nn_ops.xw_plus_b(
            prev, output_projection[0], output_projection[1])

    probs  = tf.log(tf.nn.softmax(prev))

    if i > 1:

        probs = tf.reshape(probs + log_beam_probs[-1],
                           [-1, beam_size * num_symbols])

    best_probs, indices = tf.nn.top_k(probs, beam_size)
    indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
    best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

    symbols = indices % num_symbols # Which word in vocabulary.
    beam_parent = indices // num_symbols # Which hypothesis it came from.


    beam_symbols.append(symbols)
    beam_path.append(beam_parent)
    log_beam_probs.append(best_probs)

    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.

    emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    emb_prev  = tf.reshape(emb_prev ,[beam_size ,embedding_size])
    # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    if not update_embedding:
        emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev

def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1])

probs  = tf.log(tf.nn.softmax(prev))

if i > 1:

    probs = tf.reshape(probs + log_beam_probs[-1],
                       [-1, beam_size * num_symbols])

best_probs, indices = tf.nn.top_k(probs, beam_size)
indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

symbols = indices % num_symbols # Which word in vocabulary.
beam_parent = indices // num_symbols # Which hypothesis it came from.


beam_symbols.append(symbols)
beam_path.append(beam_parent)
log_beam_probs.append(best_probs)

# Note that gradients will not propagate through the second parameter of
# embedding_lookup.

emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
emb_prev  = tf.reshape(emb_prev ,[beam_size ,embedding_size])
# emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
if not update_embedding:
    emb_prev = array_ops.stop_gradient(emb_prev)
return emb_prev

--- /Users/higepon/Desktop/a.py	2017-07-09 15:05:16.000000000 +0900
+++ /Users/higepon/Desktop/b.py	2017-07-09 15:05:30.000000000 +0900
@@ -1,11 +1,53 @@
-    def loop_function(prev, _):
+    def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
         if output_projection is not None:
             prev = nn_ops.xw_plus_b(
                 prev, output_projection[0], output_projection[1])
-        prev_symbol = math_ops.argmax(prev, 1)
+        # prev= prev.get_shape().with_rank(2)[1]
+
+        # Say k = 2, beam_size=3
+        # convert prev output to probability(?)
+        # i = 1
+        #  probs = [0.3, 0.2, 0.4, ..., 0.12] len = num_symbols
+        #  best_probs = [0.3, 0.4] =>reshape=> [0.3, 0.4]
+        #  indices = [0, 2] =>reshape=> [0, 2]
+        #  log_beam_probs = [] => [[0.3, 0.4]]
+        # i = 2
+        #  probs = [0.99, 0.89, 0.4, ..., 0.001] len = num_symbols
+        #  probs = [[0.99, 0.89, 0.4, ..., 0.001] + [0.3, 0.4]]
+        # best_probs = [0.99, 0.89]
+        # indices = [0, 1]
+        #  probs = [[current probs],
+        #           [top k of prev]]
+        #  top_k returns top k for each row
+   
+        probs  = tf.log(tf.nn.softmax(prev))
+
+        if i > 1:
+
+            # reshape current_probs + previous_probs
+            # we can add probability, as this is log!!!
+            probs = tf.reshape(probs + log_beam_probs[-1],
+                               [-1, beam_size * num_symbols])
+
+        # Pick top k probs and indices, but why we searchibng in past probs?
+        best_probs, indices = tf.nn.top_k(probs, beam_size)
+        indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
+        best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))
+
+        symbols = indices % num_symbols # Which word in vocabulary.
+        beam_parent = indices // num_symbols # Which hypothesis it came from.
+
+
+        beam_symbols.append(symbols)
+        beam_path.append(beam_parent)
+        log_beam_probs.append(best_probs)
+
         # Note that gradients will not propagate through the second parameter of
         # embedding_lookup.
-        emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
+
+        emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
+        emb_prev  = tf.reshape(emb_prev ,[beam_size ,embedding_size])
+        # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
         if not update_embedding:
             emb_prev = array_ops.stop_gradient(emb_prev)
         return emb_prev
Clone this wiki locally