pyparseface.py

import os
import sys
import time
import asciitree
import collections

from concurrent.futures import ThreadPoolExecutor

PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))

runfiles_path = os.path.join(PROJECT_ROOT, 'models', 'syntaxnet', 'bazel-bin', 'syntaxnet', 'parser_eval.runfiles')
tensorflow_path = os.path.join(PROJECT_ROOT, 'models', 'syntaxnet', 'bazel-bin', 'syntaxnet', 'parser_eval.runfiles',
                               'external', 'tf')

sys.path.append(runfiles_path)
sys.path.append(tensorflow_path)

import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from syntaxnet import sentence_pb2, structured_graph_builder
from syntaxnet.ops import gen_parser_ops


input_file_path = os.path.join(PROJECT_ROOT, "input-file.txt")
output_file_path = os.path.join(PROJECT_ROOT, "output-file.txt")
parser_path = os.path.join(PROJECT_ROOT, 'models', 'syntaxnet', 'bazel-bin', 'syntaxnet', 'parser_eval')
mcparseface_path = os.path.join(PROJECT_ROOT, 'models', 'syntaxnet', 'syntaxnet', 'models', 'parsey_mcparseface')
tagger_params_path = os.path.join(mcparseface_path, 'tagger-params')
parser_params_path = os.path.join(mcparseface_path, 'parser-params')
task_context_path = os.path.join(PROJECT_ROOT, "custom_context.pbtxt")

"""
PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
MODEL_DIR=syntaxnet/models/parsey_mcparseface
[[ "$1" == "--conll" ]] && INPUT_FORMAT=stdin-conll || INPUT_FORMAT=stdin

$PARSER_EVAL \
  --input=$INPUT_FORMAT \
  --output=stdout-conll \
  --hidden_layer_sizes=64 \
  --arg_prefix=brain_tagger \
  --graph_builder=structured \
  --task_context=$MODEL_DIR/context.pbtxt \
  --model_path=$MODEL_DIR/tagger-params \
  --slim_model \
  --batch_size=1024 \
  --alsologtostderr \
   | \
  $PARSER_EVAL \
  --input=stdin-conll \
  --output=stdout-conll \
  --hidden_layer_sizes=512,512 \
  --arg_prefix=brain_parser \
  --graph_builder=structured \
  --task_context=$MODEL_DIR/context.pbtxt \
  --model_path=$MODEL_DIR/parser-params \
  --slim_model \
  --batch_size=1024 \
  --alsologtostderr \
  | \
  bazel-bin/syntaxnet/conll2tree \
  --task_context=$MODEL_DIR/context.pbtxt \
  --alsologtostderr
"""


def parse_sentence(sentence):
  tagged_sentence = _perform_action_in_thread(action="brain_tagger", sentence=sentence)

  parsed_sentence = _perform_action_in_thread(action="brain_parser", sentence=tagged_sentence)

  sentence_dict = _get_sentence_dict_in_thread(parsed_sentence)

  return sentence_dict


def to_dict(sentence):
  """Builds a dictionary representing the parse tree of a sentence.

  Args:
    sentence: Sentence protocol buffer to represent.
  Returns:
    Dictionary mapping tokens to children.
  """
  token_str = ['%s %s %s' % (token.word, token.tag, token.label)
               for token in sentence.token]
  children = [[] for token in sentence.token]
  root = -1
  for i in range(0, len(sentence.token)):
    token = sentence.token[i]
    if token.head == -1:
      root = i
    else:
      children[token.head].append(i)

  def _get_dict(i):
    d = collections.OrderedDict()
    for c in children[i]:
      d[token_str[c]] = _get_dict(c)
    return d

  tree = collections.OrderedDict()
  tree[token_str[root]] = _get_dict(root)
  return tree


def _get_sentence_dict_in_thread(sentence_input):
  # The file doesn't get written unless the calling thread dies.
  # So this is a hack to be able to read from the file we are writing to.
  input_file = open(input_file_path, mode="w")
  input_file.write(sentence_input)
  input_file.close()

  executor = ThreadPoolExecutor(max_workers=1)
  future = executor.submit(_get_sentence_dict)
  return future.result()


def _get_sentence_dict():
  logging.set_verbosity(logging.INFO)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name='input-from-file-conll',
                                         task_context=task_context_path)
    sentence = sentence_pb2.Sentence()
    result_dict = None
    while True:
      documents, finished = sess.run(src)
      for d in documents:
        sentence.ParseFromString(d)
        d = to_dict(sentence)
        result_dict = d

      if finished:
        break

  return result_dict


def _perform_action_in_thread(action=None, sentence=None):
  # The file doesn't get written unless the calling thread dies.
  # So this is a hack to be able to read from the file we are writing to.
  input_file = open(input_file_path, mode="w")
  input_file.write(sentence)
  input_file.close()

  executor = ThreadPoolExecutor(max_workers=1)
  future = executor.submit(_perform_action, action)
  future.result()

  output_file = open(output_file_path, mode="r")
  result = output_file.read()
  output_file.close()

  return result


def _perform_action(action=None):
  arg_prefix = action
  task_context = task_context_path

  if action == "brain_tagger":
    hidden_layer_sizes = [64]
    model_path = tagger_params_path
    output = 'output-to-file'
    input = 'input-from-file'
  elif action == "brain_parser":
    hidden_layer_sizes = [512, 512]
    model_path = parser_params_path
    output = 'output-to-file-conll'
    input = 'input-from-file-conll'
  else:
    raise Exception("Do not recognize action %s" % action)

  with tf.Session() as sess:
    feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run(
      gen_parser_ops.feature_size(task_context=task_context,
                                  arg_prefix=arg_prefix))

  beam_size = 8
  max_steps = 1000
  batch_size = 1024
  slim_model = True

  parser = structured_graph_builder.StructuredGraphBuilder(
    num_actions,
    feature_sizes,
    domain_sizes,
    embedding_dims,
    hidden_layer_sizes,
    gate_gradients=True,
    arg_prefix=arg_prefix,
    beam_size=beam_size,
    max_steps=max_steps)

  parser.AddEvaluation(task_context,
                       batch_size,
                       corpus_name=input,
                       evaluation_max_steps=max_steps)

  with tf.Session() as sess:

    parser.AddSaver(slim_model)
    sess.run(parser.inits.values())
    parser.saver.restore(sess, model_path)

    sink_documents = tf.placeholder(tf.string)
    sink = gen_parser_ops.document_sink(sink_documents,
                                        task_context=task_context,
                                        corpus_name=output)
    t = time.time()
    num_epochs = None
    num_tokens = 0
    num_correct = 0
    num_documents = 0
    while True:
      tf_eval_epochs, tf_eval_metrics, tf_documents = sess.run([
        parser.evaluation['epochs'],
        parser.evaluation['eval_metrics'],
        parser.evaluation['documents'],
      ])

      if len(tf_documents):
        logging.info('Processed %d documents', len(tf_documents))
        num_documents += len(tf_documents)
        sess.run(sink, feed_dict={sink_documents: tf_documents})

      num_tokens += tf_eval_metrics[0]
      num_correct += tf_eval_metrics[1]
      if num_epochs is None:
        num_epochs = tf_eval_epochs
      elif num_epochs < tf_eval_epochs:
        break

    logging.info('Total processed documents: %d', num_documents)
    if num_tokens > 0:
      eval_metric = 100.0 * num_correct / num_tokens
      logging.info('num correct tokens: %d', num_correct)
      logging.info('total tokens: %d', num_tokens)
      logging.info('Seconds elapsed in evaluation: %.2f, '
                   'eval metric: %.2f%%', time.time() - t, eval_metric)

"""
/usr/local/bin/python /Users/plowman/projects/parseface/models/syntaxnet/bazel-bin/syntaxnet/parser_eval.runfiles/syntaxnet/parser_eval.py
--input=stdin --output=stdout-conll --hidden_layer_sizes=64 --arg_prefix=brain_tagger
--graph_builder=structured --task_context=syntaxnet/models/parsey_mcparseface/context.pbtxt
--model_path=syntaxnet/models/parsey_mcparseface/tagger-params --slim_model --batch_size=1024
--alsologtostderr
"""


def pretty_print_dict(input_dict):
  tr = asciitree.LeftAligned()
  print tr(input_dict)


if __name__ == "__main__":
  parsed_sentence_dict = parse_sentence("Maybe there was once a human who looked like you, and somewhere along the "
                                        "line you killed him and took his place. And your superiors don't know.")
  print("OrderedDict: %s" % parsed_sentence_dict)
  pretty_print_dict(parsed_sentence_dict)