Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rizwan09 committed Apr 5, 2017
0 parents commit b41e879
Show file tree
Hide file tree
Showing 10 changed files with 307 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .idea/LanModeledProgramGeneartion-master.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
The task is to predict the next token based on the context what we have seen so far

initial corpus: a sentence is a method:

for eaxmple:

public double_type method_name(int_array int_array_var) { double_type double_var = constant; for(int_type int_var = constant; int_var < int_array_vay.size(); int_var++) { double_var += int_array_var[int_var]; } return double_var; }

now we need to genearte next token from any given context
105 changes: 105 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
###############################################################################
# Author: Md Rizwan Parvez
# Project: LanModeledProgramGeneration
# Date Created: 4/1/2017
# Some codes are from Wasi Ahmad data.py
# File Description: This script provides a definition of the corpus, each
# example in the corpus and the dictionary.
###############################################################################
from nltk.tokenize import word_tokenize
import numpy as np
import json, os


# # import util

# #### fix this form util
# def sepearte_operator(x):
# x = x.replace('++', ' ++')
# x = x.replace('--', ' --')
# return x
# def tokenize(s):
# """Tokenize string."""
# token_list = []
# tokens = word_tokenize(s.lower())
# token_list.extend([x for x in tokens])
# return token_list


class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
# Create and store three special tokens
self.start_token = '<SOS>'
self.end_token = '<EOS>'
self.unknown_token = '<UNKNOWN>'
self.pad_token = '<PAD>'
self.idx2word.append(self.start_token)
self.word2idx[self.start_token] = len(self.idx2word) - 1
self.idx2word.append(self.end_token)
self.word2idx[self.end_token] = len(self.idx2word) - 1
self.idx2word.append(self.unknown_token)
self.word2idx[self.unknown_token] = len(self.idx2word) - 1
self.idx2word.append(self.pad_token)
self.word2idx[self.pad_token] = len(self.idx2word) - 1

def add_word(self, word):
word = word.lower()
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]

def contains(self, word):
word = word.lower()
return True if word in self.word2idx else False

def __len__(self):
return len(self.idx2word)


class Instance(object):
def __init__(self):
self.sentence1 = []

def add_sentence(self, sentence, dictionary, is_test_instance=False):
#### fix this
words = [dictionary.start_token] + word_tokenize(sepearte_operator(sentence.lower())) + [dictionary.end_token]
if is_test_instance:
for i in range(len(words)):
if dictionary.contains(words[i].lower()) == False:
words[i] = dictionary.unknown_token
else:
for word in words:
dictionary.add_word(word.lower())

self.sentence1 = words


class Corpus(object):
def __init__(self, path):
self.dictionary = Dictionary()
self.max_sent_length = 0
#### fix this
self.train = self.parse(os.path.join(path, 'train_corpus.txt'))
self.dev = self.parse(os.path.join(path, 'train_corpus.txt'))
self.test = self.parse(os.path.join(path, 'train_corpus.txt'), True)

def parse(self, path, is_test_instance=False):
"""Parses the content of a file."""
assert os.path.exists(path)

samples = []
with open(path, 'r') as f:
for line in f:
instance = Instance()
if is_test_instance:
instance.add_sentence(line, self.dictionary, is_test_instance)
else:
instance.add_sentence(line, self.dictionary)
if self.max_sent_length < len(instance.sentence1):
self.max_sent_length = len(instance.sentence1)
samples.append(instance)

return samples
Empty file added encoder.py
Empty file.
Empty file added main.py
Empty file.
Empty file added model.py
Empty file.
Empty file added train.py
Empty file.
160 changes: 160 additions & 0 deletions util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from argparse import ArgumentParser
from numpy.linalg import norm
from nltk.tokenize import word_tokenize
import pickle
import sys
import numpy as np

def get_args():
parser = ArgumentParser(description='attend_analyze_aggregate_nli')
parser.add_argument('--data', type=str, default='../data/snli_1.0/',
help='location of the training data')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_Tanh, RNN_RELU, LSTM, GRU)')
parser.add_argument('--bidirection', action='store_true',
help='use bidirectional recurrent unit')
parser.add_argument('--emsize', type=int, default=300,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=300,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=1,
help='number of layers')
parser.add_argument('--lr', type=float, default=.001,
help='initial learning rate')
parser.add_argument('--lr_decay', type=float, default=.95,
help='decay ratio for learning rate')
parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
#### fix this

parser.add_argument('--epochs', type=int, default=1,
help='upper limit of epoch')
parser.add_argument('--train_data', type=str, default='train_corpus.txt',
help='train corpus path')
#### fix this

parser.add_argument('--batch_size', type=int, default=2, metavar='N',
help='batch size')
parser.add_argument('-- bptt', type=int, default=10,
help='sequence length')
parser.add_argument('--dropout', type=float, default=0.25,
help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
help='tie the word embedding and softmax weights')
parser.add_argument('--sos_token', type=int, default=0,
help='index of the start of a sentence token')
parser.add_argument('--eos_token', type=int, default=1,
help='index of the end of a sentence token')
parser.add_argument('--max_length', type=int, default=10,
help='maximum length of a query')
parser.add_argument('--min_length', type=int, default=3,
help='minimum length of a query')
parser.add_argument('--teacher_forcing_ratio', type=int, default=1.0,
help='use the real target outputs as each next input, instead of using '
'the decoder\'s guess as the next input')
parser.add_argument('--reverse_seq', type=bool, default=False,
help='allow reverse sequence for seq2seq model')
parser.add_argument('--seed', type=int, default=1111,
help='random seed for reproducibility')
parser.add_argument('--cuda', action='store_true',
help='use CUDA for computation')
parser.add_argument('--gpu', type=int, default=1,
help='number of gpu can be used for computation')
parser.add_argument('--print_every', type=int, default=2000, metavar='N',
help='training report interval')
parser.add_argument('--plot_every', type=int, default=2000,
help='plotting interval')
parser.add_argument('--dev_every', type=int, default=500,
help='development report interval')
parser.add_argument('--save_every', type=int, default=500,
help='saving model interval')
parser.add_argument('--resume_snapshot', action='store_true',
help='resume previous execution')
parser.add_argument('--save_path', type=str, default='../output/',
help='path to save the final model')
parser.add_argument('--word_vectors_file', type=str, default='glove.840B.300d.txt',
help='GloVe word embedding version')
parser.add_argument('--word_vectors_directory', type=str, default='../data/glove/',
help='Path of GloVe word embeddings')
parser.add_argument('--data_path', default='./')

args = parser.parse_args()
return args


def save_object(obj, filename):
"""Save an object into file."""
with open(filename, 'wb') as output:
pickle.dump(obj, output)


def load_object(filename):
"""Load object from file."""
with open(filename, 'rb') as input:
obj = pickle.load(input)
return obj


def initialize_out_of_vocab_words(dimension):
"""Returns a random vector of size dimension where mean is 0 and standard deviation is 1."""
return np.random.normal(size=dimension)


def sepearte_operator(x):
x = x.replace('++', ' ++')
x = x.replace('--', ' --')
return x





def normalize_word_embedding(v):
return np.array(v) / norm(np.array(v))


def load_word_embeddings(directory, file, dic):
# print (os.path.join(directory, file))
embeddings_index = {}
f = open(os.path.join(directory, file))
for line in f:
try:
values = line.split()
word = values[0]
#### fix this
if (word in dic.word2idx):
embeddings_index[word] = normalize_word_embedding([float(x) for x in values[1:]])
except ValueError as e:
print(e)
f.close()
return embeddings_index





def get_initial_embeddings(file_name, directory, file, dic):
if (os.path.isfile(file_name)):
print('========================== loading input matrix', file = sys.stderr)
embeddings_index = pickle.load(open(file_name, 'rb'))
print('========================== loading complete', file = sys.stderr)
else:
print('========================== no cached file!!! starting to generate now', file = sys.stderr)
embeddings_index = load_word_embeddings(directory, file, corpus.dictionary)
print('========================== Generation comple dumping now', file = sys.stderr)
save_object(embeddings_index, file_name)
print('========================== Saved dictionary completed!!!', file = sys.stderr)
return embeddings_index

def batchify(data, bsz, cuda_true=True):
nbatch = len(data) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
# data = data[0:nbatch * bsz]
# Evenly divide the data across the bsz batches.
# print (bsz)
# batched_data = [[data[bsz * i + j] for j in range(bsz)] for i in range(nbatch)]
batched_data = [data[bsz * i: bsz * (i + 1)] for i in range(nbatch)]
if (bsz * nbatch != len(data)): batched_data.append(data[bsz * nbatch:])
# print (batched_data)
if cuda_true: batched_data = batched_data.cuda()
return batched_data # num_batch x batch_size x instance

0 comments on commit b41e879

Please sign in to comment.