Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
aylai authored Jan 11, 2021
0 parents commit 4bf98fa
Show file tree
Hide file tree
Showing 18 changed files with 2,533 additions and 0 deletions.
108 changes: 108 additions & 0 deletions DocumentWithCliques.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import torch
from nltk import word_tokenize
import random

is_cuda = torch.cuda.is_available()


class DocumentWithCliques(object):

def __init__(self, orig_sentences, clique_size, permutation_indices=None, id = '', label=None):
self.id = id
self.clique_size = clique_size
self.orig_sentences = []
for sent in orig_sentences:
sent = sent.strip()
if sent == "":
continue
if sent == "<d>" or sent == "</d>":
self.orig_sentences.append([sent])
else:
self.orig_sentences.append(word_tokenize(sent))
self.permutation_indices = []
if permutation_indices is not None:
self.permutation_indices = permutation_indices # index into orig_sentences
self.index_sentences = [] # token-indexed version of self.orig_sentences
self.orig_full_sequence = None
self.perm_full_sequences = None
self.label = label

# turn full doc into flat sequence of word indices
def get_orig_full_sequence(self):
if self.orig_full_sequence is not None:
return self.orig_full_sequence
self.orig_full_sequence = []
for sent in self.index_sentences:
self.orig_full_sequence.extend(sent)
return self.orig_full_sequence

# turn all doc permutations into sentence lists of word indices
def get_perm_index_sentences(self):
if self.perm_full_sequences is not None:
return self.get_perm_full_sequences
self.perm_doc_sentences = []
for perm in self.permutation_indices:
doc_temp = []
for sent_idx in perm:
doc_temp.append(self.index_sentences[sent_idx])
self.perm_doc_sentences.append(doc_temp)
return self.perm_doc_sentences

# turn all doc permutations into flat sequences of word indices
def get_perm_full_sequences(self):
if self.perm_full_sequences is not None:
return self.get_perm_full_sequences
self.perm_full_sequences = []
for perm in self.permutation_indices:
doc_temp = []
for sent_idx in perm:
doc_temp.extend(self.index_sentences[sent_idx])
self.perm_full_sequences.append(doc_temp)
return self.perm_full_sequences

def create_cliques_orig(self): # assume self.index_sentences is non-empty
self.orig_cliques = []
self.orig_cliques_index = []
for i in range(len(self.index_sentences) - self.clique_size + 1):
clique = []
clique_index = []
for j in range(self.clique_size):
clique.append(self.index_sentences[i + j])
clique_index.append(i+j)
self.orig_cliques.append(clique)
self.orig_cliques_index.append(clique_index)

# randomly create negative cliques from the original document sentences
def create_cliques_neg(self):
self.neg_cliques = []
for orig_clique in self.orig_cliques_index: # negative example for each window: replace center sentence
if len(self.orig_cliques_index) == 1:
break # no possible negative cliques for this doc
valid_sentences = {}
for sent_idx in orig_clique:
valid_sentences[sent_idx] = 1
valid_sentences[0] = 1 # don't allow <d> pad
valid_sentences[len(self.index_sentences) - 1] = 1 # don't allow </d> pad
if len(valid_sentences) == len(self.index_sentences):
continue # no possible negative cliques for this positive clique
center_idx = int(len(orig_clique) / 2)
new_sent = random.randrange(len(self.index_sentences))
while new_sent in valid_sentences:
new_sent = random.randrange(len(self.index_sentences))
neg_clique = []
for sent_idx in orig_clique:
neg_clique.append(self.index_sentences[sent_idx])
neg_clique[center_idx] = self.index_sentences[new_sent]
self.neg_cliques.append(neg_clique)

# create cliques for predefined permutations of this document
def create_cliques_perm(self):
self.perm_cliques = []
for perm in self.permutation_indices:
cliques = []
for i in range(len(perm) - self.clique_size + 1):
clique = []
for j in range(self.clique_size):
clique.append(self.index_sentences[perm[i + j]])
cliques.append(clique)
self.perm_cliques.append(cliques)
44 changes: 44 additions & 0 deletions DocumentWithParagraphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from nltk import sent_tokenize, word_tokenize


class DocumentWithParagraphs(object):

def __init__(self, text_with_line_breaks, label=None, orig_sentences=None, permutation_indices=None, id=''):
self.id = id
self.text = []
self.text_indexed = []
self.label = label
lines = text_with_line_breaks.splitlines()
for line in lines:
line = line.strip()
if line != "": # this is a paragraph
paragraph = []
sents = sent_tokenize(line)
for sent in sents:
words = word_tokenize(sent)
paragraph.append(words)
self.text.append(paragraph)
self.orig_sentences = []
if orig_sentences is not None:
self.orig_sentences = orig_sentences
self.permutation_indices = []
if permutation_indices is not None:
self.permutation_indices = permutation_indices

def get_paragraphs(self):
return self.text_indexed

def get_sentences(self):
sentences = []
for paragraph in self.text_indexed:
for sent in paragraph:
sentences.append(sent)
return sentences

def get_words(self):
words = []
for paragraph in self.text_indexed:
for sent in paragraph:
for word in sent:
words.append(word)
return words
69 changes: 69 additions & 0 deletions LSTMClique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor


class LSTMClique(nn.Module):

def __init__(self, params, data_obj):
super(LSTMClique, self).__init__()
self.embedding_dim = params['embedding_dim']
self.hidden_dim = params['hidden_dim']
self.lstm_dim = params['lstm_dim']
self.dropout = params['dropout']
self.clique_size = params['clique_size']
self.embeddings = data_obj.word_embeds
self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim)
self.hidden = None
self.clique_layer = nn.Linear(params['clique_size'] * self.lstm_dim, self.hidden_dim)
nn.init.xavier_uniform(self.clique_layer.weight, gain=nn.init.calculate_gain('tanh'))
self.task = params['task']
if params['task'] == 'perm':
num_labels = 2
elif params['task'] == 'minority':
num_labels = 2
elif params['task'] == 'class':
num_labels = 3
elif params['task'] == 'score_pred':
num_labels = 1
self.predict_layer = nn.Linear(self.hidden_dim, num_labels)
nn.init.xavier_uniform(self.predict_layer.weight, gain=nn.init.calculate_gain('sigmoid'))
if USE_CUDA:
self.clique_layer = self.clique_layer.cuda()
self.predict_layer = self.predict_layer.cuda()

def init_hidden(self, batch_size):
if USE_CUDA:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()),
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda())
else:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)),
Variable(torch.zeros(1, batch_size, self.lstm_dim)))

def forward(self, inputs, input_lengths, original_index): # now with cliques
lstm_out = None
for i in range(self.clique_size): # send each sentence x batch through LSTM
self.hidden = self.init_hidden(len(input_lengths[i]))
seq_tensor = self.embeddings(inputs[i])
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True)
packed_output, (ht, ct) = self.lstm(packed_input, self.hidden)
# reorder
final_output = ht[-1]
odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1))
output_unsorted = torch.gather(final_output, 0, Variable(odx))
if lstm_out is None:
lstm_out = output_unsorted
else:
lstm_out = torch.cat([lstm_out, output_unsorted], dim=1)
clique_vector = F.tanh(self.clique_layer(lstm_out))
clique_vector = F.dropout(clique_vector, p=self.dropout, training=self.training)
coherence_pred = self.predict_layer(clique_vector)
if self.task != 'score_pred':
coherence_pred = F.softmax(coherence_pred, dim=0)
return coherence_pred
96 changes: 96 additions & 0 deletions LSTMParSeq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor

# todo this whole class
class LSTMParSeq(nn.Module):

def __init__(self, params, data_obj):
super(LSTMParSeq, self).__init__()
self.data_obj = data_obj
self.task = params['task']
self.embedding_dim = params['embedding_dim']
self.hidden_dim = params['hidden_dim']
self.lstm_dim = params['lstm_dim']
self.dropout = params['dropout']
self.embeddings = data_obj.word_embeds
self.word_lstm = nn.LSTM(self.embedding_dim, self.lstm_dim)
self.word_lstm_hidden = None
self.sent_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim)
self.sent_lstm_hidden = None
self.par_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim)
self.par_lstm_hidden = None
self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim)
if params['task'] == 'perm':
num_labels = 2
elif params['task'] == 'minority':
num_labels = 2
elif params['task'] == 'class':
num_labels = 3
elif params['task'] == 'score_pred':
num_labels = 1
self.predict_layer = nn.Linear(self.hidden_dim, num_labels)
self.bn = nn.BatchNorm1d(self.hidden_dim)
# weight initialization
for m in self.modules():
if isinstance(m, nn.Linear):
m.bias.data.zero_()
nn.init.xavier_uniform(m.weight)
if USE_CUDA:
self.hidden_layer = self.hidden_layer.cuda()
self.predict_layer = self.predict_layer.cuda()

def init_hidden(self, batch_size):
if USE_CUDA:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()),
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda())
else:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)),
Variable(torch.zeros(1, batch_size, self.lstm_dim)))

def forward(self, inputs, input_lengths, original_index):
doc_vecs = None
for i in range(len(inputs)): # loop over docs
par_vecs = None
for j in range(len(inputs[i])): # loop over paragraphs
doc_batch_size = len(inputs[i][j]) # number of sents
self.word_lstm_hidden = self.init_hidden(doc_batch_size)
seq_tensor = self.embeddings(inputs[i][j])
# pack
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i][j], batch_first=True)
packed_output, (ht, ct) = self.word_lstm(packed_input, self.word_lstm_hidden)
# reorder
final_output = ht[-1]
odx = original_index[i][j].view(-1, 1).expand(len(input_lengths[i][j]), final_output.size(-1))
output_unsorted = torch.gather(final_output, 0, Variable(odx))
# LSTM to produce paragraph vector from sentence vectors
output_unsorted = output_unsorted.unsqueeze(1)
self.sent_lstm_hidden = self.init_hidden(output_unsorted.size(1)) # batch size 1
output_pars, (ht, ct) = self.sent_lstm(output_unsorted, self.sent_lstm_hidden)
final_output = ht[-1]
# append paragraph vector to batch
if par_vecs is None:
par_vecs = final_output
else:
par_vecs = torch.cat([par_vecs, final_output], dim=0)
# LSTM over paragraph vectors to create document vector
par_vecs = par_vecs.unsqueeze(1)
self.par_lstm_hidden = self.init_hidden(par_vecs.size(1)) # batch size 1
output_doc, (ht, ct) = self.par_lstm(par_vecs, self.par_lstm_hidden)
final_output = ht[-1]
# append doc vector to batch
if doc_vecs is None:
doc_vecs = final_output
else:
doc_vecs = torch.cat([doc_vecs, final_output], dim=0)
doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(doc_vecs))), p=self.dropout, training=self.training)
coherence_pred = self.predict_layer(doc_vectors)
if self.task != 'score_pred':
coherence_pred = F.softmax(coherence_pred, dim=0)
return coherence_pred
76 changes: 76 additions & 0 deletions LSTMSentAvg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor


class LSTMSentAvg(nn.Module):

def __init__(self, params, data_obj):
super(LSTMSentAvg, self).__init__()
self.data_obj = data_obj
self.task = params['task']
self.embedding_dim = params['embedding_dim']
self.hidden_dim = params['hidden_dim']
self.lstm_dim = params['lstm_dim']
self.dropout = params['dropout']
self.embeddings = data_obj.word_embeds
self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim)
self.hidden = None
self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim)
if params['task'] == 'perm':
num_labels = 2
elif params['task'] == 'minority':
num_labels = 2
elif params['task'] == 'class':
num_labels = 3
elif params['task'] == 'score_pred':
num_labels = 1
self.predict_layer = nn.Linear(self.hidden_dim, num_labels)
self.bn = nn.BatchNorm1d(self.hidden_dim)
# weight initialization
for m in self.modules():
if isinstance(m, nn.Linear):
m.bias.data.zero_()
nn.init.xavier_uniform(m.weight)
if USE_CUDA:
self.hidden_layer = self.hidden_layer.cuda()
self.predict_layer = self.predict_layer.cuda()

def init_hidden(self, batch_size):
if USE_CUDA:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()),
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda())
else:
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)),
Variable(torch.zeros(1, batch_size, self.lstm_dim)))

def forward(self, inputs, input_lengths, original_index):
lstm_out = None # document vectors
for i in range(len(inputs)): # loop over docs
doc_batch_size = len(inputs[i]) # number of sents
self.hidden = self.init_hidden(doc_batch_size)
seq_tensor = self.embeddings(inputs[i])
# pack
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True)
packed_output, (ht, ct) = self.lstm(packed_input, self.hidden)
# reorder
final_output = ht[-1]
odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1))
output_unsorted = torch.gather(final_output, 0, Variable(odx))
# sum sentence vectors
output_sum = torch.sum(output_unsorted, 0).unsqueeze(0)
if lstm_out is None:
lstm_out = output_sum
else:
lstm_out = torch.cat([lstm_out, output_sum], dim=0)
doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(lstm_out))), p=self.dropout, training=self.training)
coherence_pred = self.predict_layer(doc_vectors)
if self.task != 'score_pred':
coherence_pred = F.softmax(coherence_pred, dim=0)
return coherence_pred
Loading

0 comments on commit 4bf98fa

Please sign in to comment.