-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4bf98fa
Showing
18 changed files
with
2,533 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import torch | ||
from nltk import word_tokenize | ||
import random | ||
|
||
is_cuda = torch.cuda.is_available() | ||
|
||
|
||
class DocumentWithCliques(object): | ||
|
||
def __init__(self, orig_sentences, clique_size, permutation_indices=None, id = '', label=None): | ||
self.id = id | ||
self.clique_size = clique_size | ||
self.orig_sentences = [] | ||
for sent in orig_sentences: | ||
sent = sent.strip() | ||
if sent == "": | ||
continue | ||
if sent == "<d>" or sent == "</d>": | ||
self.orig_sentences.append([sent]) | ||
else: | ||
self.orig_sentences.append(word_tokenize(sent)) | ||
self.permutation_indices = [] | ||
if permutation_indices is not None: | ||
self.permutation_indices = permutation_indices # index into orig_sentences | ||
self.index_sentences = [] # token-indexed version of self.orig_sentences | ||
self.orig_full_sequence = None | ||
self.perm_full_sequences = None | ||
self.label = label | ||
|
||
# turn full doc into flat sequence of word indices | ||
def get_orig_full_sequence(self): | ||
if self.orig_full_sequence is not None: | ||
return self.orig_full_sequence | ||
self.orig_full_sequence = [] | ||
for sent in self.index_sentences: | ||
self.orig_full_sequence.extend(sent) | ||
return self.orig_full_sequence | ||
|
||
# turn all doc permutations into sentence lists of word indices | ||
def get_perm_index_sentences(self): | ||
if self.perm_full_sequences is not None: | ||
return self.get_perm_full_sequences | ||
self.perm_doc_sentences = [] | ||
for perm in self.permutation_indices: | ||
doc_temp = [] | ||
for sent_idx in perm: | ||
doc_temp.append(self.index_sentences[sent_idx]) | ||
self.perm_doc_sentences.append(doc_temp) | ||
return self.perm_doc_sentences | ||
|
||
# turn all doc permutations into flat sequences of word indices | ||
def get_perm_full_sequences(self): | ||
if self.perm_full_sequences is not None: | ||
return self.get_perm_full_sequences | ||
self.perm_full_sequences = [] | ||
for perm in self.permutation_indices: | ||
doc_temp = [] | ||
for sent_idx in perm: | ||
doc_temp.extend(self.index_sentences[sent_idx]) | ||
self.perm_full_sequences.append(doc_temp) | ||
return self.perm_full_sequences | ||
|
||
def create_cliques_orig(self): # assume self.index_sentences is non-empty | ||
self.orig_cliques = [] | ||
self.orig_cliques_index = [] | ||
for i in range(len(self.index_sentences) - self.clique_size + 1): | ||
clique = [] | ||
clique_index = [] | ||
for j in range(self.clique_size): | ||
clique.append(self.index_sentences[i + j]) | ||
clique_index.append(i+j) | ||
self.orig_cliques.append(clique) | ||
self.orig_cliques_index.append(clique_index) | ||
|
||
# randomly create negative cliques from the original document sentences | ||
def create_cliques_neg(self): | ||
self.neg_cliques = [] | ||
for orig_clique in self.orig_cliques_index: # negative example for each window: replace center sentence | ||
if len(self.orig_cliques_index) == 1: | ||
break # no possible negative cliques for this doc | ||
valid_sentences = {} | ||
for sent_idx in orig_clique: | ||
valid_sentences[sent_idx] = 1 | ||
valid_sentences[0] = 1 # don't allow <d> pad | ||
valid_sentences[len(self.index_sentences) - 1] = 1 # don't allow </d> pad | ||
if len(valid_sentences) == len(self.index_sentences): | ||
continue # no possible negative cliques for this positive clique | ||
center_idx = int(len(orig_clique) / 2) | ||
new_sent = random.randrange(len(self.index_sentences)) | ||
while new_sent in valid_sentences: | ||
new_sent = random.randrange(len(self.index_sentences)) | ||
neg_clique = [] | ||
for sent_idx in orig_clique: | ||
neg_clique.append(self.index_sentences[sent_idx]) | ||
neg_clique[center_idx] = self.index_sentences[new_sent] | ||
self.neg_cliques.append(neg_clique) | ||
|
||
# create cliques for predefined permutations of this document | ||
def create_cliques_perm(self): | ||
self.perm_cliques = [] | ||
for perm in self.permutation_indices: | ||
cliques = [] | ||
for i in range(len(perm) - self.clique_size + 1): | ||
clique = [] | ||
for j in range(self.clique_size): | ||
clique.append(self.index_sentences[perm[i + j]]) | ||
cliques.append(clique) | ||
self.perm_cliques.append(cliques) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from nltk import sent_tokenize, word_tokenize | ||
|
||
|
||
class DocumentWithParagraphs(object): | ||
|
||
def __init__(self, text_with_line_breaks, label=None, orig_sentences=None, permutation_indices=None, id=''): | ||
self.id = id | ||
self.text = [] | ||
self.text_indexed = [] | ||
self.label = label | ||
lines = text_with_line_breaks.splitlines() | ||
for line in lines: | ||
line = line.strip() | ||
if line != "": # this is a paragraph | ||
paragraph = [] | ||
sents = sent_tokenize(line) | ||
for sent in sents: | ||
words = word_tokenize(sent) | ||
paragraph.append(words) | ||
self.text.append(paragraph) | ||
self.orig_sentences = [] | ||
if orig_sentences is not None: | ||
self.orig_sentences = orig_sentences | ||
self.permutation_indices = [] | ||
if permutation_indices is not None: | ||
self.permutation_indices = permutation_indices | ||
|
||
def get_paragraphs(self): | ||
return self.text_indexed | ||
|
||
def get_sentences(self): | ||
sentences = [] | ||
for paragraph in self.text_indexed: | ||
for sent in paragraph: | ||
sentences.append(sent) | ||
return sentences | ||
|
||
def get_words(self): | ||
words = [] | ||
for paragraph in self.text_indexed: | ||
for sent in paragraph: | ||
for word in sent: | ||
words.append(word) | ||
return words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import torch | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
from torch.nn.utils.rnn import pack_padded_sequence | ||
|
||
USE_CUDA = torch.cuda.is_available() | ||
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor | ||
|
||
|
||
class LSTMClique(nn.Module): | ||
|
||
def __init__(self, params, data_obj): | ||
super(LSTMClique, self).__init__() | ||
self.embedding_dim = params['embedding_dim'] | ||
self.hidden_dim = params['hidden_dim'] | ||
self.lstm_dim = params['lstm_dim'] | ||
self.dropout = params['dropout'] | ||
self.clique_size = params['clique_size'] | ||
self.embeddings = data_obj.word_embeds | ||
self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) | ||
self.hidden = None | ||
self.clique_layer = nn.Linear(params['clique_size'] * self.lstm_dim, self.hidden_dim) | ||
nn.init.xavier_uniform(self.clique_layer.weight, gain=nn.init.calculate_gain('tanh')) | ||
self.task = params['task'] | ||
if params['task'] == 'perm': | ||
num_labels = 2 | ||
elif params['task'] == 'minority': | ||
num_labels = 2 | ||
elif params['task'] == 'class': | ||
num_labels = 3 | ||
elif params['task'] == 'score_pred': | ||
num_labels = 1 | ||
self.predict_layer = nn.Linear(self.hidden_dim, num_labels) | ||
nn.init.xavier_uniform(self.predict_layer.weight, gain=nn.init.calculate_gain('sigmoid')) | ||
if USE_CUDA: | ||
self.clique_layer = self.clique_layer.cuda() | ||
self.predict_layer = self.predict_layer.cuda() | ||
|
||
def init_hidden(self, batch_size): | ||
if USE_CUDA: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) | ||
else: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim))) | ||
|
||
def forward(self, inputs, input_lengths, original_index): # now with cliques | ||
lstm_out = None | ||
for i in range(self.clique_size): # send each sentence x batch through LSTM | ||
self.hidden = self.init_hidden(len(input_lengths[i])) | ||
seq_tensor = self.embeddings(inputs[i]) | ||
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True) | ||
packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) | ||
# reorder | ||
final_output = ht[-1] | ||
odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1)) | ||
output_unsorted = torch.gather(final_output, 0, Variable(odx)) | ||
if lstm_out is None: | ||
lstm_out = output_unsorted | ||
else: | ||
lstm_out = torch.cat([lstm_out, output_unsorted], dim=1) | ||
clique_vector = F.tanh(self.clique_layer(lstm_out)) | ||
clique_vector = F.dropout(clique_vector, p=self.dropout, training=self.training) | ||
coherence_pred = self.predict_layer(clique_vector) | ||
if self.task != 'score_pred': | ||
coherence_pred = F.softmax(coherence_pred, dim=0) | ||
return coherence_pred |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import torch | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
from torch.nn.utils.rnn import pack_padded_sequence | ||
|
||
USE_CUDA = torch.cuda.is_available() | ||
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor | ||
|
||
# todo this whole class | ||
class LSTMParSeq(nn.Module): | ||
|
||
def __init__(self, params, data_obj): | ||
super(LSTMParSeq, self).__init__() | ||
self.data_obj = data_obj | ||
self.task = params['task'] | ||
self.embedding_dim = params['embedding_dim'] | ||
self.hidden_dim = params['hidden_dim'] | ||
self.lstm_dim = params['lstm_dim'] | ||
self.dropout = params['dropout'] | ||
self.embeddings = data_obj.word_embeds | ||
self.word_lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) | ||
self.word_lstm_hidden = None | ||
self.sent_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim) | ||
self.sent_lstm_hidden = None | ||
self.par_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim) | ||
self.par_lstm_hidden = None | ||
self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim) | ||
if params['task'] == 'perm': | ||
num_labels = 2 | ||
elif params['task'] == 'minority': | ||
num_labels = 2 | ||
elif params['task'] == 'class': | ||
num_labels = 3 | ||
elif params['task'] == 'score_pred': | ||
num_labels = 1 | ||
self.predict_layer = nn.Linear(self.hidden_dim, num_labels) | ||
self.bn = nn.BatchNorm1d(self.hidden_dim) | ||
# weight initialization | ||
for m in self.modules(): | ||
if isinstance(m, nn.Linear): | ||
m.bias.data.zero_() | ||
nn.init.xavier_uniform(m.weight) | ||
if USE_CUDA: | ||
self.hidden_layer = self.hidden_layer.cuda() | ||
self.predict_layer = self.predict_layer.cuda() | ||
|
||
def init_hidden(self, batch_size): | ||
if USE_CUDA: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) | ||
else: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim))) | ||
|
||
def forward(self, inputs, input_lengths, original_index): | ||
doc_vecs = None | ||
for i in range(len(inputs)): # loop over docs | ||
par_vecs = None | ||
for j in range(len(inputs[i])): # loop over paragraphs | ||
doc_batch_size = len(inputs[i][j]) # number of sents | ||
self.word_lstm_hidden = self.init_hidden(doc_batch_size) | ||
seq_tensor = self.embeddings(inputs[i][j]) | ||
# pack | ||
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i][j], batch_first=True) | ||
packed_output, (ht, ct) = self.word_lstm(packed_input, self.word_lstm_hidden) | ||
# reorder | ||
final_output = ht[-1] | ||
odx = original_index[i][j].view(-1, 1).expand(len(input_lengths[i][j]), final_output.size(-1)) | ||
output_unsorted = torch.gather(final_output, 0, Variable(odx)) | ||
# LSTM to produce paragraph vector from sentence vectors | ||
output_unsorted = output_unsorted.unsqueeze(1) | ||
self.sent_lstm_hidden = self.init_hidden(output_unsorted.size(1)) # batch size 1 | ||
output_pars, (ht, ct) = self.sent_lstm(output_unsorted, self.sent_lstm_hidden) | ||
final_output = ht[-1] | ||
# append paragraph vector to batch | ||
if par_vecs is None: | ||
par_vecs = final_output | ||
else: | ||
par_vecs = torch.cat([par_vecs, final_output], dim=0) | ||
# LSTM over paragraph vectors to create document vector | ||
par_vecs = par_vecs.unsqueeze(1) | ||
self.par_lstm_hidden = self.init_hidden(par_vecs.size(1)) # batch size 1 | ||
output_doc, (ht, ct) = self.par_lstm(par_vecs, self.par_lstm_hidden) | ||
final_output = ht[-1] | ||
# append doc vector to batch | ||
if doc_vecs is None: | ||
doc_vecs = final_output | ||
else: | ||
doc_vecs = torch.cat([doc_vecs, final_output], dim=0) | ||
doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(doc_vecs))), p=self.dropout, training=self.training) | ||
coherence_pred = self.predict_layer(doc_vectors) | ||
if self.task != 'score_pred': | ||
coherence_pred = F.softmax(coherence_pred, dim=0) | ||
return coherence_pred |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import torch | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
from torch.nn.utils.rnn import pack_padded_sequence | ||
|
||
USE_CUDA = torch.cuda.is_available() | ||
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor | ||
|
||
|
||
class LSTMSentAvg(nn.Module): | ||
|
||
def __init__(self, params, data_obj): | ||
super(LSTMSentAvg, self).__init__() | ||
self.data_obj = data_obj | ||
self.task = params['task'] | ||
self.embedding_dim = params['embedding_dim'] | ||
self.hidden_dim = params['hidden_dim'] | ||
self.lstm_dim = params['lstm_dim'] | ||
self.dropout = params['dropout'] | ||
self.embeddings = data_obj.word_embeds | ||
self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) | ||
self.hidden = None | ||
self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim) | ||
if params['task'] == 'perm': | ||
num_labels = 2 | ||
elif params['task'] == 'minority': | ||
num_labels = 2 | ||
elif params['task'] == 'class': | ||
num_labels = 3 | ||
elif params['task'] == 'score_pred': | ||
num_labels = 1 | ||
self.predict_layer = nn.Linear(self.hidden_dim, num_labels) | ||
self.bn = nn.BatchNorm1d(self.hidden_dim) | ||
# weight initialization | ||
for m in self.modules(): | ||
if isinstance(m, nn.Linear): | ||
m.bias.data.zero_() | ||
nn.init.xavier_uniform(m.weight) | ||
if USE_CUDA: | ||
self.hidden_layer = self.hidden_layer.cuda() | ||
self.predict_layer = self.predict_layer.cuda() | ||
|
||
def init_hidden(self, batch_size): | ||
if USE_CUDA: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) | ||
else: | ||
return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), | ||
Variable(torch.zeros(1, batch_size, self.lstm_dim))) | ||
|
||
def forward(self, inputs, input_lengths, original_index): | ||
lstm_out = None # document vectors | ||
for i in range(len(inputs)): # loop over docs | ||
doc_batch_size = len(inputs[i]) # number of sents | ||
self.hidden = self.init_hidden(doc_batch_size) | ||
seq_tensor = self.embeddings(inputs[i]) | ||
# pack | ||
packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True) | ||
packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) | ||
# reorder | ||
final_output = ht[-1] | ||
odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1)) | ||
output_unsorted = torch.gather(final_output, 0, Variable(odx)) | ||
# sum sentence vectors | ||
output_sum = torch.sum(output_unsorted, 0).unsqueeze(0) | ||
if lstm_out is None: | ||
lstm_out = output_sum | ||
else: | ||
lstm_out = torch.cat([lstm_out, output_sum], dim=0) | ||
doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(lstm_out))), p=self.dropout, training=self.training) | ||
coherence_pred = self.predict_layer(doc_vectors) | ||
if self.task != 'score_pred': | ||
coherence_pred = F.softmax(coherence_pred, dim=0) | ||
return coherence_pred |
Oops, something went wrong.