From 4bf98facc60bd9ddc1889b8f55deb28ec6194c80 Mon Sep 17 00:00:00 2001 From: Alice Lai Date: Mon, 11 Jan 2021 12:20:39 -0500 Subject: [PATCH] Add files via upload --- DocumentWithCliques.py | 108 +++++++ DocumentWithParagraphs.py | 44 +++ LSTMClique.py | 69 ++++ LSTMParSeq.py | 96 ++++++ LSTMSentAvg.py | 76 +++++ README.md | 142 +++++++++ csv_to_text_files.py | 19 ++ data_loader.py | 505 ++++++++++++++++++++++++++++++ entity_graph.py | 154 +++++++++ entity_grid.py | 153 +++++++++ evaluation.py | 286 +++++++++++++++++ extract_entity_grid.py | 125 ++++++++ extract_entity_grid_perm.py | 153 +++++++++ extract_features_from_grid.py | 104 ++++++ extract_graph_from_grid.py | 115 +++++++ generate_high_coh_permutations.py | 109 +++++++ main.py | 126 ++++++++ train_neural_models.py | 149 +++++++++ 18 files changed, 2533 insertions(+) create mode 100644 DocumentWithCliques.py create mode 100644 DocumentWithParagraphs.py create mode 100644 LSTMClique.py create mode 100644 LSTMParSeq.py create mode 100644 LSTMSentAvg.py create mode 100644 README.md create mode 100644 csv_to_text_files.py create mode 100644 data_loader.py create mode 100644 entity_graph.py create mode 100644 entity_grid.py create mode 100644 evaluation.py create mode 100644 extract_entity_grid.py create mode 100644 extract_entity_grid_perm.py create mode 100644 extract_features_from_grid.py create mode 100644 extract_graph_from_grid.py create mode 100644 generate_high_coh_permutations.py create mode 100644 main.py create mode 100644 train_neural_models.py diff --git a/DocumentWithCliques.py b/DocumentWithCliques.py new file mode 100644 index 0000000..1ad9668 --- /dev/null +++ b/DocumentWithCliques.py @@ -0,0 +1,108 @@ +import torch +from nltk import word_tokenize +import random + +is_cuda = torch.cuda.is_available() + + +class DocumentWithCliques(object): + + def __init__(self, orig_sentences, clique_size, permutation_indices=None, id = '', label=None): + self.id = id + self.clique_size = clique_size + self.orig_sentences = [] + for sent in orig_sentences: + sent = sent.strip() + if sent == "": + continue + if sent == "" or sent == "": + self.orig_sentences.append([sent]) + else: + self.orig_sentences.append(word_tokenize(sent)) + self.permutation_indices = [] + if permutation_indices is not None: + self.permutation_indices = permutation_indices # index into orig_sentences + self.index_sentences = [] # token-indexed version of self.orig_sentences + self.orig_full_sequence = None + self.perm_full_sequences = None + self.label = label + + # turn full doc into flat sequence of word indices + def get_orig_full_sequence(self): + if self.orig_full_sequence is not None: + return self.orig_full_sequence + self.orig_full_sequence = [] + for sent in self.index_sentences: + self.orig_full_sequence.extend(sent) + return self.orig_full_sequence + + # turn all doc permutations into sentence lists of word indices + def get_perm_index_sentences(self): + if self.perm_full_sequences is not None: + return self.get_perm_full_sequences + self.perm_doc_sentences = [] + for perm in self.permutation_indices: + doc_temp = [] + for sent_idx in perm: + doc_temp.append(self.index_sentences[sent_idx]) + self.perm_doc_sentences.append(doc_temp) + return self.perm_doc_sentences + + # turn all doc permutations into flat sequences of word indices + def get_perm_full_sequences(self): + if self.perm_full_sequences is not None: + return self.get_perm_full_sequences + self.perm_full_sequences = [] + for perm in self.permutation_indices: + doc_temp = [] + for sent_idx in perm: + doc_temp.extend(self.index_sentences[sent_idx]) + self.perm_full_sequences.append(doc_temp) + return self.perm_full_sequences + + def create_cliques_orig(self): # assume self.index_sentences is non-empty + self.orig_cliques = [] + self.orig_cliques_index = [] + for i in range(len(self.index_sentences) - self.clique_size + 1): + clique = [] + clique_index = [] + for j in range(self.clique_size): + clique.append(self.index_sentences[i + j]) + clique_index.append(i+j) + self.orig_cliques.append(clique) + self.orig_cliques_index.append(clique_index) + + # randomly create negative cliques from the original document sentences + def create_cliques_neg(self): + self.neg_cliques = [] + for orig_clique in self.orig_cliques_index: # negative example for each window: replace center sentence + if len(self.orig_cliques_index) == 1: + break # no possible negative cliques for this doc + valid_sentences = {} + for sent_idx in orig_clique: + valid_sentences[sent_idx] = 1 + valid_sentences[0] = 1 # don't allow pad + valid_sentences[len(self.index_sentences) - 1] = 1 # don't allow pad + if len(valid_sentences) == len(self.index_sentences): + continue # no possible negative cliques for this positive clique + center_idx = int(len(orig_clique) / 2) + new_sent = random.randrange(len(self.index_sentences)) + while new_sent in valid_sentences: + new_sent = random.randrange(len(self.index_sentences)) + neg_clique = [] + for sent_idx in orig_clique: + neg_clique.append(self.index_sentences[sent_idx]) + neg_clique[center_idx] = self.index_sentences[new_sent] + self.neg_cliques.append(neg_clique) + + # create cliques for predefined permutations of this document + def create_cliques_perm(self): + self.perm_cliques = [] + for perm in self.permutation_indices: + cliques = [] + for i in range(len(perm) - self.clique_size + 1): + clique = [] + for j in range(self.clique_size): + clique.append(self.index_sentences[perm[i + j]]) + cliques.append(clique) + self.perm_cliques.append(cliques) diff --git a/DocumentWithParagraphs.py b/DocumentWithParagraphs.py new file mode 100644 index 0000000..9585c84 --- /dev/null +++ b/DocumentWithParagraphs.py @@ -0,0 +1,44 @@ +from nltk import sent_tokenize, word_tokenize + + +class DocumentWithParagraphs(object): + + def __init__(self, text_with_line_breaks, label=None, orig_sentences=None, permutation_indices=None, id=''): + self.id = id + self.text = [] + self.text_indexed = [] + self.label = label + lines = text_with_line_breaks.splitlines() + for line in lines: + line = line.strip() + if line != "": # this is a paragraph + paragraph = [] + sents = sent_tokenize(line) + for sent in sents: + words = word_tokenize(sent) + paragraph.append(words) + self.text.append(paragraph) + self.orig_sentences = [] + if orig_sentences is not None: + self.orig_sentences = orig_sentences + self.permutation_indices = [] + if permutation_indices is not None: + self.permutation_indices = permutation_indices + + def get_paragraphs(self): + return self.text_indexed + + def get_sentences(self): + sentences = [] + for paragraph in self.text_indexed: + for sent in paragraph: + sentences.append(sent) + return sentences + + def get_words(self): + words = [] + for paragraph in self.text_indexed: + for sent in paragraph: + for word in sent: + words.append(word) + return words \ No newline at end of file diff --git a/LSTMClique.py b/LSTMClique.py new file mode 100644 index 0000000..899416b --- /dev/null +++ b/LSTMClique.py @@ -0,0 +1,69 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence + +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor + + +class LSTMClique(nn.Module): + + def __init__(self, params, data_obj): + super(LSTMClique, self).__init__() + self.embedding_dim = params['embedding_dim'] + self.hidden_dim = params['hidden_dim'] + self.lstm_dim = params['lstm_dim'] + self.dropout = params['dropout'] + self.clique_size = params['clique_size'] + self.embeddings = data_obj.word_embeds + self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) + self.hidden = None + self.clique_layer = nn.Linear(params['clique_size'] * self.lstm_dim, self.hidden_dim) + nn.init.xavier_uniform(self.clique_layer.weight, gain=nn.init.calculate_gain('tanh')) + self.task = params['task'] + if params['task'] == 'perm': + num_labels = 2 + elif params['task'] == 'minority': + num_labels = 2 + elif params['task'] == 'class': + num_labels = 3 + elif params['task'] == 'score_pred': + num_labels = 1 + self.predict_layer = nn.Linear(self.hidden_dim, num_labels) + nn.init.xavier_uniform(self.predict_layer.weight, gain=nn.init.calculate_gain('sigmoid')) + if USE_CUDA: + self.clique_layer = self.clique_layer.cuda() + self.predict_layer = self.predict_layer.cuda() + + def init_hidden(self, batch_size): + if USE_CUDA: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), + Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) + else: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), + Variable(torch.zeros(1, batch_size, self.lstm_dim))) + + def forward(self, inputs, input_lengths, original_index): # now with cliques + lstm_out = None + for i in range(self.clique_size): # send each sentence x batch through LSTM + self.hidden = self.init_hidden(len(input_lengths[i])) + seq_tensor = self.embeddings(inputs[i]) + packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True) + packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) + # reorder + final_output = ht[-1] + odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1)) + output_unsorted = torch.gather(final_output, 0, Variable(odx)) + if lstm_out is None: + lstm_out = output_unsorted + else: + lstm_out = torch.cat([lstm_out, output_unsorted], dim=1) + clique_vector = F.tanh(self.clique_layer(lstm_out)) + clique_vector = F.dropout(clique_vector, p=self.dropout, training=self.training) + coherence_pred = self.predict_layer(clique_vector) + if self.task != 'score_pred': + coherence_pred = F.softmax(coherence_pred, dim=0) + return coherence_pred diff --git a/LSTMParSeq.py b/LSTMParSeq.py new file mode 100644 index 0000000..7c8dc63 --- /dev/null +++ b/LSTMParSeq.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence + +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor + +# todo this whole class +class LSTMParSeq(nn.Module): + + def __init__(self, params, data_obj): + super(LSTMParSeq, self).__init__() + self.data_obj = data_obj + self.task = params['task'] + self.embedding_dim = params['embedding_dim'] + self.hidden_dim = params['hidden_dim'] + self.lstm_dim = params['lstm_dim'] + self.dropout = params['dropout'] + self.embeddings = data_obj.word_embeds + self.word_lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) + self.word_lstm_hidden = None + self.sent_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim) + self.sent_lstm_hidden = None + self.par_lstm = nn.LSTM(self.lstm_dim, self.lstm_dim) + self.par_lstm_hidden = None + self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim) + if params['task'] == 'perm': + num_labels = 2 + elif params['task'] == 'minority': + num_labels = 2 + elif params['task'] == 'class': + num_labels = 3 + elif params['task'] == 'score_pred': + num_labels = 1 + self.predict_layer = nn.Linear(self.hidden_dim, num_labels) + self.bn = nn.BatchNorm1d(self.hidden_dim) + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Linear): + m.bias.data.zero_() + nn.init.xavier_uniform(m.weight) + if USE_CUDA: + self.hidden_layer = self.hidden_layer.cuda() + self.predict_layer = self.predict_layer.cuda() + + def init_hidden(self, batch_size): + if USE_CUDA: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), + Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) + else: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), + Variable(torch.zeros(1, batch_size, self.lstm_dim))) + + def forward(self, inputs, input_lengths, original_index): + doc_vecs = None + for i in range(len(inputs)): # loop over docs + par_vecs = None + for j in range(len(inputs[i])): # loop over paragraphs + doc_batch_size = len(inputs[i][j]) # number of sents + self.word_lstm_hidden = self.init_hidden(doc_batch_size) + seq_tensor = self.embeddings(inputs[i][j]) + # pack + packed_input = pack_padded_sequence(seq_tensor, input_lengths[i][j], batch_first=True) + packed_output, (ht, ct) = self.word_lstm(packed_input, self.word_lstm_hidden) + # reorder + final_output = ht[-1] + odx = original_index[i][j].view(-1, 1).expand(len(input_lengths[i][j]), final_output.size(-1)) + output_unsorted = torch.gather(final_output, 0, Variable(odx)) + # LSTM to produce paragraph vector from sentence vectors + output_unsorted = output_unsorted.unsqueeze(1) + self.sent_lstm_hidden = self.init_hidden(output_unsorted.size(1)) # batch size 1 + output_pars, (ht, ct) = self.sent_lstm(output_unsorted, self.sent_lstm_hidden) + final_output = ht[-1] + # append paragraph vector to batch + if par_vecs is None: + par_vecs = final_output + else: + par_vecs = torch.cat([par_vecs, final_output], dim=0) + # LSTM over paragraph vectors to create document vector + par_vecs = par_vecs.unsqueeze(1) + self.par_lstm_hidden = self.init_hidden(par_vecs.size(1)) # batch size 1 + output_doc, (ht, ct) = self.par_lstm(par_vecs, self.par_lstm_hidden) + final_output = ht[-1] + # append doc vector to batch + if doc_vecs is None: + doc_vecs = final_output + else: + doc_vecs = torch.cat([doc_vecs, final_output], dim=0) + doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(doc_vecs))), p=self.dropout, training=self.training) + coherence_pred = self.predict_layer(doc_vectors) + if self.task != 'score_pred': + coherence_pred = F.softmax(coherence_pred, dim=0) + return coherence_pred diff --git a/LSTMSentAvg.py b/LSTMSentAvg.py new file mode 100644 index 0000000..a7c0d34 --- /dev/null +++ b/LSTMSentAvg.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence + +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor + + +class LSTMSentAvg(nn.Module): + + def __init__(self, params, data_obj): + super(LSTMSentAvg, self).__init__() + self.data_obj = data_obj + self.task = params['task'] + self.embedding_dim = params['embedding_dim'] + self.hidden_dim = params['hidden_dim'] + self.lstm_dim = params['lstm_dim'] + self.dropout = params['dropout'] + self.embeddings = data_obj.word_embeds + self.lstm = nn.LSTM(self.embedding_dim, self.lstm_dim) + self.hidden = None + self.hidden_layer = nn.Linear(self.lstm_dim, self.hidden_dim) + if params['task'] == 'perm': + num_labels = 2 + elif params['task'] == 'minority': + num_labels = 2 + elif params['task'] == 'class': + num_labels = 3 + elif params['task'] == 'score_pred': + num_labels = 1 + self.predict_layer = nn.Linear(self.hidden_dim, num_labels) + self.bn = nn.BatchNorm1d(self.hidden_dim) + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Linear): + m.bias.data.zero_() + nn.init.xavier_uniform(m.weight) + if USE_CUDA: + self.hidden_layer = self.hidden_layer.cuda() + self.predict_layer = self.predict_layer.cuda() + + def init_hidden(self, batch_size): + if USE_CUDA: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim).cuda()), + Variable(torch.zeros(1, batch_size, self.lstm_dim)).cuda()) + else: + return (Variable(torch.zeros(1, batch_size, self.lstm_dim)), + Variable(torch.zeros(1, batch_size, self.lstm_dim))) + + def forward(self, inputs, input_lengths, original_index): + lstm_out = None # document vectors + for i in range(len(inputs)): # loop over docs + doc_batch_size = len(inputs[i]) # number of sents + self.hidden = self.init_hidden(doc_batch_size) + seq_tensor = self.embeddings(inputs[i]) + # pack + packed_input = pack_padded_sequence(seq_tensor, input_lengths[i], batch_first=True) + packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) + # reorder + final_output = ht[-1] + odx = original_index[i].view(-1, 1).expand(len(input_lengths[i]), final_output.size(-1)) + output_unsorted = torch.gather(final_output, 0, Variable(odx)) + # sum sentence vectors + output_sum = torch.sum(output_unsorted, 0).unsqueeze(0) + if lstm_out is None: + lstm_out = output_sum + else: + lstm_out = torch.cat([lstm_out, output_sum], dim=0) + doc_vectors = F.dropout(self.bn(F.relu(self.hidden_layer(lstm_out))), p=self.dropout, training=self.training) + coherence_pred = self.predict_layer(doc_vectors) + if self.task != 'score_pred': + coherence_pred = F.softmax(coherence_pred, dim=0) + return coherence_pred diff --git a/README.md b/README.md new file mode 100644 index 0000000..937f7ca --- /dev/null +++ b/README.md @@ -0,0 +1,142 @@ +# DiscourseCoherenceDev + +## Dependencies + +This code is written in Python. The dependencies are: + +* Python3 (with recent versions of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/)) +* [Pytorch](http://pytorch.org/) (tested on Pytorch 0.3.1) +* [scikit-learn](http://scikit-learn.org/stable/) +* NLTK >= 3 +* [pycorenlp](https://github.com/smilli/py-corenlp) +* [progressbar2](https://pypi.org/project/progressbar2/) + +## Evaluation + +All models can be trained for 4 different evaluation tasks: +- 'class': 3-class classification (low, medium, high coherence) +- 'score_pred': mean score prediction +- 'perm': binary ranking of original vs. permuted texts (requires text permutation files) +- 'minority': binary classification of low coherence vs. all other texts + +## Data Directory Structure + +The GDCD data is available by request (see https://github.com/aylai/GCDC-corpus for details). To run the preprocessing scripts, you will have to create a directory for each corpus in 'data/' containing the train and test csv files. For the Yelp data, you will need to download the data separately (https://www.yelp.com/dataset) and add the corresponding review titles and texts to the incomplete csv file (the CSV header should match the fields in the Clinton and Enron CSVs). + + +## Preprocessing + +'corpus' refers to the corpus name: {Yahoo, Clinton, Enron, Yelp} + +*1) Extract texts from CSV to separate files.* Required for entity grid and entity graph models, as well as generating text permutations for evaluation. + +Input: data/[corpus]/[corpus]_train.csv and data/[corpus]/[corpus]_test.csv files. Output: data/[corpus]/text/ directory containing all individual text files. + +``` +python3 csv_to_text_files.py [corpus] +``` + +*2) Generate permutation text files (20 per text).* Only generates permutations for high-coherence texts (label = 3). Required for evaluating any model on the binary permutation ranking task (can skip this step for all other experiments). + +Input: data/[corpus]/[corpus]_train.csv, data/[corpus]/[corpus]_test.csv, and data/[corpus]/text/ files. Output: data/[corpus]/text_permute directory containing original and permuted text files for all high-coherence texts. + +``` +python3 generate_high_coh_permutations [corpus] +``` + +*3) Extract entity grid files (requires Stanford CoreNLP for parsing).* Required for entity grid and entity graph models. + +This step requires running the Stanford CoreNLP server (with Java 8, not Java 9). More details here: https://github.com/smilli/py-corenlp and here: https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started. You will probably need to run the server with -timeout 50000 (or possibly higher) instead of -timeout 15000 in order to process the longest documents in this dataset. + +**Original files only:** + +Input: data/[corpus]/text/ files. Output: data/[corpus]/parsed/ and data/[corpus]/grid/ files. + + +``` +python3 extract_entity_grid.py [corpus] +``` + +**Permuted files:** + +Input: data/[corpus]/text_permute/ files. Output: data/[corpus]/parsed_permute/ and data/[corpus]/grid_permute/ files + + +``` +python3 extract_entity_grid_perm.py [corpus] +``` + +*4) Extract entity graph files from entity grid files.* Extracts 6 different types of entity graphs: {unweighted, weighted, and syntax-sensitive} with or without distance discounting. Specify 'true' or 'false' for 'is_permutation' argument. Required for entity graph model. + +Input: data/[corpus]/grid[_permute]/ files. Output: data/[corpus]/graph[_permute] files. + +``` +python3 extract_graph_from_grid.py [corpus] [is_permutation] +``` + +*5) Extract features from entity grid files.* Required for entity grid model. Must specify: +- 'seq_len' the number of sequential sentences over which to compute features (e.g. 2, 3, 4) +- 'salience_threshold' the threshold for salient vs. non-salient entities (e.g. 2, 3, 4 occurrences); specify '1' for only one saliance class +- 'syntax_opt' 1 to use syntactic roles (s, o, x, -); 0 to ignore syntactic roles (x, -) +- 'is_permutation': 'true' if using permuted text files, 'false' if using original text files only + +Input: data/[corpus]/grid[_permute]/ files. Output: data/[corpus]/features[_permute]/[feature_set] + +``` +python3 extract_features_from_grid.py [corpus] [seq_len] [salience_threshold] [syntax_opt] [is_permutation] +``` + +## Models + +### Entity grid + +Train a random forest classifier on entity grid features. 'feature_set' specifies the name of the feature directory in data/[corpus]/features[_permute]. 'evaluation' specifies the task: 'class', 'score_pred', 'minority', 'perm'. + +``` +python3 entity_grid.py [corpus] [feature_set] [evaluation] +``` + +### Entity graph + +Use entity graph outdegree values to evaluate on different tasks. Must specify graph type: [u, u_dist, w, w_dist, syn, syn_dist]. + +Thresholds (any real numbers): +- 'class': must specify 'threshold1' and 'threshold2' +- 'minority': must specify 'threshold1' +- 'perm': no threshold +- 'score_pred': no threshold + +``` +python3 entity_graph.py [corpus] [evaluation] [graph_type] [opt:threshold1] [opt:threshold2] +``` + +### Neural clique + +Train 3-class classification model on Yahoo data with clique size = 7 sentences: +``` +python3 main.py --model_name yahoo_class_model --train_corpus Yahoo --model_type clique --task class --clique 7 +``` + +See main.py for other parameters. + +### Neural SentAvg + +Train 3-class classification model on Yahoo data: +``` +python3 main.py --model_name yahoo_class_model --train_corpus Yahoo --model_type sent_avg --task class +``` + +See main.py for other parameters. + +**Note:** the SentAvg model cannot be trained for the binary permutation ranking task (because all sentence order permutations have the same score). + +### Neural ParSeq + +Train 3-class classification model on Yahoo data: +``` +python3 main.py --model_name yahoo_class_model --train_corpus Yahoo --model_type par_seq --task class +``` + +See main.py for other parameters. + +**Note:** the ParSeq model currently cannot be trained for the binary permutation ranking task. \ No newline at end of file diff --git a/csv_to_text_files.py b/csv_to_text_files.py new file mode 100644 index 0000000..3c36e87 --- /dev/null +++ b/csv_to_text_files.py @@ -0,0 +1,19 @@ +import csv, os, sys + +corpus = sys.argv[1] +corpus_dir = 'data/' + corpus + '/' +text_dir = corpus_dir + 'text/' +if not os.path.exists(text_dir): + os.makedirs(text_dir) +splits = ['train', 'test'] +for split in splits: + with open(corpus_dir + corpus + '_' + split + '.csv','r') as in_file: + reader = csv.DictReader(in_file) + for row in reader: + text_id = row['text_id'] + filename = text_id + '.txt' + if os.path.exists(text_dir + filename): + continue + out_file = open(text_dir + filename, 'w') + out_file.write(row['text']) + out_file.close() diff --git a/data_loader.py b/data_loader.py new file mode 100644 index 0000000..6412768 --- /dev/null +++ b/data_loader.py @@ -0,0 +1,505 @@ +import torch +import torch.nn as nn +import numpy as np +import os +from DocumentWithCliques import DocumentWithCliques +from DocumentWithParagraphs import DocumentWithParagraphs +import random +from torch.autograd import Variable +from nltk import word_tokenize +from nltk import sent_tokenize +import csv + +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor +is_cuda = torch.cuda.is_available() + + +class Data(object): + + def __init__(self, params): + self.params = params + self.word_to_idx = {} + self.idx_to_word = {} + self.word_to_idx[''] = 0 + self.idx_to_word[0] = '' + self.word_embeds = None + + def read_orig_doc(self, filename, data_type, for_clique): + sentences = [] + with open(filename, "r") as in_file: + for line in in_file: + line = line.strip() + if data_type == "BL": + line = line.split(None, 1)[1] # remove sent ID + if not self.params['case_sensitive']: + line = line.lower() + sentences.append(line) + if for_clique: + for i in range(int(self.params['clique_size'] / 2)): + sentences.insert(0, "") + sentences.append("") + return sentences + + def read_perm_doc(self, filename, sentences, data_type, for_clique): + sentence_indices = [] + with open(filename, "r") as in_file: + for line in in_file: + line = line.strip() + if data_type == "BL": + line = line.split(None, 1)[1] + if not self.params['case_sensitive']: + line = line.lower() + sentence_indices.append(sentences.index(line)) + if for_clique: + for i in range(int(self.params['clique_size'] / 2)): + sentence_indices.insert(0, 0) # start pad + sentence_indices.append(len(sentences) - 1) + return sentence_indices + + # read my Yahoo/Clinton/Enron data for 3-way classification (full train/test) + def read_data_class(self, params, split): + # corpus = params['data_dir'].rsplit('/', 2)[1] + if split == 'train' or split == 'train_nodev': + corpus = params['train_corpus'] + elif split == 'test': + corpus = params['test_corpus'] + documents = [] + add_new_words = False + if self.word_embeds is None and split == "train": + add_new_words = True + filename = corpus + '_' + split + '.csv' + with open(params['data_dir'] + corpus + '/' + filename,'r') as in_file: + reader = csv.DictReader(in_file) + for row in reader: + text = row['text'] + if not self.params['case_sensitive']: + text = text.lower() + text_id = row['text_id'] + if params['task'] == 'score_pred': + labels = [int(row['ratingA1']), int(row['ratingA2']), int(row['ratingA3'])] + label = np.mean(labels) + # elif params['eval_minority']: + elif params['task'] == 'minority': + num_low_judgments = 0 + if row['ratingA1'] == '1': + num_low_judgments += 1 + if row['ratingA2'] == '1': + num_low_judgments += 1 + if row['ratingA3'] == '1': + num_low_judgments += 1 + if num_low_judgments >= 2: + label = 1 + else: + label = 0 + else: + label = int(row['labelA']) + label = label - 1 # zero-indexing + if params['model_type'] == 'clique': + orig_sentences = [] + for par in text.splitlines(): + par = par.strip() + if par == "": + continue + orig_sentences.extend(sent_tokenize(par)) + for i in range(int(self.params['clique_size'] / 2)): + orig_sentences.insert(0, "") + orig_sentences.append("") + doc = DocumentWithCliques(orig_sentences, self.params['clique_size'], None, text_id, label) + for sent in doc.orig_sentences: + sent_idx = [] + for token in sent: + idx = self.add_token_to_index(token, add_new_words) + sent_idx.append(idx) + doc.index_sentences.append(sent_idx) + elif params['model_type'] == 'sent_avg' or params['model_type'] == 'par_seq': + doc = DocumentWithParagraphs(text, label, id=text_id) + # index words + doc_indexed = [] + for para in doc.text: + para_indexed = [] + for sent in para: + sent_indexed = [] + for word in sent: + sent_indexed.append(self.add_token_to_index(word, add_new_words)) + para_indexed.append(sent_indexed) + doc_indexed.append(para_indexed) + doc.text_indexed = doc_indexed + documents.append(doc) + return documents + + # read my Yahoo/Clinton/Enron data for binary ranking permutation task (cross-validation fold) + def read_data_perm(self, params, split): + # corpus = params['data_dir'].rsplit('/', 2)[1] + if split == 'train' or split == 'train_nodev': + corpus = params['train_corpus'] + elif split == 'dev': + corpus = params['train_corpus'] + elif split == 'test': + corpus = params['test_corpus'] + documents = [] + add_new_words = False + if self.word_embeds is None and split == "train": + add_new_words = True + # get list of files in this split + filename = corpus + '_' + split + '_perm.csv' + text_ids = [] + with open(params['data_dir'] + corpus + '/' + filename, 'r') as in_file: + reader = csv.DictReader(in_file) + for row in reader: + text_ids.append(row['text_id']) + for text_id in text_ids: + # read orig file + if not os.path.exists(params['data_dir'] + corpus + '/text_permute/' + text_id + '_sent.txt'): + print(text_id + " not found in permutation data.") + continue + orig_sentences = self.read_orig_doc(params['data_dir'] + corpus + '/text_permute/' + text_id + '_sent.txt', "mine", params['model_type']=='clique') + perm_docs = [] + for i in range(1,21): + filename_perm = params['data_dir'] + corpus + '/text_permute/' + text_id + '.perm-' + str(i) + '.txt' + if not os.path.exists(filename_perm): + continue + perm_docs.append(self.read_perm_doc(filename_perm, orig_sentences, "mine", params['model_type']=='clique')) + if len(perm_docs) == 0: + continue # document has no permutations (is only a single sentence) -- remove from data + if params['model_type'] == 'clique': + doc = DocumentWithCliques(orig_sentences, self.params['clique_size'], perm_docs, text_id) + for sent in doc.orig_sentences: + sent_idx = [] + for token in sent: + idx = self.add_token_to_index(token, add_new_words) + sent_idx.append(idx) + doc.index_sentences.append(sent_idx) + elif params['model_type'] == 'sent_avg' or params['model_type'] == 'par_seq': + # note this loses paragraph info (not useful for permutations task) + doc = DocumentWithParagraphs("\n".join(orig_sentences), None, orig_sentences, perm_docs, text_id) + # index words + doc_indexed = [] + for para in doc.text: + para_indexed = [] + for sent in para: + sent_indexed = [] + for word in sent: + sent_indexed.append(self.add_token_to_index(word, add_new_words)) + para_indexed.append(sent_indexed) + doc_indexed.append(para_indexed) + doc.text_indexed = doc_indexed + documents.append(doc) + return documents + + def add_token_to_index(self, token, add_new_words): + if token not in self.word_to_idx and add_new_words: # add to vocab + idx = len(self.word_to_idx) + self.word_to_idx[token] = idx + self.idx_to_word[idx] = token + elif token not in self.word_to_idx and not add_new_words: # replace with UNK token + if 'unk' not in self.word_to_idx: + idx = len(self.word_to_idx) + self.word_to_idx['unk'] = idx + self.idx_to_word[idx] = 'unk' + return self.word_to_idx['unk'] + return self.word_to_idx[token] + + def create_cliques(self, documents, task, limit=None): # create cliques of k sentences + items = [] + labels = [] + for doc in documents: + doc.create_cliques_orig() + for clique in doc.orig_cliques: + temp_item = [] + for sent in clique: + # temp_item.append(Variable(LongTensor(list(sent))).view(1, -1)) + temp_item.append(list(sent)) + items.append(temp_item) + if task == 'perm': + labels.append(1) # coherent clique + elif task == 'class' or task == 'score_pred' or task == 'minority': + labels.append(doc.label) + if task == 'perm': + doc.create_cliques_neg() + for clique in doc.neg_cliques: + temp_item = [] + for sent in clique: + temp_item.append(list(sent)) + items.append(temp_item) + labels.append(0) # incoherent clique + doc.create_cliques_perm() + if limit is not None and limit < len(items): + indices = list(range(len(items))) + random.shuffle(indices) + indices = indices[:limit] + new_items = [] + new_labels = [] + for i in indices: + new_items.append(items[i]) + new_labels.append(labels[i]) + items = new_items + labels = new_labels + return items, labels + + def retrieve_doc_cliques_by_label(self, document, task, limit=None): # create cliques of k sentences + items_pos = [] + items_neg = [] + document.create_cliques_orig() + document.create_cliques_neg() + for clique in document.orig_cliques: + temp_item = [] + for sent in clique: + # temp_item.append(Variable(LongTensor(list(sent))).view(1, -1)) + temp_item.append(list(sent)) + items_pos.append(temp_item) + if task == 'perm': + for perm_doc in document.perm_cliques: + perm_temp = [] + for clique in perm_doc: + temp_item = [] + for sent in clique: + # temp_item.append(Variable(LongTensor(list(sent))).view(1, -1)) + temp_item.append(list(sent)) + perm_temp.append(temp_item) + items_neg.append(perm_temp) + return items_pos, items_neg + + def retrieve_doc_sents_by_label(self, document, limit=None): # create cliques of k sentences + items_pos = [] + items_neg = [] + orig_sentences = document.get_sentences() + for sent in orig_sentences: + # items_pos.append(Variable(LongTensor(list(sent))).view(1, -1)) + items_pos.append(list(sent)) + for perm_doc in document.permutation_indices: + doc_neg = [] + for sent_idx in perm_doc: + # doc_neg.append(Variable(LongTensor(list(orig_sentences[sent_idx]))).view(1, -1)) + doc_neg.append(list(orig_sentences[sent_idx])) + items_neg.append(doc_neg) + return [items_pos], items_neg + + def create_doc_sents(self, documents, split_type, task, limit=-1): + items = [] + labels = [] + ids = [] + for doc in documents: + doc_items = [] + if split_type == 'paragraph': + for paragraph in doc.get_paragraphs(): + par_sentences = [] + for sent in paragraph: + par_sentences.append(sent) + doc_items.append(par_sentences) + if split_type == 'sentence': + if task == 'class' or task == 'score_pred' or task == 'minority': + for sent in doc.get_sentences(): + doc_items.append(sent) + elif task == 'perm': + orig_sentences = doc.get_sentences() + perm_count = 1 + for perm in doc.permutation_indices: + # create permuted doc + doc_items = [] + for sent_idx in perm: + doc_items.append(orig_sentences[sent_idx]) + items.append(doc_items) + labels.append(0) # permuted + ids.append(doc.id+".0") + # create orig doc for each permuted doc + doc_items = [] + for sent in orig_sentences: + doc_items.append(sent) + items.append(doc_items) + labels.append(1) + ids.append(doc.id+"."+str(perm_count)) + perm_count += 1 + if task != "perm": + items.append(doc_items) + labels.append(doc.label) + ids.append(doc.id) + if -1 < limit < len(items): + indices = list(range(len(items))) + random.shuffle(indices) + indices = indices[:limit] + new_items = [] + new_labels = [] + new_ids = [] + for i in indices: + new_items.append(items[i]) + new_labels.append(labels[i]) + new_ids.append(ids[i]) + items = new_items + labels = new_labels + ids = new_ids + return items, labels, ids + + def load_vectors(self): + print("\nLoading vectors:") + if self.params['vector_type'] == 'glove': + data = [] + for line in open(self.params['vector_path']): + tokens = line.split() + if len(tokens) != 301: + continue + word = tokens[0] + vector_len = len(tokens) - 1 + for t in tokens[1:]: + data.append(float(t)) + idx = len(self.word_to_idx) + self.word_to_idx[word] = idx + self.idx_to_word[idx] = word + data_arr = np.reshape(data, newshape=(int(len(data)/vector_len), vector_len)) + # add pad array at index 0 + data_arr = np.concatenate((np.random.rand(1, vector_len), data_arr), 0) + # add OOV array + data_arr = np.concatenate((data_arr, np.random.rand(1, vector_len)), 0) + idx = len(self.word_to_idx) + self.word_to_idx['unk'] = idx + self.idx_to_word[idx] = 'unk' + # add doc start pad array + data_arr = np.concatenate((data_arr, np.random.rand(1, vector_len)), 0) + idx = len(self.word_to_idx) + self.word_to_idx[''] = idx + self.idx_to_word[idx] = '' + # add doc end pad array + data_arr = np.concatenate((data_arr, np.random.rand(1, vector_len)), 0) + idx = len(self.word_to_idx) + self.word_to_idx[''] = idx + self.idx_to_word[idx] = '' + self.word_embeds = nn.Embedding(data_arr.shape[0], data_arr.shape[1]) + if USE_CUDA: + self.word_embeds = self.word_embeds.cuda() + self.word_embeds.weight.data.copy_(torch.from_numpy(data_arr)) + self.word_embeds.weight.requires_grad = False + print("loading: done") + return self.word_embeds, vector_len + else: + print("unrecognized vector type") + + def rand_vectors(self, vocab_size): + if 'unk' not in self.word_to_idx: + idx = len(self.word_to_idx) + self.word_to_idx['unk'] = idx + self.idx_to_word[idx] = 'unk' + if '' not in self.word_to_idx: + # add doc start pad + idx = len(self.word_to_idx) + self.word_to_idx[''] = idx + self.idx_to_word[idx] = '' + if '' not in self.word_to_idx: + # add doc end pad array + idx = len(self.word_to_idx) + self.word_to_idx[''] = idx + self.idx_to_word[idx] = '' + self.word_embeds = nn.Embedding(len(self.word_to_idx), self.params['embedding_dim']) + if is_cuda: + self.word_embeds = self.word_embeds.cuda() + return self.word_embeds + + def get_batch(self, data, labels, indices, model_type, clique_size=0): + batch = [] + batch_labels = [] + if model_type == 'clique': + for i in range(clique_size): + batch.append([]) + for idx in indices: + batch_labels.append(labels[idx]) + if model_type == 'sent_avg' or model_type == 'par_seq': + batch.append(data[idx]) + elif model_type == 'clique': + for i in range(clique_size): + batch[i].append(data[idx][i]) + return batch, batch_labels + + def reverse_index(self, sorted_index): + rev_index = [] + for val in sorted_index: + rev_index.append(0) + for idx, val in enumerate(sorted_index): + rev_index[val] = idx + return rev_index + + def reorder_list(self, data_list, reorder_idx): + new_data_list = [] + for idx in reorder_idx: + new_data_list.append(data_list[idx]) + return new_data_list + + def pad_to_batch(self, batch, word_to_idx, model_type, clique_size=0): # batch is list of (sequence, label) + if model_type == 'par_seq': + input_var = [] + input_len = [] + reverse_index = [] + for doc in batch: + doc_var = [] + doc_len = [] + doc_index = [] + for par in doc: + # batch_lengths = LongTensor([seq[0].size(0) for seq in par]) + batch_lengths = LongTensor([len(seq) for seq in par]) + sorted_lengths, original_index = torch.sort(batch_lengths, 0, descending=True) + doc_index.append(LongTensor(self.reverse_index(original_index))) + sorted_batch = sorted(par, key=lambda b: len(b), reverse=True) + x = sorted_batch + max_x = max([len(s) for s in x]) + x_p = [] + for i in range(len(par)): + if len(x[i]) < max_x: + x_p.append(torch.cat([Variable(LongTensor(x[i])).view(1,-1), + Variable( + LongTensor([word_to_idx['']] * (max_x - len(x[i])))).view( + 1, -1)], 1)) + else: + x_p.append(Variable(LongTensor(x[i])).view(1,-1)) + input_var_temp = torch.cat(x_p) + doc_var.append(input_var_temp) + doc_len.append([list(map(lambda s: s == 0, t.data)).count(False) for t in input_var_temp]) + input_var.append(doc_var) + input_len.append(doc_len) + reverse_index.append(doc_index) + if model_type == 'sent_avg': + input_var = [] + input_len = [] + reverse_index = [] + for doc in batch: + batch_lengths = LongTensor([len(seq) for seq in doc]) + sorted_lengths, original_index = torch.sort(batch_lengths, 0, descending=True) + reverse_index.append(LongTensor(self.reverse_index(original_index))) + sorted_batch = sorted(doc, key=lambda b: len(b), reverse=True) + x = sorted_batch + max_x = max([len(s) for s in x]) + x_p = [] + for i in range(len(doc)): + if len(x[i]) < max_x: + x_p.append( + torch.cat([Variable(LongTensor(x[i])).view(1,-1), + Variable(LongTensor([word_to_idx['']] * (max_x - len(x[i])))).view(1, + -1)], + 1)) + else: + x_p.append(Variable(LongTensor(x[i])).view(1,-1)) + input_var_temp = torch.cat(x_p) + input_var.append(input_var_temp) + input_len.append([list(map(lambda s: s == 0, t.data)).count(False) for t in input_var_temp]) + elif model_type == 'clique': + # list of lists for each sentence-batch in a clique + input_var = [] + input_len = [] + reverse_index = [] + for i in range(clique_size): + batch_lengths = LongTensor([len(seq) for seq in batch[i]]) + sorted_lengths, original_index = torch.sort(batch_lengths, 0, descending=True) + + reverse_index.append(LongTensor(self.reverse_index(original_index))) + x = sorted(batch[i], key=lambda b: len(b), reverse=True) + max_x = max([len(s) for s in x]) + x_p = [] + for i in range(len(batch[i])): + if len(x[i]) < max_x: + x_p.append( + torch.cat( + [Variable(LongTensor(x[i])).view(1, -1), Variable(LongTensor([word_to_idx['']] * (max_x - len(x[i])))).view(1, -1)], + 1)) + else: + x_p.append(Variable(LongTensor(x[i])).view(1, -1)) + input_var.append(torch.cat(x_p)) + input_len.append(list(sorted_lengths)) + return input_var, input_len, reverse_index \ No newline at end of file diff --git a/entity_graph.py b/entity_graph.py new file mode 100644 index 0000000..bc28135 --- /dev/null +++ b/entity_graph.py @@ -0,0 +1,154 @@ +import os, csv, sys +import numpy as np +from sklearn.metrics import mean_squared_error +from scipy.stats import spearmanr + +corpus = sys.argv[1] +evaluation = sys.argv[2] +graph_type = sys.argv[3] +if evaluation == 'class': + threshold1 = float(sys.argv[4]) + threshold2 = float(sys.argv[5]) +if evaluation == 'minority': + threshold1 = float(sys.argv[4]) + + +def compute_corr(test_labels, test_scores): + all_labels = [] + all_scores = [] + for test_id in test_labels: + all_labels.append(test_labels[test_id]) + all_scores.append(test_scores[test_id]) + mse = mean_squared_error(all_labels, all_scores) + corr = spearmanr(all_labels, all_scores)[0] + return mse, corr + + +def compute_fscore(threshold, train_labels, train_scores): + tp = 0 + fp = 0 + fn = 0 + for train_id in train_labels: + label = train_labels[train_id] + score = train_scores[train_id] + if score < threshold: + pred = 1 + else: + pred = 0 + if pred == label: + if label == 1: + tp += 1 + else: # incorrect prediction + if pred == 1: + fp += 1 + else: + fn += 1 + precision = 0 + if (tp + fp) > 0: + precision = tp / (tp + fp) + recall = 0 + if (tp + fn) > 0: + recall = tp / (tp + fn) + f05 = 0 # compute F0.5 score + if (precision + recall) > 0: + f05 = (1.25 * precision * recall) / (1.25 * precision + recall) + return precision, recall, f05 + +def evaluate_perm(test_scores_orig, test_scores_perm): + num_correct = 0 + num_total = 0 + for test_id in test_scores_orig: + orig_score = test_scores_orig[test_id] + for perm_id in test_scores_perm[test_id]: + perm_score = test_scores_perm[test_id][perm_id] + if orig_score > perm_score: + num_correct += 1 + num_total += 1 + return num_correct, num_total + + +in_dir = 'data/' + corpus + '/' +# read all test data +test_ids = [] +test_labels = {} +if evaluation == 'perm': + in_filename = in_dir + corpus + '_test_perm.csv' +else: + in_filename = in_dir + corpus + '_test.csv' +with open(in_filename,'r') as in_file: + reader = csv.DictReader(in_file) + for row in reader: + test_ids.append(row['text_id']) + label = None + if evaluation == 'class': + label = int(row['labelA']) + elif evaluation == 'score_pred': + labels = [int(row['ratingA1']), int(row['ratingA2']), int(row['ratingA3'])] + label = np.mean(labels) + elif evaluation == 'minority': + num_low_judgments = 0 + if row['ratingA1'] == '1': + num_low_judgments += 1 + if row['ratingA2'] == '1': + num_low_judgments += 1 + if row['ratingA3'] == '1': + num_low_judgments += 1 + if num_low_judgments >= 2: + label = 1 + else: + label = 0 + test_labels[row['text_id']] = label +test_scores = {} +test_scores_perm = {} +test_scores_orig = {} +num_correct = 0 +num_total = 0 +for test_id in test_ids: + if evaluation == 'perm': + orig_filename = in_dir + 'graph_permute/' + test_id + '.0.graph_' + graph_type + if not os.path.exists(orig_filename): + continue # no valid permutations + with open(in_dir + 'graph_permute/' + test_id + '.0.graph_' + graph_type, 'r') as in_file: + for line in in_file: + score = float(line.strip()) + test_scores_orig[test_id] = score + test_scores_perm[test_id] = {} + break + # read permutations + for i in range(1, 21): + perm_filename = in_dir + 'graph_permute/' + test_id + '.perm-' + str(i) + '.graph_' + graph_type + if not os.path.exists(perm_filename): + continue + with open(perm_filename, 'r') as in_file: + for line in in_file: + score = float(line.strip()) + test_scores_perm[test_id][i] = score + break + else: + with open(in_dir + 'graph/' + test_id + '.graph_' + graph_type, 'r') as in_file: + for line in in_file: + score = float(line.strip()) + test_scores[test_id] = score + if evaluation == 'class': + if score < threshold1: + pred_label = 1 + elif score < threshold2: + pred_label = 2 + else: + pred_label = 3 + gold_label = test_labels[test_id] + if gold_label == pred_label: + num_correct += 1 + num_total += 1 + break +if evaluation == 'class': + print("Results on test:\nAccuracy: %0.2f" % (100 * (num_correct / num_total))) +elif evaluation == 'minority': + precision, recall, fscore = compute_fscore(threshold1, test_labels, test_scores) + print("Results on test:\nPrecision: %0.2f Recall: %0.2f F0.5: %0.2f" % (precision, recall, fscore)) +elif evaluation == 'score_pred': + mse, corr = compute_corr(test_labels, test_scores) + print("Results on test:\nSpearman corr: %0.3f MSE: %0.3f" % (corr, mse)) +elif evaluation == 'perm': + num_correct, num_total = evaluate_perm(test_scores_orig, test_scores_perm) + print("Results on test:\nAccuracy: %0.2f" % (100 * (num_correct / num_total))) diff --git a/entity_grid.py b/entity_grid.py new file mode 100644 index 0000000..30a75d6 --- /dev/null +++ b/entity_grid.py @@ -0,0 +1,153 @@ +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor +import os, csv, random, sys +import numpy as np +from sklearn.metrics import mean_squared_error +from scipy.stats import spearmanr + +corpus = sys.argv[1] +feature_dirname = sys.argv[2] +evaluation = sys.argv[3] + + +def evaluate_fscore(labels, predictions): + tp = 0 + fp = 0 + fn = 0 + for idx, label in enumerate(labels): + pred = predictions[idx] + if pred == label: + if label == 1: + tp += 1 + else: # incorrect prediction + if pred == 1: + fp += 1 + else: + fn += 1 + precision = 0 + if (tp + fp) > 0: + precision = tp / (tp + fp) + recall = 0 + if (tp + fn) > 0: + recall = tp / (tp + fn) + f05 = 0 # compute F0.5 score + if (precision + recall) > 0: + f05 = (1.25 * precision * recall) / (1.25 * precision + recall) + return precision, recall, f05 + +def read_features(text_ids, labels_dict): + instances = [] + labels = [] + for text_id in text_ids: + if evaluation == 'perm': + orig_instance = [] + orig_filename = in_dir + 'features_permute/' + feature_dirname + '/' + text_id + '.0.feat' + if not os.path.exists(orig_filename): + continue # file without valid permutations + with open(in_dir + 'features_permute/' + feature_dirname + '/' + text_id + '.0.feat', 'r') as in_file: + for line in in_file: + line = line.strip().split() + for val in line: + orig_instance.append(float(val)) + for j in range(1, 21): + other_doc_instance = [] + filename = in_dir + 'features_permute/' + feature_dirname + '/' + text_id + '.perm-' + str(j) + '.feat' + if not os.path.exists(filename): + continue + with open(in_dir + 'features_permute/' + feature_dirname + '/' + text_id + '.perm-' + str(j) + '.feat', + 'r') as in_file: + for line in in_file: + line = line.strip().split() + for val in line: + other_doc_instance.append(float(val)) + # randomly order documents + doc_order = random.randint(1, 2) + if doc_order == 1: # doc1 = orig document + feat = np.asarray(orig_instance) - np.asarray(other_doc_instance) + label = 1 + else: + feat = np.asarray(other_doc_instance) - np.asarray(orig_instance) + label = 2 + instances.append(feat) + labels.append(label) + else: + instance = [] + with open(in_dir + 'features/' + feature_dirname + '/' + text_id + '.feat','r') as in_file: + for line in in_file: + line = line.strip().split() + for val in line: + instance.append(float(val)) + labels.append(labels_dict[text_id]) + instances.append(instance) + return instances, labels + + +in_dir = 'data/' + corpus + '/' +train_ids = [] +train_labels_dict = {} +eval_ids = [] +eval_labels_dict = {} +splits = ['train', 'test'] +for split in splits: + if evaluation == 'perm': + in_filename = in_dir + corpus + '_' + split + '_perm.csv' + else: + in_filename = in_dir + corpus + '_' + split + '.csv' + with open(in_filename, 'r') as in_file: + reader = csv.DictReader(in_file) + for row in reader: + text_id = row['text_id'] + label = None + if evaluation == 'class': + label = int(row['labelA']) + elif evaluation == 'score_pred': + labels = [int(row['ratingA1']), int(row['ratingA2']), int(row['ratingA3'])] + label = np.mean(labels) + elif evaluation == 'minority': + num_low_judgments = 0 + if row['ratingA1'] == '1': + num_low_judgments += 1 + if row['ratingA2'] == '1': + num_low_judgments += 1 + if row['ratingA3'] == '1': + num_low_judgments += 1 + if num_low_judgments >= 2: + label = 1 + else: + label = 0 + if split == 'train': + train_ids.append(text_id) + train_labels_dict[text_id] = label + elif split == 'test': + eval_ids.append(text_id) + eval_labels_dict[text_id] = label +# read features +train_instances, train_labels = read_features(train_ids, train_labels_dict) +train_arr = np.array(train_instances) +eval_instances, eval_labels = read_features(eval_ids, eval_labels_dict) +eval_arr = np.array(eval_instances) +# shuffle training data +indices = [idx for idx in range(len(train_instances))] +random.shuffle(indices) +shuffle_train_instances = [train_instances[idx] for idx in indices] +shuffle_train_labels = [train_labels[idx] for idx in indices] +# train and evaluate model +if evaluation == 'class' or evaluation == 'minority' or evaluation == 'perm': + clf = RandomForestClassifier() +elif evaluation == 'score_pred': + clf = RandomForestRegressor() +clf.fit(np.array(shuffle_train_instances), np.array(shuffle_train_labels)) +# predictions = clf.predict(np.array(eval_instances)) +if evaluation == 'class' or evaluation == 'perm': + accuracy = clf.score(np.array(eval_instances), np.array(eval_labels)) + print("Results on test:\nAccuracy: %0.2f" % (accuracy * 100)) +elif evaluation == 'score_pred': + predictions = clf.predict(np.array(eval_instances)) + mse = mean_squared_error(eval_labels, predictions) + corr = spearmanr(eval_labels, predictions)[0] + print("Results on test:\nSpearman corr: %0.3f MSE: %0.3f" % (corr, mse)) +if evaluation == 'minority': + predictions = clf.predict(np.array(eval_instances)) + eval_precision, eval_recall, eval_fscore = evaluate_fscore(eval_labels, predictions) + print("Results on test:\nPrecision: %0.2f Recall: %0.2f F0.5: %0.2f" % (eval_precision, eval_recall, eval_fscore)) + diff --git a/evaluation.py b/evaluation.py new file mode 100644 index 0000000..b6ee57d --- /dev/null +++ b/evaluation.py @@ -0,0 +1,286 @@ +import numpy as np +from torch.autograd import Variable +import torch +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor +from scipy.stats import spearmanr +import csv + +def eval_docs(model, loss_fn, eval_data, labels, data_obj, params): + steps = int(len(eval_data) / params['batch_size']) + if len(eval_data) % params['batch_size'] != 0: + steps += 1 + eval_indices = list(range(len(eval_data))) + eval_pred = [] + eval_labels = [] + loss = 0 + model.eval() + for step in range(steps): + end_idx = (step + 1) * params['batch_size'] + if end_idx > len(eval_data): + end_idx = len(eval_data) + batch_ind = eval_indices[(step * params['batch_size']):end_idx] + sentences, orig_batch_labels = data_obj.get_batch(eval_data, labels, batch_ind, params['model_type']) + batch_padded, batch_lengths, original_index = data_obj.pad_to_batch( + sentences, data_obj.word_to_idx, params['model_type']) + batch_pred = model(batch_padded, batch_lengths, original_index) + if params['task'] == 'score_pred': + loss += loss_fn(batch_pred, Variable(FloatTensor(orig_batch_labels))).cpu().data.numpy() + eval_pred.extend(list(batch_pred.cpu().data.numpy())) + else: + loss += loss_fn(batch_pred, Variable(LongTensor(orig_batch_labels))).cpu().data.numpy() + eval_pred.extend(list(np.argmax(batch_pred.cpu().data.numpy(), axis=1))) + eval_labels.extend(orig_batch_labels) + if params['task'] == 'score_pred': + mse = np.square(np.subtract(np.array(eval_pred), np.expand_dims(np.array(eval_labels), 1))).mean() + corr = spearmanr(np.array(eval_pred), np.expand_dims(np.array(eval_labels), 1))[0] + accuracy = corr + elif params['task'] == 'minority': + f05, precision, recall = evaluate(eval_pred, eval_labels, "f05") + else: + accuracy, num_correct, num_total = evaluate(eval_pred, eval_labels, "accuracy") + if params['task'] == 'minority': + return f05, precision, recall, loss + else: + return accuracy, loss + + +def eval_docs_rank(model, eval_docs, data_obj, params): + num_correct = 0 + num_total = 0 + loss = 0 + model.eval() + eval_pred = [] + eval_ids_perm = [] + for doc in eval_docs: + orig_doc, perm_docs = data_obj.retrieve_doc_sents_by_label(doc) + batch_padded_orig, batch_lengths_orig, original_index_orig = data_obj.pad_to_batch(orig_doc, data_obj.word_to_idx, params['model_type']) + orig_pred = model(batch_padded_orig, batch_lengths_orig, original_index_orig) + orig_coh_score = orig_pred.cpu().data.numpy()[0][1] # probability that doc is coherent + for idx, perm_doc in enumerate(perm_docs): + perm_doc = [perm_doc] + batch_padded_perm, batch_lengths_perm, original_index_perm = data_obj.pad_to_batch(perm_doc, data_obj.word_to_idx, params['model_type']) + perm_pred = model(batch_padded_perm, batch_lengths_perm, original_index_perm) + pred_coh_score = perm_pred.cpu().data.numpy()[0][1] # probability that doc is coherent + if orig_coh_score > pred_coh_score: + num_correct += 1 + eval_pred.append(1) + else: + eval_pred.append(0) + eval_ids_perm.append(doc.id + "#" + str(idx+1)) + num_total += 1 + accuracy = num_correct / num_total + return accuracy, loss + + +def evaluate(pred_labels, labels, type): + num_correct = 0 + num_total = 0 + tp = 0 + fp = 0 + fn = 0 + for index, pred_val in enumerate(pred_labels): + gold_val = labels[index] + if type == "accuracy": + if pred_val == gold_val: + num_correct += 1 + elif type == "f05": + if pred_val == gold_val: + if gold_val == 1: + tp += 1 + else: + if pred_val == 1: + fp += 1 + else: + fn += 1 + num_total += 1 + if type == "f05": + precision = 0 + if (tp + fp) > 0: + precision = tp / (tp + fp) + recall = 0 + if (tp + fn) > 0: + recall = tp / (tp + fn) + f05 = 0 + if (precision + recall) > 0: + f05 = (1.25 * precision * recall) / (1.25 * precision + recall) + return f05, precision, recall + return np.sum(np.array(pred_labels) == np.array(labels)) / float( + len(pred_labels)), num_correct, num_total + + +def eval_cliques(model, loss_fn, clique_data, clique_labels, batch_size, clique_size, data_obj, model_type, task): + steps = int(len(clique_data) / batch_size) + if len(clique_data) % batch_size != 0: + steps += 1 + dev_indices = list(range(len(clique_data))) + eval_pred = [] + eval_labels = [] + loss = 0 + model.eval() + for step in range(steps): + end_idx = (step + 1) * batch_size + if end_idx > len(clique_data): + end_idx = len(clique_data) + batch_ind = dev_indices[(step * batch_size):end_idx] + sentences, orig_batch_labels = data_obj.get_batch(clique_data, clique_labels, batch_ind, model_type, clique_size) + batch_padded, batch_lengths, original_index = data_obj.pad_to_batch(sentences, data_obj.word_to_idx, model_type, clique_size) + batch_pred = model(batch_padded, batch_lengths, original_index) + if task == 'score_pred': + loss += loss_fn(batch_pred, Variable(FloatTensor(orig_batch_labels))).cpu().data.numpy() + eval_pred.extend(list(batch_pred.cpu().data.numpy())) + else: + loss += loss_fn(batch_pred, Variable(LongTensor(orig_batch_labels))).cpu().data.numpy() + eval_pred.extend(list(np.argmax(batch_pred.cpu().data.numpy(), axis=1))) + eval_labels.extend(orig_batch_labels) + if task == 'score_pred': + mse = np.square(np.subtract(np.array(eval_pred), np.expand_dims(np.array(eval_labels), 1))).mean() + corr = spearmanr(np.array(eval_pred), np.expand_dims(np.array(eval_labels), 1))[0] + accuracy = corr + else: + accuracy, num_correct, num_total = evaluate(eval_pred, eval_labels, "accuracy") + return accuracy, loss + + +def eval_doc_cliques(model, docs, data_obj, params): + num_correct = 0 + num_total = 0 + tp = 0 + fp = 0 + fn = 0 + model.eval() + eval_ids = [] + eval_pred = [] + eval_labels = [] + for doc in docs: + if params['task'] == 'perm': + orig_doc_cliques, perm_doc_cliques = data_obj.retrieve_doc_cliques_by_label(doc, params['task']) + orig_doc_score = score_doc(model, orig_doc_cliques, params['batch_size'], params['clique_size'], data_obj, params['model_type']) + for perm_count, cliques in enumerate(perm_doc_cliques): + perm_doc_score = score_doc(model, cliques, params['batch_size'], params['clique_size'], data_obj, params['model_type']) + eval_ids.append(doc.id + "#" + str(perm_count)) + if orig_doc_score > perm_doc_score: + num_correct += 1 + eval_pred.append(1) + else: + eval_pred.append(0) + num_total += 1 + elif params['task'] == 'class': + orig_doc_cliques, _ = data_obj.retrieve_doc_cliques_by_label(doc, params['task']) + pred_label = label_doc(model, orig_doc_cliques, params['batch_size'], params['clique_size'], data_obj, params['model_type']) + eval_pred.append(pred_label) + if pred_label == doc.label: + num_correct += 1 + num_total += 1 + elif params['task'] == 'minority': + orig_doc_cliques, _ = data_obj.retrieve_doc_cliques_by_label(doc, params['task']) + pred_label = label_doc(model, orig_doc_cliques, params['batch_size'], params['clique_size'], data_obj, + params['model_type']) + eval_pred.append(pred_label) + if pred_label == doc.label: + num_correct += 1 + if pred_label == doc.label: + if doc.label == 1: + tp = 1 + else: + if pred_label == 1: + fp += 1 + else: + fn += 1 + num_total += 1 + elif params['task'] == 'score_pred': + orig_doc_cliques, _ = data_obj.retrieve_doc_cliques_by_label(doc, params['task']) + pred_score = score_doc_regression(model, orig_doc_cliques, params['batch_size'], params['clique_size'], data_obj, params['model_type']) + eval_pred.append(pred_score) + eval_labels.append(doc.label) + precision = 0 + recall = 0 + f05 = 0 + if params['task'] == 'score_pred': + mse = np.square(np.subtract(eval_pred, eval_labels)).mean() + corr = spearmanr(eval_pred, eval_labels)[0] + accuracy = corr + else: + accuracy = num_correct / num_total + if (tp + fp) > 0: + precision = tp / (tp + fp) + if (tp + fn) > 0: + recall = tp / (tp + fn) + if (precision + recall) > 0: + f05 = (1.25 * precision * recall) / (1.25 * precision + recall) + return accuracy, precision, recall, f05 + + +# average scores of all cliques for a single document (3-class task) +def label_doc(model, doc_cliques, batch_size, clique_size, data_obj, model_type): + steps = int(len(doc_cliques) / batch_size) + labels = [-1 for clique in doc_cliques] + if len(doc_cliques) % batch_size != 0: + steps += 1 + clique_indices = list(range(len(doc_cliques))) + pred_distributions = None + model.eval() + for step in range(steps): + end_idx = (step + 1) * batch_size + if end_idx > len(doc_cliques): + end_idx = len(doc_cliques) + batch_ind = clique_indices[(step * batch_size):end_idx] + sentences, orig_batch_labels = data_obj.get_batch(doc_cliques, labels, batch_ind, model_type, clique_size) + batch_padded, batch_lengths, original_index = data_obj.pad_to_batch(sentences, data_obj.word_to_idx, model_type, clique_size) + batch_pred = model(batch_padded, batch_lengths, original_index) + batch_data = batch_pred.cpu().data.numpy() + if pred_distributions is None: + pred_distributions = batch_data + else: + pred_distributions = np.concatenate([pred_distributions, batch_data]) + pred_label = np.argmax(np.mean(pred_distributions, axis=0)) + return pred_label + + +# average scores of all cliques for a single document (binary task) +def score_doc(model, doc_cliques, batch_size, clique_size, data_obj, model_type): + steps = int(len(doc_cliques) / batch_size) + labels = [-1 for clique in doc_cliques] + if len(doc_cliques) % batch_size != 0: + steps += 1 + clique_indices = list(range(len(doc_cliques))) + prob_list = [] + model.eval() + for step in range(steps): + end_idx = (step + 1) * batch_size + if end_idx > len(doc_cliques): + end_idx = len(doc_cliques) + batch_ind = clique_indices[(step * batch_size):end_idx] + sentences, orig_batch_labels = data_obj.get_batch(doc_cliques, labels, batch_ind, model_type, clique_size) + batch_padded, batch_lengths, original_index = data_obj.pad_to_batch(sentences, data_obj.word_to_idx, model_type, clique_size) + batch_pred = model(batch_padded, batch_lengths, original_index) + batch_data = batch_pred.cpu().data.numpy() + for row in batch_data: + prob_list.append(row[1]) # probability that the clique is coherent + score = np.mean(prob_list) + return score + + +# average scores of all cliques for a single document (score prediction task) +def score_doc_regression(model, doc_cliques, batch_size, clique_size, data_obj, model_type): + steps = int(len(doc_cliques) / batch_size) + labels = [-1 for clique in doc_cliques] + if len(doc_cliques) % batch_size != 0: + steps += 1 + clique_indices = list(range(len(doc_cliques))) + prob_list = [] + model.eval() + for step in range(steps): + end_idx = (step + 1) * batch_size + if end_idx > len(doc_cliques): + end_idx = len(doc_cliques) + batch_ind = clique_indices[(step * batch_size):end_idx] + sentences, orig_batch_labels = data_obj.get_batch(doc_cliques, labels, batch_ind, model_type, clique_size) + batch_padded, batch_lengths, original_index = data_obj.pad_to_batch(sentences, data_obj.word_to_idx, model_type, clique_size) + batch_pred = model(batch_padded, batch_lengths, original_index) + batch_data = batch_pred.cpu().data.numpy() + for row in batch_data: + prob_list.append(row[0]) # regression score + score = np.mean(prob_list) + return score \ No newline at end of file diff --git a/extract_entity_grid.py b/extract_entity_grid.py new file mode 100644 index 0000000..9339b13 --- /dev/null +++ b/extract_entity_grid.py @@ -0,0 +1,125 @@ +# takes csv files, parses them, and extracts entity grid +from pycorenlp import StanfordCoreNLP +import os, json, sys + +corpus = sys.argv[1] +in_dir = 'data/' + corpus + '/' +nlp = StanfordCoreNLP('http://localhost:9000') # requires you have the Stanford CoreNLP server running: https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started + +if not os.path.exists(in_dir + 'parsed/'): + os.makedirs(in_dir + 'parsed/') +if not os.path.exists(in_dir + 'grid/'): + os.makedirs(in_dir + 'grid/') + + +def update_noun_types(dep_type, np_words, curr_nouns_type): + for word in np_words: + if word not in curr_nouns_type: + curr_nouns_type[word] = dep_type + if curr_nouns_type[word] == "x" or curr_nouns_type[word] == "o": + curr_nouns_type[word] = dep_type + return curr_nouns_type + + +def get_np(dependency, const_parse): + target_id = dependency['dependent'] + index = 0 + nouns = [] + for line in const_parse.splitlines(): + if ")" not in line: + continue + tokens = line.strip().split(") (") + num_tokens = len(tokens) # remove phrase label + phrase_start_idx = index + 1 + index += num_tokens + phrase_end_idx = index + 1 + if target_id <= index and tokens[0].startswith("(NP"): + for token in tokens: + if token.startswith("(NP"): + token = token[3:].strip() + while token.startswith("("): + token = token[1:] + while token.endswith(")"): + token = token[:-1].strip() + word = token.split(None, 1)[1] # remove POS tag + if token.startswith("NN"): + nouns.append(word.lower()) + elif token.startswith("PRP "): + nouns.append(word.lower()) + elif token.startswith("DT") and len(tokens) == 1: + nouns.append(word.lower()) # is noun phrase, only one DT word (this, all) in the phrase + break + return nouns, phrase_start_idx, phrase_end_idx + + +# read all text files, parse and extract entity grid +for filename in os.listdir(in_dir + "text/"): + with open(in_dir + "text/" + filename,'r') as in_file: + if not filename.endswith(".txt"): + continue + nouns_list = [] + nouns_dict = {} + sent_annotations = [] + text_id = filename.rsplit(".", 1)[0] + const_out = open(in_dir + "parsed/" + text_id + ".const_parse", "w") + dep_out = open(in_dir + "parsed/" + text_id + ".dep_parse", "w") + grid_out = open(in_dir + "grid/" + text_id + ".grid", "w") + # read text document + document_lines = [] + for line in in_file: + line = line.strip() + if line == "": + continue + if isinstance(line, str): + document_lines.append(line) + document = " ".join(document_lines) + try: + output = nlp.annotate(document, properties={ + 'annotators': 'tokenize,ssplit,pos,depparse,parse', + 'outputFormat': 'json' + }) + except: + print('Failed to parse file %s' % filename) + continue + if output == 'CoreNLP request timed out. Your document may be too long.': + print('Timed out when attempting to parse file %s' % filename) + continue + for sent in output['sentences']: + sent_idx = sent['index'] + 1 + const_out.write(sent['parse'] + "\n") + json.dump(sent['basicDependencies'], dep_out) + dep_out.write("\n") + curr_nouns_type = {} + for token in sent['tokens']: + # collect all nouns and pronouns + if token['pos'].startswith("NN") or token['pos'] == 'PRP': + token_str = token['word'].lower() + curr_nouns_type[token_str] = "x" + if token_str not in nouns_dict: + nouns_list.append(token_str) + nouns_dict[token_str] = 0 + nouns_dict[token_str] += 1 + # find highest-ranked role of entity in this sentence (subj > obj > other) + for dep in sent['basicDependencies']: + dep_type = "" + if dep['dep'] == 'nsubj' or dep['dep'] == 'nsubjpass': + dep_type = "s" + elif dep['dep'] == 'dobj': + dep_type = "o" + if dep_type != "": + np, phrase_start_idx, phrase_end_idx = get_np(dep, sent['parse']) + curr_nouns_type = update_noun_types(dep_type, np, curr_nouns_type) + sent_annotations.append(curr_nouns_type) + + # output entity grid + for noun in nouns_list: + grid_out.write(noun + " ") + for sent_ann in sent_annotations: + if noun in sent_ann: + grid_out.write(sent_ann[noun] + " ") + else: + grid_out.write("- ") + grid_out.write(str(nouns_dict[noun]) + "\n") # entity frequency (salience count) + grid_out.close() + const_out.close() + dep_out.close() diff --git a/extract_entity_grid_perm.py b/extract_entity_grid_perm.py new file mode 100644 index 0000000..4b4d61d --- /dev/null +++ b/extract_entity_grid_perm.py @@ -0,0 +1,153 @@ +# takes csv files, parses them, and extracts entity grid +from pycorenlp import StanfordCoreNLP +import os, json, sys + +nlp = StanfordCoreNLP('http://localhost:9000') +corpus = sys.argv[1] + +in_dir = 'data/' + corpus + '/' + +if not os.path.exists(in_dir + 'parsed_permute/'): + os.makedirs(in_dir + 'parsed_permute/') +if not os.path.exists(in_dir + 'grid_permute/'): + os.makedirs(in_dir + 'grid_permute/') + +def update_noun_types(dep_type, np_words, curr_nouns_type): + for word in np_words: + if word not in curr_nouns_type: + curr_nouns_type[word] = dep_type + if curr_nouns_type[word] == "x" or curr_nouns_type[word] == "o": + curr_nouns_type[word] = dep_type + return curr_nouns_type + + +def get_np(dependency, const_parse): + target_id = dependency['dependent'] + index = 0 + nouns = [] + for line in const_parse.splitlines(): + if ")" not in line: + continue + tokens = line.strip().split(") (") + num_tokens = len(tokens) # remove phrase label + index += num_tokens + if target_id <= index and tokens[0].startswith("(NP"): + for token in tokens: + if token.startswith("(NP"): + token = token[3:].strip() + while token.startswith("("): + token = token[1:] + while token.endswith(")"): + token = token[:-1].strip() + word = token.split(None, 1)[1] # remove POS tag + if token.startswith("NN"): + nouns.append(word.lower()) + elif token.startswith("PRP "): + nouns.append(word.lower()) + elif token.startswith("DT") and len(tokens) == 1: + nouns.append(word.lower()) # is noun phrase, only one DT word (this, all) in the phrase + break + return nouns + +# read all text files, parse and extract entity grid +for filename in os.listdir(in_dir + "text_permute/"): + if not filename.endswith("_sent.txt"): + continue # original files only + with open(in_dir + "text_permute/" + filename, 'r') as in_file: + # process original sentence order file + nouns_list = [] + nouns_dict = {} + sent_annotations = [] + text_id = filename.rsplit("_", 1)[0] + const_out_filename = in_dir + "parsed_permute/" + text_id + ".0.const_parse" + dep_out_filename = in_dir + "parsed_permute/" + text_id + ".0.dep_parse" + grid_out_filename = in_dir + "parsed_permute/" + text_id + ".0.grid" + if os.path.exists(const_out_filename) and os.path.exists(dep_out_filename) and os.path.exists( + grid_out_filename): + continue + const_out = open(in_dir + "parsed_permute/" + text_id + ".0.const_parse", "w") + const_lines = {} + dep_out = open(in_dir + "parsed_permute/" + text_id + ".0.dep_parse", "w") + dep_lines = {} + grid_out = open(in_dir + "grid_permute/" + text_id + ".0.grid", "w") + grid_lines = {} + for line in in_file: # sentences in original order + line = line.strip() + const_lines[line] = [] + dep_lines[line] = [] + grid_lines[line] = [] + if line.strip() == "": # not sure if this ever fires (I might have removed line breaks in these files -- for entity grid only) + const_out.write("\n\n") + dep_out.write("\n\n") + continue + output = nlp.annotate(line, properties={ + 'annotators': 'tokenize,ssplit,pos,depparse,parse', + 'outputFormat': 'json' + }) + for sent in output['sentences']: + const_out.write(sent['parse'] + "\n") + const_lines[line].append(sent['parse']) + json.dump(sent['basicDependencies'], dep_out) + dep_out.write("\n") + dep_lines[line].append(sent['basicDependencies']) + curr_nouns_type = {} + for token in sent['tokens']: + if token['pos'].startswith("NN") or token['pos'] == 'PRP': + token_str = token['word'].lower() + curr_nouns_type[token_str] = "x" + if token_str not in nouns_dict: + nouns_list.append(token_str) + nouns_dict[token_str] = 0 + nouns_dict[token_str] += 1 + for dep in sent['basicDependencies']: + dep_type = "" + if dep['dep'] == 'nsubj' or dep['dep'] == 'nsubjpass': + dep_type = "s" + elif dep['dep'] == 'dobj': + dep_type = "o" + if dep_type != "": + np = get_np(dep, sent['parse']) + curr_nouns_type = update_noun_types(dep_type, np, curr_nouns_type) + sent_annotations.append(curr_nouns_type) + grid_lines[line].append(curr_nouns_type) + + for noun in nouns_list: + grid_out.write(noun + " ") + for sent_ann in sent_annotations: + if noun in sent_ann: + grid_out.write(sent_ann[noun] + " ") + else: + grid_out.write("- ") + grid_out.write(str(nouns_dict[noun]) + "\n") # frequency for salience feature + grid_out.close() + const_out.close() + dep_out.close() + for i in range(1, 21): + filename_perm = text_id + ".perm-" + str(i) + if not os.path.exists(in_dir + "text_permute/" + filename_perm + ".txt"): + continue + const_out = open(in_dir + "parsed_permute/" + filename_perm + ".const_parse", "w") + dep_out = open(in_dir + "parsed_permute/" + filename_perm + ".dep_parse", "w") + grid_out = open(in_dir + "grid_permute/" + filename_perm + ".grid", "w") + sent_annotations = [] + with open(in_dir + "text_permute/" + filename_perm + ".txt", "r") as in_file: + for line in in_file: + line = line.strip() + for parse in const_lines[line]: + const_out.write(parse + "\n") + for parse in dep_lines[line]: + json.dump(parse, dep_out) + dep_out.write("\n") + for grid_line in grid_lines[line]: + sent_annotations.append(grid_line) + for noun in nouns_list: + grid_out.write(noun + " ") + for sent_ann in sent_annotations: + if noun in sent_ann: + grid_out.write(sent_ann[noun] + " ") + else: + grid_out.write("- ") + grid_out.write(str(nouns_dict[noun]) + "\n") # saliance frequency feature + grid_out.close() + const_out.close() + dep_out.close() \ No newline at end of file diff --git a/extract_features_from_grid.py b/extract_features_from_grid.py new file mode 100644 index 0000000..6e73a39 --- /dev/null +++ b/extract_features_from_grid.py @@ -0,0 +1,104 @@ +import itertools +import os, sys + +corpus = sys.argv[1] +seq_len = int(sys.argv[2]) +salience_threshold = int(sys.argv[3]) +syntax_opt = int(sys.argv[4]) +is_permute_arg = sys.argv[5] +is_permute = False +if is_permute_arg == 'true': + is_permute = True +append_str = '' +if is_permute: + append_str = '_permute' + +in_dir = 'data/'+corpus+'/' +if not os.path.isdir(in_dir + 'features' + append_str + '/'): + os.mkdir(in_dir + 'features' + append_str + '/') +feat_dir = in_dir + 'features' + append_str + '/seq_' + str(seq_len) + '_sal_' + str(salience_threshold) + '_syn_' + str(syntax_opt) + '/' +if not os.path.isdir(feat_dir): + os.mkdir(feat_dir) +print(feat_dir) +for filename in os.listdir(in_dir + 'grid' + append_str + '/'): + if not filename.endswith("grid"): + continue + filename_base = filename.rsplit(".", 1)[0] + out_file = open(feat_dir + filename_base + ".feat", "w") + with open(in_dir + 'grid' + append_str + '/' + filename, "r") as in_file: + # read grid + sequences = [] + frequencies = [] + for line in in_file: + line = line.strip() + tokens = line.split() + try: + frequency = int(tokens[-1]) + except ValueError: + print(line) + frequency = 0 + frequencies.append(frequency) + sequence = "".join(tokens[1:-1]) + sequence = "<" + sequence + ">" # add start and end tokens + sequences.append(sequence) + in_file.close() + + # compute feature vector + if syntax_opt == 1: # syntax on + labels = ['s', 'o', 'x', '-'] + else: # syntax off (ignore entity roles) + labels = ['x', '-'] + feature_vector = [] + for salience_class in [0, 1]: + if salience_threshold == 1 and salience_class == 1: # only one salience class + break + for i in range(seq_len): # over possible sequence lengths + seq_len = i + 1 # shortest seq is length 2 + num_total_sequences = 0 + for sent_index, sentence in enumerate(sequences): + if salience_class == 0 and frequencies[sent_index] >= salience_threshold: + num_total_sequences += len(sentence) - seq_len + 1 + elif salience_class == 1 and frequencies[sent_index] < salience_threshold: + num_total_sequences += len(sentence) - seq_len + 1 + total_prob = 0 + seq_minus_one = {} + for possible_seq in itertools.product(labels, repeat=seq_len): + possible_seq_tok = "".join(possible_seq) + seq_minus_one[possible_seq_tok[:-1]] = 1 + num_occurrences = 0 + for sent_index, sentence in enumerate(sequences): + sentence_temp = sentence + if syntax_opt == 0: + sentence_temp = sentence_temp.replace('s', 'x') + sentence_temp = sentence_temp.replace('o', 'x') + if salience_class == 0 and frequencies[sent_index] >= salience_threshold: + num_occurrences += sum(sentence_temp[j:].startswith(possible_seq_tok) for j in range(len(sentence_temp))) + elif salience_class == 1 and frequencies[sent_index] < salience_threshold: + num_occurrences += sum(sentence_temp[j:].startswith(possible_seq_tok) for j in range(len(sentence_temp))) + feature_prob = 0 + if num_total_sequences > 0: + feature_prob = float(num_occurrences) / num_total_sequences + feature_vector.append(feature_prob) + total_prob += feature_prob + # add start and end tokens + for shorter_seq in seq_minus_one: + possible_seq_toks = ["<" + shorter_seq, shorter_seq + ">"] + for possible_seq_tok in possible_seq_toks: + num_occurrences = 0 + for sent_index, sentence in enumerate(sequences): + sentence_temp = sentence + if syntax_opt == 0: + sentence_temp = sentence_temp.replace('s', 'x') + sentence_temp = sentence_temp.replace('o', 'x') + if salience_class == 0 and frequencies[sent_index] >= salience_threshold: + num_occurrences += sum(sentence_temp[j:].startswith(possible_seq_tok) for j in range(len(sentence_temp))) + elif salience_class == 1 and frequencies[sent_index] < salience_threshold: + num_occurrences += sum(sentence[j:].startswith(possible_seq_tok) for j in range(len(sentence))) + feature_prob = 0 + if num_total_sequences > 0: + feature_prob = float(num_occurrences) / num_total_sequences + feature_vector.append(feature_prob) + total_prob += feature_prob + for val in feature_vector: + out_file.write(str(val) + " ") + out_file.close() diff --git a/extract_graph_from_grid.py b/extract_graph_from_grid.py new file mode 100644 index 0000000..d0daeb6 --- /dev/null +++ b/extract_graph_from_grid.py @@ -0,0 +1,115 @@ +import itertools +import os, sys +import numpy as np + +role_weights = {'s': 3, 'o': 2, 'x': 1} + + +def compute_avg_outdeg(matrix): + out_degree_list = [] + for sent in matrix: + out_degree = 0 + for weight in sent: + out_degree += weight + out_degree_list.append(out_degree) + return np.mean(out_degree_list) + + +corpus = sys.argv[1] +is_permute_arg = sys.argv[2] +is_permute = False +if is_permute_arg == 'true': + is_permute = True +append_str = '' +if is_permute: + append_str = '_permute' + +root_dir = 'data/'+corpus+'/' +in_dir = root_dir + 'grid' + append_str + '/' +out_dir = root_dir + 'graph' + append_str + '/' +if not os.path.exists(out_dir): + os.makedirs(out_dir) + +# process all grid files (*.grid) +for filename in os.listdir(in_dir): + if not filename.endswith("grid"): + continue + filename_base = filename.rsplit(".", 1)[0] # assumes no periods '.' in grid name + out_file_u = open(out_dir + filename_base + ".graph_u", "w") + out_file_u_dist = open(out_dir + filename_base + ".graph_u_dist", "w") + out_file_w = open(out_dir + filename_base + ".graph_w", "w") + out_file_w_dist = open(out_dir + filename_base + ".graph_w_dist", "w") + out_file_syn = open(out_dir + filename_base + ".graph_syn", "w") + out_file_syn_dist = open(out_dir + filename_base + ".graph_syn_dist", "w") + with open(in_dir + filename, "r") as in_file: + matrix_u = [] + matrix_u_dist = [] + matrix_w = [] + matrix_w_dist = [] + matrix_syn = [] + matrix_syn_dist = [] + for line in in_file: # for all entities in text + line = line.strip() + tokens = line.split() + try: + count = int(tokens[-1]) + sentence_roles = tokens[1:-1] + except ValueError: + sentence_roles = tokens[1:] # remove frequency count and word + while sentence_roles[0] not in {'-', 'x', 's', 'o'}: + sentence_roles = sentence_roles[1:] + num_sentences = len(sentence_roles) + if matrix_u == []: + # initialize adjacency matrices + for i in range(num_sentences): + list_i = [] + for j in range(num_sentences): + list_i.append(0) + matrix_u.append(list(list_i)) # copy list + matrix_u_dist.append(list(list_i)) + matrix_w.append(list(list_i)) + matrix_w_dist.append(list(list_i)) + matrix_syn.append(list(list_i)) + matrix_syn_dist.append(list(list_i)) + # find sentences that contain this entity + sentence_indices = [] + for index, role in enumerate(sentence_roles): + if role != "-": + sentence_indices.append(index) + for pair in itertools.combinations(sentence_indices, 2): # get all sentence pairs + first_sent = min(pair) + second_sent = max(pair) + matrix_u[first_sent][second_sent] = 1 # binary + matrix_u_dist[first_sent][second_sent] = 1 / (second_sent - first_sent) + matrix_w[first_sent][second_sent] += 1 # count + matrix_w_dist[first_sent][second_sent] += 1 / (second_sent - first_sent) + matrix_syn[first_sent][second_sent] += role_weights[sentence_roles[first_sent]] * role_weights[sentence_roles[second_sent]] + matrix_syn_dist[first_sent][second_sent] += role_weights[sentence_roles[first_sent]] * role_weights[sentence_roles[second_sent]] / (second_sent - first_sent) + # print graph score to files + out_file_u.write(str(compute_avg_outdeg(matrix_u)) + "\n") + out_file_u_dist.write(str(compute_avg_outdeg(matrix_u_dist)) + "\n") + out_file_w.write(str(compute_avg_outdeg(matrix_w)) + "\n") + out_file_w_dist.write(str(compute_avg_outdeg(matrix_w_dist)) + "\n") + out_file_syn.write(str(compute_avg_outdeg(matrix_syn)) + "\n") + out_file_syn_dist.write(str(compute_avg_outdeg(matrix_syn_dist)) + "\n") + # print graph adjacency matrix + for i in range(num_sentences): + for j in range(num_sentences): + out_file_u.write(str(matrix_u[i][j]) + " ") + out_file_u_dist.write(str(matrix_u_dist[i][j]) + " ") + out_file_w.write(str(matrix_w[i][j]) + " ") + out_file_w_dist.write(str(matrix_w_dist[i][j]) + " ") + out_file_syn.write(str(matrix_syn[i][j]) + " ") + out_file_syn_dist.write(str(matrix_syn_dist[i][j]) + " ") + out_file_u.write("\n") + out_file_u_dist.write("\n") + out_file_w.write("\n") + out_file_w_dist.write("\n") + out_file_syn.write("\n") + out_file_syn_dist.write("\n") + out_file_u.close() + out_file_u_dist.close() + out_file_w.close() + out_file_w_dist.close() + out_file_syn.close() + out_file_syn_dist.close() diff --git a/generate_high_coh_permutations.py b/generate_high_coh_permutations.py new file mode 100644 index 0000000..fcc1dc8 --- /dev/null +++ b/generate_high_coh_permutations.py @@ -0,0 +1,109 @@ +import os, random, csv, sys +from nltk.tokenize import sent_tokenize +import itertools +import numpy as np + +corpus = sys.argv[1] +root_dir = 'data/' + corpus + '/' +in_dir = root_dir + 'text/' +out_dir = root_dir + 'text_permute/' + +if not os.path.exists(out_dir): + os.makedirs(out_dir) + +def is_orig_permutation(orig_sents, perm_sents): + for index, sent in enumerate(orig_sents): + if sent != perm_sents[index]: + return False + return True + +# which texts to permute +if corpus == 'Clinton' or corpus == 'Enron' or corpus == 'Yelp' or corpus == 'Dummy': + title_row = ["text_id","subject","text","ratingA1","ratingA2","ratingA3","labelA","ratingM1","ratingM2","ratingM3","ratingM4","ratingM5","labelM"] +elif corpus == 'Yahoo': + title_row = ["text_id","question_title","question","text","ratingA1","ratingA2","ratingA3","labelA","ratingM1","ratingM2","ratingM3","ratingM4","ratingM5","labelM"] +splits = ['train','test'] +high_coh_texts = {} +total = 0 +for split in splits: + in_file = open(root_dir + corpus + '_' + split + '.csv','r') + out_file = open(root_dir + corpus + '_' + split + '_perm.csv', 'w') + writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting = csv.QUOTE_ALL) + writer.writerow(title_row) + reader = csv.DictReader(in_file) + for row in reader: + if row['labelA'] == '3': + high_coh_texts[row['text_id']] = 1 + # print(row) + writer.writerow([row[key] for key in row]) + total += 1 + out_file.close() +print(len(high_coh_texts)) +print("total %d" %total) + +# read orig texts +count = 0 +num_files = 0 +for filename in os.listdir(in_dir): + if not filename.endswith(".txt"): + continue + # read sentences and tokenize at sentence boundaries + sentences = [] + text_id = filename.split(".")[0] + if text_id not in high_coh_texts: + continue + with open(in_dir + filename, 'r') as in_file: + orig_lines = in_file.readlines() + for line in orig_lines: + sentences.extend(sent_tokenize(line)) + # remove empty lines (don't matter for permutations) + new_sentences = [] + for sent in sentences: + sent = sent.strip() + if sent != "": + new_sentences.append(sent) + sentences = new_sentences + if len(sentences) == 1: # no possible permutations + continue + out_file_orig = open(out_dir + text_id + "_sent.txt", "w") + num_files += 1 + count += 1 + for sent in sentences: + out_file_orig.write(sent + "\n") + out_file_orig.close() + # create 20 permutations + num_permutations = 0 + used_permutations = {} + found_duplicate = False + if len(sentences) < 6: # generate all permutations + all_permutations = list(itertools.permutations(sentences)) + random.shuffle(all_permutations) + for perm in all_permutations: + if num_permutations >= 20: + break + if not found_duplicate: + if is_orig_permutation(sentences, perm): + found_duplicate = True + continue + out_file_perm = open(out_dir + text_id + ".perm-" + str(num_permutations+1) + ".txt", "w") + num_files += 1 + for sent in perm: + out_file_perm.write(sent + "\n") + num_permutations += 1 + out_file_perm.close() + else: # need to sample permutations + while num_permutations < 20: + permutation = np.random.permutation(len(sentences)) + permutation_str = [str(num) for num in permutation] + permutation_idx_str = ",".join(permutation_str) + if permutation_idx_str not in used_permutations: + out_file_perm = open(out_dir + text_id + ".perm-" + str(num_permutations+1) + ".txt", "w") + num_files += 1 + for sent_idx in permutation: + out_file_perm.write(sentences[sent_idx] + "\n") + out_file_perm.close() + num_permutations += 1 + used_permutations[permutation_idx_str] = 1 + +print(count) +print(num_files) diff --git a/main.py b/main.py new file mode 100644 index 0000000..491786a --- /dev/null +++ b/main.py @@ -0,0 +1,126 @@ +import argparse +import sys +from data_loader import * +from LSTMClique import LSTMClique +from LSTMSentAvg import LSTMSentAvg +from LSTMParSeq import LSTMParSeq +from train_neural_models import * + +sys.path.insert(0,os.getcwd()) + +dirname, filename = os.path.split(os.path.abspath(__file__)) +root_dir = "/".join(dirname.split("/")[:-1]) + +run_dir = os.path.join(root_dir, "runs") + +parser = argparse.ArgumentParser() + +# data +parser.add_argument("--task", type=str, default="class") # class [classification], perm [binary permutation], score_pred [mean score prediction], minority [minority binary classification] + +# model params +parser.add_argument("--model_type", type=str, default="clique") # clique, doc_seq +parser.add_argument("--learning_rate", type=float, default=0.001) +parser.add_argument("--dropout", type=float, default=0) +parser.add_argument("--lstm_dim", type=int, default=100) +parser.add_argument("--hidden_dim", type=int, default=200, help="hidden layer dimension") +parser.add_argument("--clique", type=int, default=3) # number of sentences in each clique (clique model only) +parser.add_argument("--l2_reg", type=float, default=0) + +# training +parser.add_argument("--batch_size", type=int, default=32) +parser.add_argument("--num_epochs", type=int, default=10) +parser.add_argument("--train_data_limit", type=int, default=-1) # for debugging with subset of data +parser.add_argument("--lr_decay", type=str, default="none") + +# vectors +parser.add_argument("--vector_type", default="glove", help="specify vector type glove/word2vec/none") +parser.add_argument("--glove_path", type=str, default="data/GloVe/glove.840B.300d.txt") +parser.add_argument("--embedding_dim", type=int, default=300, help="vector dimension") +parser.add_argument("--case_sensitive", action="store_true", help="activate this flag if vectors are case-sensitive (don't lower-case the data)") + +# per-experiment settings +parser.add_argument("--model_name", type=str) +parser.add_argument("--data_dir", default="data/", help="path to the data directory") +parser.add_argument("--train_corpus", type=str) +parser.add_argument("--test_corpus", type=str) + + +args = parser.parse_args() +if args.model_name is None: + print("Specify name of experiment") + sys.exit(0) +if args.train_corpus is None: + print("Specify train corpus") + sys.exit(0) +if args.test_corpus is None: + args.test_corpus = args.train_corpus + +params = { + 'top_dir': root_dir, + 'run_dir': run_dir, + 'model_name': args.model_name, + 'data_dir': args.data_dir, + 'train_corpus': args.train_corpus, + 'test_corpus': args.test_corpus, + 'task': args.task, + 'train_data_limit': args.train_data_limit, + 'lr_decay': args.lr_decay, + 'model_type': args.model_type, + 'glove_file': args.glove_path, + 'vector_type': args.vector_type, + 'embedding_dim': args.embedding_dim, # word embedding dim + 'case_sensitive': args.case_sensitive, + 'learning_rate': args.learning_rate, + 'dropout': args.dropout, # 1 = no dropout, 0.5 = dropout + 'hidden_dim': args.hidden_dim, + 'lstm_dim': args.lstm_dim, + 'clique_size': args.clique, + 'l2_reg': args.l2_reg, + 'batch_size': args.batch_size, + 'num_epochs': args.num_epochs, +} + +if not os.path.exists(params['run_dir']): + os.mkdir(params['run_dir']) +model_dir = os.path.join(params['run_dir'], params["model_name"]) +if not os.path.exists(model_dir): + os.mkdir(model_dir) +params['model_dir'] = model_dir + +# save parameters +with open(os.path.join(model_dir, params['model_name'] + '.params'), 'w') as param_file: + for key, parameter in params.items(): + param_file.write("{}: {}".format(key, parameter) + "\n") + print((key, parameter)) + +start = time.time() +if params['vector_type'] == 'glove': + params['vector_path'] = params['glove_file'] + +# load data +data = Data(params) +vectors = None +if params['vector_type'] != 'none': + vectors, vector_dim = data.load_vectors() + params['embedding_dim'] = vector_dim + +if params['task'] == 'class' or params['task'] == 'score_pred' or params['task'] == 'minority': + training_docs = data.read_data_class(params, 'train') + test_docs = data.read_data_class(params, 'test') +else: + training_docs = data.read_data_perm(params, 'train') + test_docs = data.read_data_perm(params, 'test') +# dev_docs = None +if params['vector_type'] == 'none': # init random vectors + vectors = data.rand_vectors(len(data.word_to_idx)) + +if params['model_type'] == 'clique': + model = LSTMClique(params, data) + train(params, training_docs, test_docs, data, model) +elif params['model_type'] == 'sent_avg': + model = LSTMSentAvg(params, data) + train(params, training_docs, test_docs, data, model) +elif params['model_type'] == 'par_seq': + model = LSTMParSeq(params, data) + train(params, training_docs, test_docs, data, model) diff --git a/train_neural_models.py b/train_neural_models.py new file mode 100644 index 0000000..03c2b15 --- /dev/null +++ b/train_neural_models.py @@ -0,0 +1,149 @@ +import torch +import torch.optim as optim +import time +import random +from torch.autograd import Variable +from evaluation import * +import progressbar +from torch.optim.lr_scheduler import LambdaLR +from torch.optim.lr_scheduler import StepLR +USE_CUDA = torch.cuda.is_available() +FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor +LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor + + +def train(params, training_docs, test_docs, data, model): + if params['model_type'] == 'clique': + training_data, training_labels = data.create_cliques(training_docs, params['task'], params['train_data_limit']) + test_data, test_labels = data.create_cliques(test_docs, params['task'], params['train_data_limit']) + elif params['model_type'] == 'sent_avg': + training_data, training_labels, train_ids = data.create_doc_sents(training_docs, 'sentence', params['task'], params['train_data_limit']) + test_data, test_labels, test_ids = data.create_doc_sents(test_docs, 'sentence', params['task'], params['train_data_limit']) + elif params['model_type'] == 'par_seq': + training_data, training_labels, train_ids = data.create_doc_sents(training_docs, 'paragraph', params['task'], + params['train_data_limit']) + test_data, test_labels, test_ids = data.create_doc_sents(test_docs, 'paragraph', params['task'], params['train_data_limit']) + if USE_CUDA: + model.cuda() + if params['train_data_limit'] != -1: + training_docs = training_docs[:10] + test_docs = test_docs[:10] + parameters = filter(lambda p: p.requires_grad, model.parameters()) + optimizer = optim.Adam(parameters, weight_decay=params['l2_reg']) + scheduler = None + if params['lr_decay'] == 'step': + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + elif params['lr_decay'] == 'lambda': + lambda1 = lambda epoch: 0.95 ** epoch + scheduler = LambdaLR(optimizer, lr_lambda=[lambda1]) + if params['task'] == 'class' or params['task'] == 'perm' or params['task'] == 'minority': + loss_fn = torch.nn.CrossEntropyLoss() + elif params['task'] == 'score_pred': + loss_fn = torch.nn.MSELoss() + timestamp = time.time() + best_test_acc = 0 + for epoch in range(params['num_epochs']): + if params['lr_decay'] == 'lambda' or params['lr_decay'] == 'step': + scheduler.step() + print(optimizer.param_groups[0]['lr']) + print("EPOCH "+str(epoch)) + total_loss = 0 + steps = int(len(training_data) / params['batch_size']) + indices = list(range(len(training_data))) + random.shuffle(indices) + bar = progressbar.ProgressBar() + model.train() + for step in bar(range(steps)): + batch_ind = indices[(step * params["batch_size"]):((step + 1) * params["batch_size"])] + sentences, orig_batch_labels = data.get_batch(training_data, training_labels, batch_ind, params['model_type'], params['clique_size']) + batch_padded, batch_lengths, original_index = data.pad_to_batch(sentences, data.word_to_idx, params['model_type'], params['clique_size']) + model.zero_grad() + pred_coherence = model(batch_padded, batch_lengths, original_index) + if params['task'] == 'score_pred': + loss = loss_fn(pred_coherence, Variable(FloatTensor(orig_batch_labels))) + else: + loss = loss_fn(pred_coherence, Variable(LongTensor(orig_batch_labels))) + mean_loss = loss / params["batch_size"] + mean_loss.backward() + total_loss += loss.cpu().data.numpy() + optimizer.step() + current_time = time.time() + print("Time %-5.2f min" % ((current_time - timestamp) / 60.0)) + print("Train loss: " + str(total_loss[0])) + output_name = params['model_name'] + '_epoch' + str(epoch) + if params['model_type'] == 'sent_avg' or params['model_type'] == 'par_seq': + if params['task'] == 'minority': + test_f05, test_precision, test_recall, test_loss = eval_docs(model, loss_fn, test_data, test_labels, + data, params) + elif params['task'] == 'class' or params['task'] == 'score_pred': + test_accuracy, test_loss = eval_docs(model, loss_fn, test_data, test_labels, data, params) + elif params['task'] == 'perm': + test_accuracy, test_loss = eval_docs_rank(model, test_docs, data, params) + print("Test loss: %0.3f" % test_loss) + if params['task'] == 'score_pred': + print("Test correlation: %0.5f" % (test_accuracy)) + elif params['task'] == 'minority': + print("Test F0.5: %0.2f Precision: %0.2f Recall: %0.2f" % (test_f05, test_precision, test_recall)) + else: + print("Test accuracy: %0.2f%%" % (test_accuracy * 100)) + elif params['model_type'] == 'clique': + train_accuracy, train_loss = eval_cliques(model, loss_fn, training_data, + training_labels, + params['batch_size'], + params['clique_size'], data, + params['model_type'], params['task']) + if params['task'] == 'score_pred': + print("Train clique corr: %0.5f" % (train_accuracy)) + else: + print("Train clique accuracy: %0.2f%%" % (train_accuracy * 100)) + test_clique_accuracy, test_loss = eval_cliques(model, loss_fn, test_data, + test_labels, + params['batch_size'], + params['clique_size'], data, params['model_type'], params['task']) + print("Test loss: %0.3f" % test_loss) + if params['task'] == 'score_pred': + print("Test clique corr: %0.5f" % ((test_clique_accuracy))) + else: + print("Test clique accuracy: %0.2f%%" % ((test_clique_accuracy * 100))) + doc_accuracy, test_precision, test_recall, test_f05 = eval_doc_cliques(model, test_docs, data, params) + if params['task'] == 'score_pred': + print("Test document corr: %0.5f" % (doc_accuracy)) + elif params['task'] == 'minority': + print("Test F0.5: %0.2f Precision: %0.2f Recall: %0.2f" % (test_f05, test_precision, test_recall)) + else: + print("Test document ranking accuracy: %0.2f%%" % (doc_accuracy * 100)) + test_accuracy = doc_accuracy + if params['task'] == 'minority': + if test_f05 > best_test_acc: + best_test_acc = test_f05 + # save best model + torch.save(model.state_dict(), params['model_dir'] + '/' + params['model_name'] + '_best') + print('saved model ' + params['model_dir'] + '/' + params['model_name'] + '_best') + else: + if test_accuracy > best_test_acc: + best_test_acc = test_accuracy + # save best model + torch.save(model.state_dict(), params['model_dir'] + '/' + params['model_name'] + '_best') + print('saved model ' + params['model_dir'] + '/' + params['model_name'] + '_best') + print() + return best_test_acc + + +def test(params, test_docs, data, model): + if params['model_type'] == 'clique': + test_data, test_labels = data.create_cliques(test_docs, params['task']) + elif params['model_type'] == 'sent_avg': + test_data, test_labels, test_ids = data.create_doc_sents(test_docs, 'sentence', params['task'], params['train_data_limit']) + elif params['model_type'] == 'par_seq': + test_data, test_labels, test_ids = data.create_doc_sents(test_docs, 'paragraph', params['task'], params['train_data_limit']) + + if USE_CUDA: + model.cuda() + loss_fn = torch.nn.CrossEntropyLoss() + # output_name = params['model_name'] + '_test' + if params['model_type'] == 'par_seq' or params['model_type'] == 'sent_avg': + test_accuracy, test_loss = eval_docs(model, loss_fn, test_data, test_labels, data, params) + print("Test accuracy: %0.2f%%" % (test_accuracy * 100)) + elif params['model_type'] == 'clique': + doc_accuracy = eval_doc_cliques(model, test_docs, data, params) + print("Test document ranking accuracy: %0.2f%%" % (doc_accuracy * 100))