-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
160 lines (136 loc) · 5.79 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
###############################################################################
# Author: Md Rizwan Parvez
# Project: LanModeledProgramGeneration
# Date Created: 4/1/2017
# Many codes are from Wasi Ahmad data.py
# File Description: This script provides a definition of the corpus, each
# example in the corpus and the dictionary.
###############################################################################
from nltk.tokenize import word_tokenize
import numpy as np
import json, os, torch
import util
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
# Create and store three special tokens
self.start_token = '<SOS>'
self.end_token = '<EOS>'
self.unknown_token = '<UNKNOWN>'
self.pad_token = '<PAD>'
self.idx2word.append(self.pad_token)
self.word2idx[self.pad_token] = len(self.idx2word) - 1
self.padding_id = self.word2idx[self.pad_token]
self.idx2word.append(self.start_token)
self.word2idx[self.start_token] = len(self.idx2word) - 1
self.idx2word.append(self.end_token)
self.word2idx[self.end_token] = len(self.idx2word) - 1
self.idx2word.append(self.unknown_token)
self.word2idx[self.unknown_token] = len(self.idx2word) - 1
def add_word(self, word):
#word = word.lower()
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def contains(self, word):
word = word.lower()
return True if word in self.word2idx else False
def __len__(self):
return len(self.idx2word)
class Instance(object):
def __init__(self):
self.sentence1 = []
self.target = []
def add_sentence(self, sentence, dictionary, is_test_instance=False):
#### fix this
words = [dictionary.start_token] + word_tokenize(util.sepearte_operator(sentence.lower())) + [dictionary.end_token]
#removes <, >
words.pop(1)
words.pop(2)
words.pop(len(words)-2)
words.pop(len(words)-3)
if is_test_instance:
for i in range(len(words)):
if dictionary.contains(words[i].lower()) == False:
words[i] = dictionary.unknown_token
else:
for word in words:
dictionary.add_word(word.lower())
self.sentence1 = words[:-1]
#self.target = words[1:]
class Corpus(object):
def __init2__(self, args):
path = args.data_path
self.dictionary = Dictionary()
self.max_sent_length = 0
#### fix this
self.train = self.parse(os.path.join(path, args.train_data))
self.valid = self.parse(os.path.join(path, args.valid_data))
self.test = self.parse(os.path.join(path, args.test_data), True)
def parse(self, path, is_test_instance=False):
"""Parses the content of a file."""
assert os.path.exists(path)
samples = []
with open(path, 'r') as f:
for line in f:
instance = Instance()
if is_test_instance:
instance.add_sentence(line, self.dictionary, is_test_instance)
else:
instance.add_sentence(line, self.dictionary)
if self.max_sent_length < len(instance.sentence1):
self.max_sent_length = len(instance.sentence1)
samples.append(instance)
return samples
### from example
def __init__(self, args):
path = args.data_path
self.dictionary = Dictionary()
self.max_sent_length = 0
self.dictionary = Dictionary()
self.max_length = args.max_length
self.train_data, self.train_label = self.tokenize(os.path.join(path, args.train_data))
self.valid_data, self.valid_label = self.tokenize(os.path.join(path, args.valid_data))
self.test_data, self.test_label = self.tokenize(os.path.join(path, args.test_data))
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
# Add words to the dictionary
with open(path, 'r') as f:
tokens = 0
lines_c = 0
for line in f:
words = ['<start>'] + line.split() + ['<eos>']
len_ = len(words)
tokens += len_
if(self.max_sent_length <len_): self.max_sent_length = len_
lines_c+=1
for word in words:
self.dictionary.add_word(word)
# Tokenize file content
with open(path, 'r') as f:
#print('Creating tensor of size: ', lines_c, self.max_sent_length)
print('Reading files: ', path)
ids = [] # torch.LongTensor(lines_c, self.max_sent_length)
target_vecs = [] # torch.LongTensor(lines_c, self.max_sent_length)
line_c = 0
count =0
for line in f:
words = ['<start>'] + line.split() + ['<eos>']
sentence_len = len(words)
if(sentence_len>self.max_length):
#print ("sen len: ", sentence_len, ' exceed limit: ', self.max_length, ' skipped!!', count)
count+=1
continue
ids.append([])
target_vecs.append([])
#if(self.max_sent_length<sentence_len): self.max_sent_length = sentence_len
token = 0
for word in words:
if(token<sentence_len-1 ): ids[line_c].append( self.dictionary.word2idx[word])
if(token>0): target_vecs[line_c].append( self.dictionary.word2idx[word] )
token += 1
line_c +=1
return ids, target_vecs