forked from kiasar/MSCI-tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
py_tut.py
92 lines (76 loc) · 2.62 KB
/
py_tut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
This module is a template containing basic utilities to read raw dataset
and generate appropriately processed files.
"""
import os
import sys
import json
import random
from tqdm import tqdm
from pprint import pprint
def read_dataset(data_path):
"""
reads the raw dataset and returns all the lines as a list of string
"""
# os.path can be used for seamless path construction across different
# operating systems.
with open(os.path.join(data_path, 'pos.txt')) as f:
pos_lines = f.readlines()
with open(os.path.join(data_path, 'neg.txt')) as f:
neg_lines = f.readlines()
all_lines = pos_lines + neg_lines
return list(zip(all_lines, [1]*len(pos_lines) + [0]*len(neg_lines)))
def main(data_path):
"""
reads the raw dataset from data_path, creates a vocab dictionary,
"tokenizes" the sentences in the dataset.
NOTE: In our case tokenization simply refers to splitting a sentence at ' '
"""
# Read raw data
all_lines = read_dataset(data_path)
total_lines = len(all_lines)
vocab = {}
csv_data = ''
train_data = ''
val_data = ''
test_data = ''
labels_data = ''
train_size = int(0.8*total_lines)
val_size = int(0.1*total_lines)
random.shuffle(all_lines)
for idx, line in tqdm(enumerate(all_lines)):
sentence = line[0].strip().split()
label = line[1]
# construct the entry to be added to the csv files
csv_line = '{}\n'.format(','.join(sentence))
csv_data += csv_line
labels_data += '{}\n'.format(label)
# decide whether to add the entry to train/val/test set based on idx
if idx < train_size:
train_data += csv_line
elif idx >= train_size and idx < train_size + val_size:
val_data += csv_line
else:
test_data += csv_line
# constuct vocab dictionary
for word in sentence:
word = word.lower()
if word not in vocab:
vocab[word] = 0
vocab[word] += 1
# save processed files
with open('data/processed/tokenized.csv', 'w') as f:
f.write(csv_data)
with open('data/processed/train.csv', 'w') as f:
f.write(train_data)
with open('data/processed/val.csv', 'w') as f:
f.write(val_data)
with open('data/processed/test.csv', 'w') as f:
f.write(test_data)
with open('data/processed/labels.csv', 'w') as f:
f.write(labels_data)
with open('data/processed/vocab.json', 'w') as f:
json.dump(vocab, f)
if __name__ == '__main__':
# use a command-line argument to input raw data path
main(sys.argv[1])