-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
166 lines (128 loc) · 6.69 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
TODO:
1. Expand the dataset by randomizing professions by adding new datapoints !!!
2. Add noise to resemble google search items more
TODO: generally the dataset is not really good, lots of false pasitives and unlabeled data...
1. Try out another dataset WikiData
"""
import random
import re
# finding consecutive ints in a list
from itertools import groupby
from operator import itemgetter
# refactor and read
dataset = []
for file in ['EWNERTC_TC_Fine Grained NER_No_NoiseReduction.DUMP', 'EWNERTC_TC_Fine Grained NER_DomainDependent_NoiseReduction.DUMP']:
with open(file, 'rb') as f:
for line in f:
line = line.decode("utf-8")
line = line.replace('\r', '').replace('\n', '')
domain, tags, sentence = line.split('\t')
if 'profession' in tags:
dataset.append([domain, tags, sentence])
# print(domain, sentence)
# read and clean addition professions
other_titles = open('dataset/titles_combined.txt').read().splitlines()
max_parts = 0
for idx, title in enumerate(other_titles):
if ',' in title:
title_parts = title.split(', ')
other_titles[idx] = ' '.join(reversed(title_parts)).lower()
max_parts = [max_parts if max_parts > len(title_parts) else len(title_parts)][0]
print(other_titles[idx])
for idx, title in enumerate(other_titles):
other_titles[idx] = title.lower()
# refactor and write
splits = [0, int(len(dataset) * 0.70)], \
[int(len(dataset) * 0.70), int(len(dataset) * 0.85)], \
[int(len(dataset) * 0.85), int(len(dataset) * 1)]
random.shuffle(dataset)
for dataset_name, split in zip(['train', 'test', 'dev'], splits):
open('dataset/{}.txt'.format(dataset_name), 'w').close() # remove everything
with open('dataset/{}.txt'.format(dataset_name), 'a') as f:
for sentence in dataset[split[0]:split[1]]:
_, tags, words = sentence
# -------------------------------------------- #
# random modification by concatinating more professions in a sentence
add_random = random.randint(0, 100) < 30
if add_random and dataset_name == 'train':
words_list = words.split(' ')
tags_list = tags.split(' ')
# pick last one because else can insert in the middle of two word profession
target_index = [idx for idx, s in enumerate(tags_list) if 'profession' in s][-1] + 1
initial_target_index = target_index
random_titles = random.sample(other_titles, 1)
for random_title in random_titles:
new_words_list = words_list[:target_index] + [random.choice([',', 'and'])]
new_tags_list = tags_list[:target_index] + ['O']
target_index += 1
random_title = random_title.replace('-', ' ').lower()
for random_title_part in random_title.split(' '):
new_words_list = new_words_list[:target_index] + [random_title_part]
new_tags_list = new_tags_list[:target_index] + ['randomAdd-profession']
target_index += 1
new_words_list = new_words_list + words_list[initial_target_index:]
new_tags_list = new_tags_list + tags_list[initial_target_index:]
words = ' '.join(new_words_list)
tags = ' '.join(new_tags_list)
for tag, word in zip(tags.split(' '), words.split(' ')):
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
if 'profession' in tag:
line = word + '\t' + 'P' + '\t' + 'P'
else:
line = word + '\t' + 'P' + '\t' + 'O'
f.write(line + '\n')
f.write(line + '\n\n')
# -------------------------------------------- #
# replace all existing professions with new ones
words_list = words.split(' ')
tags_list = tags.split(' ')
targets_indices = [idx for idx, s in enumerate(tags_list) if 'profession' in s]
# find consecutive tags of professions
profession_groups = []
for k, g in groupby(enumerate(targets_indices), lambda ix: ix[0] - ix[1]):
profession_groups.append(list(map(itemgetter(1), g)))
# strip the sentence of the professions (wil add in the next step)
# this way it is simpler, due to different lengths of professions
# e.g. might strip off profession of 3 words and add profession only one word
profession_idx_flat = [item for sublist in profession_groups for item in sublist]
new_words_list = [i for j, i in enumerate(words_list) if j not in profession_idx_flat]
new_tags_list = [i for j, i in enumerate(tags_list) if j not in profession_idx_flat]
# these are the indeces of where to insert new professions
placeholder_idx = [i[0] for i in profession_groups]
placeholder_idx = [i + 1 - len(j) for i, j in zip(placeholder_idx, profession_groups)]
idx_add = 0
for idx in placeholder_idx:
idx += idx_add
random_title = random.sample(other_titles, 1)[0].split(' ')
n_words = len(random_title)
new_words_list = new_words_list[:idx] + random_title + new_words_list[idx:]
new_tags_list = new_tags_list[:idx] + ['expandRandom-profession'] * n_words + new_tags_list[idx:]
idx_add += n_words - 1
words = ' '.join(new_words_list)
tags = ' '.join(new_tags_list)
# check if correct
# print(len(new_words_list), len(new_tags_list))
# print(' '.join(new_words_list))
# print([word for word, tag in zip(new_words_list, new_tags_list) if 'profession' in tag])
for tag, word in zip(tags.split(' '), words.split(' ')):
# https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
if 'profession' in tag:
line = word + '\t' + 'P' + '\t' + 'P'
else:
line = word + '\t' + 'P' + '\t' + 'O'
f.write(line + '\n')
f.write(line + '\n\n')
# TODO: I-person_profession vs B-person_profession vs B-profession_specialization_of etc
unique_set = set()
for item in dataset:
if 'profession' in item[1]:
print(item)
for tag in unique_set:
if 'profession' in tag:
print(tag)
# TODO: check inputs one by one
with open('dataset/train.txt', 'r') as f:
for line in f:
if 'analyst' in line.lower():
print(line)