forked from zafarali/twitfem
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactive_learning_curate.py
183 lines (144 loc) · 5.42 KB
/
active_learning_curate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Usage:
python active_learning_curate.py curated_filename unlabeled_filename save_to_filename batch_size sample_size
PARAMETERS
curated_filename: name of csv file with curated data
unlabeled_filename: name of csv file with curated data
save_to_filename: filename to save new curated data to
batch_size: number of tweets to curate before retraining
smaller number improves faster but is more computionally expensive
sample_size: number of samples to take from unlabeled pool when
picking next tweet to curate. bigger improves faster but
is more expensive
DESCRIPTION
Train a multinomial naive bayes classifier on tweets with active learning, using uncertainty sampling.
"""
import numpy as np
import matplotlib as plt
import pandas as pd
import random
import csv
import re
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
## emoticons
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
## words
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
## compile regex
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(string):
return tokens_re.findall(string)
def removable(token):
isEmoticon = True if emoticon_re.search(token) else False
isRemovable = token in [',', '.', ':', ';']
return (isEmoticon or isRemovable)
# pre_processor
def pre_process(string, lowercase=False):
tokens = tokenize(string)
tokens = [ token for token in tokens if not removable(token)]
return tokens
def load_curated_tweets(csv_file, encoding='latin-1'):
df_curated = pd.read_csv(csv_file, encoding=encoding)
df_curated = df_curated[['id', 'tweet', 'class']]
return df_curated
def load_unlabeled_tweets(csv_file, encoding='latin-1'):
df_curated = pd.read_csv(csv_file, encoding=encoding)
df_curated = df_curated[['id', 'tweet']]
return df_curated
def build_classifier(df_curated, df_all):
vec = CountVectorizer(tokenizer=pre_process)
vec.fit(df_all.tweet)
bagofwords = vec.transform(df_curated.tweet)
bagofwords = bagofwords.toarray()
clf = MultinomialNB().fit(bagofwords, df_curated['class'])
return vec, clf
def update_classifier(vec, clf, df_new_curated):
bagofwords = vec.transform(df_new_curated.tweet)
bagofwords = bagofwords.toarray()
clf = clf.partial_fit(bagofwords, df_new_curated['class'])
def pick_uncertain_samples(sample_size, batch_size, vec, clf, df_unlabeled):
# take random sample of unlabeled
df_unlabeled = df_unlabeled.reindex(np.random.permutation(df_unlabeled.index))
df_sample = df_unlabeled.iloc[:sample_size]
print(df_sample.tweet)
sample = vec.transform(df_sample.tweet)
# predict them
uncert_score = clf.predict_log_proba(sample)
uncert_score = [sum(n) for n in uncert_score]
#df_sample['uncert_score'] = uncert_score
#print()
#print(df_sample)
i = np.argpartition(np.array([-n for n in uncert_score]), batch_size)
df_sample = df_sample.iloc[i]
#print('df_sample_parted')
#print(df_sample.tweet)
df_picked_sample = df_sample.iloc[:batch_size]
print(len(df_picked_sample))
return df_picked_sample
def curate_tweets(sample_df, curated_df, fn_out, i_end):
assert 'uncert_score' not in sample_df.columns
assert 'class' not in sample_df.columns
sample_df['class'] = np.nan
#print(sample_df['tweet'])
print("[0-9] for classes, b for back, s for save, x for exit")
i = 0
while i < i_end:
_id = sample_df.index[i]
print(sample_df['tweet'].iloc[i])
resp = input("class --> ")
if resp == "x":
tosave = input("Save? (y/n): ")
if tosave in ["y", "Y", "yes", "YES", "Yes"]:
curated_df.to_csv(fn_out+'.csv')
return None, None
elif resp == 'b':
i -= 1
sample_df.loc[_id, 'class'] = np.nan
elif resp == 's':
curated_df.to_csv(out_filename+'.csv')
elif 0 <= int(resp) <= 9:
sample_df.loc[_id, 'class'] = int(resp)
curated_df.append(sample_df.iloc[i])
i += 1
else:
print ('*** INVALID ENTRY, TRY AGAIN ***')
return sample_df, curated_df
def main(argv):
try:
curated_csv = argv[1]
unlabeled_csv = argv[2]
batch_size = int(argv[4])
sample_size = int(argv[5])
new_curated_fn = argv[3]
except IndexError:
print('Missing command line argument.')
unlabeled_df = load_unlabeled_tweets(unlabeled_csv)
curated_df = load_curated_tweets(curated_csv)
vec, clf = build_classifier(curated_df, unlabeled_df)
while(True):
sample_df = pick_uncertain_samples(sample_size, batch_size, vec, clf, unlabeled_df)
picked_df, curated_df = curate_tweets(sample_df, curated_df, new_curated_fn, batch_size)
if picked_df is None:
break
print(picked_df)
clf = update_classifier(vec, clf, picked_df)
if __name__ == '__main__':
main(sys.argv)