-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_match.py
executable file
·128 lines (110 loc) · 3.04 KB
/
data_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import couchdb,string,nltk,os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tag import StanfordPOSTagger
from nltk.corpus import sentiwordnet as swn
db = couchdb.Server()['huginn-events']
stop = stopwords.words('english')
wnl = WordNetLemmatizer().lemmatize
stem = PorterStemmer().stem
os.environ['CLASSPATH'] = ':'.join(["/usr/share/stanford-ner-2015-12-09/",
"/usr/share/stanford-postagger-2015-12-09/",
"/usr/share/stanford-parser-full-2015-12-09/"])
os.environ['STANFORD_MODELS'] = ':'.join(["/usr/share/stanford-ner-2015-12-09/classifiers",
"/usr/share/stanford-postagger-2015-12-09/models",
"/usr/share/stanford-parser-full-2015-12-09/"])
NewsKeys = [
"last_updated"
]
ScannerKeys = [
"time",
"address"
]
table = []
def parse_event(e):
table.append({
"original": [e['orig']],
"bag": e['text'],
"id": [e['id']],
"date": e['date'],
})
def bag_compare(b1,b2,limit=3):
l1 = len(set(list(iter(b1))+list(iter(b2))))
l2 = len(b2)+len(b1)
return l1 < l2-limit
def table_parse():
for i1 in table:
for i2 in table:
if bag_compare(i1['bag'],i2['bag']):
i1['id'].append(i2['id'][0])
i1['original'].append(i2['original'][0])
def pretty_table(t):
o = []
minimum = 3
for row in table:
if len(row['id']) > minimum:
o.append({
"headlines": row['original'],
"bags": row['bag'],
"ids": row['id']
})
return o
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
def destem_stanford(txt):
tag = st.tag(txt)
o = []
sentiment = 0
for word in tag:
pos = word[1][:1]
if pos == "N" or pos == "V":
w = wnl(word[0],pos=pos.lower())
o.append(w)
#synset=list(swn.senti_synsets(word, "v"))[0]
else:
o.append(stem(word[0]))
return (o,sentiment)
def vectorize(l):
k = set(l)
o = {}
for item in k:
o[item] = 0
for item in l:
o[item] += 1
return o
def clean_text(txt,id,date):
tbl = string.maketrans(string.punctuation,' '*len(string.punctuation))
txt = string.translate(txt.encode('ascii'),tbl)
rem = [i for i in txt.lower().split() if i not in stop]
de = destem_stanford(rem)
return {"orig": txt, "text": vectorize(de[0]), "id": id, "date": date, "sentiment": de[1]}
def clean_scanner(txt):
x = ' '.join(txt.strip().split(' ')[1:])
if x[0] in string.uppercase:
return x[:-3]
else:
return x[1:-2]
def data():
for row in db.view('_all_docs'):
doc = db[row.id]
doc_keys = list(iter(doc))
#print doc
if len(set(doc_keys+NewsKeys)) == len(doc_keys):
parse_event(clean_text(doc['title'],doc.id))
elif len(set(doc_keys+ScannerKeys)) == len(doc_keys):
pass
def handle(dat):
for row in dat:
doc = db[row['id']]
doc_keys = list(iter(doc))
#print doc
if len(set(doc_keys+NewsKeys)) == len(doc_keys):
parse_event(clean_text(doc['title'],doc.id,doc['date_published']))
elif len(set(doc_keys+ScannerKeys)) == len(doc_keys):
pass
return table
if __name__ == "__main__":
data()
table_parse()
print pretty_table(table)
#clean_text("He threw the thrown object")