-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiltering.py
33 lines (25 loc) · 1.24 KB
/
filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
import dask.dataframe as dd
from nltk.stem.snowball import SnowballStemmer
from extraction import extract_word
stemmer = SnowballStemmer("english")
def stem(concept):
w = extract_word(concept)
return stemmer.stem(w)
def filter(cn_path='conceptnet-assertions-5.7.0.csv'):
kb = dd.read_csv(cn_path,sep='\t',names=['URI','relation','from','to','extra'])
result = kb[kb['from'].str.startswith('/c/en/') & kb['to'].str.startswith('/c/en/')]
print("Filtering only english")
en_kb = result.compute()
print("Filtering by weight")
en_kb['weight'] = en_kb['extra'].apply(lambda x:json.loads(x)['weight'])
en_kb = en_kb[en_kb['weight']>=1]
print("Stemming")
from_stems = en_kb['from'].apply(stem)
to_stems = en_kb['to'].apply(stem)
en_kb = en_kb[from_stems != to_stems]
en_kb.to_csv('conceptnet_en_filtered.csv',index=False)
relations = (en_kb.relation.value_counts(normalize=True).mul(100).round(1).astype(str) + '%').head(16)
relations = relations[~relations.index.isin(['/r/Synonym','/r/Antonym','/r/DerivedFrom'])].index.tolist()
print(spec_tokens = [f'__{r[3:]}__' for r in relations])
en_kb[en_kb.relation.isin(relations)].to_csv('conceptnet_en_filtered_13.csv', index=False)