-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dictionary.py
35 lines (27 loc) · 1.24 KB
/
build_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import csv
import os
import pandas as pd
import scipy.stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
'''
path1 = "/home/saurav/Documents/nlp_intern/scisumm-corpus-master/data/Development-Set-Apr8"
path2 = "/home/saurav/Documents/nlp_intern/scisumm-corpus-master/data/Test-Set-2016"
for folder in os.listdir(path):
if os.path.isdir(os.path.join(path, folder)):
csv_file = os.path.join(path, folder)+".csv"
csv_file = csv_file.replace('\\','/')
print("processing: ", csv_file)
dataframe = pd.read_csv(csv_file, header = None, delimiter = "\t")
a = dataframe[0]
b = dataframe[1]
label = dataframe[2]
feature = [(findFeature(x,y),labels) for x,y,labels in zip(a,b,label)]
featureSets = featureSets + feature
'''
a = "named entity recognizer (NER) useful many NLP applications information extraction, question answering, etc. own, NER also provide users looking person organization names quick information."
b = "automatically derived based correlation metric value used (Chieu Ng, 2002a)."
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
a = a.split()
X = vectorizer.fit_transform(a)
chi2score = chi2(X)