-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtfidf2.py
81 lines (71 loc) · 2.63 KB
/
tfidf2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
__author__ = 'prem'
import numpy
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re
def term_freq(k, wordlist):
"""
takes review_id and wordlist and computes the term frequency
"""
tf = dict()
for term in wordlist:
tf[term] = tf.get(term, 0.0) + 1.0
return k, tf
def idf(n, docfreq):
""" Compute the IDF """
return numpy.log10(numpy.reciprocal(docfreq) * n)
def read_doc(line):
""" Read one line from review file and split it into Multiple lines and convert it into wordlist for each line
Note: Removed sentences with less than 6 words and words with less than 4 characters """
lmtz = WordNetLemmatizer()
sw = stopwords.words('english')
review = line.split("\t")
review_id = review[0]
sentences = review[5].split(".")
result = []
for idx, sent in enumerate(sentences):
sent_id = review_id + '_' + str(idx)
sent_len = len(sent.split(" "))
if 10 < sent_len < 30:
words = re.findall(r'[a-zA-Z]+', sent)
words = [lmtz.lemmatize(w.lower()) for w in words if w.lower() not in sw]
words = [w for w in words if len(w) > 3]
result.append((sent_id, words))
return result
# This is used to keep the reivew_id and original sentences from reviews
def read_reviews(line):
""" Read one line from review file and split it into enumerated review id and review sentences tuple"""
review = line.split("\t")
review_id = review[0]
sentences = review[5].split(".")
result = []
for idx, sent in enumerate(sentences):
sent_id = review_id + '_' + str(idx)
result.append((sent_id, sent))
return result
def extract_sentences(VT, reviews, columnheader, k=10, n=5):
"""
Returns a list of summary from VT matrix
:param VT: Right Singular Matrix of SVD
:param reviews: reviews RDD <reviewid, sentence>
:param columnheader: reivew id
:param k: no of concepts(rows in VT)
:param n: no of review per concept
"""
concepts = []
# for idxs in numpy.argpartition(VT[:k,:], -n, 1)[:,-n:]:
for idxs in numpy.fliplr(VT[:k,:].argsort()[:,-n:]):
keysentences = []
for idx in idxs:
keysentences.append(reviews.lookup(columnheader[idx]))
concepts.append(keysentences)
return concepts
def extract_keywords(VT, rowheader, k = 10, n = 5):
concepts = []
for idxs in numpy.fliplr(VT[:k,:].argsort()[:,-n:]):
keywords = []
for idx in idxs:
keywords.append(rowheader[idx])
concepts.append(keywords)
return concepts