-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
88 lines (73 loc) · 2.67 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Indexer read each text file in documents directory and create Inverted Index and save it in a Json file
Also it calculate idf for each terms and save it in a file that named terms_idf
"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import os
import math
import json
en_stops = set(stopwords.words('english'))
porter = PorterStemmer()
# calculate total number of documents
docs_list = os.listdir('documents')
total_documents = len(docs_list)
wordnet_lemmatizer = WordNetLemmatizer()
# Creating dictionary of terms frequencies
en_stops = set(stopwords.words('english'))
frequency_matrix = {}
for doc_name in docs_list:
path = 'documents\\'
path += doc_name
file = open(path)
data = file.read()
data = data.lower()
doc_tokens = nltk.word_tokenize(data)
doc_tokens = [word for word in doc_tokens if word.isalnum()]
freq_table = {}
for doc_token in doc_tokens:
if doc_token not in en_stops:
word = porter.stem(doc_token)
word = wordnet_lemmatizer.lemmatize(word)
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[doc_name] = freq_table
# Create tf_matrix
tf_matrix = {}
for doc, f_table in frequency_matrix.items():
tf_table = {}
count_words_in_document = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_document
tf_matrix[doc] = tf_table
# Calculate idf for each word as a dictionary
idf_dic = {}
for doc, f_table in frequency_matrix.items():
for word, count in f_table.items():
if word in idf_dic:
idf_dic[word] += 1
else:
idf_dic[word] = 1
for word, freq in idf_dic.items():
idf_dic[word] = math.log(10, total_documents / freq)
# inverted index
inverted_index = {}
for doc, tf_table in tf_matrix.items():
for word, tf in tf_table.items():
# for tf we calculate natural logarithm
tf_idf = (math.log(1 + tf)) * idf_dic[word]
if word in inverted_index:
posting_list = inverted_index[word]
posting_list[doc] = tf_idf / count_words_in_document
else:
inverted_index[word] = {doc: tf_idf}
for word, posting_list in inverted_index.items():
inverted_index[word] = sorted(posting_list.items(), key=lambda x: x[1], reverse=True)
with open('inverted-index.json', 'w') as json_file:
json.dump(inverted_index, json_file)
with open('terms-idf.json', 'w') as handler:
json.dump(idf_dic, handler)