-
Notifications
You must be signed in to change notification settings - Fork 0
/
searchRetrieval.py
289 lines (246 loc) · 9.66 KB
/
searchRetrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# Name - Tejas Dhananjay Rajopadhye
# UIN - 675873639
# CS 582 Final Project
# Please take a look at Project report for more information on this file and the usage for the same
## ***** Please supply three input args
## 1 - Input path to the input HTML Documents file directory (e.g. - c:\users\RetrivedDocs\ )
import json
import math
from pathlib import Path
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import argparse
"""
Funtion to extract the XML tag data from the SGML document
"""
def extractTextFromXML(XMldoc, tagLabel):
xmlDataLocal = BeautifulSoup(XMldoc)
return xmlDataLocal.find(tagLabel).string
"""
Function to process text - The processing happens as described in the HW2 description
"""
def textProcessing(text):
text = text.lower()
## Remove the \n's from the text
text = text.replace('\n', " ")
## Remove the punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
## Remove numbers
text = text.translate(str.maketrans('', '', string.digits))
## Remove the stop words
arrOfTokens = text.split()
arrProcessed = []
stop_words = set(stopwords.words('english'))
for wordToken in arrOfTokens:
if (wordToken not in stop_words):
arrProcessed.append(wordToken)
arrOfTokens = arrProcessed
## Perform Stemming
porterStemmerObj = PorterStemmer()
arrayOfTokensWithStemming = []
for wordToken in arrOfTokens:
arrayOfTokensWithStemming.append(porterStemmerObj.stem(wordToken))
arrOfTokens = arrayOfTokensWithStemming
## Again Remove the stop words
arrProcessed = []
for wordToken in arrOfTokens:
if (wordToken not in stop_words):
arrProcessed.append(wordToken)
arrOfTokens = arrProcessed
## Join the string together
text = ' '.join(map(str, arrOfTokens))
## Remove words of some length
text = re.compile(r'\W*\b\w{1,2}\b').sub('', text)
processedText = text
return processedText
"""
Generate the TF ( word Frequency ) for a given text string
"""
def getTF(text):
wordDict = dict()
text = text.split()
for token in text:
if token in wordDict:
wordDict[token] = wordDict[token] + 1
else:
wordDict[token] = 1
return wordDict
"""
Generate the DF data for all docs
This function takes all docs dictionary and iterates over all the documents strings and constructs inverted index
This inverted index is used to construct DF dictionary for all documents
"""
def getDFFromAllDocs(allDocsDict):
dfDict = dict()
intDict = dict() #Intermediate dictionary
for doc in allDocsDict:
textFromDoc = allDocsDict[doc]
#Create a Inverted Index
tokens = textFromDoc.split()
for token in tokens:
if token in intDict:
if doc not in intDict[token]:
intDict[token].append(doc)
else:
intDict[token] = [doc]
for dictVal in intDict:
dfDict[dictVal] = len(intDict[dictVal])
return dfDict
"""
Function to generate tf-idf data from the doc
This function takes input doc ( string ), dfDict ( generated with getDFFromAllDocs ) , total number of docs
Calculates the tf-idf from the documents
"""
def generatetfIdfFromDoc(doc, dfDict, numOfDocs):
tfDict = getTF(doc)
words = doc.split()
tfIdfDict = dict()
for word in words:
## Adding + 1 to get rid of divide by zero errors
tfIdfDict[word] = tfDict[word] * math.log2(numOfDocs / dfDict[word] + 1)
return tfIdfDict
"""
Calculate the cosine similarity
This function takes the docVec which is a dictioary of { word : tf-idf } values. queryVec is a similar dict
For each word in query , if this word exist in the docVec the multiplication is made and the sum is accumulated
getNorm function calculated the denominator
"""
def getCosineSim(docVec, queryVec):
numerator = 0.0
for qDictVal in queryVec:
if qDictVal in docVec:
numerator = numerator + queryVec[qDictVal] * docVec[qDictVal]
docNorm = getNorm(docVec)
queryNorm = getNorm(queryVec)
return (numerator / math.sqrt(docNorm * queryNorm + 1))
"""
Calculate the norm of a dictionary vector
"""
def getNorm(dicVec):
value=0.0
for vec in dicVec:
value = dicVec[vec] * dicVec[vec]
return value
"""
Function to calculate the cosine similarities between all docs and query docs
Return dict of doc to query cosine similarity for all docs and a given query
"""
def calculateCosineVec(allTfIdfDoc, queryIdfDoc):
docToQuerySim = dict()
for docTfIdf in allTfIdfDoc:
docToQuerySim[docTfIdf] = getCosineSim(allTfIdfDoc[docTfIdf], queryIdfDoc)
return docToQuerySim
"""
Generate the (query_id, document_id) output from the cosine documents
This function takes the cosine similarities generated for all the documents
Later in the code convert it to the {query_id : rankedDoc_dict} . This represenation is useful to calculate the recall and precision. Please check the generate output function for the same
"""
def getQueryIdDocId(cosSineQueriesDoc):
arrayOfQueryIdDocId = []
outputDict = dict()
for cosDicVal in cosSineQueriesDoc:
docDict = cosSineQueriesDoc[cosDicVal]
#Sort it based on the cosine sim
docDict = dict(sorted(docDict.items(), key=lambda data: data[1], reverse=True))
for docDicVals in docDict:
arrayOfQueryIdDocId.append([str(cosDicVal), docDicVals])
outputDict[cosDicVal] = arrayOfQueryIdDocId
arrayOfQueryIdDocId = []
return outputDict
"""
Find the ranked docs which match the relevance doc answers
This function finds the intersection between the relevant data and the docs ranked that are generated with cosine similarity
"""
def findCommanDocs(docRanked, relevantDocRanked):
numOfCommonDocs = 0
for doc in docRanked:
if doc in relevantDocRanked:
numOfCommonDocs = numOfCommonDocs + 1
return numOfCommonDocs
"""
Get the conversion from the queryDoc dict to a queryURL to display the links searched by the query doc
"""
def getQueryURLDoc(queryDocId, mapDocToURL):
queryURLDoc = dict()
for query in queryDocId:
queryURLDoc[query] = list()
for element in queryDocId[query]:
queryURLDoc[query].append(mapDocToURL[element[1]])
return queryURLDoc
def startSearchTerminal():
while(1):
print("**** Enter the string to search ****")
print("** Type -1 to exit the search engine")
searchText = input("Enter the string now ...")
if (searchText == str(-1)):
return
queriesDoc = list()
queriesDoc.append(searchText)
queriesDict = dict()
for queryIndex in range(len(queriesDoc)):
queriesDict[queryIndex + 1] = textProcessing(queriesDoc[queryIndex])
## Calculate the tfidf for the queries
queriesDocTfIdfDict = dict()
dfDictForQueries = getDFFromAllDocs(queriesDict)
for doc in queriesDict:
# dictVal = generatetfIdfFromDoc(queriesDict[doc], dfDictForQueries, len(queriesDoc))
dictVal = generatetfIdfFromDoc(queriesDict[doc], dfDictForQueries, numOfDocs)
queriesDocTfIdfDict[doc] = dictVal
## Now find cosine similarity between the queries and the docs
cosSineQueriesDoc = dict()
## Generate CosineSim
for qDoc in queriesDocTfIdfDict:
cosSineQueriesDoc[qDoc] = calculateCosineVec(allDocsTfIdf, queriesDocTfIdfDict[qDoc])
## Now generate the (queryId, documentId) combination
queryIdDocId = getQueryIdDocId(cosSineQueriesDoc)
# Load the MapDocToURL
mapDocToURLFile = open("./mapDocToURL", "r")
readFiled = mapDocToURLFile.read()
mapDocToURL = json.loads(readFiled)
queryURLDoc = getQueryURLDoc(queryIdDocId, mapDocToURL)
count = 10
while(1):
#for inst in queryURLDoc:
#print("Document Retrieved for query - > " + str())
printURLSearched(queryURLDoc, 1, count)
print("----------------------")
moreRecords = input("Press 1 for more records OR press 0 for exiting OR -1 to search more")
if(moreRecords == str(0)):
return
if (moreRecords == str(-1)):
break
count = count + 10
def printURLSearched(queryURLDoc, inst, count):
for itr in range(0, count):
print(queryURLDoc[inst][itr])
## Main Entry point
if __name__ == '__main__':
# Args Configuration
parser = argparse.ArgumentParser(description='Args to Search Terminal For Final Project')
parser.add_argument('HTMLDocs', type=str,
help='Please specify a correct input path to the Retrieved HTML Docs')
args = parser.parse_args()
path = args.HTMLDocs
# Hold Text Data for all Docs in a hashmap
allDocsTextDict = dict()
numOfDocs = 0
# Iterate Over all Files and Make a Text
fileEntries = Path(path)
for fileEntry in fileEntries.iterdir():
fileR = open(path + "/" + fileEntry.name, mode='r', encoding='utf-8')
textWithoutProcessing = fileR.read()
# Now process this string and tokenize
textAfterProcessing = textProcessing(textWithoutProcessing)
allDocsTextDict[fileEntry.name] = textAfterProcessing
numOfDocs = numOfDocs + 1
## Function call to generate the DF For all docs
dfDict = getDFFromAllDocs(allDocsTextDict)
## Calculate the tfidf values for all doc
allDocsTfIdf = dict()
for doc in allDocsTextDict:
dictVal = generatetfIdfFromDoc(allDocsTextDict[doc], dfDict, numOfDocs)
allDocsTfIdf[doc] = dictVal
startSearchTerminal()