-
Notifications
You must be signed in to change notification settings - Fork 1
/
reuters-working-main.py
89 lines (72 loc) · 2.42 KB
/
reuters-working-main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import math
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('reuters')
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.corpus import reuters
import streamlit as st
st.title("Information Retrieval System")
# query = st.text_input("Enter your query:")
query = "This is the first document."
# Step 1: Collect the corpus
# corpus = reuters.fileids()
# corpus = [reuters.fileid for fileid in corpus]
# st.write(corpus)
# # Step 1: Collect the corpus
# corpus = ['This is the first document.',
# 'First document is good.',
# 'for the best document refer the first',
# 'THIS IS THE FIRST DOCUMENT',
# 'This is the second document.',
# 'And this is the third one.',
# 'Is this the first document?']
# Step 2: Preprocess the documents
def preprocess(text):
# Tokenize the text into words
words = nltk.word_tokenize(text.lower())
# Remove stop words and punctuation
words = [word for word in words if word.isalnum() and not word in stopwords.words('english')]
# Stem the words
stemmer = nltk.stem.PorterStemmer()
words = [stemmer.stem(word) for word in words]
return words
corpus = []
for file_id in reuters.fileids():
document = reuters.raw(str(file_id))
corpus.append(preprocess(document))
# corpus = [preprocess(text) for text in corpus]
# Step 3: Calculate term frequencies
def calculate_tf(document):
tf = defaultdict(int)
for word in document:
tf[word] += 1
return tf
tf_corpus = [calculate_tf(document) for document in corpus]
# Step 4: Calculate inverse document frequency (IDF)
def calculate_idf(corpus):
N = len(corpus)
idf = defaultdict(float)
for document in corpus:
for word in document:
idf[word] += 1
for word in idf:
idf[word] = math.log(N / idf[word])
return idf
idf = calculate_idf(corpus)
# Step 5: Calculate document length
def calculate_document_length(document):
length = 0
for word in document:
length += tf_corpus[corpus.index(document)][word] * idf[word] ** 2
return math.sqrt(length)
# Step 8: Rank the results
# query = "This is the first document."
results = perform_query(query, idf)
# st.write(results, corpus)
for document, score in results:
st.write("Document:", document)
st.write("Score:", score)
st.write(corpus[document])
st.write('result generated')