Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add InterRanker #28

Merged
merged 18 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# SuniBrownSnakeGaborone
# Modern Search Engine
A hyper-fast search engine
281 changes: 172 additions & 109 deletions engine/crawl.py

Large diffs are not rendered by default.

22 changes: 16 additions & 6 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os

import pandas as pd
Expand Down Expand Up @@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]):
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
else:
print(f"Page with ID {page_id} not found")
logging.info(f"Page with ID {page_id} not found")


def add_title_to_index(url: str, title: str):
Expand All @@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str):
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
else:
print(f"Page with ID {page_id} not found")
logging.info(f"Page with ID {page_id} not found")


def add_snippet_to_index(url, snippet):
Expand All @@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet):
if not pages_df[pages_df['url'] == url].empty:
pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
else:
print(f"Page with URL {url} not found")
logging.info(f"Page with URL {url} not found")


def get_tokens() -> list[list[str]]:
Expand Down Expand Up @@ -152,6 +153,12 @@ def save_pages() -> None:
pages_df.to_csv("pages.csv", index=False, header=headers)


def get_doc_by_id(page_id: int):
global pages_df
page = pages_df[pages_df['id'] == page_id]
return page


def load_pages() -> pd.DataFrame:
"""
Load the pages DataFrame from a CSV file.
Expand All @@ -162,17 +169,20 @@ def load_pages() -> pd.DataFrame:

# Check if the file exists
if not os.path.exists(f"pages.csv"):
print("No pages found")
logging.info("No pages found")
return pages_df

try:
pages_df = pd.read_csv("pages.csv", header=0)
except pd.errors.EmptyDataError:
print("No pages found")
logging.info("No pages found")
return pages_df

# Convert the tokenized_text column to a list of lists
pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)

print("Loaded pages")
logging.info("Loaded pages")
return pages_df


load_pages()
134 changes: 134 additions & 0 deletions engine/custom_rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import pandas as pd

from custom_db import get_doc_by_id
from custom_tokenizer import tokenize_data
from sklearn.feature_extraction.text import TfidfVectorizer


def preprocess_query(Q):
tokenized_query = tokenize_data(Q)
return tokenized_query


def find_intersection_2(Q):
df_inverted = pd.read_csv("inverted_index.csv", sep=",", index_col=1)
df_inverted.drop(columns=["Unnamed: 0"], inplace=True)

# df_inverted.set_index("word", inplace=True)
print(df_inverted.columns)
print(df_inverted.head())
tokenized_query = preprocess_query(Q)
print(tokenized_query)
result = []
for token in tokenized_query:
if token in df_inverted.word.values:
print(f"Found token: {token}")
doc_ids = df_inverted[df_inverted["word"] == token]["doc_ids"].apply(eval)
print(f"It has {len(doc_ids)} doc_ids")
result.append(doc_ids)
# print(f"result: {result}")
# find intersection of all lists in result
intersection = set(result[0]).intersection(*result)
return intersection


def find_documents(Q) -> set:
df_inverted = pd.read_csv("inverted_index.csv", converters={'doc_ids': pd.eval})
df_inverted.set_index("word", inplace=True)
df_inverted.drop(columns=["Unnamed: 0"], inplace=True)

print(df_inverted.head())
tokenized_query = preprocess_query(Q)
print(df_inverted.index.values)
result = []
for token in tokenized_query:
if token in df_inverted.index.values:
print(f"Found token: {token}")
doc_ids = df_inverted.loc[token].doc_ids
print(f"It has {len(doc_ids)} doc_ids")
result.append(doc_ids)
# find intersection of all lists in result
intersection = set(result[0]).intersection(*result)
union = set(result[0]).union(*result)
if len(intersection) < 2:
print("No intersection found")
return union
return intersection


def dummy_tokenizer(tokens: list[str]):
return tokens


def generate_tf_idf_matrix(path):
df = pd.read_csv(path, converters={'tokenized_text': pd.eval})
df_text = df["tokenized_text"]
# create list of lists containing the tokenized text
tokenized_text = []
print(type(df_text.values))
vectorizer = TfidfVectorizer(tokenizer=dummy_tokenizer, preprocessor=dummy_tokenizer)
X = vectorizer.fit_transform(df_text.values)
features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
print(features)

return features


def rank_documents(subset_D, Q, X):
# Filter the DataFrame to include only the documents in subset_D
subset_adj = [x - 1 for x in subset_D]
filtered_X = X.loc[list(subset_adj)] # here accessen wir rows

# Ensure Q is a list of query terms
query_terms = preprocess_query(Q)
query_terms_in_X = [term for term in query_terms if term in X.columns]
# Filter the DataFrame to include only the columns corresponding to the query terms
if not query_terms_in_X:
print("No query terms found in the TF-IDF matrix.")
return pd.DataFrame()
filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns

# Sum the TF-IDF values for each document
filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1)

# Rank the documents by the summed TF-IDF values in descending order
ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False)

# Map document ID to document with title, URL, and snippet
ranking = []
for index, ranked_doc in ranked_docs.iterrows():
score = ranked_doc['sum_tfidf']

doc = get_doc_by_id(index)
title = str(doc['title'].values[0]) if not doc.empty else ""
url = str(doc['url'].values[0]) if not doc.empty else ""
snippet = str(doc['snippet'].values[0]) if not doc.empty else ""

result = {
"id": index,
"title": title,
"url": url,
"description": snippet if snippet else "",
"summary": "",
"score": score
}
ranking.append(result)

return ranking


# query = "food and drink"
# docs = find_documents(query)
X = generate_tf_idf_matrix('pages.csv')


# print(f"Found {len(docs)} documents, they look like this: {docs}")
# print(f"Result: {generate_tf_idf_matrix('pages.csv')}")

# ranked_docs = rank_documents(docs, query, X)
# print(f"Best 20 docs: {ranked_docs[:20]}")


def rank(query):
docs = find_documents(query)
return rank_documents(docs, query, X)
66 changes: 49 additions & 17 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import nltk as nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
Expand All @@ -9,11 +11,16 @@

from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
from pipeline import PipelineElement
from utils import safe_join, safe_str

WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
STEMMER = nltk.stem.PorterStemmer()


def remove_punctuations(text):
punct_tag = re.compile(r'[^\w\s]')
text = punct_tag.sub(r'', text)
# Remove punctuations
punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
text = punctuations.sub(r'', text)
return text


Expand Down Expand Up @@ -51,31 +58,41 @@ def tokenize_plain_words(words: str):
return words.split()


def stem_and_remove_stopwords(words) -> list[str]:
# use english porterStemmer
def stem(words) -> list[str]:
words = [STEMMER.stem(word) for word in words] # added stemmer
return words


stemmer = nltk.stem.porter.PorterStemmer()
words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer
def remove_stopwords(words):
return [word for word in words if word not in stopwords.words("english")]


def lemmatize(words):
words = [WN_LEMMATIZER.lemmatize(word) for word in words]
return words


def tokenize_data(data) -> list[str]:
"""
Tokenizes the input data.
"""
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
stem_and_remove_stopwords]
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
lemmatize]
for pipe in pipeline:
data = pipe(data)
return data


# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
def tf_idf_vectorize(data):
"""
Vectorizes the input data using the TF-IDF algorithm.
"""
# Create the vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
# vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
vectorizer = TfidfVectorizer()
# Vectorize the data
X = vectorizer.fit_transform(data)
return X
Expand All @@ -91,10 +108,10 @@ def top_30_words(data):
X = vectorizer.fit_transform(data)
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
print(f"Feature names: {feature_names}")
print(f"X sieht so aus: {X}")
print(f"Shape of X: {X.shape}")
print(f"Summe: {X.sum(axis=0)}")
logging.info(f"Feature names: {feature_names}")
logging.info(f"X sieht so aus: {X}")
logging.info(f"Shape of X: {X.shape}")
logging.info(f"Summe: {X.sum(axis=0)}")
top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
return top_30_words

Expand All @@ -108,18 +125,33 @@ async def process(self, data, link):
Tokenizes the input data.
"""

if data is None:
logging.info(f"Failed to tokenize {link} because the data was empty.")
return

soup = data

# Get the text from the page
text = soup.get_text()
img_tags = soup.findAll("img")

# Get the meta description and title
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

# Get the alt texts from the images
img_tags = soup.findAll("img")
alt_texts = [img.get("alt") for img in img_tags]
text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)

# Join all the text together
alt_texts_str = safe_join(alt_texts)
description_str = safe_str(description_content)
title_str = safe_str(title_content)
text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()

# Tokenize the text
tokenized_text = tokenize_data(data=text)
add_tokens_to_index(url=link, tokenized_text=tokenized_text)

print(f"Tokenized text for {link}")
logging.info(f"Tokenized text for {link}")
8 changes: 7 additions & 1 deletion engine/index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
from pipeline import PipelineElement

Expand All @@ -17,6 +19,10 @@ async def process(self, data, link):
Indexes the input data.
"""

if data is None:
logging.info(f"Failed to index {link} because the data was empty.")
return

soup = data

# Title
Expand All @@ -32,7 +38,7 @@ async def process(self, data, link):
add_title_to_index(url=link, title=title_content)
add_snippet_to_index(url=link, snippet=description_content)

print(f"Indexed {link}")
logging.info(f"Indexed {link}")
if not self.is_shutdown():
await self.call_next(soup, link)

Expand Down
Loading