Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended the pages.csv #26

Merged
merged 8 commits into from
Jul 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection):
"amazon.com", "cctue.de", "spotify.com"]
self.langs = ["en", "en-de", "eng", "en-GB", "en-US", "english"]
self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"]
self.user_agent = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)"
self.user_agent = ("Modern Search Engines University of Tuebingen Project Crawler ("
"https://uni-tuebingen.de/de/262377)")

def __del__(self) -> None:
self.cursor.close()
Expand All @@ -87,7 +88,6 @@ async def fetch(self, session, url):
return None

async def process(self):
print("Crawler started")
async with ClientSession() as session:
while self.to_crawl and len(self.found_links) < self.max_size:
tasks = []
Expand Down
156 changes: 142 additions & 14 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,174 @@
import os

import pandas as pd

from collections import defaultdict
import re

# Create a DataFrame to store HTML pages
pages_df = pd.DataFrame(columns=['id', 'url', 'tokenized_text'])
headers = ['id', 'url', 'title', 'snippet', 'tokenized_text']
pages_df = pd.DataFrame({
'id': pd.Series(dtype='int'),
'url': pd.Series(dtype='str'),
'title': pd.Series(dtype='str'),
'snippet': pd.Series(dtype='str'),
'tokenized_text': pd.Series(dtype='object')
})
inverted_index = defaultdict(list)

def save_html_to_df(url, tokenized_text):

def upsert_page_to_index(url: str):
"""
Add a page to the index if it doesn't exist.
Args:
url: URL of the page

Returns:

"""

global pages_df
# Get an existing row with the same URL if it exists
existing_row = pages_df[pages_df['url'] == url]

if not existing_row.empty:
page_id = existing_row['id'].values[0]
else:
# Create a new row
page_id = len(pages_df) + 1
pages_df = pd.concat(
[pages_df, pd.DataFrame(
[
{'id': page_id, 'url': url, 'title': '', 'snippet': '', 'tokenized_text': []}
])],
ignore_index=True)

return page_id


def add_tokens_to_index(url: str, tokenized_text: list[str]):
"""
Add tokenized text to the index.
Args:
url:
tokenized_text: List of tokens

Returns:

"""
global pages_df

page_id = upsert_page_to_index(url)
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
else:
print(f"Page with ID {page_id} not found")


def add_title_to_index(url: str, title: str):
"""
Add a title to the index.
Args:
url:
title:

Returns:

"""
global pages_df

page_id = upsert_page_to_index(url)
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
else:
print(f"Page with ID {page_id} not found")


def add_snippet_to_index(url, snippet):
"""
Add a snippet/description to the index.
Args:
url:
snippet:

Returns:

"""
global pages_df
new_id = len(pages_df) + 1
new_row = {'id': new_id, 'url': url, 'tokenized_text': tokenized_text}
pages_df = pd.concat([pages_df,pd.DataFrame([new_row])], ignore_index=True)

upsert_page_to_index(url)
if not pages_df[pages_df['url'] == url].empty:
pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
else:
print(f"Page with URL {url} not found")


def get_tokens() -> list[list[str]]:
"""
Get the tokenized text from the pages DataFrame.
Tokenized text is a matrix of tokens.
One row per document, one column per token.

Returns: list[list[str]]

"""
global pages_df
tokens = pages_df['tokenized_text'].to_list()
return tokens

# Create an inverted index

def get_overview():
return pages_df.head()

def save_pages():
global pages_df
pages_df.to_csv("pages.csv")

def add_document_to_index(doc_id, words: list[str]):
print(f"Adding stuff")
global inverted_index

if not words:
return

for word in set(words):
inverted_index[word].append(doc_id)


def index_pages():
for index, row in pages_df.iterrows():

add_document_to_index(row['id'], row['tokenized_text'])
page_id = row['id']
tokenized_text = row['tokenized_text']
add_document_to_index(page_id, tokenized_text)


def access_index():
index_df = pd.DataFrame(list(inverted_index.items()), columns=['word', 'doc_ids'])
return index_df


# Convert the inverted index to a DataFrame
def save_pages() -> None:
"""
Save the pages DataFrame to a CSV file.
Returns: None
"""

global pages_df
pages_df.to_csv("pages.csv", index=False, header=headers)


def load_pages() -> pd.DataFrame:
"""
Load the pages DataFrame from a CSV file.
Returns: pd.DataFrame
"""

global pages_df

# Check if the file exists
if not os.path.exists(f"pages.csv"):
print("No pages found")
return pages_df

pages_df = pd.read_csv("pages.csv", header=0)

# Convert the tokenized_text column to a list of lists
pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)

print("Loaded pages")
return pages_df
15 changes: 7 additions & 8 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import nltk

from custom_db import save_html_to_df
from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
from pipeline import PipelineElement


Expand Down Expand Up @@ -51,23 +51,22 @@ def tokenize_plain_words(words: str):
return words.split()


def stem_and_remove_stopwords(words):
def stem_and_remove_stopwords(words) -> list[str]:
# use english porterStemmer

stemmer = nltk.stem.porter.PorterStemmer()
words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer
return words


def tokenize_data(data):
def tokenize_data(data) -> list[str]:
"""
Tokenizes the input data.
"""
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
stem_and_remove_stopwords]
for pipe in pipeline:
data = pipe(data)
print("We are done here in tokenizing")
return data


Expand Down Expand Up @@ -112,13 +111,13 @@ def process(self, data, link):
soup = data
text = soup.get_text()
img_tags = soup.findAll("img")
desciption = soup.find("meta", attrs={"name": "description"})
desciption_content = desciption.get("content") if desciption is not None else ""
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

alt_texts = [img.get("alt") for img in img_tags]
text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)
text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)

tokenized_text = tokenize_data(data=text)
save_html_to_df(url=link, tokenized_text=tokenized_text)
add_tokens_to_index(url=link, tokenized_text=tokenized_text)
54 changes: 33 additions & 21 deletions engine/index.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,43 @@
from .crawl import start_crawl
from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
from pipeline import PipelineElement


class Document:
def __init__(self, content: str, title: str, url: str):
self.content = content
self.title = title
self.url = url
class Indexer(PipelineElement):
"""
Adds the data to the index.
"""


class Index:
def __init__(self):
self.documents = []
super().__init__("Indexer")

self._load_state()

def process(self, data, link):
"""
Indexes the input data.
"""

soup = data

def add_document(self, document: Document):
self.documents.append(document)
# Title
title = soup.find("title")
title_content = title.string if title is not None else ""

def search(self, query: str) -> list[Document]:
pass
# Snippet or description
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""

# Add more data to the index
upsert_page_to_index(url=link)
add_title_to_index(url=link, title=title_content)
add_snippet_to_index(url=link, snippet=description_content)

class SearchEngine:
def __init__(self, index: Index):
self.index = index
self.call_next(soup, link)

def crawl(self):
start_crawl()
def _load_state(self):
"""
Load the state of the indexer.
"""

def search(self, query: str) -> list[Document]:
# TODO: Implement search
pass
# TODO: Not ideal! This should be in a database
load_pages()
11 changes: 10 additions & 1 deletion engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# Async
import asyncio

from index import Indexer

MAX_THREADS = 10

if __name__ == "__main__":
Expand All @@ -27,17 +29,24 @@
crawler.max_size = 1000
crawler.add_executor(executor)

indexer = Indexer()
indexer.add_executor(executor)

tokenizer = Tokenizer()
tokenizer.add_executor(executor)

# Add the pipeline elements
crawler.add_next(tokenizer)
crawler.add_next(indexer)
indexer.add_next(tokenizer)

# Start the pipeline
asyncio.run(crawler.process())
except (KeyboardInterrupt, SystemExit):
print("Exiting...")
crawler.save_state()
index_pages()
index_df = access_index()
index_df.to_csv("inverted_index.csv")
con.close()
print("State saved")

Expand Down
2 changes: 1 addition & 1 deletion engine/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def add_next(self, next_element):
def call_next(self, *args):
futures = []
for element in self.next:
print(f"Calling next for {self.name}: {element.name}")
print(f"{self.name} -> {element.name}")
future = element.executor.submit(element.process, *args)
futures.append(future)
wait(futures) # Wait for all futures to complete
Loading