am9zZWY · am9zZWY · Jul 13, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/engine/crawl.py b/engine/crawl.py
@@ -67,7 +67,8 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection):
                                "amazon.com", "cctue.de", "spotify.com"]
         self.langs = ["en", "en-de", "eng", "en-GB", "en-US", "english"]
         self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"]
-        self.user_agent = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)"
+        self.user_agent = ("Modern Search Engines University of Tuebingen Project Crawler ("
+                           "https://uni-tuebingen.de/de/262377)")
 
     def __del__(self) -> None:
         self.cursor.close()
@@ -87,7 +88,6 @@ async def fetch(self, session, url):
             return None
 
     async def process(self):
-        print("Crawler started")
         async with ClientSession() as session:
             while self.to_crawl and len(self.found_links) < self.max_size:
                 tasks = []

diff --git a/engine/custom_db.py b/engine/custom_db.py
@@ -1,46 +1,174 @@
+import os
+
 import pandas as pd
 
 from collections import defaultdict
-import re
 
 # Create a DataFrame to store HTML pages
-pages_df = pd.DataFrame(columns=['id', 'url', 'tokenized_text'])
+headers = ['id', 'url', 'title', 'snippet', 'tokenized_text']
+pages_df = pd.DataFrame({
+    'id': pd.Series(dtype='int'),
+    'url': pd.Series(dtype='str'),
+    'title': pd.Series(dtype='str'),
+    'snippet': pd.Series(dtype='str'),
+    'tokenized_text': pd.Series(dtype='object')
+})
 inverted_index = defaultdict(list)
 
-def save_html_to_df(url, tokenized_text):
+
+def upsert_page_to_index(url: str):
+    """
+    Add a page to the index if it doesn't exist.
+    Args:
+        url: URL of the page
+
+    Returns:
+
+    """
+
+    global pages_df
+    # Get an existing row with the same URL if it exists
+    existing_row = pages_df[pages_df['url'] == url]
+
+    if not existing_row.empty:
+        page_id = existing_row['id'].values[0]
+    else:
+        # Create a new row
+        page_id = len(pages_df) + 1
+        pages_df = pd.concat(
+            [pages_df, pd.DataFrame(
+                [
+                    {'id': page_id, 'url': url, 'title': '', 'snippet': '', 'tokenized_text': []}
+                ])],
+            ignore_index=True)
+
+    return page_id
+
+
+def add_tokens_to_index(url: str, tokenized_text: list[str]):
+    """
+    Add tokenized text to the index.
+    Args:
+        url:
+        tokenized_text: List of tokens
+
+    Returns:
+
+    """
+    global pages_df
+
+    page_id = upsert_page_to_index(url)
+    if not pages_df[pages_df['id'] == page_id].empty:
+        pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
+    else:
+        print(f"Page with ID {page_id} not found")
+
+
+def add_title_to_index(url: str, title: str):
+    """
+    Add a title to the index.
+    Args:
+        url:
+        title:
+
+    Returns:
+
+    """
+    global pages_df
+
+    page_id = upsert_page_to_index(url)
+    if not pages_df[pages_df['id'] == page_id].empty:
+        pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
+    else:
+        print(f"Page with ID {page_id} not found")
+
+
+def add_snippet_to_index(url, snippet):
+    """
+    Add a snippet/description to the index.
+    Args:
+        url:
+        snippet:
+
+    Returns:
+
+    """
     global pages_df
-    new_id = len(pages_df) + 1
-    new_row = {'id': new_id, 'url': url, 'tokenized_text': tokenized_text}
-    pages_df = pd.concat([pages_df,pd.DataFrame([new_row])], ignore_index=True)
 
+    upsert_page_to_index(url)
+    if not pages_df[pages_df['url'] == url].empty:
+        pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
+    else:
+        print(f"Page with URL {url} not found")
+
+
+def get_tokens() -> list[list[str]]:
+    """
+    Get the tokenized text from the pages DataFrame.
+    Tokenized text is a matrix of tokens.
+    One row per document, one column per token.
+
+    Returns: list[list[str]]
+
+    """
+    global pages_df
+    tokens = pages_df['tokenized_text'].to_list()
+    return tokens
 
-# Create an inverted index
 
 def get_overview():
     return pages_df.head()
 
-def save_pages():
-    global pages_df
-    pages_df.to_csv("pages.csv")
 
 def add_document_to_index(doc_id, words: list[str]):
-    print(f"Adding stuff")
     global inverted_index
+
+    if not words:
+        return
+
     for word in set(words):
         inverted_index[word].append(doc_id)
 
 
 def index_pages():
     for index, row in pages_df.iterrows():
-
-        add_document_to_index(row['id'], row['tokenized_text'])
+        page_id = row['id']
+        tokenized_text = row['tokenized_text']
+        add_document_to_index(page_id, tokenized_text)
 
 
 def access_index():
     index_df = pd.DataFrame(list(inverted_index.items()), columns=['word', 'doc_ids'])
     return index_df
 
 
-# Convert the inverted index to a DataFrame
+def save_pages() -> None:
+    """
+    Save the pages DataFrame to a CSV file.
+    Returns: None
+    """
+
+    global pages_df
+    pages_df.to_csv("pages.csv", index=False, header=headers)
+
+
+def load_pages() -> pd.DataFrame:
+    """
+    Load the pages DataFrame from a CSV file.
+    Returns: pd.DataFrame
+    """
+
+    global pages_df
+
+    # Check if the file exists
+    if not os.path.exists(f"pages.csv"):
+        print("No pages found")
+        return pages_df
+
+    pages_df = pd.read_csv("pages.csv", header=0)
 
+    # Convert the tokenized_text column to a list of lists
+    pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)
 
+    print("Loaded pages")
+    return pages_df
diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
@@ -7,7 +7,7 @@
 import re
 import nltk
 
-from custom_db import save_html_to_df
+from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
 from pipeline import PipelineElement
 
 
@@ -51,23 +51,22 @@ def tokenize_plain_words(words: str):
     return words.split()
 
 
-def stem_and_remove_stopwords(words):
+def stem_and_remove_stopwords(words) -> list[str]:
     # use english porterStemmer
 
     stemmer = nltk.stem.porter.PorterStemmer()
     words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")]  # added stemmer
     return words
 
 
-def tokenize_data(data):
+def tokenize_data(data) -> list[str]:
     """
     Tokenizes the input data.
     """
     pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
                 stem_and_remove_stopwords]
     for pipe in pipeline:
         data = pipe(data)
-    print("We are done here in tokenizing")
     return data
 
 
@@ -112,13 +111,13 @@ def process(self, data, link):
         soup = data
         text = soup.get_text()
         img_tags = soup.findAll("img")
-        desciption = soup.find("meta", attrs={"name": "description"})
-        desciption_content = desciption.get("content") if desciption is not None else ""
+        description = soup.find("meta", attrs={"name": "description"})
+        description_content = description.get("content") if description is not None else ""
         title = soup.find("title")
         title_content = title.string if title is not None else ""
 
         alt_texts = [img.get("alt") for img in img_tags]
-        text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)
+        text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)
 
         tokenized_text = tokenize_data(data=text)
-        save_html_to_df(url=link, tokenized_text=tokenized_text)
+        add_tokens_to_index(url=link, tokenized_text=tokenized_text)
diff --git a/engine/index.py b/engine/index.py
@@ -1,31 +1,43 @@
-from .crawl import start_crawl
+from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
+from pipeline import PipelineElement
 
 
-class Document:
-    def __init__(self, content: str, title: str, url: str):
-        self.content = content
-        self.title = title
-        self.url = url
+class Indexer(PipelineElement):
+    """
+    Adds the data to the index.
+    """
 
-
-class Index:
     def __init__(self):
-        self.documents = []
+        super().__init__("Indexer")
+
+        self._load_state()
+
+    def process(self, data, link):
+        """
+        Indexes the input data.
+        """
+
+        soup = data
 
-    def add_document(self, document: Document):
-        self.documents.append(document)
+        # Title
+        title = soup.find("title")
+        title_content = title.string if title is not None else ""
 
-    def search(self, query: str) -> list[Document]:
-        pass
+        # Snippet or description
+        description = soup.find("meta", attrs={"name": "description"})
+        description_content = description.get("content") if description is not None else ""
 
+        # Add more data to the index
+        upsert_page_to_index(url=link)
+        add_title_to_index(url=link, title=title_content)
+        add_snippet_to_index(url=link, snippet=description_content)
 
-class SearchEngine:
-    def __init__(self, index: Index):
-        self.index = index
+        self.call_next(soup, link)
 
-    def crawl(self):
-        start_crawl()
+    def _load_state(self):
+        """
+        Load the state of the indexer.
+        """
 
-    def search(self, query: str) -> list[Document]:
-        # TODO: Implement search
-        pass
+        # TODO: Not ideal! This should be in a database
+        load_pages()
diff --git a/engine/main.py b/engine/main.py
@@ -12,6 +12,8 @@
 # Async
 import asyncio
 
+from index import Indexer
+
 MAX_THREADS = 10
 
 if __name__ == "__main__":
@@ -27,17 +29,24 @@
             crawler.max_size = 1000
             crawler.add_executor(executor)
 
+            indexer = Indexer()
+            indexer.add_executor(executor)
+
             tokenizer = Tokenizer()
             tokenizer.add_executor(executor)
 
             # Add the pipeline elements
-            crawler.add_next(tokenizer)
+            crawler.add_next(indexer)
+            indexer.add_next(tokenizer)
 
             # Start the pipeline
             asyncio.run(crawler.process())
     except (KeyboardInterrupt, SystemExit):
         print("Exiting...")
         crawler.save_state()
+        index_pages()
+        index_df = access_index()
+        index_df.to_csv("inverted_index.csv")
         con.close()
         print("State saved")
 

diff --git a/engine/pipeline.py b/engine/pipeline.py
@@ -20,7 +20,7 @@ def add_next(self, next_element):
     def call_next(self, *args):
         futures = []
         for element in self.next:
-            print(f"Calling next for {self.name}: {element.name}")
+            print(f"{self.name} -> {element.name}")
             future = element.executor.submit(element.process, *args)
             futures.append(future)
         wait(futures)  # Wait for all futures to complete