am9zZWY · am9zZWY · Jul 16, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/README.md b/README.md
@@ -1,2 +1,2 @@
-# SuniBrownSnakeGaborone
+# Modern Search Engine
 A hyper-fast search engine
diff --git a/engine/crawl.py b/engine/crawl.py
diff --git a/engine/custom_db.py b/engine/custom_db.py
@@ -1,3 +1,4 @@
+import logging
 import os
 
 import pandas as pd
@@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]):
     if not pages_df[pages_df['id'] == page_id].empty:
         pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
     else:
-        print(f"Page with ID {page_id} not found")
+        logging.info(f"Page with ID {page_id} not found")
 
 
 def add_title_to_index(url: str, title: str):
@@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str):
     if not pages_df[pages_df['id'] == page_id].empty:
         pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
     else:
-        print(f"Page with ID {page_id} not found")
+        logging.info(f"Page with ID {page_id} not found")
 
 
 def add_snippet_to_index(url, snippet):
@@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet):
     if not pages_df[pages_df['url'] == url].empty:
         pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
     else:
-        print(f"Page with URL {url} not found")
+        logging.info(f"Page with URL {url} not found")
 
 
 def get_tokens() -> list[list[str]]:
@@ -152,6 +153,12 @@ def save_pages() -> None:
     pages_df.to_csv("pages.csv", index=False, header=headers)
 
 
+def get_doc_by_id(page_id: int):
+    global pages_df
+    page = pages_df[pages_df['id'] == page_id]
+    return page
+
+
 def load_pages() -> pd.DataFrame:
     """
     Load the pages DataFrame from a CSV file.
@@ -162,17 +169,20 @@ def load_pages() -> pd.DataFrame:
 
     # Check if the file exists
     if not os.path.exists(f"pages.csv"):
-        print("No pages found")
+        logging.info("No pages found")
         return pages_df
 
     try:
         pages_df = pd.read_csv("pages.csv", header=0)
     except pd.errors.EmptyDataError:
-        print("No pages found")
+        logging.info("No pages found")
         return pages_df
 
     # Convert the tokenized_text column to a list of lists
     pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)
 
-    print("Loaded pages")
+    logging.info("Loaded pages")
     return pages_df
+
+
+load_pages()
diff --git a/engine/custom_rank.py b/engine/custom_rank.py
@@ -0,0 +1,134 @@
+import pandas as pd
+
+from custom_db import get_doc_by_id
+from custom_tokenizer import tokenize_data
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+def preprocess_query(Q):
+    tokenized_query = tokenize_data(Q)
+    return tokenized_query
+
+
+def find_intersection_2(Q):
+    df_inverted = pd.read_csv("inverted_index.csv", sep=",", index_col=1)
+    df_inverted.drop(columns=["Unnamed: 0"], inplace=True)
+
+    # df_inverted.set_index("word", inplace=True)
+    print(df_inverted.columns)
+    print(df_inverted.head())
+    tokenized_query = preprocess_query(Q)
+    print(tokenized_query)
+    result = []
+    for token in tokenized_query:
+        if token in df_inverted.word.values:
+            print(f"Found token: {token}")
+            doc_ids = df_inverted[df_inverted["word"] == token]["doc_ids"].apply(eval)
+            print(f"It has {len(doc_ids)} doc_ids")
+            result.append(doc_ids)
+    # print(f"result: {result}")
+    # find intersection of all lists in result
+    intersection = set(result[0]).intersection(*result)
+    return intersection
+
+
+def find_documents(Q) -> set:
+    df_inverted = pd.read_csv("inverted_index.csv", converters={'doc_ids': pd.eval})
+    df_inverted.set_index("word", inplace=True)
+    df_inverted.drop(columns=["Unnamed: 0"], inplace=True)
+
+    print(df_inverted.head())
+    tokenized_query = preprocess_query(Q)
+    print(df_inverted.index.values)
+    result = []
+    for token in tokenized_query:
+        if token in df_inverted.index.values:
+            print(f"Found token: {token}")
+            doc_ids = df_inverted.loc[token].doc_ids
+            print(f"It has {len(doc_ids)} doc_ids")
+            result.append(doc_ids)
+    # find intersection of all lists in result
+    intersection = set(result[0]).intersection(*result)
+    union = set(result[0]).union(*result)
+    if len(intersection) < 2:
+        print("No intersection found")
+        return union
+    return intersection
+
+
+def dummy_tokenizer(tokens: list[str]):
+    return tokens
+
+
+def generate_tf_idf_matrix(path):
+    df = pd.read_csv(path, converters={'tokenized_text': pd.eval})
+    df_text = df["tokenized_text"]
+    # create list of lists containing the tokenized text
+    tokenized_text = []
+    print(type(df_text.values))
+    vectorizer = TfidfVectorizer(tokenizer=dummy_tokenizer, preprocessor=dummy_tokenizer)
+    X = vectorizer.fit_transform(df_text.values)
+    features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
+    print(features)
+
+    return features
+
+
+def rank_documents(subset_D, Q, X):
+    # Filter the DataFrame to include only the documents in subset_D
+    subset_adj = [x - 1 for x in subset_D]
+    filtered_X = X.loc[list(subset_adj)]  # here accessen wir rows
+
+    # Ensure Q is a list of query terms
+    query_terms = preprocess_query(Q)
+    query_terms_in_X = [term for term in query_terms if term in X.columns]
+    # Filter the DataFrame to include only the columns corresponding to the query terms
+    if not query_terms_in_X:
+        print("No query terms found in the TF-IDF matrix.")
+        return pd.DataFrame()
+    filtered_X_query_terms = filtered_X[query_terms_in_X]  # here accessen wir ganze columns
+
+    # Sum the TF-IDF values for each document
+    filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1)
+
+    # Rank the documents by the summed TF-IDF values in descending order
+    ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False)
+
+    # Map document ID to document with title, URL, and snippet
+    ranking = []
+    for index, ranked_doc in ranked_docs.iterrows():
+        score = ranked_doc['sum_tfidf']
+
+        doc = get_doc_by_id(index)
+        title = str(doc['title'].values[0]) if not doc.empty else ""
+        url = str(doc['url'].values[0]) if not doc.empty else ""
+        snippet = str(doc['snippet'].values[0]) if not doc.empty else ""
+
+        result = {
+            "id": index,
+            "title": title,
+            "url": url,
+            "description": snippet if snippet else "",
+            "summary": "",
+            "score": score
+        }
+        ranking.append(result)
+
+    return ranking
+
+
+# query = "food and drink"
+# docs = find_documents(query)
+X = generate_tf_idf_matrix('pages.csv')
+
+
+# print(f"Found {len(docs)} documents, they look like this: {docs}")
+# print(f"Result: {generate_tf_idf_matrix('pages.csv')}")
+
+# ranked_docs = rank_documents(docs, query, X)
+# print(f"Best 20 docs: {ranked_docs[:20]}")
+
+
+def rank(query):
+    docs = find_documents(query)
+    return rank_documents(docs, query, X)
diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
@@ -1,3 +1,5 @@
+import logging
+
 import nltk as nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
@@ -9,11 +11,16 @@
 
 from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
 from pipeline import PipelineElement
+from utils import safe_join, safe_str
+
+WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
+STEMMER = nltk.stem.PorterStemmer()
 
 
 def remove_punctuations(text):
-    punct_tag = re.compile(r'[^\w\s]')
-    text = punct_tag.sub(r'', text)
+    # Remove punctuations
+    punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
+    text = punctuations.sub(r'', text)
     return text
 
 
@@ -51,31 +58,41 @@ def tokenize_plain_words(words: str):
     return words.split()
 
 
-def stem_and_remove_stopwords(words) -> list[str]:
-    # use english porterStemmer
+def stem(words) -> list[str]:
+    words = [STEMMER.stem(word) for word in words]  # added stemmer
+    return words
+
 
-    stemmer = nltk.stem.porter.PorterStemmer()
-    words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")]  # added stemmer
+def remove_stopwords(words):
+    return [word for word in words if word not in stopwords.words("english")]
+
+
+def lemmatize(words):
+    words = [WN_LEMMATIZER.lemmatize(word) for word in words]
     return words
 
 
 def tokenize_data(data) -> list[str]:
     """
     Tokenizes the input data.
     """
-    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
-                stem_and_remove_stopwords]
+    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
+                lemmatize]
     for pipe in pipeline:
         data = pipe(data)
     return data
 
-
+# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
+# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
+# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
+# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
 def tf_idf_vectorize(data):
     """
     Vectorizes the input data using the TF-IDF algorithm.
     """
     # Create the vectorizer
-    vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
+    # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
+    vectorizer = TfidfVectorizer()
     # Vectorize the data
     X = vectorizer.fit_transform(data)
     return X
@@ -91,10 +108,10 @@ def top_30_words(data):
     X = vectorizer.fit_transform(data)
     # Get the feature names
     feature_names = vectorizer.get_feature_names_out()
-    print(f"Feature names: {feature_names}")
-    print(f"X sieht so aus: {X}")
-    print(f"Shape of X: {X.shape}")
-    print(f"Summe: {X.sum(axis=0)}")
+    logging.info(f"Feature names: {feature_names}")
+    logging.info(f"X sieht so aus: {X}")
+    logging.info(f"Shape of X: {X.shape}")
+    logging.info(f"Summe: {X.sum(axis=0)}")
     top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
     return top_30_words
 
@@ -108,18 +125,33 @@ async def process(self, data, link):
         Tokenizes the input data.
         """
 
+        if data is None:
+            logging.info(f"Failed to tokenize {link} because the data was empty.")
+            return
+
         soup = data
+
+        # Get the text from the page
         text = soup.get_text()
-        img_tags = soup.findAll("img")
+
+        # Get the meta description and title
         description = soup.find("meta", attrs={"name": "description"})
         description_content = description.get("content") if description is not None else ""
         title = soup.find("title")
         title_content = title.string if title is not None else ""
 
+        # Get the alt texts from the images
+        img_tags = soup.findAll("img")
         alt_texts = [img.get("alt") for img in img_tags]
-        text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)
 
+        # Join all the text together
+        alt_texts_str = safe_join(alt_texts)
+        description_str = safe_str(description_content)
+        title_str = safe_str(title_content)
+        text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()
+
+        # Tokenize the text
         tokenized_text = tokenize_data(data=text)
         add_tokens_to_index(url=link, tokenized_text=tokenized_text)
 
-        print(f"Tokenized text for {link}")
+        logging.info(f"Tokenized text for {link}")
diff --git a/engine/index.py b/engine/index.py
@@ -1,3 +1,5 @@
+import logging
+
 from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
 from pipeline import PipelineElement
 
@@ -17,6 +19,10 @@ async def process(self, data, link):
         Indexes the input data.
         """
 
+        if data is None:
+            logging.info(f"Failed to index {link} because the data was empty.")
+            return
+
         soup = data
 
         # Title
@@ -32,7 +38,7 @@ async def process(self, data, link):
         add_title_to_index(url=link, title=title_content)
         add_snippet_to_index(url=link, snippet=description_content)
 
-        print(f"Indexed {link}")
+        logging.info(f"Indexed {link}")
         if not self.is_shutdown():
             await self.call_next(soup, link)