Merge pull request #29 from am9zZWY/josef-crawler-tokenizer-update

Update crawler and tokenizer
am9zZWY · Jul 16, 2024 · 0f4f4ed · 0f4f4ed
2 parents ba68a0f + b3f4b89
commit 0f4f4ed
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 174 deletions.
diff --git a/engine/crawl.py b/engine/crawl.py
diff --git a/engine/custom_db.py b/engine/custom_db.py
@@ -1,3 +1,4 @@
+import logging
 import os
 
 import pandas as pd
@@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]):
     if not pages_df[pages_df['id'] == page_id].empty:
         pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
     else:
-        print(f"Page with ID {page_id} not found")
+        logging.info(f"Page with ID {page_id} not found")
 
 
 def add_title_to_index(url: str, title: str):
@@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str):
     if not pages_df[pages_df['id'] == page_id].empty:
         pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
     else:
-        print(f"Page with ID {page_id} not found")
+        logging.info(f"Page with ID {page_id} not found")
 
 
 def add_snippet_to_index(url, snippet):
@@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet):
     if not pages_df[pages_df['url'] == url].empty:
         pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
     else:
-        print(f"Page with URL {url} not found")
+        logging.info(f"Page with URL {url} not found")
 
 
 def get_tokens() -> list[list[str]]:
@@ -162,17 +163,17 @@ def load_pages() -> pd.DataFrame:
 
     # Check if the file exists
     if not os.path.exists(f"pages.csv"):
-        print("No pages found")
+        logging.info("No pages found")
         return pages_df
 
     try:
         pages_df = pd.read_csv("pages.csv", header=0)
     except pd.errors.EmptyDataError:
-        print("No pages found")
+        logging.info("No pages found")
         return pages_df
 
     # Convert the tokenized_text column to a list of lists
     pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)
 
-    print("Loaded pages")
+    logging.info("Loaded pages")
     return pages_df
diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
@@ -1,3 +1,5 @@
+import logging
+
 import nltk as nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
@@ -9,11 +11,16 @@
 
 from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
 from pipeline import PipelineElement
+from utils import safe_join, safe_str
+
+WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
+STEMMER = nltk.stem.PorterStemmer()
 
 
 def remove_punctuations(text):
-    punct_tag = re.compile(r'[^\w\s]')
-    text = punct_tag.sub(r'', text)
+    # Remove punctuations
+    punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
+    text = punctuations.sub(r'', text)
     return text
 
 
@@ -51,20 +58,26 @@ def tokenize_plain_words(words: str):
     return words.split()
 
 
-def stem_and_remove_stopwords(words) -> list[str]:
-    # use english porterStemmer
+def stem(words) -> list[str]:
+    words = [STEMMER.stem(word) for word in words]  # added stemmer
+    return words
+
+
+def remove_stopwords(words):
+    return [word for word in words if word not in stopwords.words("english")]
+
 
-    stemmer = nltk.stem.porter.PorterStemmer()
-    words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")]  # added stemmer
+def lemmatize(words):
+    words = [WN_LEMMATIZER.lemmatize(word) for word in words]
     return words
 
 
 def tokenize_data(data) -> list[str]:
     """
     Tokenizes the input data.
     """
-    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
-                stem_and_remove_stopwords]
+    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
+                lemmatize]
     for pipe in pipeline:
         data = pipe(data)
     return data
@@ -91,10 +104,10 @@ def top_30_words(data):
     X = vectorizer.fit_transform(data)
     # Get the feature names
     feature_names = vectorizer.get_feature_names_out()
-    print(f"Feature names: {feature_names}")
-    print(f"X sieht so aus: {X}")
-    print(f"Shape of X: {X.shape}")
-    print(f"Summe: {X.sum(axis=0)}")
+    logging.info(f"Feature names: {feature_names}")
+    logging.info(f"X sieht so aus: {X}")
+    logging.info(f"Shape of X: {X.shape}")
+    logging.info(f"Summe: {X.sum(axis=0)}")
     top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
     return top_30_words
 
@@ -108,18 +121,33 @@ async def process(self, data, link):
         Tokenizes the input data.
         """
 
+        if data is None:
+            logging.info(f"Failed to tokenize {link} because the data was empty.")
+            return
+
         soup = data
+
+        # Get the text from the page
         text = soup.get_text()
-        img_tags = soup.findAll("img")
+
+        # Get the meta description and title
         description = soup.find("meta", attrs={"name": "description"})
         description_content = description.get("content") if description is not None else ""
         title = soup.find("title")
         title_content = title.string if title is not None else ""
 
+        # Get the alt texts from the images
+        img_tags = soup.findAll("img")
         alt_texts = [img.get("alt") for img in img_tags]
-        text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)
 
+        # Join all the text together
+        alt_texts_str = safe_join(alt_texts)
+        description_str = safe_str(description_content)
+        title_str = safe_str(title_content)
+        text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()
+
+        # Tokenize the text
         tokenized_text = tokenize_data(data=text)
         add_tokens_to_index(url=link, tokenized_text=tokenized_text)
 
-        print(f"Tokenized text for {link}")
+        logging.info(f"Tokenized text for {link}")
diff --git a/engine/index.py b/engine/index.py
@@ -1,3 +1,5 @@
+import logging
+
 from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
 from pipeline import PipelineElement
 
@@ -17,6 +19,10 @@ async def process(self, data, link):
         Indexes the input data.
         """
 
+        if data is None:
+            logging.info(f"Failed to index {link} because the data was empty.")
+            return
+
         soup = data
 
         # Title
@@ -32,7 +38,7 @@ async def process(self, data, link):
         add_title_to_index(url=link, title=title_content)
         add_snippet_to_index(url=link, snippet=description_content)
 
-        print(f"Indexed {link}")
+        logging.info(f"Indexed {link}")
         if not self.is_shutdown():
             await self.call_next(soup, link)
 

diff --git a/engine/main.py b/engine/main.py
@@ -7,7 +7,9 @@
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
 import nest_asyncio
-
+import signal
+# Logging
+import logging
 # Database
 import duckdb
 # Pipeline
@@ -16,9 +18,14 @@
 from custom_tokenizer import Tokenizer
 from index import Indexer
 
-# Constants
-MAX_THREADS = 10
+# Logging setup
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    level=logging.INFO,
+    datefmt='%Y-%m-%d %H:%M:%S')
 
+# Threading
+MAX_THREADS = 10
 # Patch asyncio to allow nested event loops
 nest_asyncio.apply()
 
@@ -39,7 +46,7 @@
 
 
 def signal_handler(signum, frame):
-    print("Interrupt received, shutting down... Please wait")
+    logging.info("Interrupt received, shutting down... Please wait. This may take a few seconds.")
     for element in [crawler, indexer, tokenizer]:
         element.shutdown()
 
@@ -60,7 +67,7 @@ async def main():
         try:
             await crawler.process()
         except Exception as e:
-            print(f"An error occurred: {e}")
+            logging.info(f"An error occurred: {e}")
         finally:
             # Ensure states are saved even if an exception occurs
             for element in [crawler, indexer, tokenizer]:
@@ -70,7 +77,7 @@ async def main():
             index_df = access_index()
             index_df.to_csv("inverted_index.csv")
             con.close()
-            print("State saved")
+            logging.info("State saved")
 
     # Save the state+
     for element in [crawler, indexer, tokenizer]:
@@ -80,7 +87,7 @@ async def main():
     index_df = access_index()
     index_df.to_csv("inverted_index.csv")
     con.close()
-    print("State saved")
+    logging.info("State saved")
 
 
 if __name__ == "__main__":

diff --git a/engine/pipeline.py b/engine/pipeline.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 import threading
 
 
@@ -9,7 +10,7 @@ def __init__(self, name):
         self.executor = None
         self.tasks = []
         self.shutdown_flag = threading.Event()
-        print(f"Initialized {self.name}")
+        logging.info(f"Initialized {self.name}")
 
     def add_executor(self, executor):
         self.executor = executor
@@ -25,10 +26,10 @@ def add_next(self, next_element):
 
     async def call_next(self, *args):
         if not self.next:
-            print(f"No next elements for {self.name}")
+            logging.info(f"No next elements for {self.name}")
             return  # No next elements to process
 
-        print(f"Processing next elements for {self.name}")
+        logging.info(f"Processing next elements for {self.name}")
         tasks = []
         for element in self.next:
             if asyncio.iscoroutinefunction(element.process):

diff --git a/engine/queries.txt b/engine/queries.txt
@@ -1,41 +1,41 @@
-tübingen
-tübingen university
-tübingen attractions
-food and drinks
-tübingen weather
-tübingen hotels
-tübingen traditional food
-tübingen coffee shops
-tübingen nightlife spots
-tübingen museums
-tübingen castles
-tübingen outdoor activities
-tübingen nightlife
-tübingen markets
-tübingen shopping centers
-tübingen local products
-Best cafes in Tübingen for students
-Upcoming events at the University of Tübingen
-History of Tübingen's old town
-Popular hiking trails near Tübingen
-Tübingen student housing options
-Vegan and vegetarian restaurants in Tübingen
-Cultural activities in Tübingen
-Tübingen public transportation map
-University of Tübingen research departments
-Tübingen nightlife spots
-Bookstores in Tübingen
-Tübingen local farmers' markets
-Tübingen weather forecast
-Student discounts in Tübingen
-Tübingen library hours and services
-Language exchange programs in Tübingen
-Top tourist attractions in Tübingen
-Cycling routes in Tübingen
-Tübingen sports clubs and gyms
-Tübingen local festivals and fairs
-Best places to study in Tübingen
-Tübingen historical landmarks
-Tübingen university application process
-Local art galleries in Tübingen
-Tübingen second-hand stores
+1   tübingen
+2   tübingen university
+3   tübingen attractions
+4   food and drinks
+5   tübingen weather
+6   tübingen hotels
+7   tübingen traditional food
+8   tübingen coffee shops
+9   tübingen nightlife spots
+10  tübingen museums
+11  tübingen castles
+12  tübingen outdoor activities
+13  tübingen nightlife
+14  tübingen markets
+15  tübingen shopping centers
+16  tübingen local products
+17  Best cafes in Tübingen for students
+18  Upcoming events at the University of Tübingen
+19  History of Tübingen's old town
+20  Popular hiking trails near Tübingen
+21  Tübingen student housing options
+22  Vegan and vegetarian restaurants in Tübingen
+23  Cultural activities in Tübingen
+24  Tübingen public transportation map
+25  University of Tübingen research departments
+26  Tübingen nightlife spots
+27  Bookstores in Tübingen
+28  Tübingen local farmers' markets
+29  Tübingen weather forecast
+30  Student discounts in Tübingen
+31  Tübingen library hours and services
+32  Language exchange programs in Tübingen
+33  Top tourist attractions in Tübingen
+34  Cycling routes in Tübingen
+35  Tübingen sports clubs and gyms
+36  Tübingen local festivals and fairs
+37  Best places to study in Tübingen
+38  Tübingen historical landmarks
+39  Tübingen university application process
+40  Local art galleries in Tübingen
+41  Tübingen second-hand stores
diff --git a/engine/requirements.txt b/engine/requirements.txt
@@ -1,6 +1,7 @@
 # Automatically generated by https://github.com/damnever/pigar.
 
 beautifulsoup4==4.12.3
+certifi==2024.7.4
 duckdb==1.0.0
 eld==1.0.6
 Flask==3.0.3

diff --git a/engine/test.py b/engine/test.py
@@ -1,8 +1,9 @@
 # file to test the written functions
+import logging
 
 from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words
 
 CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + "       " +  "  \n     "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."
 
 top_30_words = top_30_words([CUSTOM_TEXT])
-print(top_30_words)
+logging.info(top_30_words)