am9zZWY · am9zZWY · Jul 17, 2024
diff --git a/engine/README.md b/engine/README.md
@@ -10,45 +10,39 @@
 
 1. **Install Python 3:**
 
-- Download and install the latest version of Python 3 from the official website.
+   - Download and install the latest version of Python 3 from the official website.
 
 2. **Install virtualenv:**
 
-- Open a terminal and run the following command:
+   - Open a terminal and run the following command:
 
    ```
    pip install virtualenv
    ```
 
 3. **Create a virtual environment:**
 
-- Create the virtual environment:
+   - Create the virtual environment:
 
-  ```
-  virtualenv --python=3.11 .venv
-  ```
+    ```
+    virtualenv --python=3.11 .venv
+    ```
 
-- Activate the virtual environment:
+   - Activate the virtual environment:
 
-  ```
-  source .venv/bin/activate
-  ```
+    ```
+    source .venv/bin/activate
+    ```
 
 4. **Install requirements:**
 
-- Ensure you have a requirements.txt file in your project directory.
-- Run the following command to install the dependencies:
+   - Ensure you have a requirements.txt file in your project directory.
+   - Run the following command to install the dependencies:
 
    ```
    pip install -r requirements.txt
    ```
 
-- For the text-processing part, we use `spaCy`. You need to download the English model by running the following command:
-
-   ```
-    python -m spacy download en_core_web_sm
-   ```
-
 5. **Start developing the project**
 
 ## Usage

diff --git a/engine/crawl.py b/engine/crawl.py
@@ -148,12 +148,6 @@ def user_agent(self):
         return self.user_agents[self._page_count % len(self.user_agents)]
 
     async def process(self):
-        """
-        Starts the crawling process.
-        Is called in the Pipeline.
-        Returns: None
-
-        """
         async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
             tasks = set()
             while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
@@ -185,16 +179,7 @@ async def process(self):
 
         print("Crawler finished processing")
 
-    async def _process_url_with_semaphore(self, session, url: str):
-        """
-        Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
-        Args:
-            session: aiohttp ClientSession
-            url: URL to crawl
-
-        Returns: None
-
-        """
+    async def _process_url_with_semaphore(self, session, url):
         async with self._semaphore:
             await self._process_url(session, url)
 
@@ -205,7 +190,7 @@ async def _process_url(self, session, url: str):
             session: aiohttp ClientSession
             url: URL to crawl
 
-        Returns: None
+        Returns:
         """
         if url in self.currently_crawled:
             log_warning(f"Ignoring {url} because it is already being crawled")
@@ -280,15 +265,15 @@ async def _process_url(self, session, url: str):
         if not self.is_shutdown():
             await self.call_next(soup, url)
 
-    async def _handle_links(self, soup: BeautifulSoup, url: str):
+    async def _handle_links(self, soup, url):
         """
         Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
         found list, and not in the to_crawl_set.
         Args:
             soup: BeautifulSoup object
             url: URL of the page
 
-        Returns: None
+        Returns:
 
         """
         for a_tag in soup.find_all("a", href=True):
@@ -298,9 +283,6 @@ async def _handle_links(self, soup: BeautifulSoup, url: str):
             if found_link.startswith("#"):
                 continue
 
-            # Strip out the fragment
-            found_link = found_link.split("#")[0]
-
             # Check if link is relative
             if found_link.startswith("/"):
                 base_url = get_base_url(url)

diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
@@ -0,0 +1,159 @@
+import logging
+
+import nltk as nltk
+from sklearn.feature_extraction.text import TfidfVectorizer
+import re
+# We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer
+# and/or nltk tokenizer
+from nltk.corpus import stopwords
+import re
+import nltk
+
+from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
+from pipeline import PipelineElement
+from utils import safe_join, safe_str
+
+WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
+STEMMER = nltk.stem.PorterStemmer()
+
+
+def remove_punctuations(text):
+    # Remove punctuations
+    punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
+    text = punctuations.sub(r'', text)
+    return text
+
+
+# Removes HTML syntaxes
+def remove_html(text):
+    html_tag = re.compile(r'<.*?>')
+    text = html_tag.sub(r'', text)
+    return text
+
+
+# Removes URL data
+def remove_url(text):
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+
+# Removes Emojis
+def remove_emoji(text):
+    emoji_clean = re.compile("["
+                             u"\U0001F600-\U0001F64F"  # emoticons
+                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                             u"\U00002702-\U000027B0"
+                             u"\U000024C2-\U0001F251"
+                             "]+", flags=re.UNICODE)
+    text = emoji_clean.sub(r'', text)
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+
+def tokenize_plain_words(words: str):
+    return words.split()
+
+
+def stem(words) -> list[str]:
+    words = [STEMMER.stem(word) for word in words]  # added stemmer
+    return words
+
+
+def remove_stopwords(words):
+    return [word for word in words if word not in stopwords.words("english")]
+
+
+def lemmatize(words):
+    words = [WN_LEMMATIZER.lemmatize(word) for word in words]
+    return words
+
+
+def tokenize_data(data) -> list[str]:
+    """
+    Tokenizes the input data.
+    """
+    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
+                lemmatize]
+    for pipe in pipeline:
+        data = pipe(data)
+    return data
+
+
+# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
+# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
+# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
+# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
+def tf_idf_vectorize(data):
+    """
+    Vectorizes the input data using the TF-IDF algorithm.
+    """
+    # Create the vectorizer
+    # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
+    vectorizer = TfidfVectorizer()
+    # Vectorize the data
+    X = vectorizer.fit_transform(data)
+    return X
+
+
+def top_30_words(data):
+    """
+    Returns the top 30 words from the input data.
+    """
+    # Create the vectorizer
+    vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
+    # Vectorize the data
+    X = vectorizer.fit_transform(data)
+    # Get the feature names
+    feature_names = vectorizer.get_feature_names_out()
+    print(f"Feature names: {feature_names}")
+    print(f"X sieht so aus: {X}")
+    print(f"Shape of X: {X.shape}")
+    print(f"Summe: {X.sum(axis=0)}")
+    top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
+    return top_30_words
+
+
+class Tokenizer(PipelineElement):
+    def __init__(self):
+        super().__init__("Tokenizer")
+
+    async def process(self, data, link):
+        """
+        Tokenizes the input data.
+        """
+
+        if data is None:
+            print(f"Failed to tokenize {link} because the data was empty.")
+            return
+
+        soup = data
+
+        # Get the text from the main content
+        main_content = soup.find("main")
+        text = main_content.get_text() if main_content is not None else soup.get_text()
+
+        # Get the meta description and title
+        description = soup.find("meta", attrs={"name": "description"})
+        description_content = description.get("content") if description is not None else ""
+        title = soup.find("title")
+        title_content = title.string if title is not None else ""
+
+        # Get the alt texts from the images
+        img_tags = soup.findAll("img")
+        alt_texts = [img.get("alt") for img in img_tags]
+
+        # Join all the text together
+        alt_texts_str = safe_join(alt_texts)
+        description_str = safe_str(description_content)
+        title_str = safe_str(title_content)
+        text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()
+
+        # Tokenize the text
+        tokenized_text = tokenize_data(data=text)
+        add_tokens_to_index(url=link, tokenized_text=tokenized_text)
+
+        print(f"Tokenized text for {link}")
diff --git a/engine/requirements.txt b/engine/requirements.txt
@@ -7,8 +7,7 @@ Flask==3.0.3
 Flask-Cors==4.0.1
 nest-asyncio==1.6.0
 nltk==3.8.1
-numpy==1.26.4
+numpy==2.0.0
 pandas==2.2.2
 scikit-learn==1.5.1
 aiohttp==3.9.5
-spacy==3.7.5
diff --git a/engine/summarize.py b/engine/summarize.py