From c19111b8f269171604d73dc5d3044ca368c592c4 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 11:04:21 +0200
Subject: [PATCH 1/6] Add tests for the tokenizer

---
 engine/custom_tokenizer.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
index 7f45d8f..57bbf5d 100644
--- a/engine/custom_tokenizer.py
+++ b/engine/custom_tokenizer.py
@@ -157,3 +157,34 @@ async def process(self, data, link):
         add_tokens_to_index(url=link, tokenized_text=tokenized_text)
 
         print(f"Tokenized text for {link}")
+
+
+# Test tokenization
+
+test_sentences = [
+    "Mr. Smith's car is blue-green.",
+    "The URL is https://www.example.com/path?param=value#fragment",
+    "She said, 'I can't believe it!'",
+    "Send an e-mail to john.doe@example.com",
+    "The price is $19.99 (20% off)",
+    "I love the movie 'Star Wars: Episode IV - A New Hope'",
+    "Python 3.9.5 was released on 05/03/2021",
+    "Call me at +1 (555) 123-4567",
+    "The equation is E=mc^2",
+    "Use the #hashtag and @mention",
+    "I'm running... but I'm tired",
+    "It's 72°F outside",
+    "He said: Don't do that!",
+    "The file name is 'document_v1.2.txt'",
+    "1,000,000 people can't be wrong",
+    "The code is: <html><body>Hello</body></html>",
+    "Let's meet at 9:30 AM",
+    "The password is: P@ssw0rd!",
+    "I'll have a ham & cheese sandwich",
+    "The result was 42% (not 50%)",
+]
+
+for sentence in test_sentences:
+    print(f"Original: {sentence}")
+    print(f"Tokenized: {tokenize_data(sentence)}")
+    print()

From 9fdc4c1b699df0fed427c02f94ee5548ac70cede Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:14:38 +0200
Subject: [PATCH 2/6] Use spaCy-based text processing

---
 engine/custom_tokenizer.py | 190 -----------------------------
 engine/requirements.txt    |   3 +-
 engine/tokenizer.py        | 237 +++++++++++++++++++++++++++++++++++++
 3 files changed, 239 insertions(+), 191 deletions(-)
 delete mode 100644 engine/custom_tokenizer.py
 create mode 100644 engine/tokenizer.py

diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
deleted file mode 100644
index 57bbf5d..0000000
--- a/engine/custom_tokenizer.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import logging
-
-import nltk as nltk
-from sklearn.feature_extraction.text import TfidfVectorizer
-import re
-# We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer
-# and/or nltk tokenizer
-from nltk.corpus import stopwords
-import re
-import nltk
-
-from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
-from pipeline import PipelineElement
-from utils import safe_join, safe_str
-
-WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
-STEMMER = nltk.stem.PorterStemmer()
-
-
-def remove_punctuations(text):
-    # Remove punctuations
-    punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
-    text = punctuations.sub(r'', text)
-    return text
-
-
-# Removes HTML syntaxes
-def remove_html(text):
-    html_tag = re.compile(r'<.*?>')
-    text = html_tag.sub(r'', text)
-    return text
-
-
-# Removes URL data
-def remove_url(text):
-    url_clean = re.compile(r"https://\S+|www\.\S+")
-    text = url_clean.sub(r'', text)
-    return text
-
-
-# Removes Emojis
-def remove_emoji(text):
-    emoji_clean = re.compile("["
-                             u"\U0001F600-\U0001F64F"  # emoticons
-                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
-                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
-                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
-                             u"\U00002702-\U000027B0"
-                             u"\U000024C2-\U0001F251"
-                             "]+", flags=re.UNICODE)
-    text = emoji_clean.sub(r'', text)
-    url_clean = re.compile(r"https://\S+|www\.\S+")
-    text = url_clean.sub(r'', text)
-    return text
-
-
-def tokenize_plain_words(words: str):
-    return words.split()
-
-
-def stem(words) -> list[str]:
-    words = [STEMMER.stem(word) for word in words]  # added stemmer
-    return words
-
-
-def remove_stopwords(words):
-    return [word for word in words if word not in stopwords.words("english")]
-
-
-def lemmatize(words):
-    words = [WN_LEMMATIZER.lemmatize(word) for word in words]
-    return words
-
-
-def tokenize_data(data) -> list[str]:
-    """
-    Tokenizes the input data.
-    """
-    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
-                lemmatize]
-    for pipe in pipeline:
-        data = pipe(data)
-    return data
-
-
-# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
-# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
-# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
-# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
-def tf_idf_vectorize(data):
-    """
-    Vectorizes the input data using the TF-IDF algorithm.
-    """
-    # Create the vectorizer
-    # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
-    vectorizer = TfidfVectorizer()
-    # Vectorize the data
-    X = vectorizer.fit_transform(data)
-    return X
-
-
-def top_30_words(data):
-    """
-    Returns the top 30 words from the input data.
-    """
-    # Create the vectorizer
-    vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
-    # Vectorize the data
-    X = vectorizer.fit_transform(data)
-    # Get the feature names
-    feature_names = vectorizer.get_feature_names_out()
-    print(f"Feature names: {feature_names}")
-    print(f"X sieht so aus: {X}")
-    print(f"Shape of X: {X.shape}")
-    print(f"Summe: {X.sum(axis=0)}")
-    top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
-    return top_30_words
-
-
-class Tokenizer(PipelineElement):
-    def __init__(self):
-        super().__init__("Tokenizer")
-
-    async def process(self, data, link):
-        """
-        Tokenizes the input data.
-        """
-
-        if data is None:
-            print(f"Failed to tokenize {link} because the data was empty.")
-            return
-
-        soup = data
-
-        # Get the text from the main content
-        main_content = soup.find("main")
-        text = main_content.get_text() if main_content is not None else soup.get_text()
-
-        # Get the meta description and title
-        description = soup.find("meta", attrs={"name": "description"})
-        description_content = description.get("content") if description is not None else ""
-        title = soup.find("title")
-        title_content = title.string if title is not None else ""
-
-        # Get the alt texts from the images
-        img_tags = soup.findAll("img")
-        alt_texts = [img.get("alt") for img in img_tags]
-
-        # Join all the text together
-        alt_texts_str = safe_join(alt_texts)
-        description_str = safe_str(description_content)
-        title_str = safe_str(title_content)
-        text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()
-
-        # Tokenize the text
-        tokenized_text = tokenize_data(data=text)
-        add_tokens_to_index(url=link, tokenized_text=tokenized_text)
-
-        print(f"Tokenized text for {link}")
-
-
-# Test tokenization
-
-test_sentences = [
-    "Mr. Smith's car is blue-green.",
-    "The URL is https://www.example.com/path?param=value#fragment",
-    "She said, 'I can't believe it!'",
-    "Send an e-mail to john.doe@example.com",
-    "The price is $19.99 (20% off)",
-    "I love the movie 'Star Wars: Episode IV - A New Hope'",
-    "Python 3.9.5 was released on 05/03/2021",
-    "Call me at +1 (555) 123-4567",
-    "The equation is E=mc^2",
-    "Use the #hashtag and @mention",
-    "I'm running... but I'm tired",
-    "It's 72°F outside",
-    "He said: Don't do that!",
-    "The file name is 'document_v1.2.txt'",
-    "1,000,000 people can't be wrong",
-    "The code is: <html><body>Hello</body></html>",
-    "Let's meet at 9:30 AM",
-    "The password is: P@ssw0rd!",
-    "I'll have a ham & cheese sandwich",
-    "The result was 42% (not 50%)",
-]
-
-for sentence in test_sentences:
-    print(f"Original: {sentence}")
-    print(f"Tokenized: {tokenize_data(sentence)}")
-    print()
diff --git a/engine/requirements.txt b/engine/requirements.txt
index 0c0e6f9..9cf4449 100644
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -7,7 +7,8 @@ Flask==3.0.3
 Flask-Cors==4.0.1
 nest-asyncio==1.6.0
 nltk==3.8.1
-numpy==2.0.0
+numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.5.1
 aiohttp==3.9.5
+spacy==3.7.5
diff --git a/engine/tokenizer.py b/engine/tokenizer.py
new file mode 100644
index 0000000..7836ac7
--- /dev/null
+++ b/engine/tokenizer.py
@@ -0,0 +1,237 @@
+import re
+
+import spacy
+
+from custom_db import add_tokens_to_index
+from pipeline import PipelineElement
+from utils import safe_join, safe_str
+
+
+# Define regular expressions for preprocessing
+
+def remove_html(text: str) -> str:
+    html_tag = re.compile(r'<.*?>')
+    text = html_tag.sub(r'', text)
+    return text
+
+
+def remove_emails(text: str) -> str:
+    email_clean = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
+    text = email_clean.sub(r'', text)
+    return text
+
+
+def remove_percentages(text: str) -> str:
+    percentage_clean = re.compile(r"\d+%")
+    text = percentage_clean.sub(r'', text)
+    return text
+
+
+def remove_phone_number(text: str) -> str:
+    # This pattern matches various phone number formats
+    # Thanks to https://stackoverflow.com/a/56450924
+    phone_pattern = re.compile(r'''
+        ((\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?)?\d{3}[\s.-]?\d{4}
+    ''', re.VERBOSE)
+
+    # Replace matched phone numbers with an empty string
+    text = phone_pattern.sub('', text)
+    return text
+
+
+def remove_dates(text: str) -> str:
+    # This pattern matches various date formats
+    # Thanks to https://stackoverflow.com/a/8768241
+    date_pattern = re.compile(r'''
+        ^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$
+    ''', re.VERBOSE)
+
+    # Replace matched phone numbers with an empty string
+    text = date_pattern.sub('', text)
+    return text
+
+
+def remove_times(text: str) -> str:
+    # This pattern matches various time formats
+    time_pattern = re.compile(r'''
+        \b                                  # Word boundary
+        (?:
+            (?:1[0-2]|0?[1-9])              # Hours: 1-12 with optional leading zero
+            :                               # Colon separator
+            (?:[0-5][0-9])                  # Minutes: 00-59
+            (?:
+                :(?:[0-5][0-9])             # Optional seconds: 00-59
+                (?:\.[0-9]{1,3})?           # Optional milliseconds
+            )?
+            \s*(?:AM|PM|am|pm|A\.M\.|P\.M\.)? # Optional AM/PM indicator
+        )
+        |
+            (?:(?:2[0-3]|[01]?[0-9])        # Hours: 00-23
+            :                               # Colon separator
+            (?:[0-5][0-9])                  # Minutes: 00-59
+            (?::(?:[0-5][0-9])              # Optional seconds: 00-59
+                (?:\.[0-9]{1,3})?           # Optional milliseconds
+            )?
+        )
+        \b                                  # Word boundary
+    ''', re.VERBOSE | re.IGNORECASE)
+
+    # Replace matched times with an empty string
+    text = time_pattern.sub('', text)
+    return text
+
+
+def remove_url(text: str) -> str:
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+
+# Removes Emojis
+def remove_emoji(text: str) -> str:
+    emoji_clean = re.compile("["
+                             u"\U0001F600-\U0001F64F"  # emoticons
+                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                             u"\U00002702-\U000027B0"
+                             u"\U000024C2-\U0001F251"
+                             "]+", flags=re.UNICODE)
+    text = emoji_clean.sub(r'', text)
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+
+def lower(tokens: list[str]) -> list[str]:
+    return [word.lower() for word in tokens]
+
+
+def preprocess_text(text: str) -> str:
+    """Apply all preprocessing steps using regular expressions."""
+    text = remove_url(text)
+    text = remove_html(text)
+    text = remove_emails(text)
+    text = remove_times(text)
+    text = remove_phone_number(text)
+    text = remove_dates(text)
+    text = remove_emoji(text)
+    text = remove_percentages(text)
+    return text
+
+
+# Load the spaCy model
+print("Loading spaCy model...")
+nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter", "tagger", "attribute_ruler"])
+nlp.add_pipe("merge_entities")
+nlp.add_pipe("merge_noun_chunks")
+
+
+def process_text(text: str) -> list[str]:
+    """Process text using spaCy and custom logic."""
+
+    # Preprocess the text
+    text = preprocess_text(text)
+
+    # Process with spaCy
+    doc = nlp(text)
+    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
+
+    # Lowercase the tokens
+    tokens = lower(tokens)
+
+    return tokens
+
+
+class Tokenizer(PipelineElement):
+    def __init__(self):
+        super().__init__("Tokenizer")
+
+    async def process(self, data, link):
+        """
+        Tokenizes the input data.
+        """
+
+        if data is None:
+            print(f"Failed to tokenize {link} because the data was empty.")
+            return
+
+        soup = data
+
+        # Get the text from the main content
+        main_content = soup.find("main")
+        text = main_content.get_text() if main_content is not None else soup.get_text()
+
+        # Get the meta description and title
+        description = soup.find("meta", attrs={"name": "description"})
+        description_content = description.get("content") if description is not None else ""
+        title = soup.find("title")
+        title_content = title.string if title is not None else ""
+
+        # Get the alt texts from the images
+        img_tags = soup.findAll("img")
+        alt_texts = [img.get("alt") for img in img_tags]
+
+        # Join all the text together
+        alt_texts_str = safe_join(alt_texts)
+        description_str = safe_str(description_content)
+        title_str = safe_str(title_content)
+        text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()
+
+        # Tokenize the text
+        tokenized_text = process_text(text=text)
+        add_tokens_to_index(url=link, tokenized_text=tokenized_text)
+
+        print(f"Tokenized text for {link}")
+
+
+# Test tokenization
+
+test_sentences = [
+    "Mr. Smith's car is blue-green.",
+    # URLs, emails, prices, and code snippets
+    "The URL is https://www.example.com/path?param=value#fragment",
+    "She said, 'I can't believe it!'",
+    "Send an e-mail to john.doe@example.com",
+    "The price is $19.99 (20% off)",
+    "I love the movie 'Star Wars: Episode IV - A New Hope'",
+    "Python 3.9.5 was released on 05/03/2021",
+    "Call me at +1 (555) 123-4567",
+    "The equation is E=mc^2",
+    "Use the #hashtag and @mention",
+    "I'm running... but I'm tired",
+    "It's 72°F outside",
+    "He said: Don't do that!",
+    "The file name is 'document_v1.2.txt'",
+    "1,000,000 people can't be wrong",
+    "The code is: <html><body>Hello</body></html>",
+    "Let's meet at 9:30 AM",
+    "The password is: P@ssw0rd!",
+    "I'll have a ham & cheese sandwich",
+    "The result was 42% (not 50%)",
+    # Dates and times
+    "The time is 12:34 PM",
+    "The time is 12:34:56 PM",
+    "The date is 2021-05-03",
+    "The time is 12:34:56.789",
+    "The time is 12:34:56.789 PM",
+    "The time is 23:59",
+    "The time is 23:59:59",
+    "The time is 23:59:59.999",
+    "The time is 23:59:59.999 PM",
+    # Named entities
+    "I live in New York City",
+    "I work at Google",
+    "I visited the Statue of Liberty",
+    "I went to the United States of America",
+    "I flew with Lufthansa",
+    "I bought an iPhone",
+    "I use Microsoft Windows",
+    "Apple Inc. is a great company",
+    "I ate at McDonald's",
+]
+
+for sentence in test_sentences:
+    print(f"Original: {sentence}")
+    print(f"Tokenized: {process_text(sentence)}")
+    print()

From 58e1eff9fb0def6f82f61596287a34e555dce067 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:21:55 +0200
Subject: [PATCH 3/6] Add information to install spaCy model

---
 engine/README.md    | 30 ++++++++++++++++++------------
 engine/tokenizer.py |  8 +++++++-
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/engine/README.md b/engine/README.md
index eb596ef..9b64248 100644
--- a/engine/README.md
+++ b/engine/README.md
@@ -10,11 +10,11 @@
 
 1. **Install Python 3:**
 
-   - Download and install the latest version of Python 3 from the official website.
+- Download and install the latest version of Python 3 from the official website.
 
 2. **Install virtualenv:**
 
-   - Open a terminal and run the following command:
+- Open a terminal and run the following command:
 
    ```
    pip install virtualenv
@@ -22,27 +22,33 @@
 
 3. **Create a virtual environment:**
 
-   - Create the virtual environment:
+- Create the virtual environment:
 
-    ```
-    virtualenv --python=3.11 .venv
-    ```
+  ```
+  virtualenv --python=3.11 .venv
+  ```
 
-   - Activate the virtual environment:
+- Activate the virtual environment:
 
-    ```
-    source .venv/bin/activate
-    ```
+  ```
+  source .venv/bin/activate
+  ```
 
 4. **Install requirements:**
 
-   - Ensure you have a requirements.txt file in your project directory.
-   - Run the following command to install the dependencies:
+- Ensure you have a requirements.txt file in your project directory.
+- Run the following command to install the dependencies:
 
    ```
    pip install -r requirements.txt
    ```
 
+- For the text-processing part, we use `spaCy`. You need to download the English model by running the following command:
+
+   ```
+    python -m spacy download en_core_web_sm
+   ```
+
 5. **Start developing the project**
 
 ## Usage
diff --git a/engine/tokenizer.py b/engine/tokenizer.py
index 7836ac7..b30ae36 100644
--- a/engine/tokenizer.py
+++ b/engine/tokenizer.py
@@ -6,6 +6,12 @@
 from pipeline import PipelineElement
 from utils import safe_join, safe_str
 
+"""
+IMPORTANT:
+Make sure you install the spaCy model with:
+python -m spacy download en_core_web_sm
+"""
+
 
 # Define regular expressions for preprocessing
 
@@ -122,7 +128,7 @@ def preprocess_text(text: str) -> str:
 
 # Load the spaCy model
 print("Loading spaCy model...")
-nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter", "tagger", "attribute_ruler"])
+nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter"])
 nlp.add_pipe("merge_entities")
 nlp.add_pipe("merge_noun_chunks")
 

From 3d18322afc234552c82b2c0063c978ffb7653e46 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:41:19 +0200
Subject: [PATCH 4/6] Add comments and improve anchor handling

---
 engine/crawl.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/engine/crawl.py b/engine/crawl.py
index f54d796..c3105fd 100644
--- a/engine/crawl.py
+++ b/engine/crawl.py
@@ -148,6 +148,12 @@ def user_agent(self):
         return self.user_agents[self._page_count % len(self.user_agents)]
 
     async def process(self):
+        """
+        Starts the crawling process.
+        Is called in the Pipeline.
+        Returns: None
+
+        """
         async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
             tasks = set()
             while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
@@ -179,7 +185,16 @@ async def process(self):
 
         print("Crawler finished processing")
 
-    async def _process_url_with_semaphore(self, session, url):
+    async def _process_url_with_semaphore(self, session, url: str):
+        """
+        Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
+        Args:
+            session: aiohttp ClientSession
+            url: URL to crawl
+
+        Returns: None
+
+        """
         async with self._semaphore:
             await self._process_url(session, url)
 
@@ -190,7 +205,7 @@ async def _process_url(self, session, url: str):
             session: aiohttp ClientSession
             url: URL to crawl
 
-        Returns:
+        Returns: None
         """
         if url in self.currently_crawled:
             log_warning(f"Ignoring {url} because it is already being crawled")
@@ -265,7 +280,7 @@ async def _process_url(self, session, url: str):
         if not self.is_shutdown():
             await self.call_next(soup, url)
 
-    async def _handle_links(self, soup, url):
+    async def _handle_links(self, soup: BeautifulSoup, url: str):
         """
         Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
         found list, and not in the to_crawl_set.
@@ -273,7 +288,7 @@ async def _handle_links(self, soup, url):
             soup: BeautifulSoup object
             url: URL of the page
 
-        Returns:
+        Returns: None
 
         """
         for a_tag in soup.find_all("a", href=True):
@@ -283,6 +298,9 @@ async def _handle_links(self, soup, url):
             if found_link.startswith("#"):
                 continue
 
+            # Strip out the fragment
+            found_link = found_link.split("#")[0]
+
             # Check if link is relative
             if found_link.startswith("/"):
                 base_url = get_base_url(url)

From dfccdc481dba02220d7b099d7287f167fcef43b6 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:41:32 +0200
Subject: [PATCH 5/6] Improve tokenization

---
 engine/tokenizer.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/engine/tokenizer.py b/engine/tokenizer.py
index b30ae36..925996d 100644
--- a/engine/tokenizer.py
+++ b/engine/tokenizer.py
@@ -27,6 +27,19 @@ def remove_emails(text: str) -> str:
     return text
 
 
+def remove_prices(text: str) -> str:
+    price_pattern = re.compile(r'''
+        (?:(?:\$|€|£|¥)(?:\s?))                     # Currency symbols at the start
+        \d{1,3}(?:,\d{3})*(?:\.\d{1,2})?            # Numbers with optional thousands separators and decimal points
+        |
+        \d{1,3}(?:,\d{3})*(?:\.\d{1,2})?            # Numbers with optional thousands separators and decimal points
+        (?:\s?(?:\$|€|£|¥|USD|EUR|GBP|JPY))         # Currency symbols or codes at the end
+    ''', re.VERBOSE | re.IGNORECASE)
+
+    text = price_pattern.sub('', text)
+    return text
+
+
 def remove_percentages(text: str) -> str:
     percentage_clean = re.compile(r"\d+%")
     text = percentage_clean.sub(r'', text)
@@ -122,6 +135,7 @@ def preprocess_text(text: str) -> str:
     text = remove_phone_number(text)
     text = remove_dates(text)
     text = remove_emoji(text)
+    text = remove_prices(text)
     text = remove_percentages(text)
     return text
 
@@ -129,8 +143,6 @@ def preprocess_text(text: str) -> str:
 # Load the spaCy model
 print("Loading spaCy model...")
 nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter"])
-nlp.add_pipe("merge_entities")
-nlp.add_pipe("merge_noun_chunks")
 
 
 def process_text(text: str) -> list[str]:
@@ -235,6 +247,14 @@ async def process(self, data, link):
     "I use Microsoft Windows",
     "Apple Inc. is a great company",
     "I ate at McDonald's",
+    "I study at the Max Planck Institute",
+    "Tübingen is a nice city",
+    "Everyday I eat at Salam Burger in Tübingen and I love it",
+    # Misc
+    "I ❤️ Python",
+    "I'm 6'2\" tall",
+    "I'm 6'2\" tall and I weigh 180 lbs.",
+    "I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.",
 ]
 
 for sentence in test_sentences:

From bbbed163c74a4f25405b8aba735637d8798ba555 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:41:34 +0200
Subject: [PATCH 6/6] Improve tokenization

---
 engine/summarize.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 engine/summarize.py

diff --git a/engine/summarize.py b/engine/summarize.py
new file mode 100644
index 0000000..40373ff
--- /dev/null
+++ b/engine/summarize.py
@@ -0,0 +1,15 @@
+from gensim.summarization import summarize
+
+# Input text to be summarized
+input_text = """
+Your input text goes here. It can be a long paragraph or multiple paragraphs. 
+"""
+
+# Generate the summary using TextRank algorithm
+summary = summarize(input_text, ratio=0.3)  # You can adjust the ratio parameter based on the summary length you desire
+
+# Output the summary
+print("Original Text:")
+print(input_text)
+print("\nSummary:")
+print(summary)