From dfccdc481dba02220d7b099d7287f167fcef43b6 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:41:32 +0200
Subject: [PATCH] Improve tokenization

---
 engine/tokenizer.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/engine/tokenizer.py b/engine/tokenizer.py
index b30ae36..925996d 100644
--- a/engine/tokenizer.py
+++ b/engine/tokenizer.py
@@ -27,6 +27,19 @@ def remove_emails(text: str) -> str:
     return text
 
 
+def remove_prices(text: str) -> str:
+    price_pattern = re.compile(r'''
+        (?:(?:\$|€|£|¥)(?:\s?))                     # Currency symbols at the start
+        \d{1,3}(?:,\d{3})*(?:\.\d{1,2})?            # Numbers with optional thousands separators and decimal points
+        |
+        \d{1,3}(?:,\d{3})*(?:\.\d{1,2})?            # Numbers with optional thousands separators and decimal points
+        (?:\s?(?:\$|€|£|¥|USD|EUR|GBP|JPY))         # Currency symbols or codes at the end
+    ''', re.VERBOSE | re.IGNORECASE)
+
+    text = price_pattern.sub('', text)
+    return text
+
+
 def remove_percentages(text: str) -> str:
     percentage_clean = re.compile(r"\d+%")
     text = percentage_clean.sub(r'', text)
@@ -122,6 +135,7 @@ def preprocess_text(text: str) -> str:
     text = remove_phone_number(text)
     text = remove_dates(text)
     text = remove_emoji(text)
+    text = remove_prices(text)
     text = remove_percentages(text)
     return text
 
@@ -129,8 +143,6 @@ def preprocess_text(text: str) -> str:
 # Load the spaCy model
 print("Loading spaCy model...")
 nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter"])
-nlp.add_pipe("merge_entities")
-nlp.add_pipe("merge_noun_chunks")
 
 
 def process_text(text: str) -> list[str]:
@@ -235,6 +247,14 @@ async def process(self, data, link):
     "I use Microsoft Windows",
     "Apple Inc. is a great company",
     "I ate at McDonald's",
+    "I study at the Max Planck Institute",
+    "Tübingen is a nice city",
+    "Everyday I eat at Salam Burger in Tübingen and I love it",
+    # Misc
+    "I ❤️ Python",
+    "I'm 6'2\" tall",
+    "I'm 6'2\" tall and I weigh 180 lbs.",
+    "I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.",
 ]
 
 for sentence in test_sentences: