diff --git a/engine/README.md b/engine/README.md index eb596ef..9b64248 100644 --- a/engine/README.md +++ b/engine/README.md @@ -10,11 +10,11 @@ 1. **Install Python 3:** - - Download and install the latest version of Python 3 from the official website. +- Download and install the latest version of Python 3 from the official website. 2. **Install virtualenv:** - - Open a terminal and run the following command: +- Open a terminal and run the following command: ``` pip install virtualenv @@ -22,27 +22,33 @@ 3. **Create a virtual environment:** - - Create the virtual environment: +- Create the virtual environment: - ``` - virtualenv --python=3.11 .venv - ``` + ``` + virtualenv --python=3.11 .venv + ``` - - Activate the virtual environment: +- Activate the virtual environment: - ``` - source .venv/bin/activate - ``` + ``` + source .venv/bin/activate + ``` 4. **Install requirements:** - - Ensure you have a requirements.txt file in your project directory. - - Run the following command to install the dependencies: +- Ensure you have a requirements.txt file in your project directory. +- Run the following command to install the dependencies: ``` pip install -r requirements.txt ``` +- For the text-processing part, we use `spaCy`. You need to download the English model by running the following command: + + ``` + python -m spacy download en_core_web_sm + ``` + 5. **Start developing the project** ## Usage diff --git a/engine/crawl.py b/engine/crawl.py index f54d796..c3105fd 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -148,6 +148,12 @@ def user_agent(self): return self.user_agents[self._page_count % len(self.user_agents)] async def process(self): + """ + Starts the crawling process. + Is called in the Pipeline. + Returns: None + + """ async with ClientSession(connector=self._connector, timeout=self._timeout) as session: tasks = set() while not self.is_shutdown() and len(self.urls_crawled) < self.max_size: @@ -179,7 +185,16 @@ async def process(self): print("Crawler finished processing") - async def _process_url_with_semaphore(self, session, url): + async def _process_url_with_semaphore(self, session, url: str): + """ + Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests. + Args: + session: aiohttp ClientSession + url: URL to crawl + + Returns: None + + """ async with self._semaphore: await self._process_url(session, url) @@ -190,7 +205,7 @@ async def _process_url(self, session, url: str): session: aiohttp ClientSession url: URL to crawl - Returns: + Returns: None """ if url in self.currently_crawled: log_warning(f"Ignoring {url} because it is already being crawled") @@ -265,7 +280,7 @@ async def _process_url(self, session, url: str): if not self.is_shutdown(): await self.call_next(soup, url) - async def _handle_links(self, soup, url): + async def _handle_links(self, soup: BeautifulSoup, url: str): """ Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the found list, and not in the to_crawl_set. @@ -273,7 +288,7 @@ async def _handle_links(self, soup, url): soup: BeautifulSoup object url: URL of the page - Returns: + Returns: None """ for a_tag in soup.find_all("a", href=True): @@ -283,6 +298,9 @@ async def _handle_links(self, soup, url): if found_link.startswith("#"): continue + # Strip out the fragment + found_link = found_link.split("#")[0] + # Check if link is relative if found_link.startswith("/"): base_url = get_base_url(url) diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py deleted file mode 100644 index 7f45d8f..0000000 --- a/engine/custom_tokenizer.py +++ /dev/null @@ -1,159 +0,0 @@ -import logging - -import nltk as nltk -from sklearn.feature_extraction.text import TfidfVectorizer -import re -# We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer -# and/or nltk tokenizer -from nltk.corpus import stopwords -import re -import nltk - -from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index -from pipeline import PipelineElement -from utils import safe_join, safe_str - -WN_LEMMATIZER = nltk.stem.WordNetLemmatizer() -STEMMER = nltk.stem.PorterStemmer() - - -def remove_punctuations(text): - # Remove punctuations - punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]') - text = punctuations.sub(r'', text) - return text - - -# Removes HTML syntaxes -def remove_html(text): - html_tag = re.compile(r'<.*?>') - text = html_tag.sub(r'', text) - return text - - -# Removes URL data -def remove_url(text): - url_clean = re.compile(r"https://\S+|www\.\S+") - text = url_clean.sub(r'', text) - return text - - -# Removes Emojis -def remove_emoji(text): - emoji_clean = re.compile("[" - u"\U0001F600-\U0001F64F" # emoticons - u"\U0001F300-\U0001F5FF" # symbols & pictographs - u"\U0001F680-\U0001F6FF" # transport & map symbols - u"\U0001F1E0-\U0001F1FF" # flags (iOS) - u"\U00002702-\U000027B0" - u"\U000024C2-\U0001F251" - "]+", flags=re.UNICODE) - text = emoji_clean.sub(r'', text) - url_clean = re.compile(r"https://\S+|www\.\S+") - text = url_clean.sub(r'', text) - return text - - -def tokenize_plain_words(words: str): - return words.split() - - -def stem(words) -> list[str]: - words = [STEMMER.stem(word) for word in words] # added stemmer - return words - - -def remove_stopwords(words): - return [word for word in words if word not in stopwords.words("english")] - - -def lemmatize(words): - words = [WN_LEMMATIZER.lemmatize(word) for word in words] - return words - - -def tokenize_data(data) -> list[str]: - """ - Tokenizes the input data. - """ - pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords, - lemmatize] - for pipe in pipeline: - data = pipe(data) - return data - - -# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen. -# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen -# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern -# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html -def tf_idf_vectorize(data): - """ - Vectorizes the input data using the TF-IDF algorithm. - """ - # Create the vectorizer - # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen - vectorizer = TfidfVectorizer() - # Vectorize the data - X = vectorizer.fit_transform(data) - return X - - -def top_30_words(data): - """ - Returns the top 30 words from the input data. - """ - # Create the vectorizer - vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") - # Vectorize the data - X = vectorizer.fit_transform(data) - # Get the feature names - feature_names = vectorizer.get_feature_names_out() - print(f"Feature names: {feature_names}") - print(f"X sieht so aus: {X}") - print(f"Shape of X: {X.shape}") - print(f"Summe: {X.sum(axis=0)}") - top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30] - return top_30_words - - -class Tokenizer(PipelineElement): - def __init__(self): - super().__init__("Tokenizer") - - async def process(self, data, link): - """ - Tokenizes the input data. - """ - - if data is None: - print(f"Failed to tokenize {link} because the data was empty.") - return - - soup = data - - # Get the text from the main content - main_content = soup.find("main") - text = main_content.get_text() if main_content is not None else soup.get_text() - - # Get the meta description and title - description = soup.find("meta", attrs={"name": "description"}) - description_content = description.get("content") if description is not None else "" - title = soup.find("title") - title_content = title.string if title is not None else "" - - # Get the alt texts from the images - img_tags = soup.findAll("img") - alt_texts = [img.get("alt") for img in img_tags] - - # Join all the text together - alt_texts_str = safe_join(alt_texts) - description_str = safe_str(description_content) - title_str = safe_str(title_content) - text = f"{text} {alt_texts_str} {description_str} {title_str}".strip() - - # Tokenize the text - tokenized_text = tokenize_data(data=text) - add_tokens_to_index(url=link, tokenized_text=tokenized_text) - - print(f"Tokenized text for {link}") diff --git a/engine/requirements.txt b/engine/requirements.txt index 0c0e6f9..9cf4449 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -7,7 +7,8 @@ Flask==3.0.3 Flask-Cors==4.0.1 nest-asyncio==1.6.0 nltk==3.8.1 -numpy==2.0.0 +numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.1 aiohttp==3.9.5 +spacy==3.7.5 diff --git a/engine/summarize.py b/engine/summarize.py new file mode 100644 index 0000000..40373ff --- /dev/null +++ b/engine/summarize.py @@ -0,0 +1,15 @@ +from gensim.summarization import summarize + +# Input text to be summarized +input_text = """ +Your input text goes here. It can be a long paragraph or multiple paragraphs. +""" + +# Generate the summary using TextRank algorithm +summary = summarize(input_text, ratio=0.3) # You can adjust the ratio parameter based on the summary length you desire + +# Output the summary +print("Original Text:") +print(input_text) +print("\nSummary:") +print(summary) diff --git a/engine/tokenizer.py b/engine/tokenizer.py new file mode 100644 index 0000000..925996d --- /dev/null +++ b/engine/tokenizer.py @@ -0,0 +1,263 @@ +import re + +import spacy + +from custom_db import add_tokens_to_index +from pipeline import PipelineElement +from utils import safe_join, safe_str + +""" +IMPORTANT: +Make sure you install the spaCy model with: +python -m spacy download en_core_web_sm +""" + + +# Define regular expressions for preprocessing + +def remove_html(text: str) -> str: + html_tag = re.compile(r'<.*?>') + text = html_tag.sub(r'', text) + return text + + +def remove_emails(text: str) -> str: + email_clean = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") + text = email_clean.sub(r'', text) + return text + + +def remove_prices(text: str) -> str: + price_pattern = re.compile(r''' + (?:(?:\$|€|£|¥)(?:\s?)) # Currency symbols at the start + \d{1,3}(?:,\d{3})*(?:\.\d{1,2})? # Numbers with optional thousands separators and decimal points + | + \d{1,3}(?:,\d{3})*(?:\.\d{1,2})? # Numbers with optional thousands separators and decimal points + (?:\s?(?:\$|€|£|¥|USD|EUR|GBP|JPY)) # Currency symbols or codes at the end + ''', re.VERBOSE | re.IGNORECASE) + + text = price_pattern.sub('', text) + return text + + +def remove_percentages(text: str) -> str: + percentage_clean = re.compile(r"\d+%") + text = percentage_clean.sub(r'', text) + return text + + +def remove_phone_number(text: str) -> str: + # This pattern matches various phone number formats + # Thanks to https://stackoverflow.com/a/56450924 + phone_pattern = re.compile(r''' + ((\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?)?\d{3}[\s.-]?\d{4} + ''', re.VERBOSE) + + # Replace matched phone numbers with an empty string + text = phone_pattern.sub('', text) + return text + + +def remove_dates(text: str) -> str: + # This pattern matches various date formats + # Thanks to https://stackoverflow.com/a/8768241 + date_pattern = re.compile(r''' + ^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$ + ''', re.VERBOSE) + + # Replace matched phone numbers with an empty string + text = date_pattern.sub('', text) + return text + + +def remove_times(text: str) -> str: + # This pattern matches various time formats + time_pattern = re.compile(r''' + \b # Word boundary + (?: + (?:1[0-2]|0?[1-9]) # Hours: 1-12 with optional leading zero + : # Colon separator + (?:[0-5][0-9]) # Minutes: 00-59 + (?: + :(?:[0-5][0-9]) # Optional seconds: 00-59 + (?:\.[0-9]{1,3})? # Optional milliseconds + )? + \s*(?:AM|PM|am|pm|A\.M\.|P\.M\.)? # Optional AM/PM indicator + ) + | + (?:(?:2[0-3]|[01]?[0-9]) # Hours: 00-23 + : # Colon separator + (?:[0-5][0-9]) # Minutes: 00-59 + (?::(?:[0-5][0-9]) # Optional seconds: 00-59 + (?:\.[0-9]{1,3})? # Optional milliseconds + )? + ) + \b # Word boundary + ''', re.VERBOSE | re.IGNORECASE) + + # Replace matched times with an empty string + text = time_pattern.sub('', text) + return text + + +def remove_url(text: str) -> str: + url_clean = re.compile(r"https://\S+|www\.\S+") + text = url_clean.sub(r'', text) + return text + + +# Removes Emojis +def remove_emoji(text: str) -> str: + emoji_clean = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + "]+", flags=re.UNICODE) + text = emoji_clean.sub(r'', text) + url_clean = re.compile(r"https://\S+|www\.\S+") + text = url_clean.sub(r'', text) + return text + + +def lower(tokens: list[str]) -> list[str]: + return [word.lower() for word in tokens] + + +def preprocess_text(text: str) -> str: + """Apply all preprocessing steps using regular expressions.""" + text = remove_url(text) + text = remove_html(text) + text = remove_emails(text) + text = remove_times(text) + text = remove_phone_number(text) + text = remove_dates(text) + text = remove_emoji(text) + text = remove_prices(text) + text = remove_percentages(text) + return text + + +# Load the spaCy model +print("Loading spaCy model...") +nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "parser", "senter"]) + + +def process_text(text: str) -> list[str]: + """Process text using spaCy and custom logic.""" + + # Preprocess the text + text = preprocess_text(text) + + # Process with spaCy + doc = nlp(text) + tokens = [token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space] + + # Lowercase the tokens + tokens = lower(tokens) + + return tokens + + +class Tokenizer(PipelineElement): + def __init__(self): + super().__init__("Tokenizer") + + async def process(self, data, link): + """ + Tokenizes the input data. + """ + + if data is None: + print(f"Failed to tokenize {link} because the data was empty.") + return + + soup = data + + # Get the text from the main content + main_content = soup.find("main") + text = main_content.get_text() if main_content is not None else soup.get_text() + + # Get the meta description and title + description = soup.find("meta", attrs={"name": "description"}) + description_content = description.get("content") if description is not None else "" + title = soup.find("title") + title_content = title.string if title is not None else "" + + # Get the alt texts from the images + img_tags = soup.findAll("img") + alt_texts = [img.get("alt") for img in img_tags] + + # Join all the text together + alt_texts_str = safe_join(alt_texts) + description_str = safe_str(description_content) + title_str = safe_str(title_content) + text = f"{text} {alt_texts_str} {description_str} {title_str}".strip() + + # Tokenize the text + tokenized_text = process_text(text=text) + add_tokens_to_index(url=link, tokenized_text=tokenized_text) + + print(f"Tokenized text for {link}") + + +# Test tokenization + +test_sentences = [ + "Mr. Smith's car is blue-green.", + # URLs, emails, prices, and code snippets + "The URL is https://www.example.com/path?param=value#fragment", + "She said, 'I can't believe it!'", + "Send an e-mail to john.doe@example.com", + "The price is $19.99 (20% off)", + "I love the movie 'Star Wars: Episode IV - A New Hope'", + "Python 3.9.5 was released on 05/03/2021", + "Call me at +1 (555) 123-4567", + "The equation is E=mc^2", + "Use the #hashtag and @mention", + "I'm running... but I'm tired", + "It's 72°F outside", + "He said: Don't do that!", + "The file name is 'document_v1.2.txt'", + "1,000,000 people can't be wrong", + "The code is: Hello", + "Let's meet at 9:30 AM", + "The password is: P@ssw0rd!", + "I'll have a ham & cheese sandwich", + "The result was 42% (not 50%)", + # Dates and times + "The time is 12:34 PM", + "The time is 12:34:56 PM", + "The date is 2021-05-03", + "The time is 12:34:56.789", + "The time is 12:34:56.789 PM", + "The time is 23:59", + "The time is 23:59:59", + "The time is 23:59:59.999", + "The time is 23:59:59.999 PM", + # Named entities + "I live in New York City", + "I work at Google", + "I visited the Statue of Liberty", + "I went to the United States of America", + "I flew with Lufthansa", + "I bought an iPhone", + "I use Microsoft Windows", + "Apple Inc. is a great company", + "I ate at McDonald's", + "I study at the Max Planck Institute", + "Tübingen is a nice city", + "Everyday I eat at Salam Burger in Tübingen and I love it", + # Misc + "I ❤️ Python", + "I'm 6'2\" tall", + "I'm 6'2\" tall and I weigh 180 lbs.", + "I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.", +] + +for sentence in test_sentences: + print(f"Original: {sentence}") + print(f"Tokenized: {process_text(sentence)}") + print()