Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Improve text processing" #36

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions engine/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,39 @@

1. **Install Python 3:**

- Download and install the latest version of Python 3 from the official website.
- Download and install the latest version of Python 3 from the official website.

2. **Install virtualenv:**

- Open a terminal and run the following command:
- Open a terminal and run the following command:

```
pip install virtualenv
```

3. **Create a virtual environment:**

- Create the virtual environment:
- Create the virtual environment:

```
virtualenv --python=3.11 .venv
```
```
virtualenv --python=3.11 .venv
```

- Activate the virtual environment:
- Activate the virtual environment:

```
source .venv/bin/activate
```
```
source .venv/bin/activate
```

4. **Install requirements:**

- Ensure you have a requirements.txt file in your project directory.
- Run the following command to install the dependencies:
- Ensure you have a requirements.txt file in your project directory.
- Run the following command to install the dependencies:

```
pip install -r requirements.txt
```

- For the text-processing part, we use `spaCy`. You need to download the English model by running the following command:

```
python -m spacy download en_core_web_sm
```

5. **Start developing the project**

## Usage
Expand Down
26 changes: 4 additions & 22 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,6 @@ def user_agent(self):
return self.user_agents[self._page_count % len(self.user_agents)]

async def process(self):
"""
Starts the crawling process.
Is called in the Pipeline.
Returns: None

"""
async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
tasks = set()
while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
Expand Down Expand Up @@ -185,16 +179,7 @@ async def process(self):

print("Crawler finished processing")

async def _process_url_with_semaphore(self, session, url: str):
"""
Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
Args:
session: aiohttp ClientSession
url: URL to crawl

Returns: None

"""
async def _process_url_with_semaphore(self, session, url):
async with self._semaphore:
await self._process_url(session, url)

Expand All @@ -205,7 +190,7 @@ async def _process_url(self, session, url: str):
session: aiohttp ClientSession
url: URL to crawl

Returns: None
Returns:
"""
if url in self.currently_crawled:
log_warning(f"Ignoring {url} because it is already being crawled")
Expand Down Expand Up @@ -280,15 +265,15 @@ async def _process_url(self, session, url: str):
if not self.is_shutdown():
await self.call_next(soup, url)

async def _handle_links(self, soup: BeautifulSoup, url: str):
async def _handle_links(self, soup, url):
"""
Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
found list, and not in the to_crawl_set.
Args:
soup: BeautifulSoup object
url: URL of the page

Returns: None
Returns:

"""
for a_tag in soup.find_all("a", href=True):
Expand All @@ -298,9 +283,6 @@ async def _handle_links(self, soup: BeautifulSoup, url: str):
if found_link.startswith("#"):
continue

# Strip out the fragment
found_link = found_link.split("#")[0]

# Check if link is relative
if found_link.startswith("/"):
base_url = get_base_url(url)
Expand Down
159 changes: 159 additions & 0 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import logging

import nltk as nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer
# and/or nltk tokenizer
from nltk.corpus import stopwords
import re
import nltk

from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
from pipeline import PipelineElement
from utils import safe_join, safe_str

WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
STEMMER = nltk.stem.PorterStemmer()


def remove_punctuations(text):
# Remove punctuations
punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
text = punctuations.sub(r'', text)
return text


# Removes HTML syntaxes
def remove_html(text):
html_tag = re.compile(r'<.*?>')
text = html_tag.sub(r'', text)
return text


# Removes URL data
def remove_url(text):
url_clean = re.compile(r"https://\S+|www\.\S+")
text = url_clean.sub(r'', text)
return text


# Removes Emojis
def remove_emoji(text):
emoji_clean = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_clean.sub(r'', text)
url_clean = re.compile(r"https://\S+|www\.\S+")
text = url_clean.sub(r'', text)
return text


def tokenize_plain_words(words: str):
return words.split()


def stem(words) -> list[str]:
words = [STEMMER.stem(word) for word in words] # added stemmer
return words


def remove_stopwords(words):
return [word for word in words if word not in stopwords.words("english")]


def lemmatize(words):
words = [WN_LEMMATIZER.lemmatize(word) for word in words]
return words


def tokenize_data(data) -> list[str]:
"""
Tokenizes the input data.
"""
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
lemmatize]
for pipe in pipeline:
data = pipe(data)
return data


# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
def tf_idf_vectorize(data):
"""
Vectorizes the input data using the TF-IDF algorithm.
"""
# Create the vectorizer
# vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
vectorizer = TfidfVectorizer()
# Vectorize the data
X = vectorizer.fit_transform(data)
return X


def top_30_words(data):
"""
Returns the top 30 words from the input data.
"""
# Create the vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
# Vectorize the data
X = vectorizer.fit_transform(data)
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
print(f"Feature names: {feature_names}")
print(f"X sieht so aus: {X}")
print(f"Shape of X: {X.shape}")
print(f"Summe: {X.sum(axis=0)}")
top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
return top_30_words


class Tokenizer(PipelineElement):
def __init__(self):
super().__init__("Tokenizer")

async def process(self, data, link):
"""
Tokenizes the input data.
"""

if data is None:
print(f"Failed to tokenize {link} because the data was empty.")
return

soup = data

# Get the text from the main content
main_content = soup.find("main")
text = main_content.get_text() if main_content is not None else soup.get_text()

# Get the meta description and title
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

# Get the alt texts from the images
img_tags = soup.findAll("img")
alt_texts = [img.get("alt") for img in img_tags]

# Join all the text together
alt_texts_str = safe_join(alt_texts)
description_str = safe_str(description_content)
title_str = safe_str(title_content)
text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()

# Tokenize the text
tokenized_text = tokenize_data(data=text)
add_tokens_to_index(url=link, tokenized_text=tokenized_text)

print(f"Tokenized text for {link}")
3 changes: 1 addition & 2 deletions engine/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ Flask==3.0.3
Flask-Cors==4.0.1
nest-asyncio==1.6.0
nltk==3.8.1
numpy==1.26.4
numpy==2.0.0
pandas==2.2.2
scikit-learn==1.5.1
aiohttp==3.9.5
spacy==3.7.5
15 changes: 0 additions & 15 deletions engine/summarize.py

This file was deleted.

Loading