Skip to content

Commit

Permalink
Merge pull request #29 from am9zZWY/josef-crawler-tokenizer-update
Browse files Browse the repository at this point in the history
Update crawler and tokenizer
  • Loading branch information
am9zZWY authored Jul 16, 2024
2 parents ba68a0f + b3f4b89 commit 0f4f4ed
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 174 deletions.
266 changes: 166 additions & 100 deletions engine/crawl.py

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os

import pandas as pd
Expand Down Expand Up @@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]):
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text
else:
print(f"Page with ID {page_id} not found")
logging.info(f"Page with ID {page_id} not found")


def add_title_to_index(url: str, title: str):
Expand All @@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str):
if not pages_df[pages_df['id'] == page_id].empty:
pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title
else:
print(f"Page with ID {page_id} not found")
logging.info(f"Page with ID {page_id} not found")


def add_snippet_to_index(url, snippet):
Expand All @@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet):
if not pages_df[pages_df['url'] == url].empty:
pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet
else:
print(f"Page with URL {url} not found")
logging.info(f"Page with URL {url} not found")


def get_tokens() -> list[list[str]]:
Expand Down Expand Up @@ -162,17 +163,17 @@ def load_pages() -> pd.DataFrame:

# Check if the file exists
if not os.path.exists(f"pages.csv"):
print("No pages found")
logging.info("No pages found")
return pages_df

try:
pages_df = pd.read_csv("pages.csv", header=0)
except pd.errors.EmptyDataError:
print("No pages found")
logging.info("No pages found")
return pages_df

# Convert the tokenized_text column to a list of lists
pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval)

print("Loaded pages")
logging.info("Loaded pages")
return pages_df
58 changes: 43 additions & 15 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import nltk as nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
Expand All @@ -9,11 +11,16 @@

from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index
from pipeline import PipelineElement
from utils import safe_join, safe_str

WN_LEMMATIZER = nltk.stem.WordNetLemmatizer()
STEMMER = nltk.stem.PorterStemmer()


def remove_punctuations(text):
punct_tag = re.compile(r'[^\w\s]')
text = punct_tag.sub(r'', text)
# Remove punctuations
punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]')
text = punctuations.sub(r'', text)
return text


Expand Down Expand Up @@ -51,20 +58,26 @@ def tokenize_plain_words(words: str):
return words.split()


def stem_and_remove_stopwords(words) -> list[str]:
# use english porterStemmer
def stem(words) -> list[str]:
words = [STEMMER.stem(word) for word in words] # added stemmer
return words


def remove_stopwords(words):
return [word for word in words if word not in stopwords.words("english")]


stemmer = nltk.stem.porter.PorterStemmer()
words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer
def lemmatize(words):
words = [WN_LEMMATIZER.lemmatize(word) for word in words]
return words


def tokenize_data(data) -> list[str]:
"""
Tokenizes the input data.
"""
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words,
stem_and_remove_stopwords]
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords,
lemmatize]
for pipe in pipeline:
data = pipe(data)
return data
Expand All @@ -91,10 +104,10 @@ def top_30_words(data):
X = vectorizer.fit_transform(data)
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
print(f"Feature names: {feature_names}")
print(f"X sieht so aus: {X}")
print(f"Shape of X: {X.shape}")
print(f"Summe: {X.sum(axis=0)}")
logging.info(f"Feature names: {feature_names}")
logging.info(f"X sieht so aus: {X}")
logging.info(f"Shape of X: {X.shape}")
logging.info(f"Summe: {X.sum(axis=0)}")
top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
return top_30_words

Expand All @@ -108,18 +121,33 @@ async def process(self, data, link):
Tokenizes the input data.
"""

if data is None:
logging.info(f"Failed to tokenize {link} because the data was empty.")
return

soup = data

# Get the text from the page
text = soup.get_text()
img_tags = soup.findAll("img")

# Get the meta description and title
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

# Get the alt texts from the images
img_tags = soup.findAll("img")
alt_texts = [img.get("alt") for img in img_tags]
text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content)

# Join all the text together
alt_texts_str = safe_join(alt_texts)
description_str = safe_str(description_content)
title_str = safe_str(title_content)
text = f"{text} {alt_texts_str} {description_str} {title_str}".strip()

# Tokenize the text
tokenized_text = tokenize_data(data=text)
add_tokens_to_index(url=link, tokenized_text=tokenized_text)

print(f"Tokenized text for {link}")
logging.info(f"Tokenized text for {link}")
8 changes: 7 additions & 1 deletion engine/index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages
from pipeline import PipelineElement

Expand All @@ -17,6 +19,10 @@ async def process(self, data, link):
Indexes the input data.
"""

if data is None:
logging.info(f"Failed to index {link} because the data was empty.")
return

soup = data

# Title
Expand All @@ -32,7 +38,7 @@ async def process(self, data, link):
add_title_to_index(url=link, title=title_content)
add_snippet_to_index(url=link, snippet=description_content)

print(f"Indexed {link}")
logging.info(f"Indexed {link}")
if not self.is_shutdown():
await self.call_next(soup, link)

Expand Down
21 changes: 14 additions & 7 deletions engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from concurrent.futures import ThreadPoolExecutor
import asyncio
import nest_asyncio

import signal
# Logging
import logging
# Database
import duckdb
# Pipeline
Expand All @@ -16,9 +18,14 @@
from custom_tokenizer import Tokenizer
from index import Indexer

# Constants
MAX_THREADS = 10
# Logging setup
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')

# Threading
MAX_THREADS = 10
# Patch asyncio to allow nested event loops
nest_asyncio.apply()

Expand All @@ -39,7 +46,7 @@


def signal_handler(signum, frame):
print("Interrupt received, shutting down... Please wait")
logging.info("Interrupt received, shutting down... Please wait. This may take a few seconds.")
for element in [crawler, indexer, tokenizer]:
element.shutdown()

Expand All @@ -60,7 +67,7 @@ async def main():
try:
await crawler.process()
except Exception as e:
print(f"An error occurred: {e}")
logging.info(f"An error occurred: {e}")
finally:
# Ensure states are saved even if an exception occurs
for element in [crawler, indexer, tokenizer]:
Expand All @@ -70,7 +77,7 @@ async def main():
index_df = access_index()
index_df.to_csv("inverted_index.csv")
con.close()
print("State saved")
logging.info("State saved")

# Save the state+
for element in [crawler, indexer, tokenizer]:
Expand All @@ -80,7 +87,7 @@ async def main():
index_df = access_index()
index_df.to_csv("inverted_index.csv")
con.close()
print("State saved")
logging.info("State saved")


if __name__ == "__main__":
Expand Down
7 changes: 4 additions & 3 deletions engine/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import logging
import threading


Expand All @@ -9,7 +10,7 @@ def __init__(self, name):
self.executor = None
self.tasks = []
self.shutdown_flag = threading.Event()
print(f"Initialized {self.name}")
logging.info(f"Initialized {self.name}")

def add_executor(self, executor):
self.executor = executor
Expand All @@ -25,10 +26,10 @@ def add_next(self, next_element):

async def call_next(self, *args):
if not self.next:
print(f"No next elements for {self.name}")
logging.info(f"No next elements for {self.name}")
return # No next elements to process

print(f"Processing next elements for {self.name}")
logging.info(f"Processing next elements for {self.name}")
tasks = []
for element in self.next:
if asyncio.iscoroutinefunction(element.process):
Expand Down
82 changes: 41 additions & 41 deletions engine/queries.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
tübingen
tübingen university
tübingen attractions
food and drinks
tübingen weather
tübingen hotels
tübingen traditional food
tübingen coffee shops
tübingen nightlife spots
tübingen museums
tübingen castles
tübingen outdoor activities
tübingen nightlife
tübingen markets
tübingen shopping centers
tübingen local products
Best cafes in Tübingen for students
Upcoming events at the University of Tübingen
History of Tübingen's old town
Popular hiking trails near Tübingen
Tübingen student housing options
Vegan and vegetarian restaurants in Tübingen
Cultural activities in Tübingen
Tübingen public transportation map
University of Tübingen research departments
Tübingen nightlife spots
Bookstores in Tübingen
Tübingen local farmers' markets
Tübingen weather forecast
Student discounts in Tübingen
Tübingen library hours and services
Language exchange programs in Tübingen
Top tourist attractions in Tübingen
Cycling routes in Tübingen
Tübingen sports clubs and gyms
Tübingen local festivals and fairs
Best places to study in Tübingen
Tübingen historical landmarks
Tübingen university application process
Local art galleries in Tübingen
Tübingen second-hand stores
1 tübingen
2 tübingen university
3 tübingen attractions
4 food and drinks
5 tübingen weather
6 tübingen hotels
7 tübingen traditional food
8 tübingen coffee shops
9 tübingen nightlife spots
10 tübingen museums
11 tübingen castles
12 tübingen outdoor activities
13 tübingen nightlife
14 tübingen markets
15 tübingen shopping centers
16 tübingen local products
17 Best cafes in Tübingen for students
18 Upcoming events at the University of Tübingen
19 History of Tübingen's old town
20 Popular hiking trails near Tübingen
21 Tübingen student housing options
22 Vegan and vegetarian restaurants in Tübingen
23 Cultural activities in Tübingen
24 Tübingen public transportation map
25 University of Tübingen research departments
26 Tübingen nightlife spots
27 Bookstores in Tübingen
28 Tübingen local farmers' markets
29 Tübingen weather forecast
30 Student discounts in Tübingen
31 Tübingen library hours and services
32 Language exchange programs in Tübingen
33 Top tourist attractions in Tübingen
34 Cycling routes in Tübingen
35 Tübingen sports clubs and gyms
36 Tübingen local festivals and fairs
37 Best places to study in Tübingen
38 Tübingen historical landmarks
39 Tübingen university application process
40 Local art galleries in Tübingen
41 Tübingen second-hand stores
1 change: 1 addition & 0 deletions engine/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Automatically generated by https://github.com/damnever/pigar.

beautifulsoup4==4.12.3
certifi==2024.7.4
duckdb==1.0.0
eld==1.0.6
Flask==3.0.3
Expand Down
3 changes: 2 additions & 1 deletion engine/test.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# file to test the written functions
import logging

from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words

CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."

top_30_words = top_30_words([CUSTOM_TEXT])
print(top_30_words)
logging.info(top_30_words)
Loading

0 comments on commit 0f4f4ed

Please sign in to comment.