diff --git a/engine/crawl.py b/engine/crawl.py index 7e5ea95..241b164 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -22,24 +22,50 @@ # Constants # URL seeds to start crawling from SEEDS = [ + # Official "https://www.tuebingen.de/en/", - "https://www.bio.mpg.de/2923/en", + # University "https://www.uni-tuebingen.de/en/", - "http://www.tuepedia.de", + "https://www.bio.mpg.de/2923/en", "https://health-nlp.com/index.html", "https://www.medizin.uni-tuebingen.de/en-de/startseite/", "https://www.my-stuwe.de/en/", "https://www.unimuseum.uni-tuebingen.de/en/", - "https://www.komoot.com/guide/210692/attractions-around-tuebingen", - "https://hoelderlinturm.de/english/", "https://www.fsi.uni-tuebingen.de/en/", + "https://studieren.de/international-business-eberhard-karls-universitaet-tuebingen.studienprofil.t-0.a-68.c-110.html", + "https://www.hih-tuebingen.de/en/?no_cache=1" + # Events + "https://www.dai-tuebingen.de/en/", + "https://pintofscience.de/events/tuebingen", + "http://www.tuepedia.de", + "https://hoelderlinturm.de/english/", + "https://www.tuebingen.de/en/leisure-tourism/culture/museums-galleries.html", + "https://www.eventbrite.com/ttd/germany--tübingen/", + # Tourism + "https://www.komoot.com/guide/210692/attractions-around-tuebingen", "https://www.stocherkahnfahrten.com/English/Stocherkahnrennen-English.html", "https://www.germany.travel/en/cities-culture/tuebingen.html", "https://justinpluslauren.com/things-to-do-in-tubingen-germany/", "https://www.yelp.de/search?find_desc=&find_loc=Tübingen%2C+Baden-Württemberg", "https://www.tripadvisor.com/Tourism-g198539-Tubingen_Baden_Wurttemberg-Vacations.html", ] +# Language detector LANG_DETECTOR = LanguageDetector() +# Ignore errors +SILENT_ERRORS = False + + +def log_error(error_msg): + """ + Prints an error message if SILENT_ERRORS is False. + Args: + error_msg: The error message to print. + + Returns: + + """ + if not SILENT_ERRORS: + logging.error(error_msg) class Crawler(PipelineElement): @@ -52,27 +78,32 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): # Initialize the crawler state self.urls_crawled = set() self.ignore_links = set() - self.to_crawl = collections.deque(SEEDS) - self.to_crawl_set = set(self.to_crawl) + self.to_crawl_queue = collections.deque(SEEDS) + self.to_crawl_set = set(self.to_crawl_queue) + self._page_count = 0 # Load the global state self._load_state() - # Internal state - self._connector = aiohttp.TCPConnector(limit=50, limit_per_host=10) - self._page_count = 0 - - self.max_size = 1000 # Example maximum size + # Crawler configuration + self.timeout = 10 # Timeout in seconds + self.max_retries = 3 # Maximum number of retries + self.retry_delay = 1 # Delay between retries in seconds + self.max_size = 1000 # Maximum number of pages to crawl + self.no_dynamic_content = False # Disable dynamic content handling (Playwright) self.ignore_domains = ["github.com", "linkedin.com", "xing.com", "instagram.com", "twitter.com", "youtube.com", "de.wikipedia.org", "wikipedia.org", "google.com", "google.de", "google.co.uk", - "amazon.com", "cctue.de", "spotify.com"] + "pinterest.com", "amazon.com", "cctue.de", "spotify.com"] self.langs = ["en", "en-de", "eng", "en-GB", "en-US", "english"] self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"] - self.user_agents = [("Modern Search Engines University of Tuebingen Project Crawler (" - "https://uni-tuebingen.de/de/262377)"), - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/58.0.3029.110 Safari/537.3")] - + self.user_agents = [ + "University of Tuebingen Student Web Crawler Project (https://uni-tuebingen.de/de/262377; contact: " + "webmaster@uni-tuebingen.de)", + "Mozilla/5.0 (compatible; TuebingenUniBot/1.0; +https://uni-tuebingen.de/de/262377)", + "Tuebingen University Research Crawler/1.0 (+https://uni-tuebingen.de/de/262377; Academic purposes only)", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 " + "Safari/537.36 (Tuebingen University Web Crawling Project)" + ] self.headers = { "User-Agent": self.user_agent, "Accept-Language": "en-US,en;q=0.9,de;q=0.8", @@ -85,50 +116,18 @@ def __del__(self) -> None: @property def user_agent(self): + # Cycle through user agents return self.user_agents[self._page_count % len(self.user_agents)] - async def fetch(self, session, url): - try: - async with session.get(url, timeout=5, headers=self.headers) as response: - return await response.text() - except Exception as e: - print(f"Error fetching {url}: {e}") - return None - - @staticmethod - async def _fetch_with_playwright(url, max_retries=3): - for attempt in range(max_retries): - try: - async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - await page.goto(url, wait_until='networkidle') - content = await page.content() - await browser.close() - return content - except Exception as e: - print(f"Error on attempt {attempt + 1} for {url}: {e}") - if attempt == max_retries - 1: - print(f"Max retries reached for {url}") - return None - - @staticmethod - def _needs_javascript_rendering(html: str) -> bool: - # Check for JavaScript frameworks - if any(framework in html.lower() for framework in ['react', 'vue', 'angular']): - return True - - return False - async def process(self): - async with ClientSession(connector=self._connector) as session: - while not self.is_shutdown() and self.to_crawl and len(self.urls_crawled) < self.max_size: - # Process multiple links concurrently + connector = TCPConnector(limit=100, force_close=True, enable_cleanup_closed=True) + async with ClientSession(connector=connector, trust_env=True) as session: + while not self.is_shutdown() and self.to_crawl_queue and len(self.urls_crawled) < self.max_size: tasks = [] - for _ in range(min(10, len(self.to_crawl))): # Process up to 10 links at a time - if self.to_crawl and len(self.urls_crawled) < self.max_size: - link = self.to_crawl.popleft() - task = asyncio.create_task(self._handle_link(session, link)) + for _ in range(min(10, len(self.to_crawl_queue))): + if self.to_crawl_queue and len(self.urls_crawled) < self.max_size: + url = self.to_crawl_queue.popleft() + task = asyncio.create_task(self._process_url(session, url)) tasks.append(task) else: break @@ -137,74 +136,117 @@ async def process(self): break # Wait for all tasks to complete or for shutdown - done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + completed, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) - # Process completed tasks - for task in done: + for task in completed: try: await task except Exception as e: - print(f"Error processing link: {e}") + log_error(f"Unhandled exception in task: {e}") - # Check for shutdown if self.is_shutdown(): - # Cancel pending tasks for task in pending: task.cancel() - # Wait for cancellation to complete await asyncio.gather(*pending, return_exceptions=True) self.save_state() break - async def _handle_link(self, session, link): + # If there are still pending tasks, add them back to the queue + for task in pending: + url = task.get_coro().cr_frame.f_locals.get('url') + if url: + self.to_crawl_queue.appendleft(url) + task.cancel() + + logging.info("Crawler finished processing") + + async def _process_url(self, session, url: str): + """ + Crawls a URL and processes the content. + Args: + session: + url: + + Returns: + + """ if len(self.urls_crawled) >= self.max_size: - print("Maximum size reached") + logging.info("Maximum size reached") return - print(f"Crawler crawling {link}...") - - if not link.startswith("http"): - print(f"Invalid URL: {link}") + if not url.startswith("http"): + logging.info(f"Invalid URL: {url}") return - if any(domain in link for domain in self.ignore_domains): - print(f"Ignoring {link} because it is in the ignore domains list") - self.ignore_links.add(link) + if any(domain in url for domain in self.ignore_domains): + logging.info(f"Ignoring {url} because it is in the ignore domains list") + self.ignore_links.add(url) return - if link in self.ignore_links or link in self.urls_crawled: - print(f"Ignoring {link} because it is in the ignore or found list") + if url in self.ignore_links or url in self.urls_crawled: + logging.info(f"Ignoring {url} because it is in the ignore or found list") return - if not check_robots(link): - print(f"Ignoring {link} because it is disallowed by robots.txt") - self.ignore_links.add(link) + if not check_robots(url): + logging.info(f"Ignoring {url} because it is disallowed by robots.txt") + self.ignore_links.add(url) return - html_content = await self.fetch(session, link) + html_content = await self._fetch(session, url) if html_content is None: - print(f"Error fetching {link}") - self.ignore_links.add(link) + logging.info(f"Error fetching {url}") + self.ignore_links.add(url) + return + + try: + soup = BeautifulSoup(html_content, "lxml") + text = soup.get_text().lower() + except Exception as e: + log_error(f"Error parsing {url}: {e}") + self.ignore_links.add(url) return - soup = BeautifulSoup(html_content, "lxml") - text = soup.get_text().lower() + if not text or not soup: + logging.info(f"Ignoring {url} because it is empty") + self.ignore_links.add(url) + return check_html_tag_lang = soup.find("html").get("lang") in self.langs check_xml_tag_lang = soup.find("html").get("xml:lang") in self.langs - check_link_lang = any(split == lang for split in link.split("/") for lang in self.langs) + check_link_lang = any(split == lang for split in url.split("/") for lang in self.langs) check_text_lang = LANG_DETECTOR.detect(text) in self.langs if not check_html_tag_lang and not check_xml_tag_lang and not check_link_lang and not check_text_lang: - print(f"Ignoring {link} because it is not in the correct language") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it is not in the correct language") + self.ignore_links.add(url) return if not any(keyword in text for keyword in self.required_keywords): - print(f"Ignoring {link} because it does not contain the required keywords") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it does not contain the required keywords") + self.ignore_links.add(url) return + # Handle links + await self._handle_links(soup, url) + + if url not in self.urls_crawled and url not in self.ignore_links: + self.urls_crawled.add(url) + + logging.info(f"Finished crawling {url}. Total: {len(self.urls_crawled)} links.") + if not self.is_shutdown(): + await self.call_next(soup, url) + + async def _handle_links(self, soup, url): + """ + Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the + found list, and not in the to_crawl_set. + Args: + soup: BeautifulSoup object + url: URL of the page + + Returns: + + """ for a_tag in soup.find_all("a", href=True): found_link = a_tag.get("href") @@ -214,10 +256,10 @@ async def _handle_link(self, session, link): # Check if link is relative if found_link.startswith("/"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) elif found_link.startswith("../"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) # Check if link is an email @@ -239,15 +281,39 @@ async def _handle_link(self, session, link): and found_link not in self.urls_crawled and found_link not in self.to_crawl_set and found_link.startswith("http")): - self.to_crawl.append(found_link) + self.to_crawl_queue.append(found_link) self.to_crawl_set.add(found_link) - if link not in self.urls_crawled and link not in self.ignore_links: - self.urls_crawled.add(link) + async def _fetch(self, session, url: str) -> str or None: + """ + Fetches the content of a URL using the given session. + Args: + session: + url: - print(f"Finished crawling {link}. Total: {len(self.urls_crawled)} links.") - if not self.is_shutdown(): - await self.call_next(soup, link) + Returns: the HTML content of the URL + """ + + max_retries = self.max_retries + retry_delay = self.retry_delay + + self._page_count += 1 + for attempt in range(max_retries): + logging.info(f"Fetching {url} (attempt {attempt + 1}/{max_retries})" if attempt > 0 else f"Fetching {url}") + try: + async with session.get(url, timeout=self.timeout, headers=self.headers) as response: + response.raise_for_status() + html_text = await response.text() + return html_text + except (TimeoutError, ClientError) as e: + if attempt == max_retries - 1: + log_error(f"Failed to process {url} after {max_retries} attempts: {str(e)}") + return + # Exponential wait time + await asyncio.sleep(retry_delay * (2 ** attempt)) + except Exception as e: + log_error(f"Error fetching {url}: {e}") + return None def save_state(self): """ @@ -261,7 +327,7 @@ def save_state(self): with open(f"crawler_states/global.json", "w") as f: # Write it as json f.write(json.dumps({ - "to_crawl": list(self.to_crawl), + "to_crawl": list(self.to_crawl_queue), "ignore_links": list(self.ignore_links), "found_links": list(self.urls_crawled) })) @@ -272,13 +338,13 @@ def _load_state(self): """ if not os.path.exists(f"crawler_states/global.json"): - print("No global state found") - self.to_crawl = collections.deque(SEEDS) + logging.info("No global state found") + self.to_crawl_queue = collections.deque(SEEDS) return with open(f"crawler_states/global.json", "r") as f: data = json.loads(f.read()) - self.to_crawl = collections.deque(data["to_crawl"]) + self.to_crawl_queue = collections.deque(data["to_crawl"]) self.to_crawl_set = set(data["to_crawl"]) self.ignore_links = set(data["ignore_links"]) self.urls_crawled = set(data["found_links"]) diff --git a/engine/custom_db.py b/engine/custom_db.py index 96640b5..855b19e 100644 --- a/engine/custom_db.py +++ b/engine/custom_db.py @@ -1,3 +1,4 @@ +import logging import os import pandas as pd @@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_title_to_index(url: str, title: str): @@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_snippet_to_index(url, snippet): @@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet): if not pages_df[pages_df['url'] == url].empty: pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet else: - print(f"Page with URL {url} not found") + logging.info(f"Page with URL {url} not found") def get_tokens() -> list[list[str]]: @@ -162,17 +163,17 @@ def load_pages() -> pd.DataFrame: # Check if the file exists if not os.path.exists(f"pages.csv"): - print("No pages found") + logging.info("No pages found") return pages_df try: pages_df = pd.read_csv("pages.csv", header=0) except pd.errors.EmptyDataError: - print("No pages found") + logging.info("No pages found") return pages_df # Convert the tokenized_text column to a list of lists pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval) - print("Loaded pages") + logging.info("Loaded pages") return pages_df diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index 9457ac7..fff429a 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -1,3 +1,5 @@ +import logging + import nltk as nltk from sklearn.feature_extraction.text import TfidfVectorizer import re @@ -9,11 +11,16 @@ from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index from pipeline import PipelineElement +from utils import safe_join, safe_str + +WN_LEMMATIZER = nltk.stem.WordNetLemmatizer() +STEMMER = nltk.stem.PorterStemmer() def remove_punctuations(text): - punct_tag = re.compile(r'[^\w\s]') - text = punct_tag.sub(r'', text) + # Remove punctuations + punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]') + text = punctuations.sub(r'', text) return text @@ -51,11 +58,17 @@ def tokenize_plain_words(words: str): return words.split() -def stem_and_remove_stopwords(words) -> list[str]: - # use english porterStemmer +def stem(words) -> list[str]: + words = [STEMMER.stem(word) for word in words] # added stemmer + return words + + +def remove_stopwords(words): + return [word for word in words if word not in stopwords.words("english")] + - stemmer = nltk.stem.porter.PorterStemmer() - words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer +def lemmatize(words): + words = [WN_LEMMATIZER.lemmatize(word) for word in words] return words @@ -63,8 +76,8 @@ def tokenize_data(data) -> list[str]: """ Tokenizes the input data. """ - pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, - stem_and_remove_stopwords] + pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords, + lemmatize] for pipe in pipeline: data = pipe(data) return data @@ -91,10 +104,10 @@ def top_30_words(data): X = vectorizer.fit_transform(data) # Get the feature names feature_names = vectorizer.get_feature_names_out() - print(f"Feature names: {feature_names}") - print(f"X sieht so aus: {X}") - print(f"Shape of X: {X.shape}") - print(f"Summe: {X.sum(axis=0)}") + logging.info(f"Feature names: {feature_names}") + logging.info(f"X sieht so aus: {X}") + logging.info(f"Shape of X: {X.shape}") + logging.info(f"Summe: {X.sum(axis=0)}") top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30] return top_30_words @@ -108,18 +121,33 @@ async def process(self, data, link): Tokenizes the input data. """ + if data is None: + logging.info(f"Failed to tokenize {link} because the data was empty.") + return + soup = data + + # Get the text from the page text = soup.get_text() - img_tags = soup.findAll("img") + + # Get the meta description and title description = soup.find("meta", attrs={"name": "description"}) description_content = description.get("content") if description is not None else "" title = soup.find("title") title_content = title.string if title is not None else "" + # Get the alt texts from the images + img_tags = soup.findAll("img") alt_texts = [img.get("alt") for img in img_tags] - text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content) + # Join all the text together + alt_texts_str = safe_join(alt_texts) + description_str = safe_str(description_content) + title_str = safe_str(title_content) + text = f"{text} {alt_texts_str} {description_str} {title_str}".strip() + + # Tokenize the text tokenized_text = tokenize_data(data=text) add_tokens_to_index(url=link, tokenized_text=tokenized_text) - print(f"Tokenized text for {link}") + logging.info(f"Tokenized text for {link}") diff --git a/engine/index.py b/engine/index.py index 5b9f1cb..a125f96 100644 --- a/engine/index.py +++ b/engine/index.py @@ -1,3 +1,5 @@ +import logging + from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages from pipeline import PipelineElement @@ -17,6 +19,10 @@ async def process(self, data, link): Indexes the input data. """ + if data is None: + logging.info(f"Failed to index {link} because the data was empty.") + return + soup = data # Title @@ -32,7 +38,7 @@ async def process(self, data, link): add_title_to_index(url=link, title=title_content) add_snippet_to_index(url=link, snippet=description_content) - print(f"Indexed {link}") + logging.info(f"Indexed {link}") if not self.is_shutdown(): await self.call_next(soup, link) diff --git a/engine/main.py b/engine/main.py index b5c636b..aa392da 100644 --- a/engine/main.py +++ b/engine/main.py @@ -7,7 +7,9 @@ from concurrent.futures import ThreadPoolExecutor import asyncio import nest_asyncio - +import signal +# Logging +import logging # Database import duckdb # Pipeline @@ -16,9 +18,14 @@ from custom_tokenizer import Tokenizer from index import Indexer -# Constants -MAX_THREADS = 10 +# Logging setup +logging.basicConfig( + format='%(asctime)s %(levelname)-8s %(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S') +# Threading +MAX_THREADS = 10 # Patch asyncio to allow nested event loops nest_asyncio.apply() @@ -39,7 +46,7 @@ def signal_handler(signum, frame): - print("Interrupt received, shutting down... Please wait") + logging.info("Interrupt received, shutting down... Please wait. This may take a few seconds.") for element in [crawler, indexer, tokenizer]: element.shutdown() @@ -60,7 +67,7 @@ async def main(): try: await crawler.process() except Exception as e: - print(f"An error occurred: {e}") + logging.info(f"An error occurred: {e}") finally: # Ensure states are saved even if an exception occurs for element in [crawler, indexer, tokenizer]: @@ -70,7 +77,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") # Save the state+ for element in [crawler, indexer, tokenizer]: @@ -80,7 +87,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") if __name__ == "__main__": diff --git a/engine/pipeline.py b/engine/pipeline.py index 37ba1b4..f1ae453 100644 --- a/engine/pipeline.py +++ b/engine/pipeline.py @@ -1,4 +1,5 @@ import asyncio +import logging import threading @@ -9,7 +10,7 @@ def __init__(self, name): self.executor = None self.tasks = [] self.shutdown_flag = threading.Event() - print(f"Initialized {self.name}") + logging.info(f"Initialized {self.name}") def add_executor(self, executor): self.executor = executor @@ -25,10 +26,10 @@ def add_next(self, next_element): async def call_next(self, *args): if not self.next: - print(f"No next elements for {self.name}") + logging.info(f"No next elements for {self.name}") return # No next elements to process - print(f"Processing next elements for {self.name}") + logging.info(f"Processing next elements for {self.name}") tasks = [] for element in self.next: if asyncio.iscoroutinefunction(element.process): diff --git a/engine/queries.txt b/engine/queries.txt index f2543dd..40bab39 100644 --- a/engine/queries.txt +++ b/engine/queries.txt @@ -1,41 +1,41 @@ -tübingen -tübingen university -tübingen attractions -food and drinks -tübingen weather -tübingen hotels -tübingen traditional food -tübingen coffee shops -tübingen nightlife spots -tübingen museums -tübingen castles -tübingen outdoor activities -tübingen nightlife -tübingen markets -tübingen shopping centers -tübingen local products -Best cafes in Tübingen for students -Upcoming events at the University of Tübingen -History of Tübingen's old town -Popular hiking trails near Tübingen -Tübingen student housing options -Vegan and vegetarian restaurants in Tübingen -Cultural activities in Tübingen -Tübingen public transportation map -University of Tübingen research departments -Tübingen nightlife spots -Bookstores in Tübingen -Tübingen local farmers' markets -Tübingen weather forecast -Student discounts in Tübingen -Tübingen library hours and services -Language exchange programs in Tübingen -Top tourist attractions in Tübingen -Cycling routes in Tübingen -Tübingen sports clubs and gyms -Tübingen local festivals and fairs -Best places to study in Tübingen -Tübingen historical landmarks -Tübingen university application process -Local art galleries in Tübingen -Tübingen second-hand stores +1 tübingen +2 tübingen university +3 tübingen attractions +4 food and drinks +5 tübingen weather +6 tübingen hotels +7 tübingen traditional food +8 tübingen coffee shops +9 tübingen nightlife spots +10 tübingen museums +11 tübingen castles +12 tübingen outdoor activities +13 tübingen nightlife +14 tübingen markets +15 tübingen shopping centers +16 tübingen local products +17 Best cafes in Tübingen for students +18 Upcoming events at the University of Tübingen +19 History of Tübingen's old town +20 Popular hiking trails near Tübingen +21 Tübingen student housing options +22 Vegan and vegetarian restaurants in Tübingen +23 Cultural activities in Tübingen +24 Tübingen public transportation map +25 University of Tübingen research departments +26 Tübingen nightlife spots +27 Bookstores in Tübingen +28 Tübingen local farmers' markets +29 Tübingen weather forecast +30 Student discounts in Tübingen +31 Tübingen library hours and services +32 Language exchange programs in Tübingen +33 Top tourist attractions in Tübingen +34 Cycling routes in Tübingen +35 Tübingen sports clubs and gyms +36 Tübingen local festivals and fairs +37 Best places to study in Tübingen +38 Tübingen historical landmarks +39 Tübingen university application process +40 Local art galleries in Tübingen +41 Tübingen second-hand stores diff --git a/engine/requirements.txt b/engine/requirements.txt index e850b9a..36f516a 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -1,6 +1,7 @@ # Automatically generated by https://github.com/damnever/pigar. beautifulsoup4==4.12.3 +certifi==2024.7.4 duckdb==1.0.0 eld==1.0.6 Flask==3.0.3 diff --git a/engine/test.py b/engine/test.py index 389ca59..08bed22 100644 --- a/engine/test.py +++ b/engine/test.py @@ -1,8 +1,9 @@ # file to test the written functions +import logging from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum." top_30_words = top_30_words([CUSTOM_TEXT]) -print(top_30_words) +logging.info(top_30_words) diff --git a/engine/utils.py b/engine/utils.py index dbefc6d..8755602 100644 --- a/engine/utils.py +++ b/engine/utils.py @@ -71,3 +71,29 @@ def check_robots(url: str) -> bool: except: return True return rp.can_fetch("*", url) + + +def safe_join(items): + """ + Safely joins a list of items into a string, separating them with a space. + If an item is None, it is skipped. + + Args: + items: + + Returns: + + """ + return " ".join(str(item) for item in items if item is not None) + + +def safe_str(item): + """ + Safely converts an item to a string. If the item is None, an empty string is returned. + Args: + item: + + Returns: + + """ + return str(item) if item is not None else ""