From 94edd89c347bfb9e56c8fb65127f4c37a187477b Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:53:38 +0200 Subject: [PATCH 1/7] Experiment with dynamic content and buttons. Add more frontiers. Add retries. Dynamic content handling is disabled since it causes errors. --- engine/crawl.py | 402 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 298 insertions(+), 104 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index ede0ec1..bae7cea 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -23,24 +23,50 @@ # Constants # URL seeds to start crawling from SEEDS = [ + # Official "https://www.tuebingen.de/en/", - "https://www.bio.mpg.de/2923/en", + # University "https://www.uni-tuebingen.de/en/", - "http://www.tuepedia.de", + "https://www.bio.mpg.de/2923/en", "https://health-nlp.com/index.html", "https://www.medizin.uni-tuebingen.de/en-de/startseite/", "https://www.my-stuwe.de/en/", "https://www.unimuseum.uni-tuebingen.de/en/", - "https://www.komoot.com/guide/210692/attractions-around-tuebingen", - "https://hoelderlinturm.de/english/", "https://www.fsi.uni-tuebingen.de/en/", + "https://studieren.de/international-business-eberhard-karls-universitaet-tuebingen.studienprofil.t-0.a-68.c-110.html", + "https://www.hih-tuebingen.de/en/?no_cache=1" + # Events + "https://www.dai-tuebingen.de/en/", + "https://pintofscience.de/events/tuebingen", + "http://www.tuepedia.de", + "https://hoelderlinturm.de/english/", + "https://www.tuebingen.de/en/leisure-tourism/culture/museums-galleries.html", + "https://www.eventbrite.com/ttd/germany--tübingen/", + # Tourism + "https://www.komoot.com/guide/210692/attractions-around-tuebingen", "https://www.stocherkahnfahrten.com/English/Stocherkahnrennen-English.html", "https://www.germany.travel/en/cities-culture/tuebingen.html", "https://justinpluslauren.com/things-to-do-in-tubingen-germany/", "https://www.yelp.de/search?find_desc=&find_loc=Tübingen%2C+Baden-Württemberg", "https://www.tripadvisor.com/Tourism-g198539-Tubingen_Baden_Wurttemberg-Vacations.html", ] +# Language detector LANG_DETECTOR = LanguageDetector() +# Ignore errors +SILENT_ERRORS = False + + +def log_error(error_msg): + """ + Prints an error message if SILENT_ERRORS is False. + Args: + error_msg: The error message to print. + + Returns: + + """ + if not SILENT_ERRORS: + logging.error(error_msg) class Crawler(PipelineElement): @@ -53,27 +79,32 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): # Initialize the crawler state self.urls_crawled = set() self.ignore_links = set() - self.to_crawl = collections.deque(SEEDS) - self.to_crawl_set = set(self.to_crawl) + self.to_crawl_queue = collections.deque(SEEDS) + self.to_crawl_set = set(self.to_crawl_queue) + self._page_count = 0 # Load the global state self._load_state() - # Internal state - self._connector = aiohttp.TCPConnector(limit=50, limit_per_host=10) - self._page_count = 0 - - self.max_size = 1000 # Example maximum size + # Crawler configuration + self.timeout = 10 # Timeout in seconds + self.max_retries = 3 # Maximum number of retries + self.retry_delay = 1 # Delay between retries in seconds + self.max_size = 1000 # Maximum number of pages to crawl + self.no_dynamic_content = False # Disable dynamic content handling (Playwright) self.ignore_domains = ["github.com", "linkedin.com", "xing.com", "instagram.com", "twitter.com", "youtube.com", "de.wikipedia.org", "wikipedia.org", "google.com", "google.de", "google.co.uk", - "amazon.com", "cctue.de", "spotify.com"] + "pinterest.com", "amazon.com", "cctue.de", "spotify.com"] self.langs = ["en", "en-de", "eng", "en-GB", "en-US", "english"] self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"] - self.user_agents = [("Modern Search Engines University of Tuebingen Project Crawler (" - "https://uni-tuebingen.de/de/262377)"), - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/58.0.3029.110 Safari/537.3")] - + self.user_agents = [ + "University of Tuebingen Student Web Crawler Project (https://uni-tuebingen.de/de/262377; contact: " + "webmaster@uni-tuebingen.de)", + "Mozilla/5.0 (compatible; TuebingenUniBot/1.0; +https://uni-tuebingen.de/de/262377)", + "Tuebingen University Research Crawler/1.0 (+https://uni-tuebingen.de/de/262377; Academic purposes only)", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 " + "Safari/537.36 (Tuebingen University Web Crawling Project)" + ] self.headers = { "User-Agent": self.user_agent, "Accept-Language": "en-US,en;q=0.9,de;q=0.8", @@ -84,54 +115,23 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): def __del__(self) -> None: self.cursor.close() - async def fetch(self, session, url): - headers = { - "User-Agent": self.user_agent, - "Accept-Language": "en-US,en;q=0.9,de;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - } - try: - async with session.get(url, timeout=5, headers=headers) as response: - return await response.text() - except Exception as e: - print(f"Error fetching {url}: {e}") - return None - - @staticmethod - async def _fetch_with_playwright(url, max_retries=3): - for attempt in range(max_retries): - try: - async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - await page.goto(url, wait_until='networkidle') - content = await page.content() - await browser.close() - return content - except Exception as e: - print(f"Error on attempt {attempt + 1} for {url}: {e}") - if attempt == max_retries - 1: - print(f"Max retries reached for {url}") - return None - - @staticmethod - def _needs_javascript_rendering(html: str) -> bool: - # Check for JavaScript frameworks - if any(framework in html.lower() for framework in ['react', 'vue', 'angular']): - return True + # Internal state + self._page_count = 0 - return False + @property + def user_agent(self): + # Cycle through user agents + return self.user_agents[self._page_count % len(self.user_agents)] async def process(self): - async with ClientSession(connector=self._connector) as session: - while not self.is_shutdown() and self.to_crawl and len(self.urls_crawled) < self.max_size: - # Process multiple links concurrently + connector = TCPConnector(limit=100, force_close=True, enable_cleanup_closed=True) + async with ClientSession(connector=connector, trust_env=True) as session: + while not self.is_shutdown() and self.to_crawl_queue and len(self.urls_crawled) < self.max_size: tasks = [] - for _ in range(min(10, len(self.to_crawl))): # Process up to 10 links at a time - if self.to_crawl and len(self.urls_crawled) < self.max_size: - link = self.to_crawl.popleft() - task = asyncio.create_task(self._handle_link(session, link)) + for _ in range(min(10, len(self.to_crawl_queue))): + if self.to_crawl_queue and len(self.urls_crawled) < self.max_size: + url = self.to_crawl_queue.popleft() + task = asyncio.create_task(self._process_url(session, url)) tasks.append(task) else: break @@ -140,74 +140,137 @@ async def process(self): break # Wait for all tasks to complete or for shutdown - done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + completed, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) - # Process completed tasks - for task in done: + for task in completed: try: await task except Exception as e: - print(f"Error processing link: {e}") + log_error(f"Unhandled exception in task: {e}") - # Check for shutdown if self.is_shutdown(): - # Cancel pending tasks for task in pending: task.cancel() - # Wait for cancellation to complete await asyncio.gather(*pending, return_exceptions=True) self.save_state() break - async def _handle_link(self, session, link): + # If there are still pending tasks, add them back to the queue + for task in pending: + url = task.get_coro().cr_frame.f_locals.get('url') + if url: + self.to_crawl_queue.appendleft(url) + task.cancel() + + logging.info("Crawler finished processing") + + async def _process_url(self, session, url: str): + """ + Crawls a URL and processes the content. + Args: + session: + url: + + Returns: + + """ if len(self.urls_crawled) >= self.max_size: - print("Maximum size reached") + logging.info("Maximum size reached") return - print(f"Crawler crawling {link}...") - - if not link.startswith("http"): - print(f"Invalid URL: {link}") + if not url.startswith("http"): + logging.info(f"Invalid URL: {url}") return - if any(domain in link for domain in self.ignore_domains): - print(f"Ignoring {link} because it is in the ignore domains list") - self.ignore_links.add(link) + if any(domain in url for domain in self.ignore_domains): + logging.info(f"Ignoring {url} because it is in the ignore domains list") + self.ignore_links.add(url) return - if link in self.ignore_links or link in self.urls_crawled: - print(f"Ignoring {link} because it is in the ignore or found list") + if url in self.ignore_links or url in self.urls_crawled: + logging.info(f"Ignoring {url} because it is in the ignore or found list") return - if not check_robots(link): - print(f"Ignoring {link} because it is disallowed by robots.txt") - self.ignore_links.add(link) + if not check_robots(url): + logging.info(f"Ignoring {url} because it is disallowed by robots.txt") + self.ignore_links.add(url) return - html_content = await self.fetch(session, link) + html_content = await self._fetch(session, url) if html_content is None: - print(f"Error fetching {link}") - self.ignore_links.add(link) + logging.info(f"Error fetching {url}") + self.ignore_links.add(url) + return + + try: + soup = BeautifulSoup(html_content, "lxml") + text = soup.get_text().lower() + except Exception as e: + log_error(f"Error parsing {url}: {e}") + self.ignore_links.add(url) return - soup = BeautifulSoup(html_content, "lxml") - text = soup.get_text().lower() + if not text or not soup: + logging.info(f"Ignoring {url} because it is empty") + self.ignore_links.add(url) + return check_html_tag_lang = soup.find("html").get("lang") in self.langs check_xml_tag_lang = soup.find("html").get("xml:lang") in self.langs - check_link_lang = any(split == lang for split in link.split("/") for lang in self.langs) + check_link_lang = any(split == lang for split in url.split("/") for lang in self.langs) check_text_lang = LANG_DETECTOR.detect(text) in self.langs if not check_html_tag_lang and not check_xml_tag_lang and not check_link_lang and not check_text_lang: - print(f"Ignoring {link} because it is not in the correct language") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it is not in the correct language") + self.ignore_links.add(url) return if not any(keyword in text for keyword in self.required_keywords): - print(f"Ignoring {link} because it does not contain the required keywords") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it does not contain the required keywords") + self.ignore_links.add(url) return + # Handle links + await self._handle_links(soup, url) + + # Handle buttons + #await self._handle_dynamic_content(url) + + if url not in self.urls_crawled and url not in self.ignore_links: + self.urls_crawled.add(url) + + logging.info(f"Finished crawling {url}. Total: {len(self.urls_crawled)} links.") + if not self.is_shutdown(): + await self.call_next(soup, url) + + async def _handle_dynamic_links(self, page: Page, url: str): + """ + Handles the links on a page using Playwright. + Args: + page: + url: + + Returns: + + """ + try: + content = await page.content() + soup = BeautifulSoup(content, "lxml") + await self._handle_links(soup, url) + except Exception as e: + log_error(f"Error handling links on {url}: {e}") + + async def _handle_links(self, soup, url): + """ + Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the + found list, and not in the to_crawl_set. + Args: + soup: BeautifulSoup object + url: URL of the page + + Returns: + + """ for a_tag in soup.find_all("a", href=True): found_link = a_tag.get("href") @@ -217,10 +280,10 @@ async def _handle_link(self, session, link): # Check if link is relative if found_link.startswith("/"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) elif found_link.startswith("../"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) # Check if link is an email @@ -242,15 +305,146 @@ async def _handle_link(self, session, link): and found_link not in self.urls_crawled and found_link not in self.to_crawl_set and found_link.startswith("http")): - self.to_crawl.append(found_link) + self.to_crawl_queue.append(found_link) self.to_crawl_set.add(found_link) - if link not in self.urls_crawled and link not in self.ignore_links: - self.urls_crawled.add(link) + async def _handle_dynamic_content(self, url: str): + """ + Handles dynamic content on a page. This is useful for websites that require JavaScript rendering. + Args: + url: - print(f"Finished crawling {link}. Total: {len(self.urls_crawled)} links.") - if not self.is_shutdown(): - await self.call_next(soup, link) + Returns: + + """ + try: + await self._fetch_with_playwright(url, [self._handle_dynamic_buttons]) + logging.info(f"Finished handling dynamic content on {url}. Total: {len(self.urls_crawled)} links.") + except Exception as e: + log_error(f"Error handling dynamic content: {e}") + + async def _handle_dynamic_buttons(self, page: Page, url: str, ignore_names=None): + if ignore_names is None: + ignore_names = ['search', 'submit', 'login', 'register', 'sign in', 'sign up', 'accept', 'close', 'agree'] + try: + buttons = await page.query_selector_all("button:visible:not(:disabled)") + if not buttons: + return + logging.info(f"Found {len(buttons)} clickable buttons on {url}") + + for button_index, button in enumerate(buttons): + if self.is_shutdown(): + logging.info("Shutdown signal received during button handling.") + return + try: + if page.is_closed(): + logging.info("Page was closed, stopping button handling.") + return + + await button.scroll_into_view_if_needed() + button_text = await button.text_content() + if any(ignore_name.lower() in button_text.lower() for ignore_name in ignore_names): + continue + + try: + async with page.expect_navigation(wait_until="networkidle", timeout=5000): + await button.click(timeout=5000) + except PlaywrightTimeoutError: + log_error(f"Navigation timeout after clicking button {button_index} on {url}") + continue + except TargetClosedError: + log_error(f"Page was closed while clicking button {button_index} on {url}") + return + + new_content = await page.content() + soup = BeautifulSoup(new_content, "lxml") + await self._handle_links(soup, url) + + except TargetClosedError: + log_error(f"Page was closed while handling button {button_index} on {url}") + return + except Exception as e: + log_error(f"Error handling button {button_index} on {url}: {e}") + except TargetClosedError: + log_error(f"Page was closed during button discovery on {url}") + except Exception as e: + log_error(f"Error during button discovery on {url}: {e}") + + async def _fetch(self, session, url: str) -> str or None: + """ + Fetches the content of a URL using the given session. + Args: + session: + url: + + Returns: the HTML content of the URL + """ + + max_retries = self.max_retries + retry_delay = self.retry_delay + + self._page_count += 1 + for attempt in range(max_retries): + logging.info(f"Fetching {url} (attempt {attempt + 1}/{max_retries})" if attempt > 0 else f"Fetching {url}") + try: + async with session.get(url, timeout=self.timeout, headers=self.headers) as response: + response.raise_for_status() + html_text = await response.text() + return html_text + except (TimeoutError, ClientError) as e: + if attempt == max_retries - 1: + log_error(f"Failed to process {url} after {max_retries} attempts: {str(e)}") + return + # Exponential wait time + await asyncio.sleep(retry_delay * (2 ** attempt)) + except Exception as e: + log_error(f"Error fetching {url}: {e}") + return None + + async def _fetch_with_playwright(self, url: str, callbacks: list[callable] = None): + max_retries = self.max_retries + retry_delay = self.retry_delay + + for attempt in range(max_retries): + if self.is_shutdown(): + logging.info("Shutdown signal received before starting fetch.") + return + + logging.info(f"Fetching {url} with playwright (attempt {attempt + 1}/{max_retries})" if attempt > 0 + else f"Fetching {url} with playwright") + + try: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + try: + page = await browser.new_page() + try: + await page.goto(url, wait_until='networkidle', timeout=30000) + + if self.is_shutdown(): + logging.info("Shutdown signal received after page load.") + return + + if callbacks: + for callback in callbacks: + await callback(page, url) + + return # Exit after successful fetch + finally: + await page.close() + finally: + await browser.close() + except Exception as e: + log_error(f"Error fetching {url} with playwright (attempt {attempt + 1}/{max_retries}): {e}") + if attempt == max_retries - 1: + log_error(f"Failed to process {url} after {max_retries} attempts.") + return + # Exponential wait time + await asyncio.sleep(retry_delay * (2 ** attempt)) + + if self.is_shutdown(): + logging.info("Shutdown signal received during retry delay.") + return def save_state(self): """ @@ -264,7 +458,7 @@ def save_state(self): with open(f"crawler_states/global.json", "w") as f: # Write it as json f.write(json.dumps({ - "to_crawl": list(self.to_crawl), + "to_crawl": list(self.to_crawl_queue), "ignore_links": list(self.ignore_links), "found_links": list(self.urls_crawled) })) @@ -275,13 +469,13 @@ def _load_state(self): """ if not os.path.exists(f"crawler_states/global.json"): - print("No global state found") - self.to_crawl = collections.deque(SEEDS) + logging.info("No global state found") + self.to_crawl_queue = collections.deque(SEEDS) return with open(f"crawler_states/global.json", "r") as f: data = json.loads(f.read()) - self.to_crawl = collections.deque(data["to_crawl"]) + self.to_crawl_queue = collections.deque(data["to_crawl"]) self.to_crawl_set = set(data["to_crawl"]) self.ignore_links = set(data["ignore_links"]) self.urls_crawled = set(data["found_links"]) From 4d8db248012e92547bca60426c7a2bf8b2e54edb Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:06 +0200 Subject: [PATCH 2/7] Update tokenizer --- engine/custom_tokenizer.py | 58 ++++++++++++++++++++++++++++---------- engine/utils.py | 26 +++++++++++++++++ 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index 9457ac7..fff429a 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -1,3 +1,5 @@ +import logging + import nltk as nltk from sklearn.feature_extraction.text import TfidfVectorizer import re @@ -9,11 +11,16 @@ from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index from pipeline import PipelineElement +from utils import safe_join, safe_str + +WN_LEMMATIZER = nltk.stem.WordNetLemmatizer() +STEMMER = nltk.stem.PorterStemmer() def remove_punctuations(text): - punct_tag = re.compile(r'[^\w\s]') - text = punct_tag.sub(r'', text) + # Remove punctuations + punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]') + text = punctuations.sub(r'', text) return text @@ -51,11 +58,17 @@ def tokenize_plain_words(words: str): return words.split() -def stem_and_remove_stopwords(words) -> list[str]: - # use english porterStemmer +def stem(words) -> list[str]: + words = [STEMMER.stem(word) for word in words] # added stemmer + return words + + +def remove_stopwords(words): + return [word for word in words if word not in stopwords.words("english")] + - stemmer = nltk.stem.porter.PorterStemmer() - words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer +def lemmatize(words): + words = [WN_LEMMATIZER.lemmatize(word) for word in words] return words @@ -63,8 +76,8 @@ def tokenize_data(data) -> list[str]: """ Tokenizes the input data. """ - pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, - stem_and_remove_stopwords] + pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords, + lemmatize] for pipe in pipeline: data = pipe(data) return data @@ -91,10 +104,10 @@ def top_30_words(data): X = vectorizer.fit_transform(data) # Get the feature names feature_names = vectorizer.get_feature_names_out() - print(f"Feature names: {feature_names}") - print(f"X sieht so aus: {X}") - print(f"Shape of X: {X.shape}") - print(f"Summe: {X.sum(axis=0)}") + logging.info(f"Feature names: {feature_names}") + logging.info(f"X sieht so aus: {X}") + logging.info(f"Shape of X: {X.shape}") + logging.info(f"Summe: {X.sum(axis=0)}") top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30] return top_30_words @@ -108,18 +121,33 @@ async def process(self, data, link): Tokenizes the input data. """ + if data is None: + logging.info(f"Failed to tokenize {link} because the data was empty.") + return + soup = data + + # Get the text from the page text = soup.get_text() - img_tags = soup.findAll("img") + + # Get the meta description and title description = soup.find("meta", attrs={"name": "description"}) description_content = description.get("content") if description is not None else "" title = soup.find("title") title_content = title.string if title is not None else "" + # Get the alt texts from the images + img_tags = soup.findAll("img") alt_texts = [img.get("alt") for img in img_tags] - text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content) + # Join all the text together + alt_texts_str = safe_join(alt_texts) + description_str = safe_str(description_content) + title_str = safe_str(title_content) + text = f"{text} {alt_texts_str} {description_str} {title_str}".strip() + + # Tokenize the text tokenized_text = tokenize_data(data=text) add_tokens_to_index(url=link, tokenized_text=tokenized_text) - print(f"Tokenized text for {link}") + logging.info(f"Tokenized text for {link}") diff --git a/engine/utils.py b/engine/utils.py index dbefc6d..8755602 100644 --- a/engine/utils.py +++ b/engine/utils.py @@ -71,3 +71,29 @@ def check_robots(url: str) -> bool: except: return True return rp.can_fetch("*", url) + + +def safe_join(items): + """ + Safely joins a list of items into a string, separating them with a space. + If an item is None, it is skipped. + + Args: + items: + + Returns: + + """ + return " ".join(str(item) for item in items if item is not None) + + +def safe_str(item): + """ + Safely converts an item to a string. If the item is None, an empty string is returned. + Args: + item: + + Returns: + + """ + return str(item) if item is not None else "" From 628d364a6a36e6835877acca008fe00c440aa0f3 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:48 +0200 Subject: [PATCH 3/7] Improve logging --- engine/custom_db.py | 13 +++++++------ engine/index.py | 8 +++++++- engine/main.py | 21 ++++++++++++++------- engine/pipeline.py | 7 ++++--- engine/test.py | 3 ++- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/engine/custom_db.py b/engine/custom_db.py index 96640b5..855b19e 100644 --- a/engine/custom_db.py +++ b/engine/custom_db.py @@ -1,3 +1,4 @@ +import logging import os import pandas as pd @@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_title_to_index(url: str, title: str): @@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_snippet_to_index(url, snippet): @@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet): if not pages_df[pages_df['url'] == url].empty: pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet else: - print(f"Page with URL {url} not found") + logging.info(f"Page with URL {url} not found") def get_tokens() -> list[list[str]]: @@ -162,17 +163,17 @@ def load_pages() -> pd.DataFrame: # Check if the file exists if not os.path.exists(f"pages.csv"): - print("No pages found") + logging.info("No pages found") return pages_df try: pages_df = pd.read_csv("pages.csv", header=0) except pd.errors.EmptyDataError: - print("No pages found") + logging.info("No pages found") return pages_df # Convert the tokenized_text column to a list of lists pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval) - print("Loaded pages") + logging.info("Loaded pages") return pages_df diff --git a/engine/index.py b/engine/index.py index 5b9f1cb..a125f96 100644 --- a/engine/index.py +++ b/engine/index.py @@ -1,3 +1,5 @@ +import logging + from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages from pipeline import PipelineElement @@ -17,6 +19,10 @@ async def process(self, data, link): Indexes the input data. """ + if data is None: + logging.info(f"Failed to index {link} because the data was empty.") + return + soup = data # Title @@ -32,7 +38,7 @@ async def process(self, data, link): add_title_to_index(url=link, title=title_content) add_snippet_to_index(url=link, snippet=description_content) - print(f"Indexed {link}") + logging.info(f"Indexed {link}") if not self.is_shutdown(): await self.call_next(soup, link) diff --git a/engine/main.py b/engine/main.py index a1c1802..8ce2fd3 100644 --- a/engine/main.py +++ b/engine/main.py @@ -4,7 +4,9 @@ from concurrent.futures import ThreadPoolExecutor import asyncio import nest_asyncio - +import signal +# Logging +import logging # Database import duckdb # Pipeline @@ -13,9 +15,14 @@ from custom_tokenizer import Tokenizer from index import Indexer -# Constants -MAX_THREADS = 10 +# Logging setup +logging.basicConfig( + format='%(asctime)s %(levelname)-8s %(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S') +# Threading +MAX_THREADS = 10 # Patch asyncio to allow nested event loops nest_asyncio.apply() @@ -36,7 +43,7 @@ def signal_handler(signum, frame): - print("Interrupt received, shutting down... Please wait") + logging.info("Interrupt received, shutting down... Please wait. This may take a few seconds.") for element in [crawler, indexer, tokenizer]: element.shutdown() @@ -57,7 +64,7 @@ async def main(): try: await crawler.process() except Exception as e: - print(f"An error occurred: {e}") + logging.info(f"An error occurred: {e}") finally: # Ensure states are saved even if an exception occurs for element in [crawler, indexer, tokenizer]: @@ -67,7 +74,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") # Save the state+ for element in [crawler, indexer, tokenizer]: @@ -77,7 +84,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") if __name__ == "__main__": diff --git a/engine/pipeline.py b/engine/pipeline.py index 37ba1b4..f1ae453 100644 --- a/engine/pipeline.py +++ b/engine/pipeline.py @@ -1,4 +1,5 @@ import asyncio +import logging import threading @@ -9,7 +10,7 @@ def __init__(self, name): self.executor = None self.tasks = [] self.shutdown_flag = threading.Event() - print(f"Initialized {self.name}") + logging.info(f"Initialized {self.name}") def add_executor(self, executor): self.executor = executor @@ -25,10 +26,10 @@ def add_next(self, next_element): async def call_next(self, *args): if not self.next: - print(f"No next elements for {self.name}") + logging.info(f"No next elements for {self.name}") return # No next elements to process - print(f"Processing next elements for {self.name}") + logging.info(f"Processing next elements for {self.name}") tasks = [] for element in self.next: if asyncio.iscoroutinefunction(element.process): diff --git a/engine/test.py b/engine/test.py index 389ca59..08bed22 100644 --- a/engine/test.py +++ b/engine/test.py @@ -1,8 +1,9 @@ # file to test the written functions +import logging from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum." top_30_words = top_30_words([CUSTOM_TEXT]) -print(top_30_words) +logging.info(top_30_words) From b3b1c3697b50ecd48bc7aebfea847145523104ed Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:53 +0200 Subject: [PATCH 4/7] Add queries --- engine/queries.txt | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/engine/queries.txt b/engine/queries.txt index f2543dd..3429346 100644 --- a/engine/queries.txt +++ b/engine/queries.txt @@ -1,41 +1,2 @@ -tübingen -tübingen university -tübingen attractions -food and drinks -tübingen weather -tübingen hotels -tübingen traditional food -tübingen coffee shops -tübingen nightlife spots -tübingen museums -tübingen castles -tübingen outdoor activities -tübingen nightlife -tübingen markets -tübingen shopping centers -tübingen local products -Best cafes in Tübingen for students -Upcoming events at the University of Tübingen -History of Tübingen's old town -Popular hiking trails near Tübingen -Tübingen student housing options -Vegan and vegetarian restaurants in Tübingen -Cultural activities in Tübingen -Tübingen public transportation map -University of Tübingen research departments -Tübingen nightlife spots -Bookstores in Tübingen -Tübingen local farmers' markets -Tübingen weather forecast -Student discounts in Tübingen -Tübingen library hours and services -Language exchange programs in Tübingen -Top tourist attractions in Tübingen -Cycling routes in Tübingen -Tübingen sports clubs and gyms -Tübingen local festivals and fairs -Best places to study in Tübingen -Tübingen historical landmarks -Tübingen university application process -Local art galleries in Tübingen -Tübingen second-hand stores +1 tübingen attractions +2 food and drinks \ No newline at end of file From 0fcb896f1250ee45e6af822120371159d3bc3ca1 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:59 +0200 Subject: [PATCH 5/7] Add certifi --- engine/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/requirements.txt b/engine/requirements.txt index e850b9a..36f516a 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -1,6 +1,7 @@ # Automatically generated by https://github.com/damnever/pigar. beautifulsoup4==4.12.3 +certifi==2024.7.4 duckdb==1.0.0 eld==1.0.6 Flask==3.0.3 From cd2201f5c0d4952ce30ef3543beca62ca725cdaf Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:57:37 +0200 Subject: [PATCH 6/7] Update query format --- engine/queries.txt | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/engine/queries.txt b/engine/queries.txt index 3429346..40bab39 100644 --- a/engine/queries.txt +++ b/engine/queries.txt @@ -1,2 +1,41 @@ -1 tübingen attractions -2 food and drinks \ No newline at end of file +1 tübingen +2 tübingen university +3 tübingen attractions +4 food and drinks +5 tübingen weather +6 tübingen hotels +7 tübingen traditional food +8 tübingen coffee shops +9 tübingen nightlife spots +10 tübingen museums +11 tübingen castles +12 tübingen outdoor activities +13 tübingen nightlife +14 tübingen markets +15 tübingen shopping centers +16 tübingen local products +17 Best cafes in Tübingen for students +18 Upcoming events at the University of Tübingen +19 History of Tübingen's old town +20 Popular hiking trails near Tübingen +21 Tübingen student housing options +22 Vegan and vegetarian restaurants in Tübingen +23 Cultural activities in Tübingen +24 Tübingen public transportation map +25 University of Tübingen research departments +26 Tübingen nightlife spots +27 Bookstores in Tübingen +28 Tübingen local farmers' markets +29 Tübingen weather forecast +30 Student discounts in Tübingen +31 Tübingen library hours and services +32 Language exchange programs in Tübingen +33 Top tourist attractions in Tübingen +34 Cycling routes in Tübingen +35 Tübingen sports clubs and gyms +36 Tübingen local festivals and fairs +37 Best places to study in Tübingen +38 Tübingen historical landmarks +39 Tübingen university application process +40 Local art galleries in Tübingen +41 Tübingen second-hand stores From 3f835bc73c41af2e17cba04744f25054035b003d Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 22:04:51 +0200 Subject: [PATCH 7/7] Remove code that handles dynamic stuff. If we still need it, it's in commit 94edd89c347bfb9e56c8fb65127f4c37a187477b --- engine/crawl.py | 127 ------------------------------------------------ 1 file changed, 127 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index bae7cea..b63d048 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -233,9 +233,6 @@ async def _process_url(self, session, url: str): # Handle links await self._handle_links(soup, url) - # Handle buttons - #await self._handle_dynamic_content(url) - if url not in self.urls_crawled and url not in self.ignore_links: self.urls_crawled.add(url) @@ -243,23 +240,6 @@ async def _process_url(self, session, url: str): if not self.is_shutdown(): await self.call_next(soup, url) - async def _handle_dynamic_links(self, page: Page, url: str): - """ - Handles the links on a page using Playwright. - Args: - page: - url: - - Returns: - - """ - try: - content = await page.content() - soup = BeautifulSoup(content, "lxml") - await self._handle_links(soup, url) - except Exception as e: - log_error(f"Error handling links on {url}: {e}") - async def _handle_links(self, soup, url): """ Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the @@ -308,68 +288,6 @@ async def _handle_links(self, soup, url): self.to_crawl_queue.append(found_link) self.to_crawl_set.add(found_link) - async def _handle_dynamic_content(self, url: str): - """ - Handles dynamic content on a page. This is useful for websites that require JavaScript rendering. - Args: - url: - - Returns: - - """ - try: - await self._fetch_with_playwright(url, [self._handle_dynamic_buttons]) - logging.info(f"Finished handling dynamic content on {url}. Total: {len(self.urls_crawled)} links.") - except Exception as e: - log_error(f"Error handling dynamic content: {e}") - - async def _handle_dynamic_buttons(self, page: Page, url: str, ignore_names=None): - if ignore_names is None: - ignore_names = ['search', 'submit', 'login', 'register', 'sign in', 'sign up', 'accept', 'close', 'agree'] - try: - buttons = await page.query_selector_all("button:visible:not(:disabled)") - if not buttons: - return - logging.info(f"Found {len(buttons)} clickable buttons on {url}") - - for button_index, button in enumerate(buttons): - if self.is_shutdown(): - logging.info("Shutdown signal received during button handling.") - return - try: - if page.is_closed(): - logging.info("Page was closed, stopping button handling.") - return - - await button.scroll_into_view_if_needed() - button_text = await button.text_content() - if any(ignore_name.lower() in button_text.lower() for ignore_name in ignore_names): - continue - - try: - async with page.expect_navigation(wait_until="networkidle", timeout=5000): - await button.click(timeout=5000) - except PlaywrightTimeoutError: - log_error(f"Navigation timeout after clicking button {button_index} on {url}") - continue - except TargetClosedError: - log_error(f"Page was closed while clicking button {button_index} on {url}") - return - - new_content = await page.content() - soup = BeautifulSoup(new_content, "lxml") - await self._handle_links(soup, url) - - except TargetClosedError: - log_error(f"Page was closed while handling button {button_index} on {url}") - return - except Exception as e: - log_error(f"Error handling button {button_index} on {url}: {e}") - except TargetClosedError: - log_error(f"Page was closed during button discovery on {url}") - except Exception as e: - log_error(f"Error during button discovery on {url}: {e}") - async def _fetch(self, session, url: str) -> str or None: """ Fetches the content of a URL using the given session. @@ -401,51 +319,6 @@ async def _fetch(self, session, url: str) -> str or None: log_error(f"Error fetching {url}: {e}") return None - async def _fetch_with_playwright(self, url: str, callbacks: list[callable] = None): - max_retries = self.max_retries - retry_delay = self.retry_delay - - for attempt in range(max_retries): - if self.is_shutdown(): - logging.info("Shutdown signal received before starting fetch.") - return - - logging.info(f"Fetching {url} with playwright (attempt {attempt + 1}/{max_retries})" if attempt > 0 - else f"Fetching {url} with playwright") - - try: - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - try: - page = await browser.new_page() - try: - await page.goto(url, wait_until='networkidle', timeout=30000) - - if self.is_shutdown(): - logging.info("Shutdown signal received after page load.") - return - - if callbacks: - for callback in callbacks: - await callback(page, url) - - return # Exit after successful fetch - finally: - await page.close() - finally: - await browser.close() - except Exception as e: - log_error(f"Error fetching {url} with playwright (attempt {attempt + 1}/{max_retries}): {e}") - if attempt == max_retries - 1: - log_error(f"Failed to process {url} after {max_retries} attempts.") - return - # Exponential wait time - await asyncio.sleep(retry_delay * (2 ** attempt)) - - if self.is_shutdown(): - logging.info("Shutdown signal received during retry delay.") - return - def save_state(self): """ Saves the global state to a file.