From 94342f368338446b199ffcaab6cec6534cb931f8 Mon Sep 17 00:00:00 2001 From: okihnjo Date: Mon, 15 Jul 2024 15:30:38 +0200 Subject: [PATCH 01/15] introduced ranking by simply summing up the tf-idf vals. Query, X and subset of documents are used --- engine/custom_rank.py | 108 +++++++++++++++++++++++++++++++++++++ engine/custom_tokenizer.py | 8 ++- 2 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 engine/custom_rank.py diff --git a/engine/custom_rank.py b/engine/custom_rank.py new file mode 100644 index 0000000..8a492fe --- /dev/null +++ b/engine/custom_rank.py @@ -0,0 +1,108 @@ +import pandas as pd +from custom_tokenizer import tokenize_data, tf_idf_vectorize +from sklearn.feature_extraction.text import TfidfVectorizer +def preprocess_query(Q): + tokenized_query = tokenize_data(Q) + return tokenized_query + + +def find_intersection_2(Q): + df_inverted = pd.read_csv("engine/inverted_index.csv",sep=",", index_col=1) + df_inverted.drop(columns=["Unnamed: 0"], inplace=True) + + # df_inverted.set_index("word", inplace=True) + print(df_inverted.columns) + print(df_inverted.head()) + tokenized_query = preprocess_query(Q) + print(tokenized_query) + result = [] + for token in tokenized_query: + if token in df_inverted.word.values: + print(f"Found token: {token}") + doc_ids = df_inverted[df_inverted["word"]==token]["doc_ids"].apply(eval) + print(f"It has {len(doc_ids)} doc_ids") + result.append(doc_ids) + # print(f"result: {result}") + # find intersection of all lists in result + intersection = set(result[0]).intersection(*result) + return intersection + + + +def find_documents(Q): + df_inverted = pd.read_csv("engine/inverted_index.csv", converters={'doc_ids': pd.eval}) + df_inverted.set_index("word", inplace=True) + df_inverted.drop(columns=["Unnamed: 0"], inplace=True) + + + + print(df_inverted.head()) + tokenized_query = preprocess_query(Q) + print(df_inverted.index.values) + result = [] + for token in tokenized_query: + if token in df_inverted.index.values: + print(f"Found token: {token}") + doc_ids = df_inverted.loc[token].doc_ids + print(f"It has {len(doc_ids)} doc_ids") + result.append(doc_ids) + # find intersection of all lists in result + intersection = set(result[0]).intersection(*result) + union = set(result[0]).union(*result) + if len(intersection) < 2: + print("No intersection found") + return union + return intersection + +def dummy(tokens): + return tokens + +def generate_tf_idf_matrix(path): + df = pd.read_csv("engine/pages.csv", converters={'tokenized_text': pd.eval}) + df_text = df["tokenized_text"] + # create list of lists containing the tokenized text + tokenized_text = [] + print(type(df_text.values)) + vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy) + X = vectorizer.fit_transform(df_text.values) + features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out()) + print(features) + + return features + +def rank_documents(subset_D, Q, X): + # Filter the DataFrame to include only the documents in subset_D + subset_adj = [x-1 for x in subset_D] + filtered_X = X.loc[list(subset_adj)] # here accessen wir rows + + # Ensure Q is a list of query terms + query_terms = preprocess_query(Q) + query_terms_in_X = [term for term in query_terms if term in X.columns] + # Filter the DataFrame to include only the columns corresponding to the query terms + if not query_terms_in_X: + print("No query terms found in the TF-IDF matrix.") + return pd.DataFrame() + filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns + + # Sum the TF-IDF values for each document + filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1) + + # Rank the documents by the summed TF-IDF values in descending order + ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False) + + return ranked_docs + + + + + + +query = "max animal future" +docs = find_documents(query) +X = generate_tf_idf_matrix('engine/pages.csv') +print(f"Found {len(docs)} documents, they look like this: {docs}") +print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}") + +ranked_docs = rank_documents(docs, query, X) +best_20_docs = ranked_docs.head(20).index + 1 +print(f"Best 20 docs: {best_20_docs}") diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index 9457ac7..891ddf3 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -69,13 +69,17 @@ def tokenize_data(data) -> list[str]: data = pipe(data) return data - +# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen. +# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen +# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern +# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html def tf_idf_vectorize(data): """ Vectorizes the input data using the TF-IDF algorithm. """ # Create the vectorizer - vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") + # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen + vectorizer = TfidfVectorizer() # Vectorize the data X = vectorizer.fit_transform(data) return X From 94edd89c347bfb9e56c8fb65127f4c37a187477b Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:53:38 +0200 Subject: [PATCH 02/15] Experiment with dynamic content and buttons. Add more frontiers. Add retries. Dynamic content handling is disabled since it causes errors. --- engine/crawl.py | 402 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 298 insertions(+), 104 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index ede0ec1..bae7cea 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -23,24 +23,50 @@ # Constants # URL seeds to start crawling from SEEDS = [ + # Official "https://www.tuebingen.de/en/", - "https://www.bio.mpg.de/2923/en", + # University "https://www.uni-tuebingen.de/en/", - "http://www.tuepedia.de", + "https://www.bio.mpg.de/2923/en", "https://health-nlp.com/index.html", "https://www.medizin.uni-tuebingen.de/en-de/startseite/", "https://www.my-stuwe.de/en/", "https://www.unimuseum.uni-tuebingen.de/en/", - "https://www.komoot.com/guide/210692/attractions-around-tuebingen", - "https://hoelderlinturm.de/english/", "https://www.fsi.uni-tuebingen.de/en/", + "https://studieren.de/international-business-eberhard-karls-universitaet-tuebingen.studienprofil.t-0.a-68.c-110.html", + "https://www.hih-tuebingen.de/en/?no_cache=1" + # Events + "https://www.dai-tuebingen.de/en/", + "https://pintofscience.de/events/tuebingen", + "http://www.tuepedia.de", + "https://hoelderlinturm.de/english/", + "https://www.tuebingen.de/en/leisure-tourism/culture/museums-galleries.html", + "https://www.eventbrite.com/ttd/germany--tübingen/", + # Tourism + "https://www.komoot.com/guide/210692/attractions-around-tuebingen", "https://www.stocherkahnfahrten.com/English/Stocherkahnrennen-English.html", "https://www.germany.travel/en/cities-culture/tuebingen.html", "https://justinpluslauren.com/things-to-do-in-tubingen-germany/", "https://www.yelp.de/search?find_desc=&find_loc=Tübingen%2C+Baden-Württemberg", "https://www.tripadvisor.com/Tourism-g198539-Tubingen_Baden_Wurttemberg-Vacations.html", ] +# Language detector LANG_DETECTOR = LanguageDetector() +# Ignore errors +SILENT_ERRORS = False + + +def log_error(error_msg): + """ + Prints an error message if SILENT_ERRORS is False. + Args: + error_msg: The error message to print. + + Returns: + + """ + if not SILENT_ERRORS: + logging.error(error_msg) class Crawler(PipelineElement): @@ -53,27 +79,32 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): # Initialize the crawler state self.urls_crawled = set() self.ignore_links = set() - self.to_crawl = collections.deque(SEEDS) - self.to_crawl_set = set(self.to_crawl) + self.to_crawl_queue = collections.deque(SEEDS) + self.to_crawl_set = set(self.to_crawl_queue) + self._page_count = 0 # Load the global state self._load_state() - # Internal state - self._connector = aiohttp.TCPConnector(limit=50, limit_per_host=10) - self._page_count = 0 - - self.max_size = 1000 # Example maximum size + # Crawler configuration + self.timeout = 10 # Timeout in seconds + self.max_retries = 3 # Maximum number of retries + self.retry_delay = 1 # Delay between retries in seconds + self.max_size = 1000 # Maximum number of pages to crawl + self.no_dynamic_content = False # Disable dynamic content handling (Playwright) self.ignore_domains = ["github.com", "linkedin.com", "xing.com", "instagram.com", "twitter.com", "youtube.com", "de.wikipedia.org", "wikipedia.org", "google.com", "google.de", "google.co.uk", - "amazon.com", "cctue.de", "spotify.com"] + "pinterest.com", "amazon.com", "cctue.de", "spotify.com"] self.langs = ["en", "en-de", "eng", "en-GB", "en-US", "english"] self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"] - self.user_agents = [("Modern Search Engines University of Tuebingen Project Crawler (" - "https://uni-tuebingen.de/de/262377)"), - ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/58.0.3029.110 Safari/537.3")] - + self.user_agents = [ + "University of Tuebingen Student Web Crawler Project (https://uni-tuebingen.de/de/262377; contact: " + "webmaster@uni-tuebingen.de)", + "Mozilla/5.0 (compatible; TuebingenUniBot/1.0; +https://uni-tuebingen.de/de/262377)", + "Tuebingen University Research Crawler/1.0 (+https://uni-tuebingen.de/de/262377; Academic purposes only)", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 " + "Safari/537.36 (Tuebingen University Web Crawling Project)" + ] self.headers = { "User-Agent": self.user_agent, "Accept-Language": "en-US,en;q=0.9,de;q=0.8", @@ -84,54 +115,23 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): def __del__(self) -> None: self.cursor.close() - async def fetch(self, session, url): - headers = { - "User-Agent": self.user_agent, - "Accept-Language": "en-US,en;q=0.9,de;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - } - try: - async with session.get(url, timeout=5, headers=headers) as response: - return await response.text() - except Exception as e: - print(f"Error fetching {url}: {e}") - return None - - @staticmethod - async def _fetch_with_playwright(url, max_retries=3): - for attempt in range(max_retries): - try: - async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - await page.goto(url, wait_until='networkidle') - content = await page.content() - await browser.close() - return content - except Exception as e: - print(f"Error on attempt {attempt + 1} for {url}: {e}") - if attempt == max_retries - 1: - print(f"Max retries reached for {url}") - return None - - @staticmethod - def _needs_javascript_rendering(html: str) -> bool: - # Check for JavaScript frameworks - if any(framework in html.lower() for framework in ['react', 'vue', 'angular']): - return True + # Internal state + self._page_count = 0 - return False + @property + def user_agent(self): + # Cycle through user agents + return self.user_agents[self._page_count % len(self.user_agents)] async def process(self): - async with ClientSession(connector=self._connector) as session: - while not self.is_shutdown() and self.to_crawl and len(self.urls_crawled) < self.max_size: - # Process multiple links concurrently + connector = TCPConnector(limit=100, force_close=True, enable_cleanup_closed=True) + async with ClientSession(connector=connector, trust_env=True) as session: + while not self.is_shutdown() and self.to_crawl_queue and len(self.urls_crawled) < self.max_size: tasks = [] - for _ in range(min(10, len(self.to_crawl))): # Process up to 10 links at a time - if self.to_crawl and len(self.urls_crawled) < self.max_size: - link = self.to_crawl.popleft() - task = asyncio.create_task(self._handle_link(session, link)) + for _ in range(min(10, len(self.to_crawl_queue))): + if self.to_crawl_queue and len(self.urls_crawled) < self.max_size: + url = self.to_crawl_queue.popleft() + task = asyncio.create_task(self._process_url(session, url)) tasks.append(task) else: break @@ -140,74 +140,137 @@ async def process(self): break # Wait for all tasks to complete or for shutdown - done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + completed, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) - # Process completed tasks - for task in done: + for task in completed: try: await task except Exception as e: - print(f"Error processing link: {e}") + log_error(f"Unhandled exception in task: {e}") - # Check for shutdown if self.is_shutdown(): - # Cancel pending tasks for task in pending: task.cancel() - # Wait for cancellation to complete await asyncio.gather(*pending, return_exceptions=True) self.save_state() break - async def _handle_link(self, session, link): + # If there are still pending tasks, add them back to the queue + for task in pending: + url = task.get_coro().cr_frame.f_locals.get('url') + if url: + self.to_crawl_queue.appendleft(url) + task.cancel() + + logging.info("Crawler finished processing") + + async def _process_url(self, session, url: str): + """ + Crawls a URL and processes the content. + Args: + session: + url: + + Returns: + + """ if len(self.urls_crawled) >= self.max_size: - print("Maximum size reached") + logging.info("Maximum size reached") return - print(f"Crawler crawling {link}...") - - if not link.startswith("http"): - print(f"Invalid URL: {link}") + if not url.startswith("http"): + logging.info(f"Invalid URL: {url}") return - if any(domain in link for domain in self.ignore_domains): - print(f"Ignoring {link} because it is in the ignore domains list") - self.ignore_links.add(link) + if any(domain in url for domain in self.ignore_domains): + logging.info(f"Ignoring {url} because it is in the ignore domains list") + self.ignore_links.add(url) return - if link in self.ignore_links or link in self.urls_crawled: - print(f"Ignoring {link} because it is in the ignore or found list") + if url in self.ignore_links or url in self.urls_crawled: + logging.info(f"Ignoring {url} because it is in the ignore or found list") return - if not check_robots(link): - print(f"Ignoring {link} because it is disallowed by robots.txt") - self.ignore_links.add(link) + if not check_robots(url): + logging.info(f"Ignoring {url} because it is disallowed by robots.txt") + self.ignore_links.add(url) return - html_content = await self.fetch(session, link) + html_content = await self._fetch(session, url) if html_content is None: - print(f"Error fetching {link}") - self.ignore_links.add(link) + logging.info(f"Error fetching {url}") + self.ignore_links.add(url) + return + + try: + soup = BeautifulSoup(html_content, "lxml") + text = soup.get_text().lower() + except Exception as e: + log_error(f"Error parsing {url}: {e}") + self.ignore_links.add(url) return - soup = BeautifulSoup(html_content, "lxml") - text = soup.get_text().lower() + if not text or not soup: + logging.info(f"Ignoring {url} because it is empty") + self.ignore_links.add(url) + return check_html_tag_lang = soup.find("html").get("lang") in self.langs check_xml_tag_lang = soup.find("html").get("xml:lang") in self.langs - check_link_lang = any(split == lang for split in link.split("/") for lang in self.langs) + check_link_lang = any(split == lang for split in url.split("/") for lang in self.langs) check_text_lang = LANG_DETECTOR.detect(text) in self.langs if not check_html_tag_lang and not check_xml_tag_lang and not check_link_lang and not check_text_lang: - print(f"Ignoring {link} because it is not in the correct language") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it is not in the correct language") + self.ignore_links.add(url) return if not any(keyword in text for keyword in self.required_keywords): - print(f"Ignoring {link} because it does not contain the required keywords") - self.ignore_links.add(link) + logging.info(f"Ignoring {url} because it does not contain the required keywords") + self.ignore_links.add(url) return + # Handle links + await self._handle_links(soup, url) + + # Handle buttons + #await self._handle_dynamic_content(url) + + if url not in self.urls_crawled and url not in self.ignore_links: + self.urls_crawled.add(url) + + logging.info(f"Finished crawling {url}. Total: {len(self.urls_crawled)} links.") + if not self.is_shutdown(): + await self.call_next(soup, url) + + async def _handle_dynamic_links(self, page: Page, url: str): + """ + Handles the links on a page using Playwright. + Args: + page: + url: + + Returns: + + """ + try: + content = await page.content() + soup = BeautifulSoup(content, "lxml") + await self._handle_links(soup, url) + except Exception as e: + log_error(f"Error handling links on {url}: {e}") + + async def _handle_links(self, soup, url): + """ + Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the + found list, and not in the to_crawl_set. + Args: + soup: BeautifulSoup object + url: URL of the page + + Returns: + + """ for a_tag in soup.find_all("a", href=True): found_link = a_tag.get("href") @@ -217,10 +280,10 @@ async def _handle_link(self, session, link): # Check if link is relative if found_link.startswith("/"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) elif found_link.startswith("../"): - base_url = get_base_url(link) + base_url = get_base_url(url) found_link = get_full_url(base_url, found_link) # Check if link is an email @@ -242,15 +305,146 @@ async def _handle_link(self, session, link): and found_link not in self.urls_crawled and found_link not in self.to_crawl_set and found_link.startswith("http")): - self.to_crawl.append(found_link) + self.to_crawl_queue.append(found_link) self.to_crawl_set.add(found_link) - if link not in self.urls_crawled and link not in self.ignore_links: - self.urls_crawled.add(link) + async def _handle_dynamic_content(self, url: str): + """ + Handles dynamic content on a page. This is useful for websites that require JavaScript rendering. + Args: + url: - print(f"Finished crawling {link}. Total: {len(self.urls_crawled)} links.") - if not self.is_shutdown(): - await self.call_next(soup, link) + Returns: + + """ + try: + await self._fetch_with_playwright(url, [self._handle_dynamic_buttons]) + logging.info(f"Finished handling dynamic content on {url}. Total: {len(self.urls_crawled)} links.") + except Exception as e: + log_error(f"Error handling dynamic content: {e}") + + async def _handle_dynamic_buttons(self, page: Page, url: str, ignore_names=None): + if ignore_names is None: + ignore_names = ['search', 'submit', 'login', 'register', 'sign in', 'sign up', 'accept', 'close', 'agree'] + try: + buttons = await page.query_selector_all("button:visible:not(:disabled)") + if not buttons: + return + logging.info(f"Found {len(buttons)} clickable buttons on {url}") + + for button_index, button in enumerate(buttons): + if self.is_shutdown(): + logging.info("Shutdown signal received during button handling.") + return + try: + if page.is_closed(): + logging.info("Page was closed, stopping button handling.") + return + + await button.scroll_into_view_if_needed() + button_text = await button.text_content() + if any(ignore_name.lower() in button_text.lower() for ignore_name in ignore_names): + continue + + try: + async with page.expect_navigation(wait_until="networkidle", timeout=5000): + await button.click(timeout=5000) + except PlaywrightTimeoutError: + log_error(f"Navigation timeout after clicking button {button_index} on {url}") + continue + except TargetClosedError: + log_error(f"Page was closed while clicking button {button_index} on {url}") + return + + new_content = await page.content() + soup = BeautifulSoup(new_content, "lxml") + await self._handle_links(soup, url) + + except TargetClosedError: + log_error(f"Page was closed while handling button {button_index} on {url}") + return + except Exception as e: + log_error(f"Error handling button {button_index} on {url}: {e}") + except TargetClosedError: + log_error(f"Page was closed during button discovery on {url}") + except Exception as e: + log_error(f"Error during button discovery on {url}: {e}") + + async def _fetch(self, session, url: str) -> str or None: + """ + Fetches the content of a URL using the given session. + Args: + session: + url: + + Returns: the HTML content of the URL + """ + + max_retries = self.max_retries + retry_delay = self.retry_delay + + self._page_count += 1 + for attempt in range(max_retries): + logging.info(f"Fetching {url} (attempt {attempt + 1}/{max_retries})" if attempt > 0 else f"Fetching {url}") + try: + async with session.get(url, timeout=self.timeout, headers=self.headers) as response: + response.raise_for_status() + html_text = await response.text() + return html_text + except (TimeoutError, ClientError) as e: + if attempt == max_retries - 1: + log_error(f"Failed to process {url} after {max_retries} attempts: {str(e)}") + return + # Exponential wait time + await asyncio.sleep(retry_delay * (2 ** attempt)) + except Exception as e: + log_error(f"Error fetching {url}: {e}") + return None + + async def _fetch_with_playwright(self, url: str, callbacks: list[callable] = None): + max_retries = self.max_retries + retry_delay = self.retry_delay + + for attempt in range(max_retries): + if self.is_shutdown(): + logging.info("Shutdown signal received before starting fetch.") + return + + logging.info(f"Fetching {url} with playwright (attempt {attempt + 1}/{max_retries})" if attempt > 0 + else f"Fetching {url} with playwright") + + try: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + try: + page = await browser.new_page() + try: + await page.goto(url, wait_until='networkidle', timeout=30000) + + if self.is_shutdown(): + logging.info("Shutdown signal received after page load.") + return + + if callbacks: + for callback in callbacks: + await callback(page, url) + + return # Exit after successful fetch + finally: + await page.close() + finally: + await browser.close() + except Exception as e: + log_error(f"Error fetching {url} with playwright (attempt {attempt + 1}/{max_retries}): {e}") + if attempt == max_retries - 1: + log_error(f"Failed to process {url} after {max_retries} attempts.") + return + # Exponential wait time + await asyncio.sleep(retry_delay * (2 ** attempt)) + + if self.is_shutdown(): + logging.info("Shutdown signal received during retry delay.") + return def save_state(self): """ @@ -264,7 +458,7 @@ def save_state(self): with open(f"crawler_states/global.json", "w") as f: # Write it as json f.write(json.dumps({ - "to_crawl": list(self.to_crawl), + "to_crawl": list(self.to_crawl_queue), "ignore_links": list(self.ignore_links), "found_links": list(self.urls_crawled) })) @@ -275,13 +469,13 @@ def _load_state(self): """ if not os.path.exists(f"crawler_states/global.json"): - print("No global state found") - self.to_crawl = collections.deque(SEEDS) + logging.info("No global state found") + self.to_crawl_queue = collections.deque(SEEDS) return with open(f"crawler_states/global.json", "r") as f: data = json.loads(f.read()) - self.to_crawl = collections.deque(data["to_crawl"]) + self.to_crawl_queue = collections.deque(data["to_crawl"]) self.to_crawl_set = set(data["to_crawl"]) self.ignore_links = set(data["ignore_links"]) self.urls_crawled = set(data["found_links"]) From 4d8db248012e92547bca60426c7a2bf8b2e54edb Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:06 +0200 Subject: [PATCH 03/15] Update tokenizer --- engine/custom_tokenizer.py | 58 ++++++++++++++++++++++++++++---------- engine/utils.py | 26 +++++++++++++++++ 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index 9457ac7..fff429a 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -1,3 +1,5 @@ +import logging + import nltk as nltk from sklearn.feature_extraction.text import TfidfVectorizer import re @@ -9,11 +11,16 @@ from custom_db import add_tokens_to_index, upsert_page_to_index, add_title_to_index from pipeline import PipelineElement +from utils import safe_join, safe_str + +WN_LEMMATIZER = nltk.stem.WordNetLemmatizer() +STEMMER = nltk.stem.PorterStemmer() def remove_punctuations(text): - punct_tag = re.compile(r'[^\w\s]') - text = punct_tag.sub(r'', text) + # Remove punctuations + punctuations = re.compile(r'[.!?,;:\-_`´()\[\]{}<>"]') + text = punctuations.sub(r'', text) return text @@ -51,11 +58,17 @@ def tokenize_plain_words(words: str): return words.split() -def stem_and_remove_stopwords(words) -> list[str]: - # use english porterStemmer +def stem(words) -> list[str]: + words = [STEMMER.stem(word) for word in words] # added stemmer + return words + + +def remove_stopwords(words): + return [word for word in words if word not in stopwords.words("english")] + - stemmer = nltk.stem.porter.PorterStemmer() - words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer +def lemmatize(words): + words = [WN_LEMMATIZER.lemmatize(word) for word in words] return words @@ -63,8 +76,8 @@ def tokenize_data(data) -> list[str]: """ Tokenizes the input data. """ - pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, - stem_and_remove_stopwords] + pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, remove_stopwords, + lemmatize] for pipe in pipeline: data = pipe(data) return data @@ -91,10 +104,10 @@ def top_30_words(data): X = vectorizer.fit_transform(data) # Get the feature names feature_names = vectorizer.get_feature_names_out() - print(f"Feature names: {feature_names}") - print(f"X sieht so aus: {X}") - print(f"Shape of X: {X.shape}") - print(f"Summe: {X.sum(axis=0)}") + logging.info(f"Feature names: {feature_names}") + logging.info(f"X sieht so aus: {X}") + logging.info(f"Shape of X: {X.shape}") + logging.info(f"Summe: {X.sum(axis=0)}") top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30] return top_30_words @@ -108,18 +121,33 @@ async def process(self, data, link): Tokenizes the input data. """ + if data is None: + logging.info(f"Failed to tokenize {link} because the data was empty.") + return + soup = data + + # Get the text from the page text = soup.get_text() - img_tags = soup.findAll("img") + + # Get the meta description and title description = soup.find("meta", attrs={"name": "description"}) description_content = description.get("content") if description is not None else "" title = soup.find("title") title_content = title.string if title is not None else "" + # Get the alt texts from the images + img_tags = soup.findAll("img") alt_texts = [img.get("alt") for img in img_tags] - text = text + " ".join(alt_texts) + " " + str(description_content) + " " + str(title_content) + # Join all the text together + alt_texts_str = safe_join(alt_texts) + description_str = safe_str(description_content) + title_str = safe_str(title_content) + text = f"{text} {alt_texts_str} {description_str} {title_str}".strip() + + # Tokenize the text tokenized_text = tokenize_data(data=text) add_tokens_to_index(url=link, tokenized_text=tokenized_text) - print(f"Tokenized text for {link}") + logging.info(f"Tokenized text for {link}") diff --git a/engine/utils.py b/engine/utils.py index dbefc6d..8755602 100644 --- a/engine/utils.py +++ b/engine/utils.py @@ -71,3 +71,29 @@ def check_robots(url: str) -> bool: except: return True return rp.can_fetch("*", url) + + +def safe_join(items): + """ + Safely joins a list of items into a string, separating them with a space. + If an item is None, it is skipped. + + Args: + items: + + Returns: + + """ + return " ".join(str(item) for item in items if item is not None) + + +def safe_str(item): + """ + Safely converts an item to a string. If the item is None, an empty string is returned. + Args: + item: + + Returns: + + """ + return str(item) if item is not None else "" From 628d364a6a36e6835877acca008fe00c440aa0f3 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:48 +0200 Subject: [PATCH 04/15] Improve logging --- engine/custom_db.py | 13 +++++++------ engine/index.py | 8 +++++++- engine/main.py | 21 ++++++++++++++------- engine/pipeline.py | 7 ++++--- engine/test.py | 3 ++- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/engine/custom_db.py b/engine/custom_db.py index 96640b5..855b19e 100644 --- a/engine/custom_db.py +++ b/engine/custom_db.py @@ -1,3 +1,4 @@ +import logging import os import pandas as pd @@ -61,7 +62,7 @@ def add_tokens_to_index(url: str, tokenized_text: list[str]): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'tokenized_text'] = tokenized_text else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_title_to_index(url: str, title: str): @@ -80,7 +81,7 @@ def add_title_to_index(url: str, title: str): if not pages_df[pages_df['id'] == page_id].empty: pages_df.at[pages_df[pages_df['id'] == page_id].index[0], 'title'] = title else: - print(f"Page with ID {page_id} not found") + logging.info(f"Page with ID {page_id} not found") def add_snippet_to_index(url, snippet): @@ -99,7 +100,7 @@ def add_snippet_to_index(url, snippet): if not pages_df[pages_df['url'] == url].empty: pages_df.at[pages_df[pages_df['url'] == url].index[0], 'snippet'] = snippet else: - print(f"Page with URL {url} not found") + logging.info(f"Page with URL {url} not found") def get_tokens() -> list[list[str]]: @@ -162,17 +163,17 @@ def load_pages() -> pd.DataFrame: # Check if the file exists if not os.path.exists(f"pages.csv"): - print("No pages found") + logging.info("No pages found") return pages_df try: pages_df = pd.read_csv("pages.csv", header=0) except pd.errors.EmptyDataError: - print("No pages found") + logging.info("No pages found") return pages_df # Convert the tokenized_text column to a list of lists pages_df['tokenized_text'] = pages_df['tokenized_text'].apply(eval) - print("Loaded pages") + logging.info("Loaded pages") return pages_df diff --git a/engine/index.py b/engine/index.py index 5b9f1cb..a125f96 100644 --- a/engine/index.py +++ b/engine/index.py @@ -1,3 +1,5 @@ +import logging + from custom_db import upsert_page_to_index, add_title_to_index, add_snippet_to_index, load_pages from pipeline import PipelineElement @@ -17,6 +19,10 @@ async def process(self, data, link): Indexes the input data. """ + if data is None: + logging.info(f"Failed to index {link} because the data was empty.") + return + soup = data # Title @@ -32,7 +38,7 @@ async def process(self, data, link): add_title_to_index(url=link, title=title_content) add_snippet_to_index(url=link, snippet=description_content) - print(f"Indexed {link}") + logging.info(f"Indexed {link}") if not self.is_shutdown(): await self.call_next(soup, link) diff --git a/engine/main.py b/engine/main.py index a1c1802..8ce2fd3 100644 --- a/engine/main.py +++ b/engine/main.py @@ -4,7 +4,9 @@ from concurrent.futures import ThreadPoolExecutor import asyncio import nest_asyncio - +import signal +# Logging +import logging # Database import duckdb # Pipeline @@ -13,9 +15,14 @@ from custom_tokenizer import Tokenizer from index import Indexer -# Constants -MAX_THREADS = 10 +# Logging setup +logging.basicConfig( + format='%(asctime)s %(levelname)-8s %(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S') +# Threading +MAX_THREADS = 10 # Patch asyncio to allow nested event loops nest_asyncio.apply() @@ -36,7 +43,7 @@ def signal_handler(signum, frame): - print("Interrupt received, shutting down... Please wait") + logging.info("Interrupt received, shutting down... Please wait. This may take a few seconds.") for element in [crawler, indexer, tokenizer]: element.shutdown() @@ -57,7 +64,7 @@ async def main(): try: await crawler.process() except Exception as e: - print(f"An error occurred: {e}") + logging.info(f"An error occurred: {e}") finally: # Ensure states are saved even if an exception occurs for element in [crawler, indexer, tokenizer]: @@ -67,7 +74,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") # Save the state+ for element in [crawler, indexer, tokenizer]: @@ -77,7 +84,7 @@ async def main(): index_df = access_index() index_df.to_csv("inverted_index.csv") con.close() - print("State saved") + logging.info("State saved") if __name__ == "__main__": diff --git a/engine/pipeline.py b/engine/pipeline.py index 37ba1b4..f1ae453 100644 --- a/engine/pipeline.py +++ b/engine/pipeline.py @@ -1,4 +1,5 @@ import asyncio +import logging import threading @@ -9,7 +10,7 @@ def __init__(self, name): self.executor = None self.tasks = [] self.shutdown_flag = threading.Event() - print(f"Initialized {self.name}") + logging.info(f"Initialized {self.name}") def add_executor(self, executor): self.executor = executor @@ -25,10 +26,10 @@ def add_next(self, next_element): async def call_next(self, *args): if not self.next: - print(f"No next elements for {self.name}") + logging.info(f"No next elements for {self.name}") return # No next elements to process - print(f"Processing next elements for {self.name}") + logging.info(f"Processing next elements for {self.name}") tasks = [] for element in self.next: if asyncio.iscoroutinefunction(element.process): diff --git a/engine/test.py b/engine/test.py index 389ca59..08bed22 100644 --- a/engine/test.py +++ b/engine/test.py @@ -1,8 +1,9 @@ # file to test the written functions +import logging from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum." top_30_words = top_30_words([CUSTOM_TEXT]) -print(top_30_words) +logging.info(top_30_words) From b3b1c3697b50ecd48bc7aebfea847145523104ed Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:53 +0200 Subject: [PATCH 05/15] Add queries --- engine/queries.txt | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/engine/queries.txt b/engine/queries.txt index f2543dd..3429346 100644 --- a/engine/queries.txt +++ b/engine/queries.txt @@ -1,41 +1,2 @@ -tübingen -tübingen university -tübingen attractions -food and drinks -tübingen weather -tübingen hotels -tübingen traditional food -tübingen coffee shops -tübingen nightlife spots -tübingen museums -tübingen castles -tübingen outdoor activities -tübingen nightlife -tübingen markets -tübingen shopping centers -tübingen local products -Best cafes in Tübingen for students -Upcoming events at the University of Tübingen -History of Tübingen's old town -Popular hiking trails near Tübingen -Tübingen student housing options -Vegan and vegetarian restaurants in Tübingen -Cultural activities in Tübingen -Tübingen public transportation map -University of Tübingen research departments -Tübingen nightlife spots -Bookstores in Tübingen -Tübingen local farmers' markets -Tübingen weather forecast -Student discounts in Tübingen -Tübingen library hours and services -Language exchange programs in Tübingen -Top tourist attractions in Tübingen -Cycling routes in Tübingen -Tübingen sports clubs and gyms -Tübingen local festivals and fairs -Best places to study in Tübingen -Tübingen historical landmarks -Tübingen university application process -Local art galleries in Tübingen -Tübingen second-hand stores +1 tübingen attractions +2 food and drinks \ No newline at end of file From 0fcb896f1250ee45e6af822120371159d3bc3ca1 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:54:59 +0200 Subject: [PATCH 06/15] Add certifi --- engine/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/requirements.txt b/engine/requirements.txt index e850b9a..36f516a 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -1,6 +1,7 @@ # Automatically generated by https://github.com/damnever/pigar. beautifulsoup4==4.12.3 +certifi==2024.7.4 duckdb==1.0.0 eld==1.0.6 Flask==3.0.3 From cd2201f5c0d4952ce30ef3543beca62ca725cdaf Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:57:37 +0200 Subject: [PATCH 07/15] Update query format --- engine/queries.txt | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/engine/queries.txt b/engine/queries.txt index 3429346..40bab39 100644 --- a/engine/queries.txt +++ b/engine/queries.txt @@ -1,2 +1,41 @@ -1 tübingen attractions -2 food and drinks \ No newline at end of file +1 tübingen +2 tübingen university +3 tübingen attractions +4 food and drinks +5 tübingen weather +6 tübingen hotels +7 tübingen traditional food +8 tübingen coffee shops +9 tübingen nightlife spots +10 tübingen museums +11 tübingen castles +12 tübingen outdoor activities +13 tübingen nightlife +14 tübingen markets +15 tübingen shopping centers +16 tübingen local products +17 Best cafes in Tübingen for students +18 Upcoming events at the University of Tübingen +19 History of Tübingen's old town +20 Popular hiking trails near Tübingen +21 Tübingen student housing options +22 Vegan and vegetarian restaurants in Tübingen +23 Cultural activities in Tübingen +24 Tübingen public transportation map +25 University of Tübingen research departments +26 Tübingen nightlife spots +27 Bookstores in Tübingen +28 Tübingen local farmers' markets +29 Tübingen weather forecast +30 Student discounts in Tübingen +31 Tübingen library hours and services +32 Language exchange programs in Tübingen +33 Top tourist attractions in Tübingen +34 Cycling routes in Tübingen +35 Tübingen sports clubs and gyms +36 Tübingen local festivals and fairs +37 Best places to study in Tübingen +38 Tübingen historical landmarks +39 Tübingen university application process +40 Local art galleries in Tübingen +41 Tübingen second-hand stores From 3f835bc73c41af2e17cba04744f25054035b003d Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Mon, 15 Jul 2024 22:04:51 +0200 Subject: [PATCH 08/15] Remove code that handles dynamic stuff. If we still need it, it's in commit 94edd89c347bfb9e56c8fb65127f4c37a187477b --- engine/crawl.py | 127 ------------------------------------------------ 1 file changed, 127 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index bae7cea..b63d048 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -233,9 +233,6 @@ async def _process_url(self, session, url: str): # Handle links await self._handle_links(soup, url) - # Handle buttons - #await self._handle_dynamic_content(url) - if url not in self.urls_crawled and url not in self.ignore_links: self.urls_crawled.add(url) @@ -243,23 +240,6 @@ async def _process_url(self, session, url: str): if not self.is_shutdown(): await self.call_next(soup, url) - async def _handle_dynamic_links(self, page: Page, url: str): - """ - Handles the links on a page using Playwright. - Args: - page: - url: - - Returns: - - """ - try: - content = await page.content() - soup = BeautifulSoup(content, "lxml") - await self._handle_links(soup, url) - except Exception as e: - log_error(f"Error handling links on {url}: {e}") - async def _handle_links(self, soup, url): """ Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the @@ -308,68 +288,6 @@ async def _handle_links(self, soup, url): self.to_crawl_queue.append(found_link) self.to_crawl_set.add(found_link) - async def _handle_dynamic_content(self, url: str): - """ - Handles dynamic content on a page. This is useful for websites that require JavaScript rendering. - Args: - url: - - Returns: - - """ - try: - await self._fetch_with_playwright(url, [self._handle_dynamic_buttons]) - logging.info(f"Finished handling dynamic content on {url}. Total: {len(self.urls_crawled)} links.") - except Exception as e: - log_error(f"Error handling dynamic content: {e}") - - async def _handle_dynamic_buttons(self, page: Page, url: str, ignore_names=None): - if ignore_names is None: - ignore_names = ['search', 'submit', 'login', 'register', 'sign in', 'sign up', 'accept', 'close', 'agree'] - try: - buttons = await page.query_selector_all("button:visible:not(:disabled)") - if not buttons: - return - logging.info(f"Found {len(buttons)} clickable buttons on {url}") - - for button_index, button in enumerate(buttons): - if self.is_shutdown(): - logging.info("Shutdown signal received during button handling.") - return - try: - if page.is_closed(): - logging.info("Page was closed, stopping button handling.") - return - - await button.scroll_into_view_if_needed() - button_text = await button.text_content() - if any(ignore_name.lower() in button_text.lower() for ignore_name in ignore_names): - continue - - try: - async with page.expect_navigation(wait_until="networkidle", timeout=5000): - await button.click(timeout=5000) - except PlaywrightTimeoutError: - log_error(f"Navigation timeout after clicking button {button_index} on {url}") - continue - except TargetClosedError: - log_error(f"Page was closed while clicking button {button_index} on {url}") - return - - new_content = await page.content() - soup = BeautifulSoup(new_content, "lxml") - await self._handle_links(soup, url) - - except TargetClosedError: - log_error(f"Page was closed while handling button {button_index} on {url}") - return - except Exception as e: - log_error(f"Error handling button {button_index} on {url}: {e}") - except TargetClosedError: - log_error(f"Page was closed during button discovery on {url}") - except Exception as e: - log_error(f"Error during button discovery on {url}: {e}") - async def _fetch(self, session, url: str) -> str or None: """ Fetches the content of a URL using the given session. @@ -401,51 +319,6 @@ async def _fetch(self, session, url: str) -> str or None: log_error(f"Error fetching {url}: {e}") return None - async def _fetch_with_playwright(self, url: str, callbacks: list[callable] = None): - max_retries = self.max_retries - retry_delay = self.retry_delay - - for attempt in range(max_retries): - if self.is_shutdown(): - logging.info("Shutdown signal received before starting fetch.") - return - - logging.info(f"Fetching {url} with playwright (attempt {attempt + 1}/{max_retries})" if attempt > 0 - else f"Fetching {url} with playwright") - - try: - async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - try: - page = await browser.new_page() - try: - await page.goto(url, wait_until='networkidle', timeout=30000) - - if self.is_shutdown(): - logging.info("Shutdown signal received after page load.") - return - - if callbacks: - for callback in callbacks: - await callback(page, url) - - return # Exit after successful fetch - finally: - await page.close() - finally: - await browser.close() - except Exception as e: - log_error(f"Error fetching {url} with playwright (attempt {attempt + 1}/{max_retries}): {e}") - if attempt == max_retries - 1: - log_error(f"Failed to process {url} after {max_retries} attempts.") - return - # Exponential wait time - await asyncio.sleep(retry_delay * (2 ** attempt)) - - if self.is_shutdown(): - logging.info("Shutdown signal received during retry delay.") - return - def save_state(self): """ Saves the global state to a file. From c136b4ece0c6b3849868124444d6e358e015aed8 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 00:09:21 +0200 Subject: [PATCH 09/15] Fix master --- engine/crawl.py | 19 ++++++++----------- engine/main.py | 1 + 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index ede0ec1..7e5ea95 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -2,13 +2,12 @@ import asyncio import collections # For deque import json -import os -import sys ##### Parsing ##### from bs4 import BeautifulSoup # HTML parsing +import aiohttp from aiohttp import ClientSession -from utils import check_robots, get_base_url -import requests # HTTP requests +from playwright.async_api import async_playwright +from utils import check_robots, get_base_url, get_full_url ##### Threading ##### from pipeline import PipelineElement from concurrent.futures import ThreadPoolExecutor @@ -84,15 +83,13 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): def __del__(self) -> None: self.cursor.close() + @property + def user_agent(self): + return self.user_agents[self._page_count % len(self.user_agents)] + async def fetch(self, session, url): - headers = { - "User-Agent": self.user_agent, - "Accept-Language": "en-US,en;q=0.9,de;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Connection": "keep-alive", - } try: - async with session.get(url, timeout=5, headers=headers) as response: + async with session.get(url, timeout=5, headers=self.headers) as response: return await response.text() except Exception as e: print(f"Error fetching {url}: {e}") diff --git a/engine/main.py b/engine/main.py index a1c1802..9e10041 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,6 +1,7 @@ """ Pipeline for Crawling, Tokenizing, and Indexing """ +import signal from concurrent.futures import ThreadPoolExecutor import asyncio import nest_asyncio From 14827ce5cbd7ae943663f3ce456cee861570e5a9 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 00:31:45 +0200 Subject: [PATCH 10/15] Add shebang support --- engine/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/engine/main.py b/engine/main.py index 9e10041..b5c636b 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,3 +1,5 @@ +#!.venv/bin/python + """ Pipeline for Crawling, Tokenizing, and Indexing """ From ba68a0fd32e1823ddc22b2b4d820b7f98aa6fc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20M=C3=BCller?= <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 01:54:54 +0200 Subject: [PATCH 11/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c1a072d..c94f0e3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# SuniBrownSnakeGaborone +# Modern Search Engine A hyper-fast search engine From e0e2eb8208871b96d6bdef69df41464de32d6618 Mon Sep 17 00:00:00 2001 From: okihnjo Date: Mon, 15 Jul 2024 15:30:38 +0200 Subject: [PATCH 12/15] introduced ranking by simply summing up the tf-idf vals. Query, X and subset of documents are used --- engine/custom_rank.py | 108 +++++++++++++++++++++++++++++++++++++ engine/custom_tokenizer.py | 8 ++- 2 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 engine/custom_rank.py diff --git a/engine/custom_rank.py b/engine/custom_rank.py new file mode 100644 index 0000000..8a492fe --- /dev/null +++ b/engine/custom_rank.py @@ -0,0 +1,108 @@ +import pandas as pd +from custom_tokenizer import tokenize_data, tf_idf_vectorize +from sklearn.feature_extraction.text import TfidfVectorizer +def preprocess_query(Q): + tokenized_query = tokenize_data(Q) + return tokenized_query + + +def find_intersection_2(Q): + df_inverted = pd.read_csv("engine/inverted_index.csv",sep=",", index_col=1) + df_inverted.drop(columns=["Unnamed: 0"], inplace=True) + + # df_inverted.set_index("word", inplace=True) + print(df_inverted.columns) + print(df_inverted.head()) + tokenized_query = preprocess_query(Q) + print(tokenized_query) + result = [] + for token in tokenized_query: + if token in df_inverted.word.values: + print(f"Found token: {token}") + doc_ids = df_inverted[df_inverted["word"]==token]["doc_ids"].apply(eval) + print(f"It has {len(doc_ids)} doc_ids") + result.append(doc_ids) + # print(f"result: {result}") + # find intersection of all lists in result + intersection = set(result[0]).intersection(*result) + return intersection + + + +def find_documents(Q): + df_inverted = pd.read_csv("engine/inverted_index.csv", converters={'doc_ids': pd.eval}) + df_inverted.set_index("word", inplace=True) + df_inverted.drop(columns=["Unnamed: 0"], inplace=True) + + + + print(df_inverted.head()) + tokenized_query = preprocess_query(Q) + print(df_inverted.index.values) + result = [] + for token in tokenized_query: + if token in df_inverted.index.values: + print(f"Found token: {token}") + doc_ids = df_inverted.loc[token].doc_ids + print(f"It has {len(doc_ids)} doc_ids") + result.append(doc_ids) + # find intersection of all lists in result + intersection = set(result[0]).intersection(*result) + union = set(result[0]).union(*result) + if len(intersection) < 2: + print("No intersection found") + return union + return intersection + +def dummy(tokens): + return tokens + +def generate_tf_idf_matrix(path): + df = pd.read_csv("engine/pages.csv", converters={'tokenized_text': pd.eval}) + df_text = df["tokenized_text"] + # create list of lists containing the tokenized text + tokenized_text = [] + print(type(df_text.values)) + vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy) + X = vectorizer.fit_transform(df_text.values) + features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out()) + print(features) + + return features + +def rank_documents(subset_D, Q, X): + # Filter the DataFrame to include only the documents in subset_D + subset_adj = [x-1 for x in subset_D] + filtered_X = X.loc[list(subset_adj)] # here accessen wir rows + + # Ensure Q is a list of query terms + query_terms = preprocess_query(Q) + query_terms_in_X = [term for term in query_terms if term in X.columns] + # Filter the DataFrame to include only the columns corresponding to the query terms + if not query_terms_in_X: + print("No query terms found in the TF-IDF matrix.") + return pd.DataFrame() + filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns + + # Sum the TF-IDF values for each document + filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1) + + # Rank the documents by the summed TF-IDF values in descending order + ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False) + + return ranked_docs + + + + + + +query = "max animal future" +docs = find_documents(query) +X = generate_tf_idf_matrix('engine/pages.csv') +print(f"Found {len(docs)} documents, they look like this: {docs}") +print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}") + +ranked_docs = rank_documents(docs, query, X) +best_20_docs = ranked_docs.head(20).index + 1 +print(f"Best 20 docs: {best_20_docs}") diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index fff429a..df77f86 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -82,13 +82,17 @@ def tokenize_data(data) -> list[str]: data = pipe(data) return data - +# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen. +# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen +# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern +# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html def tf_idf_vectorize(data): """ Vectorizes the input data using the TF-IDF algorithm. """ # Create the vectorizer - vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") + # vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen + vectorizer = TfidfVectorizer() # Vectorize the data X = vectorizer.fit_transform(data) return X From 7c476620f1b8c8bcda8fa1fdeca860289c8cee71 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 17:22:44 +0200 Subject: [PATCH 13/15] Add more information to ranked results to be compatible with web interface --- engine/custom_db.py | 9 +++++++++ engine/custom_rank.py | 37 +++++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/engine/custom_db.py b/engine/custom_db.py index 855b19e..fde3d91 100644 --- a/engine/custom_db.py +++ b/engine/custom_db.py @@ -153,6 +153,12 @@ def save_pages() -> None: pages_df.to_csv("pages.csv", index=False, header=headers) +def get_doc_by_id(page_id: int): + global pages_df + page = pages_df[pages_df['id'] == page_id] + return page + + def load_pages() -> pd.DataFrame: """ Load the pages DataFrame from a CSV file. @@ -177,3 +183,6 @@ def load_pages() -> pd.DataFrame: logging.info("Loaded pages") return pages_df + + +load_pages() diff --git a/engine/custom_rank.py b/engine/custom_rank.py index 8a492fe..0b2deb9 100644 --- a/engine/custom_rank.py +++ b/engine/custom_rank.py @@ -1,5 +1,7 @@ import pandas as pd -from custom_tokenizer import tokenize_data, tf_idf_vectorize + +from custom_db import get_doc_by_id +from custom_tokenizer import tokenize_data from sklearn.feature_extraction.text import TfidfVectorizer def preprocess_query(Q): tokenized_query = tokenize_data(Q) @@ -89,13 +91,29 @@ def rank_documents(subset_D, Q, X): # Rank the documents by the summed TF-IDF values in descending order ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False) - - return ranked_docs - - - - - + + # Map document ID to document with title, URL, and snippet + ranking = [] + for index, ranked_doc in ranked_docs.iterrows(): + score = ranked_doc['sum_tfidf'] + + doc = get_doc_by_id(index) + title = str(doc['title'].values[0]) if not doc.empty else "" + url = str(doc['url'].values[0]) if not doc.empty else "" + snippet = str(doc['snippet'].values[0]) if not doc.empty else "" + + result = { + "id": index, + "title": title, + "url": url, + "description": snippet if snippet else "", + "summary": "", + "score": score + } + ranking.append(result) + + return ranking + query = "max animal future" docs = find_documents(query) @@ -104,5 +122,4 @@ def rank_documents(subset_D, Q, X): print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}") ranked_docs = rank_documents(docs, query, X) -best_20_docs = ranked_docs.head(20).index + 1 -print(f"Best 20 docs: {best_20_docs}") +print(f"Best 20 docs: {ranked_docs[:20]}") From 4e90b151e43d5b1f421b1eb35b52bae295f03cf5 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 17:23:13 +0200 Subject: [PATCH 14/15] Format custom_rank.py and adapt paths --- engine/custom_rank.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/engine/custom_rank.py b/engine/custom_rank.py index 0b2deb9..20e8c48 100644 --- a/engine/custom_rank.py +++ b/engine/custom_rank.py @@ -3,15 +3,17 @@ from custom_db import get_doc_by_id from custom_tokenizer import tokenize_data from sklearn.feature_extraction.text import TfidfVectorizer + + def preprocess_query(Q): tokenized_query = tokenize_data(Q) return tokenized_query def find_intersection_2(Q): - df_inverted = pd.read_csv("engine/inverted_index.csv",sep=",", index_col=1) + df_inverted = pd.read_csv("inverted_index.csv", sep=",", index_col=1) df_inverted.drop(columns=["Unnamed: 0"], inplace=True) - + # df_inverted.set_index("word", inplace=True) print(df_inverted.columns) print(df_inverted.head()) @@ -21,7 +23,7 @@ def find_intersection_2(Q): for token in tokenized_query: if token in df_inverted.word.values: print(f"Found token: {token}") - doc_ids = df_inverted[df_inverted["word"]==token]["doc_ids"].apply(eval) + doc_ids = df_inverted[df_inverted["word"] == token]["doc_ids"].apply(eval) print(f"It has {len(doc_ids)} doc_ids") result.append(doc_ids) # print(f"result: {result}") @@ -30,14 +32,11 @@ def find_intersection_2(Q): return intersection - -def find_documents(Q): - df_inverted = pd.read_csv("engine/inverted_index.csv", converters={'doc_ids': pd.eval}) +def find_documents(Q) -> set: + df_inverted = pd.read_csv("inverted_index.csv", converters={'doc_ids': pd.eval}) df_inverted.set_index("word", inplace=True) df_inverted.drop(columns=["Unnamed: 0"], inplace=True) - - - + print(df_inverted.head()) tokenized_query = preprocess_query(Q) print(df_inverted.index.values) @@ -45,7 +44,7 @@ def find_documents(Q): for token in tokenized_query: if token in df_inverted.index.values: print(f"Found token: {token}") - doc_ids = df_inverted.loc[token].doc_ids + doc_ids = df_inverted.loc[token].doc_ids print(f"It has {len(doc_ids)} doc_ids") result.append(doc_ids) # find intersection of all lists in result @@ -56,11 +55,13 @@ def find_documents(Q): return union return intersection + def dummy(tokens): return tokens + def generate_tf_idf_matrix(path): - df = pd.read_csv("engine/pages.csv", converters={'tokenized_text': pd.eval}) + df = pd.read_csv("pages.csv", converters={'tokenized_text': pd.eval}) df_text = df["tokenized_text"] # create list of lists containing the tokenized text tokenized_text = [] @@ -72,11 +73,12 @@ def generate_tf_idf_matrix(path): return features + def rank_documents(subset_D, Q, X): - # Filter the DataFrame to include only the documents in subset_D - subset_adj = [x-1 for x in subset_D] - filtered_X = X.loc[list(subset_adj)] # here accessen wir rows - + # Filter the DataFrame to include only the documents in subset_D + subset_adj = [x - 1 for x in subset_D] + filtered_X = X.loc[list(subset_adj)] # here accessen wir rows + # Ensure Q is a list of query terms query_terms = preprocess_query(Q) query_terms_in_X = [term for term in query_terms if term in X.columns] @@ -84,11 +86,11 @@ def rank_documents(subset_D, Q, X): if not query_terms_in_X: print("No query terms found in the TF-IDF matrix.") return pd.DataFrame() - filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns - + filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns + # Sum the TF-IDF values for each document filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1) - + # Rank the documents by the summed TF-IDF values in descending order ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False) @@ -117,9 +119,9 @@ def rank_documents(subset_D, Q, X): query = "max animal future" docs = find_documents(query) -X = generate_tf_idf_matrix('engine/pages.csv') +X = generate_tf_idf_matrix('pages.csv') print(f"Found {len(docs)} documents, they look like this: {docs}") -print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}") +print(f"Result: {generate_tf_idf_matrix('pages.csv')}") ranked_docs = rank_documents(docs, query, X) print(f"Best 20 docs: {ranked_docs[:20]}") From 2e114675e3f0cf1e814c792065ecb95ff6e42ed7 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Tue, 16 Jul 2024 17:34:47 +0200 Subject: [PATCH 15/15] Add rank method that can be called from server --- engine/custom_rank.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/engine/custom_rank.py b/engine/custom_rank.py index 20e8c48..c62b378 100644 --- a/engine/custom_rank.py +++ b/engine/custom_rank.py @@ -56,17 +56,17 @@ def find_documents(Q) -> set: return intersection -def dummy(tokens): +def dummy_tokenizer(tokens: list[str]): return tokens def generate_tf_idf_matrix(path): - df = pd.read_csv("pages.csv", converters={'tokenized_text': pd.eval}) + df = pd.read_csv(path, converters={'tokenized_text': pd.eval}) df_text = df["tokenized_text"] # create list of lists containing the tokenized text tokenized_text = [] print(type(df_text.values)) - vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy) + vectorizer = TfidfVectorizer(tokenizer=dummy_tokenizer, preprocessor=dummy_tokenizer) X = vectorizer.fit_transform(df_text.values) features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out()) print(features) @@ -117,11 +117,18 @@ def rank_documents(subset_D, Q, X): return ranking -query = "max animal future" -docs = find_documents(query) +# query = "food and drink" +# docs = find_documents(query) X = generate_tf_idf_matrix('pages.csv') -print(f"Found {len(docs)} documents, they look like this: {docs}") -print(f"Result: {generate_tf_idf_matrix('pages.csv')}") -ranked_docs = rank_documents(docs, query, X) -print(f"Best 20 docs: {ranked_docs[:20]}") + +# print(f"Found {len(docs)} documents, they look like this: {docs}") +# print(f"Result: {generate_tf_idf_matrix('pages.csv')}") + +# ranked_docs = rank_documents(docs, query, X) +# print(f"Best 20 docs: {ranked_docs[:20]}") + + +def rank(query): + docs = find_documents(query) + return rank_documents(docs, query, X)