From 23dc6441331e6dfaf4989d013e2152337861ee11 Mon Sep 17 00:00:00 2001 From: Phi Date: Tue, 9 Jul 2024 16:34:10 +0200 Subject: [PATCH 1/3] Add database --- engine/crawl.py | 19 +++++++++++++++++-- engine/requirements.txt | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index 4db7373..b4f03bb 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -15,6 +15,8 @@ from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words ##### Language detection ##### from nltk.classify import textcat +##### Database ##### +import duckdb ##### Constants ##### # Maximum size of the links @@ -148,11 +150,16 @@ def get_lang(text: str) -> str: to_crawl_set = set(SEEDS) + class Crawler: - def __init__(self, identifier: str) -> None: + def __init__(self, identifier: str, dbcon: duckdb.DuckDBPyConnection) -> None: self.identifier = identifier + self.cursor = dbcon.cursor() print(f"Initialized Crawler {self.identifier}") + def __del__(self) -> None: + self.cursor.close() + def crawl(self) -> None: """ Crawls a website iteratively and extracts links from HTML pages. @@ -331,13 +338,17 @@ def start_crawl(): load_state() crawlers = [] + con = duckdb.connect("crawlies.db") try: + con.install_extension("fts") + con.load_extension("fts") with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: for i in range(MAX_THREADS): - crawler = Crawler(i) + crawler = Crawler(i, con) crawlers.append(crawler) executor.submit(crawler.crawl) + con.close() save_state() print("Found", len(found_links), "links") @@ -347,9 +358,13 @@ def start_crawl(): except KeyboardInterrupt: try: save_state() + con.close() sys.exit(130) except SystemExit: + con.close() os._exit(130) + except Exception as e: + con.close() if __name__ == "__main__": diff --git a/engine/requirements.txt b/engine/requirements.txt index ba16fa2..0cd20a7 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -1,3 +1,4 @@ beautifulsoup4==4.12.3 Requests==2.32.3 lxml==5.2.2 +duckdb==1.0.0 \ No newline at end of file From bf6c632712d7667205218a53f61fca6ccbc42752 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Sat, 13 Jul 2024 02:20:36 +0200 Subject: [PATCH 2/3] Move some duckdb code to main --- engine/main.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/engine/main.py b/engine/main.py index 2bb8b6b..a41ee53 100644 --- a/engine/main.py +++ b/engine/main.py @@ -3,18 +3,27 @@ """ from concurrent.futures import ThreadPoolExecutor +# Database +import duckdb +# Pipeline from crawl import Crawler from custom_db import index_pages, access_index, save_pages from custom_tokenizer import Tokenizer +# Async import asyncio MAX_THREADS = 10 if __name__ == "__main__": + con = duckdb.connect("crawlies.db") try: + # Database setup + con.install_extension("fts") + con.load_extension("fts") + # Initialize the pipeline with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: - crawler = Crawler() + crawler = Crawler(con) crawler.max_size = 1000 crawler.add_executor(executor) @@ -29,9 +38,13 @@ except (KeyboardInterrupt, SystemExit): print("Exiting...") crawler.save_state() + con.close() print("State saved") index_pages() index_df = access_index() index_df.to_csv("inverted_index.csv") save_pages() + + # Close the connection + con.close() From 59e08bfe3e13171ef88cca33c70a3e853a635e32 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Sat, 13 Jul 2024 02:20:58 +0200 Subject: [PATCH 3/3] Clean up crawl --- engine/crawl.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/engine/crawl.py b/engine/crawl.py index 80d3eb0..a43fa22 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -204,9 +204,4 @@ def _load_state(self): crawler = Crawler(con) crawler.process() - # TODO - seperarw crawling and tokenizing - index_pages() - index_df = access_index() - index_df.to_csv("inverted_index.csv") con.close() - save_pages()