diff --git a/engine/crawl.py b/engine/crawl.py index 57a23cc..a43fa22 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -17,6 +17,7 @@ ##### Language detection ##### from eld import LanguageDetector ##### Database ##### +import duckdb from custom_db import * ##### Constants ##### @@ -45,9 +46,12 @@ class Crawler(PipelineElement): - def __init__(self): + def __init__(self, dbcon: duckdb.DuckDBPyConnection): super().__init__("Crawler") + # Initialize the duckdb connection + self.cursor = dbcon.cursor() + # Initialize the crawler state self.found_links = set() self.ignore_links = set() @@ -65,6 +69,9 @@ def __init__(self): self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"] self.user_agent = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)" + def __del__(self) -> None: + self.cursor.close() + async def fetch(self, session, url): headers = { "User-Agent": self.user_agent, @@ -191,10 +198,10 @@ def _load_state(self): # IMPORTANT: Please use main.py instead of this file if __name__ == "__main__": - crawler = Crawler() + con = duckdb.connect("crawlies.db") + con.install_extension("fts") + con.load_extension("fts") + + crawler = Crawler(con) crawler.process() - # TODO - seperarw crawling and tokenizing - index_pages() - index_df = access_index() - index_df.to_csv("inverted_index.csv") - save_pages() + con.close() diff --git a/engine/main.py b/engine/main.py index 2bb8b6b..a41ee53 100644 --- a/engine/main.py +++ b/engine/main.py @@ -3,18 +3,27 @@ """ from concurrent.futures import ThreadPoolExecutor +# Database +import duckdb +# Pipeline from crawl import Crawler from custom_db import index_pages, access_index, save_pages from custom_tokenizer import Tokenizer +# Async import asyncio MAX_THREADS = 10 if __name__ == "__main__": + con = duckdb.connect("crawlies.db") try: + # Database setup + con.install_extension("fts") + con.load_extension("fts") + # Initialize the pipeline with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: - crawler = Crawler() + crawler = Crawler(con) crawler.max_size = 1000 crawler.add_executor(executor) @@ -29,9 +38,13 @@ except (KeyboardInterrupt, SystemExit): print("Exiting...") crawler.save_state() + con.close() print("State saved") index_pages() index_df = access_index() index_df.to_csv("inverted_index.csv") save_pages() + + # Close the connection + con.close() diff --git a/engine/requirements.txt b/engine/requirements.txt index f48312a..a167548 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -2,6 +2,7 @@ beautifulsoup4==4.12.3 eld==1.0.6 Flask==3.0.3 lxml==5.2.2 +duckdb==1.0.0 nltk==3.8.1 pandas==2.2.2 requests==2.32.3