From 23dc6441331e6dfaf4989d013e2152337861ee11 Mon Sep 17 00:00:00 2001
From: Phi <code@phictional.de>
Date: Tue, 9 Jul 2024 16:34:10 +0200
Subject: [PATCH 1/3] Add database

---
 engine/crawl.py         | 19 +++++++++++++++++--
 engine/requirements.txt |  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/engine/crawl.py b/engine/crawl.py
index 4db7373..b4f03bb 100644
--- a/engine/crawl.py
+++ b/engine/crawl.py
@@ -15,6 +15,8 @@
 from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words
 ##### Language detection #####
 from nltk.classify import textcat
+##### Database #####
+import duckdb
 
 ##### Constants #####
 # Maximum size of the links
@@ -148,11 +150,16 @@ def get_lang(text: str) -> str:
 to_crawl_set = set(SEEDS)
 
 
+
 class Crawler:
-    def __init__(self, identifier: str) -> None:
+    def __init__(self, identifier: str, dbcon: duckdb.DuckDBPyConnection) -> None:
         self.identifier = identifier
+        self.cursor = dbcon.cursor()
         print(f"Initialized Crawler {self.identifier}")
 
+    def __del__(self) -> None:
+        self.cursor.close()
+
     def crawl(self) -> None:
         """
         Crawls a website iteratively and extracts links from HTML pages.
@@ -331,13 +338,17 @@ def start_crawl():
     load_state()
 
     crawlers = []
+    con = duckdb.connect("crawlies.db")
     try:
+        con.install_extension("fts")
+        con.load_extension("fts")
         with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
             for i in range(MAX_THREADS):
-                crawler = Crawler(i)
+                crawler = Crawler(i, con)
                 crawlers.append(crawler)
                 executor.submit(crawler.crawl)
 
+        con.close()
         save_state()
 
         print("Found", len(found_links), "links")
@@ -347,9 +358,13 @@ def start_crawl():
     except KeyboardInterrupt:
         try:
             save_state()
+            con.close()
             sys.exit(130)
         except SystemExit:
+            con.close()
             os._exit(130)
+    except Exception as e:
+        con.close()
 
 
 if __name__ == "__main__":
diff --git a/engine/requirements.txt b/engine/requirements.txt
index ba16fa2..0cd20a7 100644
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -1,3 +1,4 @@
 beautifulsoup4==4.12.3
 Requests==2.32.3
 lxml==5.2.2
+duckdb==1.0.0
\ No newline at end of file

From bf6c632712d7667205218a53f61fca6ccbc42752 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Sat, 13 Jul 2024 02:20:36 +0200
Subject: [PATCH 2/3] Move some duckdb code to main

---
 engine/main.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/engine/main.py b/engine/main.py
index 2bb8b6b..a41ee53 100644
--- a/engine/main.py
+++ b/engine/main.py
@@ -3,18 +3,27 @@
 """
 from concurrent.futures import ThreadPoolExecutor
 
+# Database
+import duckdb
+# Pipeline
 from crawl import Crawler
 from custom_db import index_pages, access_index, save_pages
 from custom_tokenizer import Tokenizer
+# Async
 import asyncio
 
 MAX_THREADS = 10
 
 if __name__ == "__main__":
+    con = duckdb.connect("crawlies.db")
     try:
+        # Database setup
+        con.install_extension("fts")
+        con.load_extension("fts")
+
         # Initialize the pipeline
         with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
-            crawler = Crawler()
+            crawler = Crawler(con)
             crawler.max_size = 1000
             crawler.add_executor(executor)
 
@@ -29,9 +38,13 @@
     except (KeyboardInterrupt, SystemExit):
         print("Exiting...")
         crawler.save_state()
+        con.close()
         print("State saved")
 
     index_pages()
     index_df = access_index()
     index_df.to_csv("inverted_index.csv")
     save_pages()
+
+    # Close the connection
+    con.close()

From 59e08bfe3e13171ef88cca33c70a3e853a635e32 Mon Sep 17 00:00:00 2001
From: am9zZWY <46693545+am9zZWY@users.noreply.github.com>
Date: Sat, 13 Jul 2024 02:20:58 +0200
Subject: [PATCH 3/3] Clean up crawl

---
 engine/crawl.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/engine/crawl.py b/engine/crawl.py
index 80d3eb0..a43fa22 100644
--- a/engine/crawl.py
+++ b/engine/crawl.py
@@ -204,9 +204,4 @@ def _load_state(self):
 
     crawler = Crawler(con)
     crawler.process()
-    # TODO - seperarw crawling and tokenizing
-    index_pages()
-    index_df = access_index()
-    index_df.to_csv("inverted_index.csv")
     con.close()
-    save_pages()