Skip to content

Commit

Permalink
Merge pull request #8 from am9zZWY/intro-duckdb
Browse files Browse the repository at this point in the history
Add duckdb database
  • Loading branch information
am9zZWY authored Jul 13, 2024
2 parents 6c29614 + 59e08bf commit 07afb35
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
21 changes: 14 additions & 7 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
##### Language detection #####
from eld import LanguageDetector
##### Database #####
import duckdb
from custom_db import *

##### Constants #####
Expand Down Expand Up @@ -45,9 +46,12 @@


class Crawler(PipelineElement):
def __init__(self):
def __init__(self, dbcon: duckdb.DuckDBPyConnection):
super().__init__("Crawler")

# Initialize the duckdb connection
self.cursor = dbcon.cursor()

# Initialize the crawler state
self.found_links = set()
self.ignore_links = set()
Expand All @@ -65,6 +69,9 @@ def __init__(self):
self.required_keywords = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"]
self.user_agent = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)"

def __del__(self) -> None:
self.cursor.close()

async def fetch(self, session, url):
headers = {
"User-Agent": self.user_agent,
Expand Down Expand Up @@ -191,10 +198,10 @@ def _load_state(self):

# IMPORTANT: Please use main.py instead of this file
if __name__ == "__main__":
crawler = Crawler()
con = duckdb.connect("crawlies.db")
con.install_extension("fts")
con.load_extension("fts")

crawler = Crawler(con)
crawler.process()
# TODO - seperarw crawling and tokenizing
index_pages()
index_df = access_index()
index_df.to_csv("inverted_index.csv")
save_pages()
con.close()
15 changes: 14 additions & 1 deletion engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,27 @@
"""
from concurrent.futures import ThreadPoolExecutor

# Database
import duckdb
# Pipeline
from crawl import Crawler
from custom_db import index_pages, access_index, save_pages
from custom_tokenizer import Tokenizer
# Async
import asyncio

MAX_THREADS = 10

if __name__ == "__main__":
con = duckdb.connect("crawlies.db")
try:
# Database setup
con.install_extension("fts")
con.load_extension("fts")

# Initialize the pipeline
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
crawler = Crawler()
crawler = Crawler(con)
crawler.max_size = 1000
crawler.add_executor(executor)

Expand All @@ -29,9 +38,13 @@
except (KeyboardInterrupt, SystemExit):
print("Exiting...")
crawler.save_state()
con.close()
print("State saved")

index_pages()
index_df = access_index()
index_df.to_csv("inverted_index.csv")
save_pages()

# Close the connection
con.close()
1 change: 1 addition & 0 deletions engine/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ beautifulsoup4==4.12.3
eld==1.0.6
Flask==3.0.3
lxml==5.2.2
duckdb==1.0.0
nltk==3.8.1
pandas==2.2.2
requests==2.32.3
Expand Down

0 comments on commit 07afb35

Please sign in to comment.