diff --git a/engine/crawl.py b/engine/crawl.py index b63d048..241b164 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -2,13 +2,12 @@ import asyncio import collections # For deque import json -import os -import sys ##### Parsing ##### from bs4 import BeautifulSoup # HTML parsing +import aiohttp from aiohttp import ClientSession -from utils import check_robots, get_base_url -import requests # HTTP requests +from playwright.async_api import async_playwright +from utils import check_robots, get_base_url, get_full_url ##### Threading ##### from pipeline import PipelineElement from concurrent.futures import ThreadPoolExecutor @@ -115,9 +114,6 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection): def __del__(self) -> None: self.cursor.close() - # Internal state - self._page_count = 0 - @property def user_agent(self): # Cycle through user agents diff --git a/engine/main.py b/engine/main.py index 8ce2fd3..2b5878e 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,6 +1,7 @@ """ Pipeline for Crawling, Tokenizing, and Indexing """ +import signal from concurrent.futures import ThreadPoolExecutor import asyncio import nest_asyncio