Skip to content

Commit

Permalink
Merge branch 'master' into josef-crawler-tokenizer-update
Browse files Browse the repository at this point in the history
  • Loading branch information
am9zZWY authored Jul 15, 2024
2 parents 3f835bc + c136b4e commit b3f4b89
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 7 deletions.
10 changes: 3 additions & 7 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import asyncio
import collections # For deque
import json
import os
import sys
##### Parsing #####
from bs4 import BeautifulSoup # HTML parsing
import aiohttp
from aiohttp import ClientSession
from utils import check_robots, get_base_url
import requests # HTTP requests
from playwright.async_api import async_playwright
from utils import check_robots, get_base_url, get_full_url
##### Threading #####
from pipeline import PipelineElement
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -115,9 +114,6 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection):
def __del__(self) -> None:
self.cursor.close()

# Internal state
self._page_count = 0

@property
def user_agent(self):
# Cycle through user agents
Expand Down
1 change: 1 addition & 0 deletions engine/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Pipeline for Crawling, Tokenizing, and Indexing
"""
import signal
from concurrent.futures import ThreadPoolExecutor
import asyncio
import nest_asyncio
Expand Down

0 comments on commit b3f4b89

Please sign in to comment.