Skip to content

Commit

Permalink
Remove code that handles dynamic stuff.
Browse files Browse the repository at this point in the history
If we still need it, it's in commit 94edd89
  • Loading branch information
am9zZWY committed Jul 15, 2024
1 parent cd2201f commit 3f835bc
Showing 1 changed file with 0 additions and 127 deletions.
127 changes: 0 additions & 127 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,33 +233,13 @@ async def _process_url(self, session, url: str):
# Handle links
await self._handle_links(soup, url)

# Handle buttons
#await self._handle_dynamic_content(url)

if url not in self.urls_crawled and url not in self.ignore_links:
self.urls_crawled.add(url)

logging.info(f"Finished crawling {url}. Total: {len(self.urls_crawled)} links.")
if not self.is_shutdown():
await self.call_next(soup, url)

async def _handle_dynamic_links(self, page: Page, url: str):
"""
Handles the links on a page using Playwright.
Args:
page:
url:
Returns:
"""
try:
content = await page.content()
soup = BeautifulSoup(content, "lxml")
await self._handle_links(soup, url)
except Exception as e:
log_error(f"Error handling links on {url}: {e}")

async def _handle_links(self, soup, url):
"""
Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
Expand Down Expand Up @@ -308,68 +288,6 @@ async def _handle_links(self, soup, url):
self.to_crawl_queue.append(found_link)
self.to_crawl_set.add(found_link)

async def _handle_dynamic_content(self, url: str):
"""
Handles dynamic content on a page. This is useful for websites that require JavaScript rendering.
Args:
url:
Returns:
"""
try:
await self._fetch_with_playwright(url, [self._handle_dynamic_buttons])
logging.info(f"Finished handling dynamic content on {url}. Total: {len(self.urls_crawled)} links.")
except Exception as e:
log_error(f"Error handling dynamic content: {e}")

async def _handle_dynamic_buttons(self, page: Page, url: str, ignore_names=None):
if ignore_names is None:
ignore_names = ['search', 'submit', 'login', 'register', 'sign in', 'sign up', 'accept', 'close', 'agree']
try:
buttons = await page.query_selector_all("button:visible:not(:disabled)")
if not buttons:
return
logging.info(f"Found {len(buttons)} clickable buttons on {url}")

for button_index, button in enumerate(buttons):
if self.is_shutdown():
logging.info("Shutdown signal received during button handling.")
return
try:
if page.is_closed():
logging.info("Page was closed, stopping button handling.")
return

await button.scroll_into_view_if_needed()
button_text = await button.text_content()
if any(ignore_name.lower() in button_text.lower() for ignore_name in ignore_names):
continue

try:
async with page.expect_navigation(wait_until="networkidle", timeout=5000):
await button.click(timeout=5000)
except PlaywrightTimeoutError:
log_error(f"Navigation timeout after clicking button {button_index} on {url}")
continue
except TargetClosedError:
log_error(f"Page was closed while clicking button {button_index} on {url}")
return

new_content = await page.content()
soup = BeautifulSoup(new_content, "lxml")
await self._handle_links(soup, url)

except TargetClosedError:
log_error(f"Page was closed while handling button {button_index} on {url}")
return
except Exception as e:
log_error(f"Error handling button {button_index} on {url}: {e}")
except TargetClosedError:
log_error(f"Page was closed during button discovery on {url}")
except Exception as e:
log_error(f"Error during button discovery on {url}: {e}")

async def _fetch(self, session, url: str) -> str or None:
"""
Fetches the content of a URL using the given session.
Expand Down Expand Up @@ -401,51 +319,6 @@ async def _fetch(self, session, url: str) -> str or None:
log_error(f"Error fetching {url}: {e}")
return None

async def _fetch_with_playwright(self, url: str, callbacks: list[callable] = None):
max_retries = self.max_retries
retry_delay = self.retry_delay

for attempt in range(max_retries):
if self.is_shutdown():
logging.info("Shutdown signal received before starting fetch.")
return

logging.info(f"Fetching {url} with playwright (attempt {attempt + 1}/{max_retries})" if attempt > 0
else f"Fetching {url} with playwright")

try:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
try:
page = await browser.new_page()
try:
await page.goto(url, wait_until='networkidle', timeout=30000)

if self.is_shutdown():
logging.info("Shutdown signal received after page load.")
return

if callbacks:
for callback in callbacks:
await callback(page, url)

return # Exit after successful fetch
finally:
await page.close()
finally:
await browser.close()
except Exception as e:
log_error(f"Error fetching {url} with playwright (attempt {attempt + 1}/{max_retries}): {e}")
if attempt == max_retries - 1:
log_error(f"Failed to process {url} after {max_retries} attempts.")
return
# Exponential wait time
await asyncio.sleep(retry_delay * (2 ** attempt))

if self.is_shutdown():
logging.info("Shutdown signal received during retry delay.")
return

def save_state(self):
"""
Saves the global state to a file.
Expand Down

0 comments on commit 3f835bc

Please sign in to comment.