Skip to content

Commit

Permalink
Add async and workflows
Browse files Browse the repository at this point in the history
  • Loading branch information
lalalune committed Jul 18, 2023
1 parent f59c9db commit 9f5641e
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 249 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Upload Python Package

on:
release:
types: [published]

permissions:
contents: read

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: ${{ secrets.pypi_username }}
password: ${{ secrets.pypi_password }}
25 changes: 25 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Lint and Test

on: [push]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pytest-asyncio
pip install -r requirements.txt
- name: Running tests
run: |
pytest agentbrowser/test.py
pytest agentbrowser/test_async.py
34 changes: 1 addition & 33 deletions agentbrowser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,33 +1 @@
"""
agentbrowser
A browser for your agent, built on Chrome and Pyppeteer.
"""

__version__ = "0.1.1"
__author__ = "Moon (https://github.com/lalalune)"
__credits__ = "https://github.com/lalalune/agentbrowser"

from .browser import (
get_browser,
init_browser,
create_page,
close_page,
navigate_to,
get_body_text,
get_document_html,
get_body_html,
evaluate_javascript
)

__all__ = [
"get_browser",
"init_browser",
"create_page",
"navigate_to",
"close_page",
"get_document_html",
"get_body_text",
"get_body_html",
"evaluate_javascript",
]
from .browser import *
178 changes: 80 additions & 98 deletions agentbrowser/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,140 +7,122 @@

browser = None


def get_browser():
check_browser_inited()
if browser is None:
init_browser()
return browser

async def async_get_browser():
if browser is None:
await async_init_browser()
return browser


def init_browser(headless=True, executable_path=None):
if browser is not None:
asyncio.get_event_loop().run_until_complete(browser.close())
asyncio.get_event_loop().run_until_complete(
async_init_browser(headless, executable_path)
)

if executable_path is None:
executable_path = find_chrome()

async def init():
global browser
def create_page(site=None):
return asyncio.get_event_loop().run_until_complete(async_create_page(site))

def handle_interrupt():
asyncio.ensure_future(browser.close())
asyncio.get_event_loop().stop()

browser = await launch(headless=headless, executablePath=executable_path)
signal.signal(signal.SIGINT, handle_interrupt)
def close_page(page):
asyncio.get_event_loop().run_until_complete(async_close_page(page))

asyncio.get_event_loop().run_until_complete(init())

def navigate_to(url, page):
return asyncio.get_event_loop().run_until_complete(async_navigate_to(url, page))

def check_browser_inited():
if browser is None:
init_browser()

def get_document_html(page):
return asyncio.get_event_loop().run_until_complete(async_get_document_html(page))

def create_page(site=None):
check_browser_inited()
page = asyncio.get_event_loop().run_until_complete(browser.newPage())

def get_body_text(page):
return asyncio.get_event_loop().run_until_complete(async_get_body_text(page))


def get_body_text_raw(page):
return asyncio.get_event_loop().run_until_complete(async_get_body_text_raw(page))


def get_body_html(page):
return asyncio.get_event_loop().run_until_complete(async_get_body_html(page))


def evaluate_javascript(code, page):
return asyncio.get_event_loop().run_until_complete(
async_evaluate_javascript(code, page)
)


# async version of init_browser
async def async_init_browser(headless=True, executable_path=None):
global browser

if executable_path is None:
executable_path = find_chrome()

if browser is None:
browser = await launch(headless=headless, executablePath=executable_path, autoClose=False)
return browser

# async version of create_page
async def async_create_page(site=None):
global browser
new_browser = None
if browser is None:
new_browser = await async_init_browser()
else:
new_browser = browser
page = await new_browser.newPage()
if site:
asyncio.get_event_loop().run_until_complete(
page.goto(site, {"waitUntil": ["domcontentloaded", "networkidle0"]})
)
await page.goto(site, {"waitUntil": ["domcontentloaded", "networkidle0"]})
return page


def close_page(page):
asyncio.get_event_loop().run_until_complete(page.close())
# async version of close_page
async def async_close_page(page):
await page.close()


def navigate_to(url, page):
check_browser_inited()
# async version of navigate_to
async def async_navigate_to(url, page):
if not page:
page = create_page(None)
page = await async_create_page(None)
try:
asyncio.get_event_loop().run_until_complete(
page.goto(url, {"waitUntil": ["domcontentloaded", "networkidle0"]})
)
await page.goto(url, {"waitUntil": ["domcontentloaded", "networkidle0"]})
except Exception as e:
print("Error navigating to: " + url)
print(e)
return None
return page


def get_document_html(page):
return asyncio.get_event_loop().run_until_complete(page.content())

# async version of get_document_html
async def async_get_document_html(page):
return await page.content()

def get_body_text(page):
# get the body, but remove some junk first
output = asyncio.get_event_loop().run_until_complete(
page.Jeval(
"body",
"""
(element) => {
const element_blacklist = [
"sidebar",
"footer",
"account",
"login",
"signup",
"search",
"advertisement",
"masthead",
"popup",
"floater",
"modal",
];
// first, filter out all the script tags, noscript tags, <footer>, <header>, etc
[...element.querySelectorAll('script, noscript, form, footer, header, img, svg, style')].forEach(element => element && element.remove())
// find any element which contains any class or id which includes the words in the blacklist
const blacklist = element_blacklist.join('|')
const regex = new RegExp(blacklist, 'i')
const blacklist_elements = [...element.querySelectorAll('*')].filter(element => element && ((element.id && element.id.match(regex)) || (element.className && element.className.match && element.className.match(regex))))
// remove all the blacklist elements
blacklist_elements.forEach(element => element && element.remove())
// replace any tags inside of the body with just their text content
const tags = [...element.querySelectorAll('*')]
tags.forEach(element => element && element.replaceWith(element.textContent))
// then, get the text content of the body element
let text = element.textContent
// finally, remove all the extra whitespace
text = text.replace(/\s+/g, ' ')
return text
}
""",
)
)

# remove any extra whitespace
output = re.sub(r"\s+", " ", output)
async def async_get_body_text(page):
output = await page.querySelectorEval("body", "(element) => element.innerText")
return output.strip()

return output
async def async_get_body_text_raw(page):
output = await page.querySelectorEval("body", "(element) => element.innerText")
return output.strip()


def get_body_text_raw(page):
# get the raw body text, without any filtering
return asyncio.get_event_loop().run_until_complete(
page.Jeval(
"body",
"""
(element) => {
return element.textContent
}
""",
)
)
# async version of get_body_html
async def async_get_body_html(page):
return await page.Jeval("body", "(element) => element.innerHTML")


def get_body_html(page):
return asyncio.get_event_loop().run_until_complete(
page.Jeval("body", "(element) => element.innerHTML")
)


def evaluate_javascript(code, page):
return asyncio.get_event_loop().run_until_complete(page.evaluate(code))
# async version of evaluate_javascript
async def async_evaluate_javascript(code, page):
return await page.evaluate(code)


def find_chrome():
Expand Down
70 changes: 0 additions & 70 deletions agentbrowser/browser_test.py

This file was deleted.

Loading

0 comments on commit 9f5641e

Please sign in to comment.