Skip to content

Commit

Permalink
Updated scrapping
Browse files Browse the repository at this point in the history
  • Loading branch information
broomva committed Jan 26, 2024
1 parent cdace87 commit d4a6099
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 26 deletions.
31 changes: 23 additions & 8 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
databricks_experiment_name=''
databricks_experiment_id=''
databricks_host=''
databricks_token=''
databricks_username=''
databricks_password=''
databricks_cluster_id=''
databricks_sql_http_path=''
PG_USERNAME=''
PG_PASSWORD=''
PG_HOST=''
PG_PORT=''
PG_DATABASE=''
LANGCHAIN_TRACING_V2=''
LANGCHAIN_ENDPOINT=''
LANGCHAIN_API_KEY=''
LANGCHAIN_PROJECT=''
DAGSTER_HOME=''
OPENAI_API_BASE_URL=''
OPENAI_MODEL_NAME=''
TOGETHER_API_KEY=''
BROWSERLESS_API_KEY=''
SERP_API_KEY=''
SQLALCHEMY_URL=''
OPENAI_API_KEY=''
CHAINLIT_API_KEY=''
OAUTH_GITHUB_CLIENT_ID=''
OAUTH_GITHUB_CLIENT_SECRET=''
CHAINLIT_AUTH_SECRET=''
HUGGINGFACEHUB_API_TOKEN=''
SENDGRID_API_KEY=''
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ __pycache__/

.streamlit/secrets.toml


dagster_runtime/

# C extensions
*.so

Expand Down
10 changes: 8 additions & 2 deletions vortex/dagster/vortex/assets/vortex.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,15 @@ def get_article(context, get_url) -> str:
if not get_url:
return None
try:
response = scrape_website_selenium(get_url[0])
try:
response = scrape_website_selenium(get_url[0])
context.log.debug(f"Selenium response {response}")
except Exception as e:
response = None
if response is None:
context.log.warning(f"Selenium response was None. Using BS4")
response = scrape_website(get_url[0])
context.log.debug(f"Bs4 Scrape response {response}")
except Exception as e:
context.log.info(f"Error {e}")
response = None
Expand Down Expand Up @@ -218,7 +224,7 @@ def write_consolidated_summary(context, consolidated_summary, get_articles_summa
def send_email_with_sendgrid(context, get_url, summarize_article):
email = get_url[2]
message = Mail(
from_email="[email protected]",
from_email="Vortex Summaries",
to_emails=email,
subject="Here is your URL summary! 🎉",
plain_text_content=summarize_article,
Expand Down
36 changes: 20 additions & 16 deletions vortex/dagster/vortex/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,28 +61,32 @@ def scrape_website(url: str):


def scrape_website_selenium(url):
# Configure Selenium with a headless browser
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
try:
# Configure Selenium with a headless browser
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)

# Access the webpage
driver.get(url)
# Access the webpage
driver.get(url)

# Wait for JavaScript to render. Adjust time as needed.
time.sleep(5) # Time in seconds
# Wait for JavaScript to render. Adjust time as needed.
time.sleep(5) # Time in seconds

# Extract the page source
page_source = driver.page_source
# Extract the page source
page_source = driver.page_source

# Close the browser
driver.quit()
# Close the browser
driver.quit()

# Convert HTML to Markdown
converter = html2text.HTML2Text()
markdown = converter.handle(page_source)
# Convert HTML to Markdown
converter = html2text.HTML2Text()
markdown = converter.handle(page_source)

return markdown
return markdown
except Exception as e:
print(f"Error scraping website: {e}")
raise e


tools = [
Expand Down

0 comments on commit d4a6099

Please sign in to comment.