From d4a60998501fdb78259b14fb7cd7f16516561955 Mon Sep 17 00:00:00 2001 From: "Carlos D. Escobar-Valbuena" Date: Fri, 26 Jan 2024 09:52:34 -0500 Subject: [PATCH] Updated scrapping --- .env.example | 31 +++++++++++++++------ .gitignore | 3 +++ vortex/dagster/vortex/assets/vortex.py | 10 +++++-- vortex/dagster/vortex/tools/__init__.py | 36 ++++++++++++++----------- 4 files changed, 54 insertions(+), 26 deletions(-) diff --git a/.env.example b/.env.example index 93dc65b..ace4e91 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,23 @@ -databricks_experiment_name='' -databricks_experiment_id='' -databricks_host='' -databricks_token='' -databricks_username='' -databricks_password='' -databricks_cluster_id='' -databricks_sql_http_path='' \ No newline at end of file +PG_USERNAME='' +PG_PASSWORD='' +PG_HOST='' +PG_PORT='' +PG_DATABASE='' +LANGCHAIN_TRACING_V2='' +LANGCHAIN_ENDPOINT='' +LANGCHAIN_API_KEY='' +LANGCHAIN_PROJECT='' +DAGSTER_HOME='' +OPENAI_API_BASE_URL='' +OPENAI_MODEL_NAME='' +TOGETHER_API_KEY='' +BROWSERLESS_API_KEY='' +SERP_API_KEY='' +SQLALCHEMY_URL='' +OPENAI_API_KEY='' +CHAINLIT_API_KEY='' +OAUTH_GITHUB_CLIENT_ID='' +OAUTH_GITHUB_CLIENT_SECRET='' +CHAINLIT_AUTH_SECRET='' +HUGGINGFACEHUB_API_TOKEN='' +SENDGRID_API_KEY='' \ No newline at end of file diff --git a/.gitignore b/.gitignore index b2320cc..2e65c74 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ __pycache__/ .streamlit/secrets.toml + +dagster_runtime/ + # C extensions *.so diff --git a/vortex/dagster/vortex/assets/vortex.py b/vortex/dagster/vortex/assets/vortex.py index fc71125..88591e9 100644 --- a/vortex/dagster/vortex/assets/vortex.py +++ b/vortex/dagster/vortex/assets/vortex.py @@ -73,9 +73,15 @@ def get_article(context, get_url) -> str: if not get_url: return None try: - response = scrape_website_selenium(get_url[0]) + try: + response = scrape_website_selenium(get_url[0]) + context.log.debug(f"Selenium response {response}") + except Exception as e: + response = None if response is None: + context.log.warning(f"Selenium response was None. Using BS4") response = scrape_website(get_url[0]) + context.log.debug(f"Bs4 Scrape response {response}") except Exception as e: context.log.info(f"Error {e}") response = None @@ -218,7 +224,7 @@ def write_consolidated_summary(context, consolidated_summary, get_articles_summa def send_email_with_sendgrid(context, get_url, summarize_article): email = get_url[2] message = Mail( - from_email="carlos@broomva.tech", + from_email="Vortex Summaries", to_emails=email, subject="Here is your URL summary! 🎉", plain_text_content=summarize_article, diff --git a/vortex/dagster/vortex/tools/__init__.py b/vortex/dagster/vortex/tools/__init__.py index f204980..d13b1ae 100644 --- a/vortex/dagster/vortex/tools/__init__.py +++ b/vortex/dagster/vortex/tools/__init__.py @@ -61,28 +61,32 @@ def scrape_website(url: str): def scrape_website_selenium(url): - # Configure Selenium with a headless browser - options = Options() - options.headless = True - driver = webdriver.Chrome(options=options) + try: + # Configure Selenium with a headless browser + options = Options() + options.headless = True + driver = webdriver.Chrome(options=options) - # Access the webpage - driver.get(url) + # Access the webpage + driver.get(url) - # Wait for JavaScript to render. Adjust time as needed. - time.sleep(5) # Time in seconds + # Wait for JavaScript to render. Adjust time as needed. + time.sleep(5) # Time in seconds - # Extract the page source - page_source = driver.page_source + # Extract the page source + page_source = driver.page_source - # Close the browser - driver.quit() + # Close the browser + driver.quit() - # Convert HTML to Markdown - converter = html2text.HTML2Text() - markdown = converter.handle(page_source) + # Convert HTML to Markdown + converter = html2text.HTML2Text() + markdown = converter.handle(page_source) - return markdown + return markdown + except Exception as e: + print(f"Error scraping website: {e}") + raise e tools = [