Updated scrapping

broomva · Jan 26, 2024 · d4a6099 · d4a6099
1 parent cdace87
commit d4a6099
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 26 deletions.
diff --git a/.env.example b/.env.example
@@ -1,8 +1,23 @@
-databricks_experiment_name=''
-databricks_experiment_id=''
-databricks_host=''
-databricks_token=''
-databricks_username=''
-databricks_password=''
-databricks_cluster_id=''
-databricks_sql_http_path=''
+PG_USERNAME=''
+PG_PASSWORD=''
+PG_HOST=''
+PG_PORT=''
+PG_DATABASE=''
+LANGCHAIN_TRACING_V2=''
+LANGCHAIN_ENDPOINT=''
+LANGCHAIN_API_KEY=''
+LANGCHAIN_PROJECT=''
+DAGSTER_HOME=''
+OPENAI_API_BASE_URL=''
+OPENAI_MODEL_NAME=''
+TOGETHER_API_KEY=''
+BROWSERLESS_API_KEY=''
+SERP_API_KEY=''
+SQLALCHEMY_URL=''
+OPENAI_API_KEY=''
+CHAINLIT_API_KEY=''
+OAUTH_GITHUB_CLIENT_ID=''
+OAUTH_GITHUB_CLIENT_SECRET=''
+CHAINLIT_AUTH_SECRET=''
+HUGGINGFACEHUB_API_TOKEN=''
+SENDGRID_API_KEY=''
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,9 @@ __pycache__/
 
 .streamlit/secrets.toml
 
+
+dagster_runtime/
+
 # C extensions
 *.so
 

diff --git a/vortex/dagster/vortex/assets/vortex.py b/vortex/dagster/vortex/assets/vortex.py
@@ -73,9 +73,15 @@ def get_article(context, get_url) -> str:
     if not get_url:
         return None
     try:
-        response = scrape_website_selenium(get_url[0])
+        try:
+            response = scrape_website_selenium(get_url[0])
+            context.log.debug(f"Selenium response {response}")
+        except Exception as e:
+            response = None
         if response is None:
+            context.log.warning(f"Selenium response was None. Using BS4")
             response = scrape_website(get_url[0])
+            context.log.debug(f"Bs4 Scrape response {response}")
     except Exception as e:
         context.log.info(f"Error {e}")
         response = None
@@ -218,7 +224,7 @@ def write_consolidated_summary(context, consolidated_summary, get_articles_summa
 def send_email_with_sendgrid(context, get_url, summarize_article):
     email = get_url[2]
     message = Mail(
-        from_email="[email protected]",
+        from_email="Vortex Summaries",
         to_emails=email,
         subject="Here is your URL summary! 🎉",
         plain_text_content=summarize_article,

diff --git a/vortex/dagster/vortex/tools/__init__.py b/vortex/dagster/vortex/tools/__init__.py
@@ -61,28 +61,32 @@ def scrape_website(url: str):
 
 
 def scrape_website_selenium(url):
-    # Configure Selenium with a headless browser
-    options = Options()
-    options.headless = True
-    driver = webdriver.Chrome(options=options)
+    try:
+        # Configure Selenium with a headless browser
+        options = Options()
+        options.headless = True
+        driver = webdriver.Chrome(options=options)
 
-    # Access the webpage
-    driver.get(url)
+        # Access the webpage
+        driver.get(url)
 
-    # Wait for JavaScript to render. Adjust time as needed.
-    time.sleep(5)  # Time in seconds
+        # Wait for JavaScript to render. Adjust time as needed.
+        time.sleep(5)  # Time in seconds
 
-    # Extract the page source
-    page_source = driver.page_source
+        # Extract the page source
+        page_source = driver.page_source
 
-    # Close the browser
-    driver.quit()
+        # Close the browser
+        driver.quit()
 
-    # Convert HTML to Markdown
-    converter = html2text.HTML2Text()
-    markdown = converter.handle(page_source)
+        # Convert HTML to Markdown
+        converter = html2text.HTML2Text()
+        markdown = converter.handle(page_source)
 
-    return markdown
+        return markdown
+    except Exception as e:
+        print(f"Error scraping website: {e}")
+        raise e
 
 
 tools = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,9 @@ __pycache__/ @@
     .streamlit/secrets.toml
+    dagster_runtime/
     # C extensions
     *.so
@@ Expand Down @@