Merge pull request #51 from lexara-prime-ai/dev

Dev
irfanghat · Jun 12, 2024 · d4cea2c · d4cea2c
2 parents 5a6da3c + 802312f
commit d4cea2c
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,8 @@ service_account.json
 # Added by cargo
 
 /target
+
+
 /**/hyper/hyper/__pycache__
-/**/data
+/**/data
+__pycache__
diff --git a/scripts/bash/devcontainer.sh b/scripts/bash/devcontainer.sh
@@ -4,6 +4,6 @@
 source /opt/venv/bin/activate
 
 # Run maturin develop only if in a dev container
-if [[ "$IS_DEV_CONTAINER" == "true" ]]; then
-    maturin develop -m python_wrapper/Cargo.toml
+if [[ ${IS_DEV_CONTAINER} == "true" ]]; then
+	maturin develop -m python_wrapper/Cargo.toml
 fi
diff --git a/scripts/bash/python_deps.sh b/scripts/bash/python_deps.sh
@@ -10,7 +10,7 @@ fi
 
 
 # Modules that will be installed/upgraded.
-modules=("mkdocs" "maturin" "patchelf" "tableauhyperapi" "google-api-python-client" "google-auth-httplib2" "google-auth-oauthlib")
+modules=("modal" "mkdocs" "maturin" "patchelf" "tableauhyperapi" "google-api-python-client" "google-auth-httplib2" "google-auth-oauthlib")
 
 echo "Installing dependencies: ${modules[*]}..."
 pip install "${modules[@]}" --upgrade
@@ -29,6 +29,7 @@ verify_installation() {
 
 # The following dictionary contains module to import name mappings.
 declare -A module_import_map=(
+	["modal"]="modal" # To Do -> Update Docker configuration to include modal cli configuration.
 	["mkdocs"]="mkdocs"
 	["maturin"]="maturin"
 	["patchelf"]="patchelf"

diff --git a/web_scraper/README.md b/web_scraper/README.md
@@ -0,0 +1,37 @@
+# Web Scraper
+
+```python
+@app.local_entrypoint()
+def main(urls: str):
+    url_list = urls.split(',')
+    for url in url_list:
+        try:
+            tables = get_tables(url)  # Direct call to get_tables
+            print(f"Tables from {url}:\n")
+            for table in tables:
+                print(table)
+                print("\n\n")
+        except Exception as e:
+            print(f"Error processing {url}: {e}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Receive multiple URLs as arguments.")
+    parser.add_argument("urls", type=str, help="Comma-separated list of URLs to scrape.")
+
+    args = parser.parse_args()
+    main(args.urls)
+```
+
+1.  **Modal Integration**: The script uses Modal's `@app.local_entrypoint()` decorator to define the entry point for the Modal app. This makes it easy to run the script as a Modal job.
+2.  **Argument Parsing**: The script uses `argparse` to accept a comma-separated list of URLs from the command line.
+3.  **Main Function**: The `main` function splits the input URLs and processes each one to scrape tables.
+
+## Running the Script with Modal
+
+To run the script with Modal, you would typically deploy the app using Modal's CLI or API. Here’s a general outline of how you might do this:
+
+```sh
+ modal run ./web_scraper/__init__.py --urls "https://asn.flightsafety.org/wikibase/dblist2.php?yr=2024&at=&re=&pc=&op=&lo=&co=&ph=&na=&submit=Submit,https://asn.flightsafety.org/wikibase/dblist2.php?at=&re=&pc=&op=&fa=&lo=&co=&ph=&na=&yr=2024&page=1"
+```
+
+This will run the script on Modal’s infrastructure, leveraging its **serverless** compute resources to handle the _web scraping_ tasks.
diff --git a/web_scraper/__init__.py b/web_scraper/__init__.py
@@ -0,0 +1,64 @@
+# Example [invocation]
+#  modal run ./web_scraper/__init__.py --urls "https://asn.flightsafety.org/wikibase/dblist2.php?yr=2024&at=&re=&pc=&op=&lo=&co=&ph=&na=&submit=Submit,https://asn.flightsafety.org/wikibase/dblist2.php?at=&re=&pc=&op=&fa=&lo=&co=&ph=&na=&yr=2024&page=1"
+
+
+import argparse
+import urllib.request
+from urllib.parse import urlparse
+
+import modal
+from bs4 import BeautifulSoup
+
+app = modal.App(name="wspr-scraper-v0.0.1")
+
+
+# URL format [validation].
+def validate_url_scheme(url):
+    ALLOWED_SCHEMES = {"http", "https"}
+    SCHEME = urlparse(url).scheme
+    if SCHEME not in ALLOWED_SCHEMES:
+        raise ValueError(
+            f"URL scheme '{SCHEME}' is not allowed. Allowed schemes: {ALLOWED_SCHEMES}"
+        )
+
+
+def get_tables(url):
+    validate_url_scheme(url)  # Validate URL scheme before proceeding
+    RESPONSE = urllib.request.urlopen(url)
+    html = RESPONSE.read().decode("utf8")
+
+    soup = BeautifulSoup(html, "lxml")
+    tables = soup.find_all("table")
+
+    TABLE_CONTENTS = []
+    for table in tables:
+        table_content = str(table)
+        TABLE_CONTENTS.append(table_content)
+
+    return TABLE_CONTENTS
+
+
+@app.local_entrypoint()
+def main(urls: str):
+    url_list = urls.split(",")
+    for url in url_list:
+        try:
+            # Direct call to get_tables.
+            tables = get_tables(url)
+            print(f"Tables from {url}:\n")
+            for table in tables:
+                print(table)
+                print("\n\n")
+        except Exception as e:
+            print(f"Error processing {url}: {e}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Receive multiple URLs as arguments.")
+    parser.add_argument(
+        "urls", type=str, help="Comma-separated list of URLs to scrape."
+    )
+
+    args = parser.parse_args()
+    main(args.urls)