-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #51 from lexara-prime-ai/dev
Dev
- Loading branch information
Showing
5 changed files
with
109 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,5 +6,8 @@ service_account.json | |
# Added by cargo | ||
|
||
/target | ||
|
||
|
||
/**/hyper/hyper/__pycache__ | ||
/**/data | ||
/**/data | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Web Scraper | ||
|
||
```python | ||
@app.local_entrypoint() | ||
def main(urls: str): | ||
url_list = urls.split(',') | ||
for url in url_list: | ||
try: | ||
tables = get_tables(url) # Direct call to get_tables | ||
print(f"Tables from {url}:\n") | ||
for table in tables: | ||
print(table) | ||
print("\n\n") | ||
except Exception as e: | ||
print(f"Error processing {url}: {e}") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Receive multiple URLs as arguments.") | ||
parser.add_argument("urls", type=str, help="Comma-separated list of URLs to scrape.") | ||
|
||
args = parser.parse_args() | ||
main(args.urls) | ||
``` | ||
|
||
1. **Modal Integration**: The script uses Modal's `@app.local_entrypoint()` decorator to define the entry point for the Modal app. This makes it easy to run the script as a Modal job. | ||
2. **Argument Parsing**: The script uses `argparse` to accept a comma-separated list of URLs from the command line. | ||
3. **Main Function**: The `main` function splits the input URLs and processes each one to scrape tables. | ||
|
||
## Running the Script with Modal | ||
|
||
To run the script with Modal, you would typically deploy the app using Modal's CLI or API. Here’s a general outline of how you might do this: | ||
|
||
```sh | ||
modal run ./web_scraper/__init__.py --urls "https://asn.flightsafety.org/wikibase/dblist2.php?yr=2024&at=&re=&pc=&op=&lo=&co=&ph=&na=&submit=Submit,https://asn.flightsafety.org/wikibase/dblist2.php?at=&re=&pc=&op=&fa=&lo=&co=&ph=&na=&yr=2024&page=1" | ||
``` | ||
|
||
This will run the script on Modal’s infrastructure, leveraging its **serverless** compute resources to handle the _web scraping_ tasks. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Example [invocation] | ||
# modal run ./web_scraper/__init__.py --urls "https://asn.flightsafety.org/wikibase/dblist2.php?yr=2024&at=&re=&pc=&op=&lo=&co=&ph=&na=&submit=Submit,https://asn.flightsafety.org/wikibase/dblist2.php?at=&re=&pc=&op=&fa=&lo=&co=&ph=&na=&yr=2024&page=1" | ||
|
||
|
||
import argparse | ||
import urllib.request | ||
from urllib.parse import urlparse | ||
|
||
import modal | ||
from bs4 import BeautifulSoup | ||
|
||
app = modal.App(name="wspr-scraper-v0.0.1") | ||
|
||
|
||
# URL format [validation]. | ||
def validate_url_scheme(url): | ||
ALLOWED_SCHEMES = {"http", "https"} | ||
SCHEME = urlparse(url).scheme | ||
if SCHEME not in ALLOWED_SCHEMES: | ||
raise ValueError( | ||
f"URL scheme '{SCHEME}' is not allowed. Allowed schemes: {ALLOWED_SCHEMES}" | ||
) | ||
|
||
|
||
def get_tables(url): | ||
validate_url_scheme(url) # Validate URL scheme before proceeding | ||
RESPONSE = urllib.request.urlopen(url) | ||
html = RESPONSE.read().decode("utf8") | ||
|
||
soup = BeautifulSoup(html, "lxml") | ||
tables = soup.find_all("table") | ||
|
||
TABLE_CONTENTS = [] | ||
for table in tables: | ||
table_content = str(table) | ||
TABLE_CONTENTS.append(table_content) | ||
|
||
return TABLE_CONTENTS | ||
|
||
|
||
@app.local_entrypoint() | ||
def main(urls: str): | ||
url_list = urls.split(",") | ||
for url in url_list: | ||
try: | ||
# Direct call to get_tables. | ||
tables = get_tables(url) | ||
print(f"Tables from {url}:\n") | ||
for table in tables: | ||
print(table) | ||
print("\n\n") | ||
except Exception as e: | ||
print(f"Error processing {url}: {e}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Receive multiple URLs as arguments.") | ||
parser.add_argument( | ||
"urls", type=str, help="Comma-separated list of URLs to scrape." | ||
) | ||
|
||
args = parser.parse_args() | ||
main(args.urls) |