From 4a3ba3ff061f5033dbbcbf3b36299749e06c61bb Mon Sep 17 00:00:00 2001 From: Nehanth Date: Sun, 15 Sep 2024 12:08:29 -0500 Subject: [PATCH 1/3] first --- requirements.txt | 2 + scrape_website.py | 131 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 requirements.txt create mode 100644 scrape_website.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0125311 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +firecrawl-py +io \ No newline at end of file diff --git a/scrape_website.py b/scrape_website.py new file mode 100644 index 0000000..3d75839 --- /dev/null +++ b/scrape_website.py @@ -0,0 +1,131 @@ +import os +import io +from dotenv import load_dotenv +from contextlib import redirect_stdout +from firecrawl import FirecrawlApp + +# Load environment variables from .env file +load_dotenv() + +api_key = os.getenv('FIRECRAWL_API_KEY') +if not api_key: + raise ValueError("API key not found. Please set the FIRECRAWL_API_KEY environment variable.") + +app = FirecrawlApp(api_key=api_key) + +def map_website(base_url): + print(f"Mapping {base_url} for available links...") + try: + # Capture and suppress the output from app.map_url + captured_output = io.StringIO() + with redirect_stdout(captured_output): + response = app.map_url(base_url) + + # Check if the response was successful and contains links + if isinstance(response, dict) and response.get('success') and 'links' in response: + links = response['links'] + elif isinstance(response, list): + links = response + else: + print("API call was not successful or didn't return links in the expected format.") + return [] + + if links: + print(f"Found {len(links)} links.") + return links + else: + print("No links found in the response.") + return [] + except Exception as e: + print(f"An error occurred during mapping: {str(e)}") + return [] + +# Step 2: Allow user to select which URL they want to scrape +def choose_url(links): + if not links: + print("No links available to choose from.") + return None + + def display_links(start, end): + print("\nAvailable URLs:") + print("--------------------") + for idx, link in enumerate(links[start:end], start=start+1): + print(f"{idx:3}. {link}") + print("--------------------") + + start = 0 + page_size = 5 + total_links = len(links) + + while True: + display_links(start, start + page_size) + + if start + page_size < total_links: + print("\nOptions:") + print("- Enter a number to select a URL") + print("- Type 'more' to see the next 5 links") + print("- Type 'all' to see all links") + print("- Type 'exit' to quit") + else: + print("\nOptions:") + print("- Enter a number to select a URL") + print("- Type 'exit' to quit") + + choice = input("\nYour choice: ").lower().strip() + + if choice == 'exit': + print("Exiting.") + return None + elif choice == 'more' and start + page_size < total_links: + start += page_size + elif choice == 'all': + display_links(0, total_links) + else: + try: + index = int(choice) - 1 + if 0 <= index < total_links: + return links[index] + else: + print(f"Invalid choice. Please choose a number between 1 and {total_links}.") + except ValueError: + print("Invalid input. Please enter a valid option.") + + print() # Add a blank line for better readability + +# Step 3: Scrape the chosen URL for markdown content +def scrape_url(url): + print(f"Scraping content from {url}...") + try: + response = app.scrape_url(url=url, params={ + 'formats': ['markdown'], + }) + + if isinstance(response, dict) and 'markdown' in response: + # Extract the markdown content + markdown_content = response['markdown'] + # Save the markdown content to a new file + output_path = 'scraped_content.md' + with open(output_path, 'w', encoding='utf-8') as markdown_file: + markdown_file.write(markdown_content) + print(f"Markdown content saved to {output_path}") + else: + print("Scraping failed or markdown content is not present.") + except Exception as e: + print(f"An error occurred during scraping: {str(e)}") + +# Main process +if __name__ == "__main__": + # Step 1: Input the base URL to map the website + base_url = input("Enter the website URL: ") + + # Step 2: Map the website and get the list of available links + links = map_website(base_url) + + # Step 3: If links are available, allow the user to choose one to scrape + if links: + selected_url = choose_url(links) + # Step 4: Scrape the selected URL for markdown content + if selected_url: + scrape_url(selected_url) + else: + print("No links found or available to scrape.") From f33d728f0c7e52334d2cfe997a7fab5459c8168d Mon Sep 17 00:00:00 2001 From: Nehanth Date: Sun, 15 Sep 2024 12:22:09 -0500 Subject: [PATCH 2/3] updated readme --- README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7b388f2..b4d5e67 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,71 @@ -# firecrawl-experiment -Experiment to use firecrawl to enable website ingestion +# Firecrawl Experiment + +This project is an experiment using the Firecrawl API to map and scrape websites. It maps a given website to get all subpage links, allows the user to choose a URL, and scrapes the content, saving it in markdown format. + +## Features + +- Map a website to retrieve all subpage links. +- Interactive CLI to select a URL for scraping. +- Save scraped content as a Markdown file. + +## Prerequisites + +- Python 3.x +- Firecrawl API Key + - To obtain an API key, visit https://www.firecrawl.dev and sign up for an account. +## Installation + +1. Clone this repository: + ```bash + git clone https://github.com/instructlab/firecrawl-experiment.git + + cd firecrawl-experiment + ``` + +2. Create a `.env` file in the root directory: + ```bash + touch .env + ``` + Add your Firecrawl API key to the `.env` file: + ``` + FIRECRAWL_API_KEY=your_api_key_here + ``` + +3. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Usage + +1. Run the script: + ```bash + python scrape_website.py + ``` +2. Enter the base URL of the website you want to scrape. +3. Select a URL from the list of available links. +4. The content will be scraped and saved as `scraped_content.md`. + +## Example + +```bash +Enter the website URL: https://www.example.com +Mapping https://www.example.com for available links... +Found 20 links. +Available URLs: +-------------------- +1. https://www.example.com +2. https://www.example.com/about +3. https://www.example.com/contact +... + +Options: +- Enter a number to select a URL +- Type 'more' to see the next 5 links +- Type 'all' to see all links +- Type 'exit' to quit + +Your choice: 2 +Scraping content from https://www.example.com/about... +Markdown content saved to scraped_content.md +``` \ No newline at end of file From be6a2d514d2b992f93f61ea3f6fa658b6f618874 Mon Sep 17 00:00:00 2001 From: Nehanth Date: Sun, 15 Sep 2024 14:01:08 -0500 Subject: [PATCH 3/3] new chnages, added local --- README.md | 27 +++++++++-- scrape_website_local.py | 102 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 scrape_website_local.py diff --git a/README.md b/README.md index b4d5e67..03bb387 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,13 @@ This project is an experiment using the Firecrawl API to map and scrape websites - Python 3.x - Firecrawl API Key - - To obtain an API key, visit https://www.firecrawl.dev and sign up for an account. +- To obtain an API key, visit https://www.firecrawl.dev and sign up for an account. + ## Installation 1. Clone this repository: ```bash git clone https://github.com/instructlab/firecrawl-experiment.git - cd firecrawl-experiment ``` @@ -42,8 +42,11 @@ This project is an experiment using the Firecrawl API to map and scrape websites ```bash python scrape_website.py ``` + 2. Enter the base URL of the website you want to scrape. + 3. Select a URL from the list of available links. + 4. The content will be scraped and saved as `scraped_content.md`. ## Example @@ -58,14 +61,28 @@ Available URLs: 2. https://www.example.com/about 3. https://www.example.com/contact ... - Options: - Enter a number to select a URL - Type 'more' to see the next 5 links - Type 'all' to see all links - Type 'exit' to quit - Your choice: 2 Scraping content from https://www.example.com/about... Markdown content saved to scraped_content.md -``` \ No newline at end of file +``` + +## Local Firecrawl Option + +Instead of using the Firecrawl API, you can also run Firecrawl locally. To do this: + +1. Clone the Firecrawl repository: + ```bash + git clone https://github.com/mendableai/firecrawl.git + ``` + +2. Follow the instructions in the Firecrawl README to set up and run Firecrawl locally: + https://github.com/mendableai/firecrawl/blob/main/README.md + +3. Once Firecrawl is set up locally, use the `scrape_website_local.py` script instead of `scrape_website.py`. + +Make sure your local Firecrawl instance is running before using the `scrape_website_local.py` script. \ No newline at end of file diff --git a/scrape_website_local.py b/scrape_website_local.py new file mode 100644 index 0000000..9225bf5 --- /dev/null +++ b/scrape_website_local.py @@ -0,0 +1,102 @@ +import requests + +# Define the endpoints +MAP_URL = "http://localhost:3002/v1/map" +CRAWL_URL = "http://localhost:3002/v1/scrape" + +def map_website(base_url): + print(f"Mapping {base_url} for available links...") + try: + response = requests.post(MAP_URL, json={'url': base_url}) + response.raise_for_status() # Check for request errors + result = response.json() + links = result.get('links', []) + if links: + print(f"Found {len(links)} links.") + return links + else: + print("No links found in the response.") + return [] + except Exception as e: + print(f"An error occurred during mapping: {str(e)}") + return [] + +def choose_url(links): + if not links: + print("No links available to choose from.") + return None + + def display_links(start, end): + print("\nAvailable URLs:") + print("--------------------") + for idx, link in enumerate(links[start:end], start=start+1): + print(f"{idx:3}. {link}") + print("--------------------") + + start = 0 + page_size = 5 + total_links = len(links) + + while True: + display_links(start, start + page_size) + + if start + page_size < total_links: + print("\nOptions:") + print("- Enter a number to select a URL") + print("- Type 'more' to see the next 5 links") + print("- Type 'all' to see all links") + print("- Type 'exit' to quit") + else: + print("\nOptions:") + print("- Enter a number to select a URL") + print("- Type 'exit' to quit") + + choice = input("\nYour choice: ").lower().strip() + + if choice == 'exit': + print("Exiting.") + return None + elif choice == 'more' and start + page_size < total_links: + start += page_size + elif choice == 'all': + display_links(0, total_links) + else: + try: + index = int(choice) - 1 + if 0 <= index < total_links: + return links[index] + else: + print(f"Invalid choice. Please choose a number between 1 and {total_links}.") + except ValueError: + print("Invalid input. Please enter a valid option.") + + print() # Add a blank line for better readability + +def scrape_url(url): + print(f"Scraping content from {url}...") + try: + response = requests.post(CRAWL_URL, json={'url': url, 'formats': ['markdown']}) + response.raise_for_status() # Check for request errors + result = response.json() + markdown_content = result.get('data', {}).get('markdown', '') + if markdown_content: + output_path = 'scraped_content.md' + with open(output_path, 'w', encoding='utf-8') as markdown_file: + markdown_file.write(markdown_content) + print(f"Markdown content saved to {output_path}") + else: + print("No markdown content found.") + except Exception as e: + print(f"An error occurred during scraping: {str(e)}") + +# Main process +if __name__ == "__main__": + base_url = input("Enter the website URL: ") + links = map_website(base_url) + + if links: + selected_url = choose_url(links) + if selected_url: + scrape_url(selected_url) + else: + print("No links found or available to scrape.")