instructlab · Nehanth · Sep 15, 2024 · Sep 15, 2024 · Sep 15, 2024
diff --git a/README.md b/README.md
@@ -1,2 +1,88 @@
-# firecrawl-experiment
-Experiment to use firecrawl to enable website ingestion
+# Firecrawl Experiment
+
+This project is an experiment using the Firecrawl API to map and scrape websites. It maps a given website to get all subpage links, allows the user to choose a URL, and scrapes the content, saving it in markdown format.
+
+## Features
+
+- Map a website to retrieve all subpage links.
+- Interactive CLI to select a URL for scraping.
+- Save scraped content as a Markdown file.
+
+## Prerequisites
+
+- Python 3.x
+- Firecrawl API Key
+- To obtain an API key, visit https://www.firecrawl.dev and sign up for an account.
+
+## Installation
+
+1. Clone this repository:
+   ```bash
+   git clone https://github.com/instructlab/firecrawl-experiment.git
+   cd firecrawl-experiment
+   ```
+
+2. Create a `.env` file in the root directory:
+   ```bash
+   touch .env
+   ```
+   Add your Firecrawl API key to the `.env` file:
+   ```
+   FIRECRAWL_API_KEY=your_api_key_here
+   ```
+
+3. Install the required dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+1. Run the script:
+   ```bash
+   python scrape_website.py
+   ```
+
+2. Enter the base URL of the website you want to scrape.
+
+3. Select a URL from the list of available links.
+
+4. The content will be scraped and saved as `scraped_content.md`.
+
+## Example
+
+```bash
+Enter the website URL: https://www.example.com
+Mapping https://www.example.com for available links...
+Found 20 links.
+Available URLs:
+--------------------
+1. https://www.example.com
+2. https://www.example.com/about
+3. https://www.example.com/contact
+...
+Options:
+- Enter a number to select a URL
+- Type 'more' to see the next 5 links
+- Type 'all' to see all links
+- Type 'exit' to quit
+Your choice: 2
+Scraping content from https://www.example.com/about...
+Markdown content saved to scraped_content.md
+```
+
+## Local Firecrawl Option
+
+Instead of using the Firecrawl API, you can also run Firecrawl locally. To do this:
+
+1. Clone the Firecrawl repository:
+   ```bash
+   git clone https://github.com/mendableai/firecrawl.git
+   ```
+
+2. Follow the instructions in the Firecrawl README to set up and run Firecrawl locally:
+   https://github.com/mendableai/firecrawl/blob/main/README.md
+
+3. Once Firecrawl is set up locally, use the `scrape_website_local.py` script instead of `scrape_website.py`.
+
+Make sure your local Firecrawl instance is running before using the `scrape_website_local.py` script.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+firecrawl-py
+io
diff --git a/scrape_website.py b/scrape_website.py
@@ -0,0 +1,131 @@
+import os
+import io
+from dotenv import load_dotenv
+from contextlib import redirect_stdout
+from firecrawl import FirecrawlApp
+
+# Load environment variables from .env file
+load_dotenv()
+
+api_key = os.getenv('FIRECRAWL_API_KEY')
+if not api_key:
+    raise ValueError("API key not found. Please set the FIRECRAWL_API_KEY environment variable.")
+
+app = FirecrawlApp(api_key=api_key)
+
+def map_website(base_url):
+    print(f"Mapping {base_url} for available links...")
+    try:
+        # Capture and suppress the output from app.map_url
+        captured_output = io.StringIO()
+        with redirect_stdout(captured_output):
+            response = app.map_url(base_url)
+
+        # Check if the response was successful and contains links
+        if isinstance(response, dict) and response.get('success') and 'links' in response:
+            links = response['links']
+        elif isinstance(response, list):
+            links = response
+        else:
+            print("API call was not successful or didn't return links in the expected format.")
+            return []
+
+        if links:
+            print(f"Found {len(links)} links.")
+            return links
+        else:
+            print("No links found in the response.")
+            return []
+    except Exception as e:
+        print(f"An error occurred during mapping: {str(e)}")
+        return []
+
+# Step 2: Allow user to select which URL they want to scrape
+def choose_url(links):
+    if not links:
+        print("No links available to choose from.")
+        return None
+
+    def display_links(start, end):
+        print("\nAvailable URLs:")
+        print("--------------------")
+        for idx, link in enumerate(links[start:end], start=start+1):
+            print(f"{idx:3}. {link}")
+        print("--------------------")
+
+    start = 0
+    page_size = 5
+    total_links = len(links)
+
+    while True:
+        display_links(start, start + page_size)
+
+        if start + page_size < total_links:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'more' to see the next 5 links")
+            print("- Type 'all' to see all links")
+            print("- Type 'exit' to quit")
+        else:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'exit' to quit")
+
+        choice = input("\nYour choice: ").lower().strip()
+
+        if choice == 'exit':
+            print("Exiting.")
+            return None
+        elif choice == 'more' and start + page_size < total_links:
+            start += page_size
+        elif choice == 'all':
+            display_links(0, total_links)
+        else:
+            try:
+                index = int(choice) - 1
+                if 0 <= index < total_links:
+                    return links[index]
+                else:
+                    print(f"Invalid choice. Please choose a number between 1 and {total_links}.")
+            except ValueError:
+                print("Invalid input. Please enter a valid option.")
+
+        print()  # Add a blank line for better readability
+
+# Step 3: Scrape the chosen URL for markdown content
+def scrape_url(url):
+    print(f"Scraping content from {url}...")
+    try:
+        response = app.scrape_url(url=url, params={
+            'formats': ['markdown'],
+        })
+
+        if isinstance(response, dict) and 'markdown' in response:
+            # Extract the markdown content
+            markdown_content = response['markdown']
+            # Save the markdown content to a new file
+            output_path = 'scraped_content.md'
+            with open(output_path, 'w', encoding='utf-8') as markdown_file:
+                markdown_file.write(markdown_content)
+            print(f"Markdown content saved to {output_path}")
+        else:
+            print("Scraping failed or markdown content is not present.")
+    except Exception as e:
+        print(f"An error occurred during scraping: {str(e)}")
+
+# Main process
+if __name__ == "__main__":
+    # Step 1: Input the base URL to map the website
+    base_url = input("Enter the website URL: ")
+
+    # Step 2: Map the website and get the list of available links
+    links = map_website(base_url)
+
+    # Step 3: If links are available, allow the user to choose one to scrape
+    if links:
+        selected_url = choose_url(links)
+        # Step 4: Scrape the selected URL for markdown content
+        if selected_url:
+            scrape_url(selected_url)
+    else:
+        print("No links found or available to scrape.")
diff --git a/scrape_website_local.py b/scrape_website_local.py
@@ -0,0 +1,102 @@
+import requests
+
+# Define the endpoints
+MAP_URL = "http://localhost:3002/v1/map"
+CRAWL_URL = "http://localhost:3002/v1/scrape"
+
+def map_website(base_url):
+    print(f"Mapping {base_url} for available links...")
+    try:
+        response = requests.post(MAP_URL, json={'url': base_url})
+        response.raise_for_status()  # Check for request errors
+        result = response.json()
+        links = result.get('links', [])
+        if links:
+            print(f"Found {len(links)} links.")
+            return links
+        else:
+            print("No links found in the response.")
+            return []
+    except Exception as e:
+        print(f"An error occurred during mapping: {str(e)}")
+        return []
+
+def choose_url(links):
+    if not links:
+        print("No links available to choose from.")
+        return None
+
+    def display_links(start, end):
+        print("\nAvailable URLs:")
+        print("--------------------")
+        for idx, link in enumerate(links[start:end], start=start+1):
+            print(f"{idx:3}. {link}")
+        print("--------------------")
+
+    start = 0
+    page_size = 5
+    total_links = len(links)
+
+    while True:
+        display_links(start, start + page_size)
+
+        if start + page_size < total_links:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'more' to see the next 5 links")
+            print("- Type 'all' to see all links")
+            print("- Type 'exit' to quit")
+        else:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'exit' to quit")
+
+        choice = input("\nYour choice: ").lower().strip()
+
+        if choice == 'exit':
+            print("Exiting.")
+            return None
+        elif choice == 'more' and start + page_size < total_links:
+            start += page_size
+        elif choice == 'all':
+            display_links(0, total_links)
+        else:
+            try:
+                index = int(choice) - 1
+                if 0 <= index < total_links:
+                    return links[index]
+                else:
+                    print(f"Invalid choice. Please choose a number between 1 and {total_links}.")
+            except ValueError:
+                print("Invalid input. Please enter a valid option.")
+
+        print()  # Add a blank line for better readability
+
+def scrape_url(url):
+    print(f"Scraping content from {url}...")
+    try:
+        response = requests.post(CRAWL_URL, json={'url': url, 'formats': ['markdown']})
+        response.raise_for_status()  # Check for request errors
+        result = response.json()
+        markdown_content = result.get('data', {}).get('markdown', '')
+        if markdown_content:
+            output_path = 'scraped_content.md'
+            with open(output_path, 'w', encoding='utf-8') as markdown_file:
+                markdown_file.write(markdown_content)
+            print(f"Markdown content saved to {output_path}")
+        else:
+            print("No markdown content found.")
+    except Exception as e:
+        print(f"An error occurred during scraping: {str(e)}")
+
+# Main process
+if __name__ == "__main__":
+    base_url = input("Enter the website URL: ")
+    links = map_website(base_url)
+
+    if links:
+        selected_url = choose_url(links)
+        if selected_url:
+            scrape_url(selected_url)
+    else:
+        print("No links found or available to scrape.")