From 4a3ba3ff061f5033dbbcbf3b36299749e06c61bb Mon Sep 17 00:00:00 2001
From: Nehanth <nehanthnarendrula@gmail.com>
Date: Sun, 15 Sep 2024 12:08:29 -0500
Subject: [PATCH 1/3] first

---
 requirements.txt  |   2 +
 scrape_website.py | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 scrape_website.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0125311
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+firecrawl-py
+io
\ No newline at end of file
diff --git a/scrape_website.py b/scrape_website.py
new file mode 100644
index 0000000..3d75839
--- /dev/null
+++ b/scrape_website.py
@@ -0,0 +1,131 @@
+import os
+import io
+from dotenv import load_dotenv
+from contextlib import redirect_stdout
+from firecrawl import FirecrawlApp
+
+# Load environment variables from .env file
+load_dotenv()
+
+api_key = os.getenv('FIRECRAWL_API_KEY')
+if not api_key:
+    raise ValueError("API key not found. Please set the FIRECRAWL_API_KEY environment variable.")
+
+app = FirecrawlApp(api_key=api_key)
+
+def map_website(base_url):
+    print(f"Mapping {base_url} for available links...")
+    try:
+        # Capture and suppress the output from app.map_url
+        captured_output = io.StringIO()
+        with redirect_stdout(captured_output):
+            response = app.map_url(base_url)
+        
+        # Check if the response was successful and contains links
+        if isinstance(response, dict) and response.get('success') and 'links' in response:
+            links = response['links']
+        elif isinstance(response, list):
+            links = response
+        else:
+            print("API call was not successful or didn't return links in the expected format.")
+            return []
+        
+        if links:
+            print(f"Found {len(links)} links.")
+            return links
+        else:
+            print("No links found in the response.")
+            return []
+    except Exception as e:
+        print(f"An error occurred during mapping: {str(e)}")
+        return []
+
+# Step 2: Allow user to select which URL they want to scrape
+def choose_url(links):
+    if not links:
+        print("No links available to choose from.")
+        return None
+    
+    def display_links(start, end):
+        print("\nAvailable URLs:")
+        print("--------------------")
+        for idx, link in enumerate(links[start:end], start=start+1):
+            print(f"{idx:3}. {link}")
+        print("--------------------")
+
+    start = 0
+    page_size = 5
+    total_links = len(links)
+
+    while True:
+        display_links(start, start + page_size)
+        
+        if start + page_size < total_links:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'more' to see the next 5 links")
+            print("- Type 'all' to see all links")
+            print("- Type 'exit' to quit")
+        else:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'exit' to quit")
+
+        choice = input("\nYour choice: ").lower().strip()
+
+        if choice == 'exit':
+            print("Exiting.")
+            return None
+        elif choice == 'more' and start + page_size < total_links:
+            start += page_size
+        elif choice == 'all':
+            display_links(0, total_links)
+        else:
+            try:
+                index = int(choice) - 1
+                if 0 <= index < total_links:
+                    return links[index]
+                else:
+                    print(f"Invalid choice. Please choose a number between 1 and {total_links}.")
+            except ValueError:
+                print("Invalid input. Please enter a valid option.")
+
+        print()  # Add a blank line for better readability
+
+# Step 3: Scrape the chosen URL for markdown content
+def scrape_url(url):
+    print(f"Scraping content from {url}...")
+    try:
+        response = app.scrape_url(url=url, params={
+            'formats': ['markdown'],
+        })
+        
+        if isinstance(response, dict) and 'markdown' in response:
+            # Extract the markdown content
+            markdown_content = response['markdown']
+            # Save the markdown content to a new file
+            output_path = 'scraped_content.md'
+            with open(output_path, 'w', encoding='utf-8') as markdown_file:
+                markdown_file.write(markdown_content)
+            print(f"Markdown content saved to {output_path}")
+        else:
+            print("Scraping failed or markdown content is not present.")
+    except Exception as e:
+        print(f"An error occurred during scraping: {str(e)}")
+
+# Main process
+if __name__ == "__main__":
+    # Step 1: Input the base URL to map the website
+    base_url = input("Enter the website URL: ")
+    
+    # Step 2: Map the website and get the list of available links
+    links = map_website(base_url)
+    
+    # Step 3: If links are available, allow the user to choose one to scrape
+    if links:
+        selected_url = choose_url(links)
+        # Step 4: Scrape the selected URL for markdown content
+        if selected_url:
+            scrape_url(selected_url)
+    else:
+        print("No links found or available to scrape.")

From f33d728f0c7e52334d2cfe997a7fab5459c8168d Mon Sep 17 00:00:00 2001
From: Nehanth <nehanthnarendrula@gmail.com>
Date: Sun, 15 Sep 2024 12:22:09 -0500
Subject: [PATCH 2/3] updated readme

---
 README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7b388f2..b4d5e67 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,71 @@
-# firecrawl-experiment
-Experiment to use firecrawl to enable website ingestion
+# Firecrawl Experiment
+
+This project is an experiment using the Firecrawl API to map and scrape websites. It maps a given website to get all subpage links, allows the user to choose a URL, and scrapes the content, saving it in markdown format.
+
+## Features
+
+- Map a website to retrieve all subpage links.
+- Interactive CLI to select a URL for scraping.
+- Save scraped content as a Markdown file.
+
+## Prerequisites
+
+- Python 3.x
+- Firecrawl API Key
+    -   To obtain an API key, visit https://www.firecrawl.dev and sign up for an account.
+## Installation
+
+1. Clone this repository:
+   ```bash
+   git clone https://github.com/instructlab/firecrawl-experiment.git
+   
+   cd firecrawl-experiment
+   ```
+
+2. Create a `.env` file in the root directory:
+   ```bash
+   touch .env
+   ```
+   Add your Firecrawl API key to the `.env` file:
+   ```
+   FIRECRAWL_API_KEY=your_api_key_here
+   ```
+
+3. Install the required dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+## Usage
+
+1. Run the script:
+   ```bash
+   python scrape_website.py
+   ```
+2. Enter the base URL of the website you want to scrape.
+3. Select a URL from the list of available links.
+4. The content will be scraped and saved as `scraped_content.md`.
+
+## Example
+
+```bash
+Enter the website URL: https://www.example.com
+Mapping https://www.example.com for available links...
+Found 20 links.
+Available URLs:
+--------------------
+1. https://www.example.com
+2. https://www.example.com/about
+3. https://www.example.com/contact
+...
+
+Options:
+- Enter a number to select a URL
+- Type 'more' to see the next 5 links
+- Type 'all' to see all links
+- Type 'exit' to quit
+
+Your choice: 2
+Scraping content from https://www.example.com/about...
+Markdown content saved to scraped_content.md
+```
\ No newline at end of file

From be6a2d514d2b992f93f61ea3f6fa658b6f618874 Mon Sep 17 00:00:00 2001
From: Nehanth <nehanthnarendrula@gmail.com>
Date: Sun, 15 Sep 2024 14:01:08 -0500
Subject: [PATCH 3/3] new chnages, added local

---
 README.md               |  27 +++++++++--
 scrape_website_local.py | 102 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 scrape_website_local.py

diff --git a/README.md b/README.md
index b4d5e67..03bb387 100644
--- a/README.md
+++ b/README.md
@@ -12,13 +12,13 @@ This project is an experiment using the Firecrawl API to map and scrape websites
 
 - Python 3.x
 - Firecrawl API Key
-    -   To obtain an API key, visit https://www.firecrawl.dev and sign up for an account.
+- To obtain an API key, visit https://www.firecrawl.dev and sign up for an account.
+
 ## Installation
 
 1. Clone this repository:
    ```bash
    git clone https://github.com/instructlab/firecrawl-experiment.git
-   
    cd firecrawl-experiment
    ```
 
@@ -42,8 +42,11 @@ This project is an experiment using the Firecrawl API to map and scrape websites
    ```bash
    python scrape_website.py
    ```
+
 2. Enter the base URL of the website you want to scrape.
+
 3. Select a URL from the list of available links.
+
 4. The content will be scraped and saved as `scraped_content.md`.
 
 ## Example
@@ -58,14 +61,28 @@ Available URLs:
 2. https://www.example.com/about
 3. https://www.example.com/contact
 ...
-
 Options:
 - Enter a number to select a URL
 - Type 'more' to see the next 5 links
 - Type 'all' to see all links
 - Type 'exit' to quit
-
 Your choice: 2
 Scraping content from https://www.example.com/about...
 Markdown content saved to scraped_content.md
-```
\ No newline at end of file
+```
+
+## Local Firecrawl Option
+
+Instead of using the Firecrawl API, you can also run Firecrawl locally. To do this:
+
+1. Clone the Firecrawl repository:
+   ```bash
+   git clone https://github.com/mendableai/firecrawl.git
+   ```
+
+2. Follow the instructions in the Firecrawl README to set up and run Firecrawl locally:
+   https://github.com/mendableai/firecrawl/blob/main/README.md
+
+3. Once Firecrawl is set up locally, use the `scrape_website_local.py` script instead of `scrape_website.py`.
+
+Make sure your local Firecrawl instance is running before using the `scrape_website_local.py` script.
\ No newline at end of file
diff --git a/scrape_website_local.py b/scrape_website_local.py
new file mode 100644
index 0000000..9225bf5
--- /dev/null
+++ b/scrape_website_local.py
@@ -0,0 +1,102 @@
+import requests
+
+# Define the endpoints
+MAP_URL = "http://localhost:3002/v1/map"
+CRAWL_URL = "http://localhost:3002/v1/scrape"
+
+def map_website(base_url):
+    print(f"Mapping {base_url} for available links...")
+    try:
+        response = requests.post(MAP_URL, json={'url': base_url})
+        response.raise_for_status()  # Check for request errors
+        result = response.json()
+        links = result.get('links', [])
+        if links:
+            print(f"Found {len(links)} links.")
+            return links
+        else:
+            print("No links found in the response.")
+            return []
+    except Exception as e:
+        print(f"An error occurred during mapping: {str(e)}")
+        return []
+
+def choose_url(links):
+    if not links:
+        print("No links available to choose from.")
+        return None
+    
+    def display_links(start, end):
+        print("\nAvailable URLs:")
+        print("--------------------")
+        for idx, link in enumerate(links[start:end], start=start+1):
+            print(f"{idx:3}. {link}")
+        print("--------------------")
+
+    start = 0
+    page_size = 5
+    total_links = len(links)
+
+    while True:
+        display_links(start, start + page_size)
+        
+        if start + page_size < total_links:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'more' to see the next 5 links")
+            print("- Type 'all' to see all links")
+            print("- Type 'exit' to quit")
+        else:
+            print("\nOptions:")
+            print("- Enter a number to select a URL")
+            print("- Type 'exit' to quit")
+
+        choice = input("\nYour choice: ").lower().strip()
+
+        if choice == 'exit':
+            print("Exiting.")
+            return None
+        elif choice == 'more' and start + page_size < total_links:
+            start += page_size
+        elif choice == 'all':
+            display_links(0, total_links)
+        else:
+            try:
+                index = int(choice) - 1
+                if 0 <= index < total_links:
+                    return links[index]
+                else:
+                    print(f"Invalid choice. Please choose a number between 1 and {total_links}.")
+            except ValueError:
+                print("Invalid input. Please enter a valid option.")
+
+        print()  # Add a blank line for better readability
+
+def scrape_url(url):
+    print(f"Scraping content from {url}...")
+    try:
+        response = requests.post(CRAWL_URL, json={'url': url, 'formats': ['markdown']})
+        response.raise_for_status()  # Check for request errors
+        result = response.json()
+        markdown_content = result.get('data', {}).get('markdown', '')
+        if markdown_content:
+            output_path = 'scraped_content.md'
+            with open(output_path, 'w', encoding='utf-8') as markdown_file:
+                markdown_file.write(markdown_content)
+            print(f"Markdown content saved to {output_path}")
+        else:
+            print("No markdown content found.")
+    except Exception as e:
+        print(f"An error occurred during scraping: {str(e)}")
+
+# Main process
+if __name__ == "__main__":
+    base_url = input("Enter the website URL: ")
+    links = map_website(base_url)
+    
+    if links:
+        selected_url = choose_url(links)
+        if selected_url:
+            scrape_url(selected_url)
+    else:
+        print("No links found or available to scrape.")