Skip to content

Commit

Permalink
feat(crawl_terraform): Add Crawl Terraform (#87)
Browse files Browse the repository at this point in the history
* crawl(terraform):crawl terraform docs -first part

* crawl(terraform) add crawl for more info

* feat(crawl_terraform) craw terraform docs

* feat(crawl_terraform) add readme

* feat(crawl_terraform) add default csv file
  • Loading branch information
rfa447eh authored Nov 22, 2024
1 parent 2a365ee commit 6a2c668
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 0 deletions.
105 changes: 105 additions & 0 deletions crawl/content_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

class WebContentParser:
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36'
)
}
self.session = self._initialize_session()
self.main_response = None
self.all_page_data = []

def _initialize_session(self):
"""Set up the session with retry strategy."""
retry_strategy = Retry(
total=5,
backoff_factor=8,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
adapter.max_retries.respect_retry_after_header = False

session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)
return session

def fetch_content(self):
"""Fetch the main content from the URL."""
try:
self.main_response = self.session.get(
self.url, verify=False, timeout=30, headers=self.headers
)
print(f'URL fetched: {self.url}')
return self.main_response
except requests.RequestException as e:
print(f"Failed to fetch the URL: {e}")
return None

def parse_content(self):
"""Parse the fetched HTML content."""
if not self.main_response:
print("No response available to parse.")
return []

main_soup = BeautifulSoup(self.main_response.content, 'html.parser')
datas = main_soup.find('main', {'id': 'main'})
if not datas:
print("No 'main' element found.")
return []

all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul'])
each_title_data = {}

for tag in all_tag:
if tag.name in ['h1', 'h2']:
if each_title_data:
self.all_page_data.append(each_title_data)
each_title_data = {}
each_title_data['metadata'] = tag.text.strip()

elif tag.name == 'h3':
if tag.text.strip() == 'Resources':
each_title_data[tag.text.strip()] = ''
else:
if each_title_data:
self.all_page_data.append(each_title_data)
each_title_data = {}
each_title_data['metadata'] = tag.text.strip()

elif tag.name in ['p', 'blockquote']:
num = len(each_title_data)
key = f'content {num}'
if tag.text.strip():
each_title_data[key] = tag.text.strip()

elif tag.name == 'ul':
text = ' '.join(
li.text.strip()
for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'})
)
if 'Resources' in each_title_data:
each_title_data['Resources'] = text
else:
num = len(each_title_data)
key = f'content {num}'
if text:
each_title_data[key] = text

if each_title_data:
self.all_page_data.append(each_title_data)

return self.all_page_data

def get_data(self):
"""Main method to fetch and parse content."""
self.fetch_content()
return self.parse_content()

92 changes: 92 additions & 0 deletions crawl/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

import argparse
import csv
import logging
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from content_parser import WebContentParser


def setup_logging():
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)


def setup_http_session():
retry_strategy = Retry(
total=5,
backoff_factor=8,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
adapter.max_retries.respect_retry_after_header = False
session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)
return session


def process_urls(file_path, save_result):
http = setup_http_session()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
if row: # Check if the row is not empty
main_url = row[0]
try:
main_response = http.get(main_url, verify=False, timeout=30, headers=headers)
logging.info(f'Fetched URL: {main_url}')
except requests.RequestException as e:
logging.error(f"Failed to fetch URL {main_url}: {e}")
continue

main_soup = BeautifulSoup(main_response.content, 'html.parser')
products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'})
logging.info(f'Found {len(products)} products on page: {main_url}')
all_data = []
for product in products:
# Get org title
title = product.find('h2').text
sub_content_link=[]
all_sub_title = product.find_all('li')
for res in all_sub_title:
sub_part_content = {}
sub_part_content['main_title'] = title
sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text()
sub_part_content['sub_title'] = sub_title
sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href']
sub_part_content['sub_title_link'] = sub_title_link

parser = WebContentParser(sub_title_link)
data = parser.get_data()
sub_part_content['all_data_info'] = data

logging.info(f'Parsed content for sub-title: {sub_title}')
sub_content_link.append(sub_part_content)
all_data.append(sub_content_link)
if save_result:
# Logic to save sub_part_content goes here (e.g., writing to a file or database)
logging.info(f'Saving result for: {all_data}')
else:
print(all_data)


def main():
setup_logging()

parser = argparse.ArgumentParser(description='Process URLs from a CSV file.')
parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs')
parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved')
args = parser.parse_args()

process_urls(args.csv_path, args.save_result)


if __name__ == '__main__':
main()
39 changes: 39 additions & 0 deletions crawl/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Documentation for Web Content Scraper

## Overview
This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation.

## Prerequisites
Make sure the following Python packages are installed:
- `requests`
- `beautifulsoup4`
- `urllib3`

To install the dependencies, run the following command:
```sh
pip install requests beautifulsoup4
```
## How to Use
Arguments
The script accepts command-line arguments that allow customization of behavior:
--csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv.
--save_result: A boolean flag indicating whether to save the scraped results. The default value is False.
## Running the Script
You can run the script by using the following command:

```sh
Copy code
python main.py --csv_path <path_to_csv> --save_result <True/False>
```
For example:
```sh
Copy code
python main.py --csv_path ./urls.csv --save_result True
```
## CSV File Format
The CSV file should contain a list of URLs, with each URL on a new line. Here is an example:
```
https://example.com/page1
https://example.com/page2
```

1 change: 1 addition & 0 deletions crawl/urls.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://developer.hashicorp.com/terraform/docs

0 comments on commit 6a2c668

Please sign in to comment.