Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crawl_Terraform) #87

Merged
merged 6 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions crawl/content_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

class WebContentParser:
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36'
)
}
self.session = self._initialize_session()
self.main_response = None
self.all_page_data = []

def _initialize_session(self):
"""Set up the session with retry strategy."""
retry_strategy = Retry(
total=5,
backoff_factor=8,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
adapter.max_retries.respect_retry_after_header = False

session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)
return session

def fetch_content(self):
"""Fetch the main content from the URL."""
try:
self.main_response = self.session.get(
self.url, verify=False, timeout=30, headers=self.headers
)
print(f'URL fetched: {self.url}')
return self.main_response
except requests.RequestException as e:
print(f"Failed to fetch the URL: {e}")
return None

def parse_content(self):
"""Parse the fetched HTML content."""
if not self.main_response:
print("No response available to parse.")
return []

main_soup = BeautifulSoup(self.main_response.content, 'html.parser')
datas = main_soup.find('main', {'id': 'main'})
if not datas:
print("No 'main' element found.")
return []

all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul'])
each_title_data = {}

for tag in all_tag:
if tag.name in ['h1', 'h2']:
if each_title_data:
self.all_page_data.append(each_title_data)
each_title_data = {}
each_title_data['metadata'] = tag.text.strip()

elif tag.name == 'h3':
if tag.text.strip() == 'Resources':
each_title_data[tag.text.strip()] = ''
else:
if each_title_data:
self.all_page_data.append(each_title_data)
each_title_data = {}
each_title_data['metadata'] = tag.text.strip()

elif tag.name in ['p', 'blockquote']:
num = len(each_title_data)
key = f'content {num}'
if tag.text.strip():
each_title_data[key] = tag.text.strip()

elif tag.name == 'ul':
text = ' '.join(
li.text.strip()
for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'})
)
if 'Resources' in each_title_data:
each_title_data['Resources'] = text
else:
num = len(each_title_data)
key = f'content {num}'
if text:
each_title_data[key] = text

if each_title_data:
self.all_page_data.append(each_title_data)

return self.all_page_data

def get_data(self):
"""Main method to fetch and parse content."""
self.fetch_content()
return self.parse_content()

92 changes: 92 additions & 0 deletions crawl/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

import argparse
import csv
import logging
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from content_parser import WebContentParser


def setup_logging():
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)


def setup_http_session():
retry_strategy = Retry(
total=5,
backoff_factor=8,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
adapter.max_retries.respect_retry_after_header = False
session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)
return session


def process_urls(file_path, save_result):
http = setup_http_session()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
if row: # Check if the row is not empty
main_url = row[0]
try:
main_response = http.get(main_url, verify=False, timeout=30, headers=headers)
logging.info(f'Fetched URL: {main_url}')
except requests.RequestException as e:
logging.error(f"Failed to fetch URL {main_url}: {e}")
continue

main_soup = BeautifulSoup(main_response.content, 'html.parser')
products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'})
logging.info(f'Found {len(products)} products on page: {main_url}')
all_data = []
for product in products:
# Get org title
title = product.find('h2').text
sub_content_link=[]
all_sub_title = product.find_all('li')
for res in all_sub_title:
sub_part_content = {}
sub_part_content['main_title'] = title
sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text()
sub_part_content['sub_title'] = sub_title
sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href']
sub_part_content['sub_title_link'] = sub_title_link

parser = WebContentParser(sub_title_link)
data = parser.get_data()
sub_part_content['all_data_info'] = data

logging.info(f'Parsed content for sub-title: {sub_title}')
sub_content_link.append(sub_part_content)
all_data.append(sub_content_link)
if save_result:
# Logic to save sub_part_content goes here (e.g., writing to a file or database)
logging.info(f'Saving result for: {all_data}')
else:
print(all_data)


def main():
setup_logging()

parser = argparse.ArgumentParser(description='Process URLs from a CSV file.')
parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs')
parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved')
args = parser.parse_args()

process_urls(args.csv_path, args.save_result)


if __name__ == '__main__':
main()
39 changes: 39 additions & 0 deletions crawl/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Documentation for Web Content Scraper

## Overview
This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation.

## Prerequisites
Make sure the following Python packages are installed:
- `requests`
- `beautifulsoup4`
- `urllib3`

To install the dependencies, run the following command:
```sh
pip install requests beautifulsoup4
```
## How to Use
Arguments
The script accepts command-line arguments that allow customization of behavior:
--csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv.
--save_result: A boolean flag indicating whether to save the scraped results. The default value is False.
## Running the Script
You can run the script by using the following command:

```sh
Copy code
python main.py --csv_path <path_to_csv> --save_result <True/False>
```
For example:
```sh
Copy code
python main.py --csv_path ./urls.csv --save_result True
```
## CSV File Format
The CSV file should contain a list of URLs, with each URL on a new line. Here is an example:
```
https://example.com/page1
https://example.com/page2
```

1 change: 1 addition & 0 deletions crawl/urls.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://developer.hashicorp.com/terraform/docs
Loading