-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_crawler.py
30 lines (23 loc) · 948 Bytes
/
demo_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
from bs4 import BeautifulSoup
import csv
# URL of the WordPress site's homepage
url = 'https://www.crawler-test.com/'
# Send a GET request to fetch the page content
response = requests.get(url)
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract hyperlinks from the page
links = [link.get('href') for link in soup.find_all('a') if link.get('href')]
# Write the extracted data to a CSV file
csv_filename = 'wp_site_links.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Links']) # Write header
# Write each link to the CSV file
for link in links:
writer.writerow([link])
print(f"Hyperlinks have been extracted and saved to '{csv_filename}'")
else:
print("Failed to fetch the page")