-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapeURL.py
92 lines (76 loc) · 2.65 KB
/
ScrapeURL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import urllib.robotparser
import time # for sleep between requests
def check_url(url):
"""
Checks if a URL is working by sending a GET request with a User-Agent header.
Args:
url: The URL to check.
Returns:
True if the URL responds with a successful status code, False otherwise.
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise exception for unsuccessful status codes
return True
except requests.exceptions.RequestException:
return False
def find_urls(base_url, max_depth=5):
"""
Crawls a website to find all sub-URLs using links from the HTML, with a maximum depth limit.
Args:
base_url: The base URL of the website.
max_depth: The maximum depth of recursion for crawling (default: 2).
Returns:
A list of all discovered sub-URLs.
"""
all_urls = []
visited_urls = set()
robots_parser = urllib.robotparser.RobotFileParser()
robots_parser.set_url(f"{base_url}/robots.txt")
robots_parser.read()
# Fetch initial response
response = requests.get(base_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
else:
print(f"Error retrieving {base_url}")
return []
# Extract links from the HTML
for link in soup.find_all('a', href=True):
url = link['href']
# Check if URL is relative and starts with a slash
if url.startswith('/'):
url = f"{base_url}{url}"
elif not url.startswith('http'):
continue # Skip non-http links
# Check robots.txt for allowed URLs and limit depth
if robots_parser.can_fetch("*", url) and max_depth > 0:
if url not in visited_urls:
visited_urls.add(url)
print(url)
all_urls.append(url)
# Recursively crawl discovered sub-URLs with reduced depth
sub_urls = find_urls(url, max_depth-1)
all_urls.extend(sub_urls)
time.sleep(0.1) # Add a small delay between requests
return all_urls
def main():
"""
Prompts user for a website URL, crawls the website with a depth limit, and checks for working sub-URLs.
"""
base_url = "http://www.efluniversity.ac.in/images/documents/"
# Find all URLs on the website
all_urls = find_urls(base_url)
# Check functionality of each URL
working_urls = [url for url in all_urls if check_url(url)]
print("Found URLs:")
for url in all_urls:
print(url)
print("\nWorking URLs:")
for url in working_urls:
print(url)
if __name__ == "__main__":
main()