-
Notifications
You must be signed in to change notification settings - Fork 21
/
CloneWebPage_3.py
139 lines (131 loc) · 5.93 KB
/
CloneWebPage_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#BaseUrl Of the website
#place link of the website without index.html
#eg: http://xyz.com/index.html is the website you want to clone
#put the base URL as http://xyz.com/
baseurl = 'REPLACE THIS'
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
from urllib.request import urlretrieve
import urllib.request
import cssutils
import logging
print ('''Python script to Clone a Web Page
Author : Sai Kiran Goud
Date : 17 Dec 2019
''')
def report(count, size, total):
progress = [0, 0]
progress[0] = count * size
if progress[0] - progress[1] > 1000000:
progress[1] = progress[0]
print("Downloaded {:,}/{:,} ...".format(progress[1], total))
print ("Connecting to server")
cssutils.log.setLevel(logging.CRITICAL)
directory = ''
opener = urllib.request.build_opener()
#defining headers as some servers mandiate it
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Connection', 'keep-alive')
]
urllib.request.install_opener(opener)
html_doc = urllib.request.urlopen(baseurl).read()
print ("Connection Success!")
try :
soup = BeautifulSoup(html_doc, 'html.parser')
f = open( 'index.html', 'w' )
f.write(str(soup))
f.close()
print ("Initializing Index File")
#Get All Images
print ("Process Initiated")
print ("Step 1: Getting all images.")
a = soup.find_all('img')
for i in range(len(a)):
try:
if(a[i].get('data-src')):
directory = a[i]['data-src']
elif(a[i].get('src')):
directory = a[i]['src']
else:
continue
print ('\t[+]Getting img = '+str(directory))
if "data:image" in directory:
print("-------Skipped for ---------",directory)
continue
if not os.path.exists(os.path.dirname(directory)):
print (" [DIR]Creating directory")
os.makedirs(os.path.dirname(directory))
testfile, headers = urlretrieve(baseurl+directory, directory, reporthook=report)
except Exception as e:
print ("Exception in IMG = ",e)
print ('==============Done getting images!==============')
#Get all Css
print ("Step 2: Getting all CSS.")
a = soup.find_all('link')
for i in range(len(a)):
try:
directory = a[i]['href']
if(".css" not in directory):
print("-------Skipped for ---------",directory)
continue
if "http" in directory or "https" in directory:
print ("------Skipped for ----- ",directory)
continue
print ('\t[+]Getting CSS = '+str(directory))
if "/" not in directory:
print ("\tNo directory. Saving file",directory)
elif not os.path.exists(os.path.dirname(directory)):
print (" [DIR]Creating directory")
os.makedirs(os.path.dirname(directory))
testfile, headers = urlretrieve(baseurl+directory, directory, reporthook=report)
urls = list( cssutils.getUrls(cssutils.parseFile(directory)))
if "fontawesome" in directory:
continue
if(len(urls)!=0):
for link in urls:
try:
if "http" in directory or "https" in link or "data:image/" in link:
print ("------Skipped for ----- ",link)
continue
while("../" in link):
if("assets" in link):
link = link[3:]
else:
link = "assets/"+link[3:]
print ('\t\t[+]Getting CSS-Image = '+str(link))
if "/" not in link:
print ("\t\tNo directory. Saving file",link)
elif not os.path.exists(os.path.dirname(link)):
print (" [DIR]Creating directory")
os.makedirs(os.path.dirname(link))
testfile, headers = urlretrieve(baseurl+link, link, reporthook=report)
except Exception as e:
print ("Excpetion occurred in CSS-Inner for",e)
except Exception as e:
print ("Exception in CSS = ",e)
print ('==============Done getting CS files!==============')
print ("Step 3: Getting all JS.")
#Get all JS
a = soup.find_all('script')
for i in range(len(a)):
try:
if(a[i].get('src')):
directory=a[i]['src']
else:
continue
if "http" in directory or "https" in directory:
print ("------Skipped for ----- ",directory)
continue
print ('\t[+]Getting JS = '+str(directory))
if not os.path.exists(os.path.dirname(directory)):
print (" [DIR]Creating directory")
os.makedirs(os.path.dirname(directory))
testfile, headers = urlretrieve(baseurl+directory, directory, reporthook=report)
except Exception as e:
print ("Exception in JS = ",e)
print ('==============Done getting JS Files!==============')
print ('Script Executed successfully!')
except Exception as e:
print ("Exception occurred = ",e)