-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathwebsite_keyword_crawl.py
109 lines (93 loc) · 3.95 KB
/
website_keyword_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#Python 3.x
#Website keyword crawler v1.1
#Created by rickvg @ https://github.com/rickvg
import urllib.request
import re
import threading
from multiprocessing import Queue
def findkeywordlvl(strwebsiteinp, strmatch, queueget):
if strmatch.startswith("src="):
strmatch = strmatch[5:len(strmatch)]
elif strmatch.startswith("href="):
strmatch = strmatch[6:len(strmatch)]
if not (strmatch.endswith(".jpg")) or (strmatch.endswith(".png")) or (strmatch.endswith(".bmp")) or (strmatch.endswith(".gif")):
if strmatch.startswith("//"):
strwebsite2 = "http:" + strmatch
elif strmatch.startswith("/"):
strwebsite2 = strwebsiteinp + strmatch
else:
strwebsite2 = strmatch
if ("\\" not in strwebsite2):
try:
print(strwebsite2)
strcontent = urllib.request.urlopen(strwebsite2).read()
match2 = re.findall(re.escape(strKeyword), str(strcontent))
match3 = re.findall("href=[\'\"]http\://[A-z0-9_\-\./]+|href=[\'\"]\/[A-z0-9_\-\./]+|href=[\'\"]www[A-z0-9_\-\./]+",str(strcontent))
match3 = match3 + re.findall("src=[\'\"]http\://[A-z0-9_\-\./]+|src=[\'\"]\/[A-z0-9_\-\./]+|src=[\'\"]www[A-z0-9_\-\./]+",str(strcontent))
if match2:
strPrint = strwebsite2 + " has " + str(len(match2)) + " matches with keyword: " + strKeyword + "\n"
print(strPrint)
strFile.write(strPrint)
else:
print("No matches for:", strwebsite2)
queueget.put([strwebsite2, match3])
return [strwebsite2, match3]
except Exception as ex:
errormsg = "Exception {0} occurred. Reason:\n{1!r}"
message = errormsg.format(type(ex).__name__, ex.args)
print(message)
strFile2.write(message)
strWebsite = input("Enter website (Format http://domain.com):\n")
strKeyword = input("Enter keyword to search for:\n")
intLevel = int(input("Select levels to scan. Choose 1, 2 or 3 - 3 might contain errors:\n"))
filename = strWebsite[7:len(strWebsite)] + " positives.log"
filename2 = strWebsite[7:len(strWebsite)] + " errors.log"
strFile = open(filename, 'w')
strFile2 = open(filename2, 'w')
strContent = urllib.request.urlopen(strWebsite).read()
match2 = re.findall(re.escape(strKeyword), str(strContent))
match3 = []
if match2:
strPrint = strWebsite + " has " + str(len(match2)) + " matches with keyword: " + strKeyword + "\n"
print(strPrint)
strFile.write(strPrint)
else:
print("No matches for:", strWebsite)
if intLevel == 1:
print("Finished scanning website for keywords")
elif intLevel in range(2, 4):
regex1 = r"src=[\'\"]http\://[A-z0-9_\-\./]+|src=[\'\"]\/[A-z0-9_\-\./]+|src=[\'\"]www[A-z0-9_\-\./]+"
regex2 = r"href=[\'\"]http\://[A-z0-9_\-\./]+|href=[\'\"]\/[A-z0-9_\-\./]+|href=[\'\"]www[A-z0-9_\-\./]+"
results = []
match = re.findall(re.compile(regex2), str(strContent))
matchsrc = re.findall(re.compile(regex1), str(strContent))
match = match + matchsrc
q = Queue()
threads = []
i = 0
while i < len(match):
if threading.active_count() < 10:
t = threading.Thread(target=findkeywordlvl, args =(strWebsite, match[i],q))
t.start()
threads.append(t)
i += 1
for p in threads:
p.join()
while not q.empty():
results.append(q.get_nowait())
print(results)
threads = []
j = 0
if intLevel == 3:
for i in range(0,len(results)):
while j < len(results[i][1]):
if threading.active_count() < 10:
threads.append(threading.Thread(target=findkeywordlvl, args=(results[i][0],results[i][1][j],q)))
threads[j].start()
j += 1
for p in threads:
p.join()
else:
print("Wrong level. Try again.")
strFile.close()
strFile2.close()