-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearchengine.py
96 lines (75 loc) · 2.39 KB
/
searchengine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def compute_ranks(graph):
d = 0.8 # damping factor, recommended by udacity
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1-d) / npages #probability of randomly clicking page
for nodes in graph:
if page in graph[nodes]:
newrank = newrank + d * (ranks[nodes] / len(graph[nodes]))
newranks[page] = newrank
ranks = newranks
return ranks
#a bit confused on the numloops on this one and how it allows the ranks to converge into a more accurate ranking result
def crawl_web(seed):
tocrawl = [seed]
crawled = []
graph = {} #would be <url>, [list of outlinks]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
def get_page(url):
if url in cache:
return cache[url]
else:
return None
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1: end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
#breaking down page content into words and adding the site as a value for each of those keywords
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def add_to_index(index, keyword, url): #assigning urls as keyword values
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = (url)
def lookup(index, keyword): #returning list of urls related to keyword
if keyword in index:
return index[keyword]
else:
return None