-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_extractor.py
75 lines (56 loc) · 2 KB
/
link_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from urllib.request import urlopen
from bs4 import BeautifulSoup
DOMAIN = "https://en.wikipedia.org"
LINKS_LIST_ID = "mw-whatlinkshere-list"
CONTENT_ID = "mw-content-text"
ARTICLE_SELECT_STR = "#{} > li".format(LINKS_LIST_ID)
NEXT_LINK_SELECT_STR = "#{} > a".format(CONTENT_ID)
SPECIAL_PREFIX = "Special:WhatLinksHere"
START_LINK_FORMAT = "/w/index.php?title=Special%3AWhatLinksHere&target={title}&namespace=0&limit=500"
#BEFORE = "/w/index.php?title=Special:WhatLinksHere/Gap,_Hautes-Alpes&from=537646&back=105054"
#AFTER = "/w/index.php?title=Special%3AWhatLinksHere&target=Gap%2C+Hautes-Alpes&namespace=0"
HEADERS = {"User-Agent": "Mozilla/5.0"}
class Parser:
def __init__(self, title):
self.title = title.replace(" ", "_")
self.link = START_LINK_FORMAT.format(title=self.title)
self.url = self.get_url(self.link)
self.doc = self.get_doc(self.url)
self.soup = self.get_soup(self.doc)
@staticmethod
def get_url(link):
return DOMAIN + link
@staticmethod
def get_doc(url):
with urlopen(url) as res:
return res.read()
@staticmethod
def get_soup(html_doc):
return BeautifulSoup(html_doc, 'html.parser')
def reinit(self, next_link):
self.link = next_link
self.url = self.get_url(self.link)
self.doc = self.get_doc(self.url)
self.soup = self.get_soup(self.doc)
def get_next_link(self, dev=False):
for anchor in self.soup.select(NEXT_LINK_SELECT_STR):
#if dev: print(anchor["title"] + ":", anchor)
if anchor["title"].startswith(SPECIAL_PREFIX):
if anchor.text.startswith("next"):
return anchor["href"]
def get_curr_articles(self, dev=False):
res = []
for item in self.soup.select(ARTICLE_SELECT_STR):
for child in item:
if child.name == "a" and child["title"] != "AdolfHitler":
#if dev: print(child.name, child)
res.append(child["title"])
return res
def get_all_articles(self, dev=False):
res = []
while True:
res.extend(self.get_curr_articles())
next_link = self.get_next_link()
if next_link is None:
return res
self.reinit(next_link)