forked from kingname/AutoCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
37 lines (29 loc) · 858 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from lxml.html import fromstring, HtmlElement
from lxml.html import etree
import constants
def remove_node(node: HtmlElement):
"""
this is a in-place operation, not necessary to return
:param node:
:return:
"""
parent = node.getparent()
if parent is not None:
parent.remove(node)
def clean_html(html):
selector = fromstring(html)
for tag in constants.USELESS_TAG:
eles = selector.xpath(f'//{tag}')
for ele in eles:
remove_node(ele)
html_clean = etree.tostring(selector, pretty_print=True, encoding='unicode')
return html_clean
def retry(func):
def wrap(*args, **kwargs):
for i in range(3):
try:
return func(*args, **kwargs)
except Exception as e:
print(f'Error: {e}')
return ''
return wrap