Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update getter.py #21

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions proxypool/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def crawl_ip181(self):
html = get_page(start_url)
ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s* 匹配空格,起到换行作用
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
Expand All @@ -48,7 +48,7 @@ def crawl_kuaidaili(self):
ip_adress = re.compile(
'<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>'
)
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
Expand All @@ -61,7 +61,7 @@ def crawl_xicidaili(self):
'<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
)
# \s* 匹配空格,起到换行作用
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
Expand All @@ -88,7 +88,7 @@ def crawl_data5u(self):
' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>'
)
# \s * 匹配空格,起到换行作用
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
Expand All @@ -99,7 +99,7 @@ def crawl_kxdaili(self):
html = get_page(start_url)
ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s* 匹配空格,起到换行作用
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
Expand All @@ -111,7 +111,7 @@ def crawl_premproxy(self):
html = get_page(start_url)
if html:
ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>')
re_ip_adress = ip_adress.findall(html)
re_ip_adress = ip_adress.findall(str(html))
for adress_port in re_ip_adress:
yield adress_port.replace(' ', '')

Expand All @@ -123,10 +123,10 @@ def crawl_xroxy(self):
if html:
ip_adress1 = re.compile(
"title='View this Proxy details'>\s*(.*).*")
re_ip_adress1 = ip_adress1.findall(html)
re_ip_adress1 = ip_adress1.findall(str(html))
ip_adress2 = re.compile(
"title='Select proxies with port number .*'>(.*)</a>")
re_ip_adress2 = ip_adress2.findall(html)
re_ip_adress2 = ip_adress2.findall(str(html))
for adress, port in zip(re_ip_adress1, re_ip_adress2):
adress_port = adress + ':' + port
yield adress_port.replace(' ', '')
7 changes: 5 additions & 2 deletions proxypool/schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def set_raw_proxies(self, proxies):
self._raw_proxies = proxies
self._conn = RedisClient()

# 利用aiohttp实现异步检测
async def test_single_proxy(self, proxy):
"""
text one proxy, if valid, put them to usable_proxies.
Expand Down Expand Up @@ -102,8 +103,8 @@ def valid_proxy(cycle=VALID_CHECK_CYCLE):
"""
Get half of proxies which in redis
"""
conn = RedisClient()
tester = ValidityTester()
conn = RedisClient() # redis连接对象
tester = ValidityTester() # 用来检测代理是否可用的类
while True:
print('Refreshing ip')
count = int(0.5 * conn.queue_len)
Expand Down Expand Up @@ -132,6 +133,8 @@ def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,

def run(self):
print('Ip processing running')
# 运行了两个进程,check_pool是从网上获取代理,进行筛选,放到数据库.
# valid_proxy是从数据库里拿出来检测
valid_process = Process(target=Schedule.valid_proxy)
check_process = Process(target=Schedule.check_pool)
valid_process.start()
Expand Down