Germey · cowry5 · Mar 31, 2018
diff --git a/proxypool/getter.py b/proxypool/getter.py
@@ -35,7 +35,7 @@ def crawl_ip181(self):
         html = get_page(start_url)
         ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         # \s* 匹配空格，起到换行作用
-        re_ip_adress = ip_adress.findall(html)
+        re_ip_adress = ip_adress.findall(str(html))
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
@@ -48,7 +48,7 @@ def crawl_kuaidaili(self):
             ip_adress = re.compile(
                 '<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>'
             )
-            re_ip_adress = ip_adress.findall(html)
+            re_ip_adress = ip_adress.findall(str(html))
             for adress, port in re_ip_adress:
                 result = adress + ':' + port
                 yield result.replace(' ', '')
@@ -61,7 +61,7 @@ def crawl_xicidaili(self):
                 '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
             )
             # \s* 匹配空格，起到换行作用
-            re_ip_adress = ip_adress.findall(html)
+            re_ip_adress = ip_adress.findall(str(html))
             for adress, port in re_ip_adress:
                 result = adress + ':' + port
                 yield result.replace(' ', '')
@@ -88,7 +88,7 @@ def crawl_data5u(self):
                 ' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>'
             )
             # \s * 匹配空格，起到换行作用
-            re_ip_adress = ip_adress.findall(html)
+            re_ip_adress = ip_adress.findall(str(html))
             for adress, port in re_ip_adress:
                 result = adress + ':' + port
                 yield result.replace(' ', '')
@@ -99,7 +99,7 @@ def crawl_kxdaili(self):
             html = get_page(start_url)
             ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
             # \s* 匹配空格，起到换行作用
-            re_ip_adress = ip_adress.findall(html)
+            re_ip_adress = ip_adress.findall(str(html))
             for adress, port in re_ip_adress:
                 result = adress + ':' + port
                 yield result.replace(' ', '')
@@ -111,7 +111,7 @@ def crawl_premproxy(self):
             html = get_page(start_url)
             if html:
                 ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>')
-                re_ip_adress = ip_adress.findall(html)
+                re_ip_adress = ip_adress.findall(str(html))
                 for adress_port in re_ip_adress:
                     yield adress_port.replace(' ', '')
 
@@ -123,10 +123,10 @@ def crawl_xroxy(self):
             if html:
                 ip_adress1 = re.compile(
                     "title='View this Proxy details'>\s*(.*).*")
-                re_ip_adress1 = ip_adress1.findall(html)
+                re_ip_adress1 = ip_adress1.findall(str(html))
                 ip_adress2 = re.compile(
                     "title='Select proxies with port number .*'>(.*)</a>")
-                re_ip_adress2 = ip_adress2.findall(html)
+                re_ip_adress2 = ip_adress2.findall(str(html))
                 for adress, port in zip(re_ip_adress1, re_ip_adress2):
                     adress_port = adress + ':' + port
                     yield adress_port.replace(' ', '')
diff --git a/proxypool/schedule.py b/proxypool/schedule.py
@@ -24,6 +24,7 @@ def set_raw_proxies(self, proxies):
         self._raw_proxies = proxies
         self._conn = RedisClient()
 
+    # 利用aiohttp实现异步检测
     async def test_single_proxy(self, proxy):
         """
         text one proxy, if valid, put them to usable_proxies.
@@ -102,8 +103,8 @@ def valid_proxy(cycle=VALID_CHECK_CYCLE):
         """
         Get half of proxies which in redis
         """
-        conn = RedisClient()
-        tester = ValidityTester()
+        conn = RedisClient()  # redis连接对象
+        tester = ValidityTester()  # 用来检测代理是否可用的类
         while True:
             print('Refreshing ip')
             count = int(0.5 * conn.queue_len)
@@ -132,6 +133,8 @@ def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
 
     def run(self):
         print('Ip processing running')
+        # 运行了两个进程,check_pool是从网上获取代理,进行筛选,放到数据库.
+        # valid_proxy是从数据库里拿出来检测
         valid_process = Process(target=Schedule.valid_proxy)
         check_process = Process(target=Schedule.check_pool)
         valid_process.start()