-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
112 lines (97 loc) · 4.32 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import time
from modules.settings import Settings
from modules.html_loader import HtmlLoader
from modules.logger import Logger
from modules.extracter import Extracter
from modules.file_helper import FileHelper
def process_detail_page(item):
soup = html_loader.get_detail_page_soup(url=item['page_url'], page=item['page_num'])
if soup is None:
extracter.save_failed_page(item['tender_id'], item['page_url'], 'grab', item['page_num'], 'detail',
item['pubdate'])
return False
try:
detail = extracter.extract_detail(soup)
extracter.save_extracted_data(item, detail)
logger.info("Extract data of page success")
return True
except Exception, e:
extracter.save_failed_page(item['tender_id'], item['page_url'], 'extract', item['page_num'], 'detail',
item['pubdate'])
logger.error('Extract data of page failed: \n%s' % e)
return False
def process_list_page(lists):
all_success = True
for item in lists:
if not process_detail_page(item):
all_success = False
return all_success
if __name__ == '__main__':
html_loader = HtmlLoader()
extracter = Extracter()
logger = Logger()
file_helper = FileHelper()
# get last extract status
last_status = extracter.get_last_extract_status()
# get first page
logger.info('Get total records from first page')
soup = html_loader.get_page_soup(url=Settings.URL, page=1)
if soup is None:
logger.info("Get total records from first page. Exit data process.")
exit(100)
record_status = extracter.extract_record_status(soup)
if last_status:
last_extract_page = last_status['page_num']
total_pages_last_status = last_status['total_pages']
else:
last_extract_page = record_status['total_pages']
total_pages_last_status = record_status['total_pages']
if last_extract_page == 1:
new_pages = record_status['total_pages'] - total_pages_last_status
last_extract_page += new_pages
extracter.update_tender_page_num(page_num=new_pages)
last_page_data_in_db = extracter.get_trenders_by_page_num(page_num=last_extract_page)
# last_page_need_grab = False
# if last_extract_page is None:
# last_extract_page = record_status['total_pages']
# else:
# # check the last page need to grab or not
# if len(last_page_data_in_db) != Settings.PAGE_SIZE:
# last_page_need_grab = True
logger.info('Grab data from page: %s' % last_extract_page)
pages = last_extract_page
page_array = range(1, pages + 1)
page_array.reverse()
# get data from earlier to current
for page in page_array:
logger.info("Grab data of page: %s" % page)
extracter.save_extract_status(page_num=page, total_pages=record_status['total_pages'])
soup = html_loader.get_page_soup(url=Settings.URL, page=page)
if soup is None:
continue
lists = extracter.extract_list(soup, page)
# if page == last_extract_page:
# lists = [item for item in lists if item['tender_id'] not in last_page_data_in_db]
process_list_page(lists)
# trying to process the failed grab
logger.info("Start to process the grab failed pages")
for retry_times in range(1, Settings.RETRY_TIMES + 1):
logger.info("Retrytime: %s" % retry_times)
failed_pages = extracter.get_failed_extract_pages()
if len(failed_pages) == 0:
break
for page in failed_pages['data']:
if page['page_type'] == 'list':
soup = html_loader.get_page_soup(url=page['page_url'], page=page['page_num'])
if soup is None:
extracter.save_reprocess_status(page['page_url'], page['page_num'], False)
continue
extracter.save_reprocess_status(page['page_url'], page['page_num'], True)
lists = extracter.extract_list(soup, page['page_num'])
process_list_page(lists)
else:
if process_detail_page(page):
extracter.save_reprocess_status(page['page_url'], page['page_num'], True)
else:
extracter.save_reprocess_status(page['page_url'], page['page_num'], False)
time.sleep(3)