-
Notifications
You must be signed in to change notification settings - Fork 26
/
YJS.py
78 lines (64 loc) · 2.5 KB
/
YJS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
######################
# Author : 高明飞
# Data : 2016-07-25
# Brief : 用于获取应届生求职网招聘信息的爬虫
######################
import WebsiteBase
import requests, time, re, logging, sqlite3
from bs4 import BeautifulSoup
class YJS(WebsiteBase.WebsiteBase):
def __init__(self, Name, DBName, AgentID, KeyWords, SpecialKeyWords = []):
self.joblocstring = ''
self.JobLoc = ['全国', '上海', '杭州', '浙江', '深圳', '广州', '南京', '苏州']
super().__init__(Name, DBName, AgentID, True, KeyWords, 7, SpecialKeyWords, 'gb2312')
# Number of Pages
def GetPageRange(self):
return range(1, 20)
def GetMainPage(self, page):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
return requests.get('http://www.yingjiesheng.com/commend-fulltime-%s.html'%page, timeout=21, headers = header)
def GetEnclose(self, soup):
return soup.find('table')
def GetTags(self, soup):
tags = []
for t in soup.find_all('tr'):
if t.get('class'):
tags.append(t)
return tags
def GetTitle(self, tag):
return tag.find('a').contents[-1].string
def GetURL(self, tag):
basecontentURL = 'http://www.yingjiesheng.com'
contentsuffix = tag.find('a')['href']
if re.match('http.*', contentsuffix):
contentURL = contentsuffix
else:
contentURL = basecontentURL + contentsuffix
return contentURL
def GetPublishTime(self, tag):
return tag.find('td', class_="date").string
def AdditionCheck(self, tag):
flagcount = 0
loctag = tag.find('span', style='color: #008000;')
if not loctag:
self.joblocstring = ''
return False
else:
self.joblocstring = loctag.string
for loc in self.JobLoc:
flagcount += self.joblocstring.count(loc)
if flagcount == 0:
return False
else:
return True
def GetBrief(self, tag, keywordstring):
# Get emphasis
if tag.find('span', class_='emphasis'):
emphasis = '[置顶] '
else:
emphasis = ''
# Generate Brief
BriefString = emphasis + self.joblocstring + '\r\n\r\n' + keywordstring
return BriefString