-
Notifications
You must be signed in to change notification settings - Fork 26
/
SJTU_Talk.py
71 lines (58 loc) · 2.39 KB
/
SJTU_Talk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
######################
# Author : 高明飞
# Data : 2016-07-23
# Brief : 用于获取交大公司招聘宣讲会的爬虫
######################
import WebsiteBase
import requests, time, re, logging, sqlite3
from bs4 import BeautifulSoup
class SJTU_Talk(WebsiteBase.WebsiteBase):
def __init__(self, Name, DBName, AgentID, KeyWords, SpecialKeyWords = []):
super().__init__(Name, DBName, AgentID, True, KeyWords, 3, SpecialKeyWords)
# Return number of pages
def GetPageRange(self):
return ['all', 'jt', 'mt', 'bz']
# Use requests to get the main page, return response
def GetMainPage(self, page):
Formdata = {'xjhType': page}
return requests.post('http://www.job.sjtu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx',
data=Formdata, timeout=21)
# Return soup
def GetEnclose(self, soup):
return soup.find('div', class_='z_newsl')
# Return list of tag
def GetTags(self, soup):
tags = soup.find_all('li')
del tags[0]
return tags
# Return title string
def GetTitle(self, tag):
if not tag.find('a').string:
companyName = ''
for s in tag.find('a').contents:
companyName += s.string
else:
companyName = tag.find('a').string
return re.sub(r'\s', '', companyName)
# Return URL string
def GetURL(self, tag):
basecontentURL = 'http://www.job.sjtu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=viewXjhxx&id='
contentsuffix = tag.find('a')['onclick']
contentsuffix = re.search(r"viewXphxx.'(\w+)'.", contentsuffix).group(1)
contentURL = basecontentURL + contentsuffix
return contentURL
# Return publish time
def GetPublishTime(self, tag):
return ''
# Addditon check, return True if unused
def AdditionCheck(self, tag):
return True
# Return brief string
def GetBrief(self, tag, keywordstring):
# Get time & Location
talktime = tag.find_all('div')[3].string + ' ' + tag.find_all('div')[4].string
talkloc = tag.find_all('div')[2].string
# Generate BriefList
BriefString = talktime + '\r\n' + talkloc + '\r\n\r\n' + keywordstring
return BriefString