-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy patharxiv_0704-now_wAbstract.py
127 lines (99 loc) · 3.61 KB
/
arxiv_0704-now_wAbstract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import re
import time
import secrets
import itertools
import requests
import os.path as osp
from bs4 import BeautifulSoup as bs
gHeaders = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'}
clean_pat = re.compile('<!--[^<>]*-->',re.M)
months = sorted('0704,0705,0706,0707,0708,0709,0710,0711,0712'.split(','),reverse=True)
def time2008_2018():
YEARS = list(range(2008,2018))
MONTHS = list(range(1,13))
newMonths = []
for year_, month_ in itertools.product(YEARS,MONTHS):
newMonths.append(f'{str(year_)[-2:]}{month_:02d}')
newMonths.extend('1801,1802,1803,1804,1805'.split(','))
return sorted(newMonths,reverse=True)
months.extend(time2008_2018())
def get_lasttime(filename):
if not osp.exists(filename): return 1
with open(filename,'rb') as fr:
lines = fr.readlines()
left5line = lines[-5:]
isDone = sum([1 if '=========' in line.decode('utf-8') else 0 for line in lines])
if isDone >= 5: return 'done'
numlines = len(lines)
if lines and '===' not in lines[-1].decode('utf-8') and '---' not in lines[-1].decode('utf-8'):
col = lines[-1].decode('utf-8').strip().split('\t')[0]
start = int(col.split('.')[1]) + 1
else:
start = numlines+1
return start
def get_result(text,id):
abs = bs(text,'lxml')
try:
h1 = abs.find_all('h1')[1].text
if ' not found' in h1:
return '='*50
except:
return 'maybe meet antiscrapy'
if 'doesn\'t exist' in h1: return 'doesn\'t exist'
try:
title = abs.find('h1',{'class':"title mathjax"}).text.replace('\n','').replace('Title:','')
except:
print(abs.find('h1',{'class':"title mathjax"}))
return '='*50
subject = abs.find('span',{'class':"primary-subject"}).text
authors = abs.find('div',{'class':"authors"}).text.replace('\n','')
authors = clean_pat.sub('',authors)
abstract = abs.find('blockquote', {'class':"abstract mathjax"}).text
abstract = abstract.replace('\n',' ').replace('\t',' ')
ans = '\t'.join([id,title,subject,authors,abstract])
ans += '\n'
print('\t'.join([time.asctime(),id,title,subject]))
return ans
def requestsGet(url,tryTime = 5):
for i in range(tryTime):
try:
time.sleep(3+secrets.randbelow(4))
res = requests.get(url = url,headers=gHeaders,timeout = 60)
break
except Exception as e:
print(f'[{i}] [{url}] {str(e)}')
time.sleep(7)
res = None
return res
def _access(month):
filename = f'paperMeta4arxiv1/arxiv-{month}.txt'
start = get_lasttime(filename)
if start == 'done': return 'done'
endCount = 0
anti = 0
for i in range(start,43000):
id = f'{month}.{i:05d}'
url = f'https://arxiv.org/abs/{id}'
print(url)
res = requestsGet(url)
if not hasattr(res,'ok') and not res: raise SystemExit('='*50+'meet anti scrapy policy or network error')
res = get_result(res.text,id)
if res == 'doesn\'t exist':
res = '='*50+'\n'
endCount += 1
elif res == 'maybe meet antiscrapy':
anti += 1
if anti == 5 : raise SystemExit(f'meet anti scrapy policy,has try {anti} times')
continue
elif '=====' in res:
endCount += 1
res += '\n'
else:
endCount = 0
if endCount == 5: return 'done'
with open(filename,'a') as fa:
fa.write(res)
if __name__== '__main__':
for month in months:
_access(month)