-
Notifications
You must be signed in to change notification settings - Fork 5
/
arxiv_search_pdfDownload.py
31 lines (24 loc) · 1.15 KB
/
arxiv_search_pdfDownload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup as bs
import re
import math
keyword = 'AND+data+preprocess'# if two or more words, you need add AND before the words
keyword = 'segmentation'# this the single word
totalnums = re.compile('Showing results 1 through \d+ \(of (\d+) total\)')
# access the pagg for total number
url = f'https://arxiv.org/find/grp_cs/1/ti:+{keyword}/0/1/0/all/0/1?skip=0'
res = requests.get(url = url)
ans = bs(res.text,'lxml')
totalnums = int(totalnums.findall(res.text)[0])
for i in range(totalnums)[::25]:
url = f'https://arxiv.org/find/grp_cs/1/ti:+{keyword}/0/1/0/all/0/1?skip={i}'
res = requests.get(url = url)
ans = bs(res.text,'lxml')
for block in ans.find_all('dt'):
dt = block.find('a',{"href":re.compile('/pdf/')})['href']#get the id of paper
filename = block.next_sibling.next_sibling.find('span',{'class':"descriptor"}).next_sibling.strip()# get the filename of paper
with open(f'arxiv-{keyword.replace("+","_")}.txt','a') as fa:
fa.write(f'#{dt.split("/")[-1]}.pdf\t{filename.replace(" ","_")}.pdf\n')
fa.write('https://arxiv.org'+dt+'.pdf\n')
print(f'has down {i+25} links')