Skip to content

Commit

Permalink
scoap3-next: arxiv categories harvesting (#437)
Browse files Browse the repository at this point in the history
* In a lot of cases we were not getting arxiv categories
* Since the request to arxiv.org were returning more than one entry
* The request is made now JUST by using arxiv id, if record has it
* Added test
* Encoded query
* Removed unnecessary test
*  ref: cern-sis/issues-scoap3#161
  • Loading branch information
ErnestaP authored May 26, 2023
1 parent 900ba4f commit 8c6afcc
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 19 deletions.
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pytest==4.6.4
radon==3.0.3
requests-mock==1.6.0
freezegun==0.3.14
pytest-vcr==1.0.2
20 changes: 9 additions & 11 deletions scoap3/utils/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from __future__ import absolute_import, division, print_function

import logging
from urllib import quote

from inspire_utils.record import get_value
from lxml import etree
Expand All @@ -49,12 +50,11 @@ def clean_arxiv(arxiv):
if arxiv is None:
return None

return arxiv.split(':')[-1].split('v')[0].split(' ')[0].encode('ascii').strip('"\'')
return arxiv.split(':')[-1].split('v')[0].split(' ')[0].strip('"\'')


def get_arxiv_categories_from_response_xml(xml):
entry_count = len(xml.xpath('//w3:entry', namespaces=xml_namespaces))

# make sure we have exactly one result in the xml
if entry_count != 1:
return []
Expand Down Expand Up @@ -99,17 +99,15 @@ def get_arxiv_categories(arxiv_id=None, title=None, doi=None):
query = []
if arxiv_id:
query.append('id:%s' % arxiv_id)
else:
if title:
query.append('ti:"%s"' % title.replace("-", "?"))
if doi:
query.append('doi:"%s"' % doi)

if title:
title = title.replace('-', '?').encode('ascii', 'replace')
query.append('ti:"%s"' % title)

if doi:
query.append('doi:"%s"' % doi)

request_url = url.format(' '.join(query))
encoded_quey_string = quote(' '.join(query))
request_url = url.format(encoded_quey_string)
data = requests_retry_session().get(request_url)

categories = []
if data.status_code == 200:
xml = etree.fromstring(data.content)
Expand Down
47 changes: 47 additions & 0 deletions tests/unit/utils/cassettes/test_categories_with_arxiv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
interactions:
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [python-requests/2.20.0]
method: GET
uri: http://export.arxiv.org/api/query?search_query=id%3A2111.13053
response:
body:
string: !!binary |
H4sIAAAAAAAAA51Wa2/bNhT9rl9xISDDhlWmZMdJ6tkOgrZBirZYH2m79EvASFcWW4lUScqP/fpd
UvIrczYkgOOY4rn3nPvgFcfny6qEOWojlJyESS8OAWWqMiFnk/Dz9WV0Fp5Pg3GOmAFBpZmEhbX1
iLHFYtFbDHpKz1g/jofswqoqnAYA41LIH1BozDdYrpdi7qG8Fuxng3p1bpDrtLj1i6PBS5EdDS76
SZL0kkE8HBz1T0R2WwpjaY8WxnLtfsb0u+LLW42mKa2hJwlJ1lhOQoNlHoJd1TgJeV2XIuWWomKc
hP1O2kPm1VlhS+xghaXH0wv9l5jDBydkBLuyJiIbbSX9wqv6j07UxC+8qEnsf++ImiTxmHkaTyiy
6cE02GXcvK2/v3rz5fRr862u3t78ePPuw8uKy89jRkbOtqkzbjGb9uP+IIqHUf/kOo5H/hPFx/Q9
ZmuIg6saZRvAyCrLy4+toLZyo+3utjDPe6mqWMRMjSnbAhjFzMJpMmYPuLxP51PxWma4fDJZvEe2
dXifSliszHvU7/kMnx7ZPtuuT8+H0uqV+/VQAe8M27bG/GRdsf2a9aOEPs+v4+FocDYaHH/bKxdh
6+aO2qlo0UmUJFF/eJ30R4NkNDwj9Ha/xbdddbEUvIQXhdD074vSllq9hFd5jqkFIYHDp7pAjbAQ
toBcSAoPjPgbAT1mpzvJp2mqilOwAF+RrOdorJiRRrAFAvdU8zUH7nA0UuRKV+UKNDWGpYkBxtMG
prn77mBW7ZL3nP/GILx7fQ13qpEZsUKqZCbcOXXoUlTCet7NvsoDt2499+BSaai4MSUaAznqiiwN
6TEiawW3wGewQFB3lpNS9xSXnPT4YII5SSMvaaM1VRlyriHXqtqntQW3RGTTAk3noabj7cZk55cG
opBBuqJpl2mfnFQpTYOTUucUeauSgtfcNk76n3Ir0OPXZF4sJSoL7hTVy9spORO2cd5K4DIDq7k0
bk7jr76opKXuUuzwbQVIG4Vofguo82slKTrTtkDFZ9K5Q8iQmt5Nd1CtnFSVZOn3uJyVOykWcwzW
GfYSM9I9dzCXDJLjc5HvtImrZZNaMRd2tZfXYJtXBbwsgRJFsThrxwSLQlFflMhbZW4TKHEV8JlG
7IJwPA5N0VHlUh+uxppa05fiX/mmbuOlUbttsM9w31mXkY3WRSHSAoQhr5LGxDNfCGM11cbpK3iL
/9nwjDhFGrTJpfcnHu6qHly3eJJJBpRLUW3eU6bjD6g/pKror1Qz1/9QIJ+vwCmkYtE48GCuXSth
2pRunPSCMVuf4/ZU88YWSrcLWkpe4fRT0UQ3jYQbqvSY+UctmO2iD5p+RBldUeBw+XjblxhdNnCl
mkfaXXERvSA6IPL/MnVDeZQp0b0M/PrA1cPQWaaS+/uAewH0kvjsjCUnp8c0dwenjKc8PcnGbOOv
8++vM35gTkJ6Gu5fbbJljx56godcdvcT+nKlaq8hG910VCs3hh6jfTCEml5UdCqPaW7MqNvNWnbn
blf6Qzexe2+wTiUv6QS5Gba+SllcWuYvSuxAQuosDx9ioL1DDOs8HLiqOW/76am1cE19SwCcKTqU
j8iTHyEEwjqyRQh+D//PaE2/IXyykzHrLhFj5q7P0+Af/bmUCW0LAAA=
headers:
access-control-allow-origin: ['*']
connection: [Keep-Alive]
content-encoding: [gzip]
content-length: ['1238']
content-type: [application/atom+xml; charset=UTF-8]
date: ['Fri, 26 May 2023 09:37:11 GMT']
keep-alive: ['timeout=5, max=100']
server: [Apache]
vary: ['Accept-Encoding,User-Agent']
status: {code: 200, message: OK}
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [python-requests/2.20.0]
method: GET
uri: http://export.arxiv.org/api/query?search_query=doi%3A%2210.1088/1674-1137/acac6c%22
response:
body:
string: !!binary |
H4sIAAAAAAAAA62TXWvbMBSG7/MrhMC7GbYku3VTz3YIlEKv1o2GdrsJinySqPWHKimJu18/WUmb
rKw3Y2DMOdJ7zvscCeWTvqnRFrSRXVtgFlGMoBVdJdtVgWd31+EYT8pRvgSokJO2psBra1VGyG63
i3ZJ1OkViSk9J1PbNbgcIZTXsn1Caw3LNy3Xvdx6KVeSPG9Av0wMcC3Wc58EyVXVySCZBnHMaMTo
eBzE1yy9OAsZSy5czAUXqXDbQZzKal5LY12RS4zlegipixvezzWYTW2NW2FuFg11gQ3US4zsi4IC
c6VqKbh14xLuiD+7oTDx2FbaGg6ytXXL5VQ/yC36NhBm6JS3cLQZPpCSN06yp8SfeKO+HCgLn3jK
gvr4hLJgNCfe1xPIqvzrgf1YmMXX9GHc36jpr0dDF8+z+0crfs4SkRNXNNRuVMUtVGVM4ySk52Gc
3lGa+S+kZ+6fk1fJIO8UtPuJMttZXn/fA+3vODvuHq/wMhJdQ0JiFAhyFBAWMYJL1/6Dlu/t/FHc
tBX0/8ns2PC9lbTQmFvQt3wF/2zG/nQ77ekeBhleRjn6DUmlTh5IAwAA
headers:
access-control-allow-origin: ['*']
connection: [Keep-Alive]
content-encoding: [gzip]
content-length: ['441']
content-type: [application/atom+xml; charset=UTF-8]
date: ['Fri, 26 May 2023 09:36:06 GMT']
keep-alive: ['timeout=5, max=100']
server: [Apache]
vary: ['Accept-Encoding,User-Agent']
status: {code: 200, message: OK}
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
interactions:
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [python-requests/2.20.0]
method: GET
uri: http://export.arxiv.org/api/query?search_query=ti%3A%22Static%20properties%20and%20Semileptonic%20transitions%20of%20lowest%3Flying%20double%20heavy%20baryons%22
response:
body:
string: !!binary |
H4sIAAAAAAAAA51Wf2/bNhD9X5+CEOphwyZRkuPE9WQHQdMWwRA0W5KtKAoEjHSWuEqiSlL+kaLf
fUdKtuXM2ZAAjkJSx3v37p6OjE9XZUEWIBUX1dQN/cAlUCUi5VU2dW9v3nlj93TmxHOAlKBppaZu
rnU9oXS5XPrLoS9kRqMgGNEzLUp35hASF7z6QnIJ860tkyu+sKas5vRrA3J9qoDJJL+zk8HwXPPB
8GwQRdeaaZ4MoqCWogapOSicsCrF5zWUvIBai8paaMkqxTVGbkzEHB+FWILSg+G7Yo0McCEVzX0B
OMiBLdb4/57Jtd0QDaJjnt4V3Nif40RpJs0wwHHJVncSVFNohSshZkVCMXUVFHOX6HUNU5fVdcET
ZuApQ+4/Y3pcahOguS6gM8s1Ls/O5Ee+IL8brhPSZz7VfOK2nMmOMUG+pM+W9LgSMSctz1PLkrQc
iWVIOn7uD6ysf+34Te3E8psGdtzjNw2DmNqIbew8nR0s2ujo/Ob29rdypN//ea2udLl4q/hDefmg
RExxk9nb1CnTkM6iIBp6wciLjm+CYGJ/XnCEz5huTIw5sq3aXEy00Kz4ow2o1dlk93Yno9d+Ikrq
UVVDQncGNPRD6s7CmD7h8jGcTcVFlcLqxWDBHtjO4WMorqFUVyCvWAYvZ7aP1vdp8aDScm1GTxXw
XtEoCsZ+cHIcjRbDTcX2axZ5If6wZq8n4WhyNP60Vy60rVFnXOUb62DcWUfDSRSg9e59a9+q6qXq
9vrqRod7+u5JFoFUU5a4PiPkJgeiDgKqPmAKCVtbrEyKpko9swdarHWL5HRIhEnjskk5tsClkF9M
ULwiGpHmkpVg1ownRipRedgnEH2BXx3CfG0YvitFCoXv3Cqzk5E6h0qU+FeIDBtIQWqhsYAcR9by
F7IEgutJU5iYDM5ekCVTCpRjOJUsq8AAlcahtoT6HMgr8vlDCRnDgbHH6UeOz46aTy4sD6c1bqMt
eMm1jYGjqkTaJIBBK17W2GPmQpYGxATVVNwcHBj2hcoa6f3FFTjzpkpMHUmjMF1MWctdeVsHc5Zo
IW1E+LpCoAXWm2cburBKikahdwdDJZ8lz3L8vqRYkgSD/3chyZKnOm/LfI9YSW4yLU1vVgaRvHI+
Y6mSb+H3b9H3vr/e8jYtfTX65EMjibjXjFfIp2uZjpEESoBlEsAkHgPQOREYuzQEhDRFMZWVkPKk
9eTEdCPTVrSs0bmQ7QSnFUpp9onlkpH3OStQHwseU7va2tP+hoO7L5kSTQk5uRTs4QETe43SzP/T
i2kRk1TwrjXZ+YFjWyU5oO7sQWfakR8G4zENj0+OvDAcnlCWsOQ4ienWX+ffXgXslzp1cdXdvxak
Kx8XLcBTLruD135VkLbn6zZubJk2+8+J/SxJUDtYS6ML27DaM9zU8w3qxidX+RrrviXTgewB/y0a
WbECj9D5s8ANgPVP3pCjE/KjOSh/IsFwGAajDV7Pdz+JT92nHnX2Ll+s0IBeNGxuKxpWmtq7CD1Q
mjqdu08h4LtDCJuKHLgNGW/7haolN8K/QwPIhFw/J2kEmZRoBLVX5y6x7+D/Nm3gt4AvdhLT7nCN
qbkEz5x/AEzI6WwzCwAA
headers:
access-control-allow-origin: ['*']
connection: [Keep-Alive]
content-encoding: [gzip]
content-length: ['1212']
content-type: [application/atom+xml; charset=UTF-8]
date: ['Fri, 26 May 2023 09:36:06 GMT']
keep-alive: ['timeout=5, max=100']
server: [Apache]
vary: ['Accept-Encoding,User-Agent']
status: {code: 200, message: OK}
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
interactions:
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [python-requests/2.20.0]
method: GET
uri: http://export.arxiv.org/api/query?search_query=ti%3A%22Static%20properties%20and%20Semileptonic%20transitions%20of%20lowest%3Flying%20double%20heavy%20baryons%22%20doi%3A%2210.1088/1674-1137/acac6c%22
response:
body:
string: !!binary |
H4sIAAAAAAAAA51WbW/bNhD+7l9BCPWwYZMoyXlxPNlB1iJFPhRNlgTtigIBLZ0lrpKokpRtteh/
35GSbTlzNiSALPPleM/dc6c7RufrIidLkIqLcuoEnu8QKGOR8DKdOvd3l+7YOZ8NogVAQlC0VFMn
07qaULparbzVyBMypaHvH9MLLQpnNiAkynn5hWQSFltZJtd8aUVZxenXGmRzroDJOHuwk+HojebD
0cUwDG810zwehn4lRQVSc1A4YWWC71soeA6VFqWV0JKVimu03IiIBb5ysQKlh6PLvEEPcCER9TwH
HGTAlg3+z5ls7IHQ7naoge8F/ng8DC+Dk9MjNwhGpzhmMYtPYit6wpOHnBvVb3CiNJNm6OO4YOsH
CarOtcKVAAmUkE8dBfnCIbqpYOqwqsp5zIyllCFNvyKTDrVcaa5z6MQyjcuzC/mRL8mNoWVC+iRN
NZ84LT1kRw5BakifGNKjhYgFaSk5t4SQlg5iySAdFQ6uouaOArolgLbuOz+xovq9c39qJ9b9qW/H
PfengR9R65B1jSezg+Gv/jhiNzk985tVcE/v3lbial5eiw83zV8RxUPmbF0lTEMyC/1w5PrHbnhy
5/sT+7j+Eb4juhEx4khG2VI10UKz/M/WoDZjJ7vdXUKeebEoqEtVBTHdCdDAC6gzCyL6hMrHcJaK
qzKB9YvB/D2wncLHUFxDoa5BXrMUXu7ZPlpfp8WDUsvGjJ4K4FzRMPTHnn96Eh4vR5uI7ccsdAN8
MGZnk+B4cjT+tBculK0wDbnKNtL+uJMOR5PQR+ndfivfZtVLk9/tJz8q3Ev/XsoikKqLAtdnhNxl
QNRBQNUHTCBmjcVKpajLxDVnoMVqWqRBh0SYNCrrhGMxXQn5xRjFS6IRaSFZAWbNaGKkFKWLZQTR
l/jVIczXmuFeIRLIvcG9MicZqTIoRYG/XKRYX3JSCY0B5Diykr+RFRBcj+vc2GRw9owsmFKgBsan
gqUlGKDCKNTWob4P5BX5/L6AlOHAyOP0I8d355pHrqwfg1a4tTbnBdfWBo5ZJZI6BjRa8aLCErQQ
sjAgxqi65KYFodlXKq2l+4ErGCzqMjZxJLVCupiykrvwtgoWLNZCWotwu0SgJcabpxt3YR3ntULt
AzSVfJY8zfD7kmJFYjT+34EkK57orA3zHLHizDAtTelWBpG8GnzGUMXfgx/fwx99fb3lLS39bPTI
+1oSMdeMl+hPVzIHJiUwBVgqAQzxaIDOiEDbpXFASBMUE1kJCY9bTYOIbtK0TVpW60zIdoLTElNp
9ollkpG3GcsxP5Y8ona1laf9AwdPv2NK1AVk5J1g374hsbeYmtl/ajElYoKdpCtNdn7gAqDiDDDv
bB805ehw04noVl+n314q7Jc6dXDV2b9gJGsPFy3Ak32s7cv2q4Kkbb9bu7FkWvafY/tFHGPuYCxN
XtiC1bZ4E8/XmDceuc4ajPvWmQ5kD/hvUcuS5dhCF88CNwBWP3lNjk7Jz6ZR/kL80Sjwjzd4Pd19
Ep+6mT2q7B1fLNeAWjRsLjMa1praqwo9EJoqWThPIeDeIYRNRA5cloy2/UBVkpvEf0ABSIVsnkMa
QU8KFILKrTKH2D34v0Mb+C3gi5VEtGuuETXX6dngHziSiRR9CwAA
headers:
access-control-allow-origin: ['*']
connection: [Keep-Alive]
content-encoding: [gzip]
content-length: ['1236']
content-type: [application/atom+xml; charset=UTF-8]
date: ['Fri, 26 May 2023 09:36:07 GMT']
keep-alive: ['timeout=5, max=100']
server: [Apache]
vary: ['Accept-Encoding,User-Agent']
status: {code: 200, message: OK}
version: 1
33 changes: 25 additions & 8 deletions tests/unit/utils/test_arxiv.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import requests_mock
from pytest import raises
from pytest import raises, mark
import requests

from scoap3.utils.arxiv import get_arxiv_categories, clean_arxiv
from tests.responses import read_response
Expand Down Expand Up @@ -70,11 +71,27 @@ def test_extract_arxiv_with_categ():
"""
assert clean_arxiv('arXiv:1803.07217 [gr-qc]') == '1803.07217'

@mark.vcr
def test_categories_with_arxiv():
"""Test extraction arXiv categories from arXiv api."""
categories = get_arxiv_categories(arxiv_id='2111.13053', title="Axial Chiral Vortical Effect in a Sphere with finite size effect", doi="10.1088/1674-1137/acac6d")
assert categories == ['hep-th']

@mark.vcr
def test_categories_without_arxiv_just_title():
"""Test extraction arXiv categories from arXiv api."""
categories = get_arxiv_categories(title="Static properties and Semileptonic transitions of lowest-lying double heavy baryons")
assert categories == ['hep-ph']

@mark.vcr
def test_categories_without_arxiv_just_doi():
"""Test extraction arXiv categories from arXiv api."""
categories = get_arxiv_categories(doi="10.1088/1674-1137/acac6c")
assert categories == []

@mark.vcr
def test_categories_without_arxiv_with_title_and_doi():
"""Test extraction arXiv categories from arXiv api."""
categories = get_arxiv_categories(doi="10.1088/1674-1137/acac6c", title="Static properties and Semileptonic transitions of lowest-lying double heavy baryons")
assert categories == ['hep-ph']

def test_extract_arxiv_additional_chars():
"""
Test getting clean arXiv identifier with additional chars.
Delivered for article: 10.1140/epjc/s10052-018-6500-y
"""
with raises(UnicodeEncodeError):
clean_arxiv(u'"1808.01899\u201c')

0 comments on commit 8c6afcc

Please sign in to comment.