diff --git a/requirements-test.txt b/requirements-test.txt index 29d2db9b..07551412 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,3 +4,4 @@ pytest==4.6.4 radon==3.0.3 requests-mock==1.6.0 freezegun==0.3.14 +pytest-vcr==1.0.2 diff --git a/scoap3/utils/arxiv.py b/scoap3/utils/arxiv.py index ef378c3a..681b632a 100644 --- a/scoap3/utils/arxiv.py +++ b/scoap3/utils/arxiv.py @@ -25,6 +25,7 @@ from __future__ import absolute_import, division, print_function import logging +from urllib import quote from inspire_utils.record import get_value from lxml import etree @@ -49,12 +50,11 @@ def clean_arxiv(arxiv): if arxiv is None: return None - return arxiv.split(':')[-1].split('v')[0].split(' ')[0].encode('ascii').strip('"\'') + return arxiv.split(':')[-1].split('v')[0].split(' ')[0].strip('"\'') def get_arxiv_categories_from_response_xml(xml): entry_count = len(xml.xpath('//w3:entry', namespaces=xml_namespaces)) - # make sure we have exactly one result in the xml if entry_count != 1: return [] @@ -99,17 +99,15 @@ def get_arxiv_categories(arxiv_id=None, title=None, doi=None): query = [] if arxiv_id: query.append('id:%s' % arxiv_id) + else: + if title: + query.append('ti:"%s"' % title.replace("-", "?")) + if doi: + query.append('doi:"%s"' % doi) - if title: - title = title.replace('-', '?').encode('ascii', 'replace') - query.append('ti:"%s"' % title) - - if doi: - query.append('doi:"%s"' % doi) - - request_url = url.format(' '.join(query)) + encoded_quey_string = quote(' '.join(query)) + request_url = url.format(encoded_quey_string) data = requests_retry_session().get(request_url) - categories = [] if data.status_code == 200: xml = etree.fromstring(data.content) diff --git a/tests/unit/utils/cassettes/test_categories_with_arxiv.yaml b/tests/unit/utils/cassettes/test_categories_with_arxiv.yaml new file mode 100644 index 00000000..d6910532 --- /dev/null +++ b/tests/unit/utils/cassettes/test_categories_with_arxiv.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: null + headers: + Accept: ['*/*'] + Accept-Encoding: ['gzip, deflate'] + Connection: [keep-alive] + User-Agent: [python-requests/2.20.0] + method: GET + uri: http://export.arxiv.org/api/query?search_query=id%3A2111.13053 + response: + body: + string: !!binary | + H4sIAAAAAAAAA51Wa2/bNhT9rl9xISDDhlWmZMdJ6tkOgrZBirZYH2m79EvASFcWW4lUScqP/fpd + UvIrczYkgOOY4rn3nPvgFcfny6qEOWojlJyESS8OAWWqMiFnk/Dz9WV0Fp5Pg3GOmAFBpZmEhbX1 + iLHFYtFbDHpKz1g/jofswqoqnAYA41LIH1BozDdYrpdi7qG8Fuxng3p1bpDrtLj1i6PBS5EdDS76 + SZL0kkE8HBz1T0R2WwpjaY8WxnLtfsb0u+LLW42mKa2hJwlJ1lhOQoNlHoJd1TgJeV2XIuWWomKc + hP1O2kPm1VlhS+xghaXH0wv9l5jDBydkBLuyJiIbbSX9wqv6j07UxC+8qEnsf++ImiTxmHkaTyiy + 6cE02GXcvK2/v3rz5fRr862u3t78ePPuw8uKy89jRkbOtqkzbjGb9uP+IIqHUf/kOo5H/hPFx/Q9 + ZmuIg6saZRvAyCrLy4+toLZyo+3utjDPe6mqWMRMjSnbAhjFzMJpMmYPuLxP51PxWma4fDJZvEe2 + dXifSliszHvU7/kMnx7ZPtuuT8+H0uqV+/VQAe8M27bG/GRdsf2a9aOEPs+v4+FocDYaHH/bKxdh + 6+aO2qlo0UmUJFF/eJ30R4NkNDwj9Ha/xbdddbEUvIQXhdD074vSllq9hFd5jqkFIYHDp7pAjbAQ + toBcSAoPjPgbAT1mpzvJp2mqilOwAF+RrOdorJiRRrAFAvdU8zUH7nA0UuRKV+UKNDWGpYkBxtMG + prn77mBW7ZL3nP/GILx7fQ13qpEZsUKqZCbcOXXoUlTCet7NvsoDt2499+BSaai4MSUaAznqiiwN + 6TEiawW3wGewQFB3lpNS9xSXnPT4YII5SSMvaaM1VRlyriHXqtqntQW3RGTTAk3noabj7cZk55cG + opBBuqJpl2mfnFQpTYOTUucUeauSgtfcNk76n3Ir0OPXZF4sJSoL7hTVy9spORO2cd5K4DIDq7k0 + bk7jr76opKXuUuzwbQVIG4Vofguo82slKTrTtkDFZ9K5Q8iQmt5Nd1CtnFSVZOn3uJyVOykWcwzW + GfYSM9I9dzCXDJLjc5HvtImrZZNaMRd2tZfXYJtXBbwsgRJFsThrxwSLQlFflMhbZW4TKHEV8JlG + 7IJwPA5N0VHlUh+uxppa05fiX/mmbuOlUbttsM9w31mXkY3WRSHSAoQhr5LGxDNfCGM11cbpK3iL + /9nwjDhFGrTJpfcnHu6qHly3eJJJBpRLUW3eU6bjD6g/pKror1Qz1/9QIJ+vwCmkYtE48GCuXSth + 2pRunPSCMVuf4/ZU88YWSrcLWkpe4fRT0UQ3jYQbqvSY+UctmO2iD5p+RBldUeBw+XjblxhdNnCl + mkfaXXERvSA6IPL/MnVDeZQp0b0M/PrA1cPQWaaS+/uAewH0kvjsjCUnp8c0dwenjKc8PcnGbOOv + 8++vM35gTkJ6Gu5fbbJljx56godcdvcT+nKlaq8hG910VCs3hh6jfTCEml5UdCqPaW7MqNvNWnbn + blf6Qzexe2+wTiUv6QS5Gba+SllcWuYvSuxAQuosDx9ioL1DDOs8HLiqOW/76am1cE19SwCcKTqU + j8iTHyEEwjqyRQh+D//PaE2/IXyykzHrLhFj5q7P0+Af/bmUCW0LAAA= + headers: + access-control-allow-origin: ['*'] + connection: [Keep-Alive] + content-encoding: [gzip] + content-length: ['1238'] + content-type: [application/atom+xml; charset=UTF-8] + date: ['Fri, 26 May 2023 09:37:11 GMT'] + keep-alive: ['timeout=5, max=100'] + server: [Apache] + vary: ['Accept-Encoding,User-Agent'] + status: {code: 200, message: OK} +version: 1 diff --git a/tests/unit/utils/cassettes/test_categories_without_arxiv_just_doi.yaml b/tests/unit/utils/cassettes/test_categories_without_arxiv_just_doi.yaml new file mode 100644 index 00000000..94e5f096 --- /dev/null +++ b/tests/unit/utils/cassettes/test_categories_without_arxiv_just_doi.yaml @@ -0,0 +1,33 @@ +interactions: +- request: + body: null + headers: + Accept: ['*/*'] + Accept-Encoding: ['gzip, deflate'] + Connection: [keep-alive] + User-Agent: [python-requests/2.20.0] + method: GET + uri: http://export.arxiv.org/api/query?search_query=doi%3A%2210.1088/1674-1137/acac6c%22 + response: + body: + string: !!binary | + H4sIAAAAAAAAA62TXWvbMBSG7/MrhMC7GbYku3VTz3YIlEKv1o2GdrsJinySqPWHKimJu18/WUmb + rKw3Y2DMOdJ7zvscCeWTvqnRFrSRXVtgFlGMoBVdJdtVgWd31+EYT8pRvgSokJO2psBra1VGyG63 + i3ZJ1OkViSk9J1PbNbgcIZTXsn1Caw3LNy3Xvdx6KVeSPG9Av0wMcC3Wc58EyVXVySCZBnHMaMTo + eBzE1yy9OAsZSy5czAUXqXDbQZzKal5LY12RS4zlegipixvezzWYTW2NW2FuFg11gQ3US4zsi4IC + c6VqKbh14xLuiD+7oTDx2FbaGg6ytXXL5VQ/yC36NhBm6JS3cLQZPpCSN06yp8SfeKO+HCgLn3jK + gvr4hLJgNCfe1xPIqvzrgf1YmMXX9GHc36jpr0dDF8+z+0crfs4SkRNXNNRuVMUtVGVM4ySk52Gc + 3lGa+S+kZ+6fk1fJIO8UtPuJMttZXn/fA+3vODvuHq/wMhJdQ0JiFAhyFBAWMYJL1/6Dlu/t/FHc + tBX0/8ns2PC9lbTQmFvQt3wF/2zG/nQ77ekeBhleRjn6DUmlTh5IAwAA + headers: + access-control-allow-origin: ['*'] + connection: [Keep-Alive] + content-encoding: [gzip] + content-length: ['441'] + content-type: [application/atom+xml; charset=UTF-8] + date: ['Fri, 26 May 2023 09:36:06 GMT'] + keep-alive: ['timeout=5, max=100'] + server: [Apache] + vary: ['Accept-Encoding,User-Agent'] + status: {code: 200, message: OK} +version: 1 diff --git a/tests/unit/utils/cassettes/test_categories_without_arxiv_just_title.yaml b/tests/unit/utils/cassettes/test_categories_without_arxiv_just_title.yaml new file mode 100644 index 00000000..7a7eea8c --- /dev/null +++ b/tests/unit/utils/cassettes/test_categories_without_arxiv_just_title.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: null + headers: + Accept: ['*/*'] + Accept-Encoding: ['gzip, deflate'] + Connection: [keep-alive] + User-Agent: [python-requests/2.20.0] + method: GET + uri: http://export.arxiv.org/api/query?search_query=ti%3A%22Static%20properties%20and%20Semileptonic%20transitions%20of%20lowest%3Flying%20double%20heavy%20baryons%22 + response: + body: + string: !!binary | + H4sIAAAAAAAAA51Wf2/bNhD9X5+CEOphwyZRkuPE9WQHQdMWwRA0W5KtKAoEjHSWuEqiSlL+kaLf + fUdKtuXM2ZAAjkJSx3v37p6OjE9XZUEWIBUX1dQN/cAlUCUi5VU2dW9v3nlj93TmxHOAlKBppaZu + rnU9oXS5XPrLoS9kRqMgGNEzLUp35hASF7z6QnIJ860tkyu+sKas5vRrA3J9qoDJJL+zk8HwXPPB + 8GwQRdeaaZ4MoqCWogapOSicsCrF5zWUvIBai8paaMkqxTVGbkzEHB+FWILSg+G7Yo0McCEVzX0B + OMiBLdb4/57Jtd0QDaJjnt4V3Nif40RpJs0wwHHJVncSVFNohSshZkVCMXUVFHOX6HUNU5fVdcET + ZuApQ+4/Y3pcahOguS6gM8s1Ls/O5Ee+IL8brhPSZz7VfOK2nMmOMUG+pM+W9LgSMSctz1PLkrQc + iWVIOn7uD6ysf+34Te3E8psGdtzjNw2DmNqIbew8nR0s2ujo/Ob29rdypN//ea2udLl4q/hDefmg + RExxk9nb1CnTkM6iIBp6wciLjm+CYGJ/XnCEz5huTIw5sq3aXEy00Kz4ow2o1dlk93Yno9d+Ikrq + UVVDQncGNPRD6s7CmD7h8jGcTcVFlcLqxWDBHtjO4WMorqFUVyCvWAYvZ7aP1vdp8aDScm1GTxXw + XtEoCsZ+cHIcjRbDTcX2axZ5If6wZq8n4WhyNP60Vy60rVFnXOUb62DcWUfDSRSg9e59a9+q6qXq + 9vrqRod7+u5JFoFUU5a4PiPkJgeiDgKqPmAKCVtbrEyKpko9swdarHWL5HRIhEnjskk5tsClkF9M + ULwiGpHmkpVg1ownRipRedgnEH2BXx3CfG0YvitFCoXv3Cqzk5E6h0qU+FeIDBtIQWqhsYAcR9by + F7IEgutJU5iYDM5ekCVTCpRjOJUsq8AAlcahtoT6HMgr8vlDCRnDgbHH6UeOz46aTy4sD6c1bqMt + eMm1jYGjqkTaJIBBK17W2GPmQpYGxATVVNwcHBj2hcoa6f3FFTjzpkpMHUmjMF1MWctdeVsHc5Zo + IW1E+LpCoAXWm2cburBKikahdwdDJZ8lz3L8vqRYkgSD/3chyZKnOm/LfI9YSW4yLU1vVgaRvHI+ + Y6mSb+H3b9H3vr/e8jYtfTX65EMjibjXjFfIp2uZjpEESoBlEsAkHgPQOREYuzQEhDRFMZWVkPKk + 9eTEdCPTVrSs0bmQ7QSnFUpp9onlkpH3OStQHwseU7va2tP+hoO7L5kSTQk5uRTs4QETe43SzP/T + i2kRk1TwrjXZ+YFjWyU5oO7sQWfakR8G4zENj0+OvDAcnlCWsOQ4ienWX+ffXgXslzp1cdXdvxak + Kx8XLcBTLruD135VkLbn6zZubJk2+8+J/SxJUDtYS6ML27DaM9zU8w3qxidX+RrrviXTgewB/y0a + WbECj9D5s8ANgPVP3pCjE/KjOSh/IsFwGAajDV7Pdz+JT92nHnX2Ll+s0IBeNGxuKxpWmtq7CD1Q + mjqdu08h4LtDCJuKHLgNGW/7haolN8K/QwPIhFw/J2kEmZRoBLVX5y6x7+D/Nm3gt4AvdhLT7nCN + qbkEz5x/AEzI6WwzCwAA + headers: + access-control-allow-origin: ['*'] + connection: [Keep-Alive] + content-encoding: [gzip] + content-length: ['1212'] + content-type: [application/atom+xml; charset=UTF-8] + date: ['Fri, 26 May 2023 09:36:06 GMT'] + keep-alive: ['timeout=5, max=100'] + server: [Apache] + vary: ['Accept-Encoding,User-Agent'] + status: {code: 200, message: OK} +version: 1 diff --git a/tests/unit/utils/cassettes/test_categories_without_arxiv_with_title_and_doi.yaml b/tests/unit/utils/cassettes/test_categories_without_arxiv_with_title_and_doi.yaml new file mode 100644 index 00000000..ffeab702 --- /dev/null +++ b/tests/unit/utils/cassettes/test_categories_without_arxiv_with_title_and_doi.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: null + headers: + Accept: ['*/*'] + Accept-Encoding: ['gzip, deflate'] + Connection: [keep-alive] + User-Agent: [python-requests/2.20.0] + method: GET + uri: http://export.arxiv.org/api/query?search_query=ti%3A%22Static%20properties%20and%20Semileptonic%20transitions%20of%20lowest%3Flying%20double%20heavy%20baryons%22%20doi%3A%2210.1088/1674-1137/acac6c%22 + response: + body: + string: !!binary | + H4sIAAAAAAAAA51WbW/bNhD+7l9BCPWwYZMoyXlxPNlB1iJFPhRNlgTtigIBLZ0lrpKokpRtteh/ + 35GSbTlzNiSALPPleM/dc6c7RufrIidLkIqLcuoEnu8QKGOR8DKdOvd3l+7YOZ8NogVAQlC0VFMn + 07qaULparbzVyBMypaHvH9MLLQpnNiAkynn5hWQSFltZJtd8aUVZxenXGmRzroDJOHuwk+HojebD + 0cUwDG810zwehn4lRQVSc1A4YWWC71soeA6VFqWV0JKVimu03IiIBb5ysQKlh6PLvEEPcCER9TwH + HGTAlg3+z5ls7IHQ7naoge8F/ng8DC+Dk9MjNwhGpzhmMYtPYit6wpOHnBvVb3CiNJNm6OO4YOsH + CarOtcKVAAmUkE8dBfnCIbqpYOqwqsp5zIyllCFNvyKTDrVcaa5z6MQyjcuzC/mRL8mNoWVC+iRN + NZ84LT1kRw5BakifGNKjhYgFaSk5t4SQlg5iySAdFQ6uouaOArolgLbuOz+xovq9c39qJ9b9qW/H + PfengR9R65B1jSezg+Gv/jhiNzk985tVcE/v3lbial5eiw83zV8RxUPmbF0lTEMyC/1w5PrHbnhy + 5/sT+7j+Eb4juhEx4khG2VI10UKz/M/WoDZjJ7vdXUKeebEoqEtVBTHdCdDAC6gzCyL6hMrHcJaK + qzKB9YvB/D2wncLHUFxDoa5BXrMUXu7ZPlpfp8WDUsvGjJ4K4FzRMPTHnn96Eh4vR5uI7ccsdAN8 + MGZnk+B4cjT+tBculK0wDbnKNtL+uJMOR5PQR+ndfivfZtVLk9/tJz8q3Ev/XsoikKqLAtdnhNxl + QNRBQNUHTCBmjcVKpajLxDVnoMVqWqRBh0SYNCrrhGMxXQn5xRjFS6IRaSFZAWbNaGKkFKWLZQTR + l/jVIczXmuFeIRLIvcG9MicZqTIoRYG/XKRYX3JSCY0B5Diykr+RFRBcj+vc2GRw9owsmFKgBsan + gqUlGKDCKNTWob4P5BX5/L6AlOHAyOP0I8d355pHrqwfg1a4tTbnBdfWBo5ZJZI6BjRa8aLCErQQ + sjAgxqi65KYFodlXKq2l+4ErGCzqMjZxJLVCupiykrvwtgoWLNZCWotwu0SgJcabpxt3YR3ntULt + AzSVfJY8zfD7kmJFYjT+34EkK57orA3zHLHizDAtTelWBpG8GnzGUMXfgx/fwx99fb3lLS39bPTI + +1oSMdeMl+hPVzIHJiUwBVgqAQzxaIDOiEDbpXFASBMUE1kJCY9bTYOIbtK0TVpW60zIdoLTElNp + 9ollkpG3GcsxP5Y8ona1laf9AwdPv2NK1AVk5J1g374hsbeYmtl/ajElYoKdpCtNdn7gAqDiDDDv + bB805ehw04noVl+n314q7Jc6dXDV2b9gJGsPFy3Ak32s7cv2q4Kkbb9bu7FkWvafY/tFHGPuYCxN + XtiC1bZ4E8/XmDceuc4ajPvWmQ5kD/hvUcuS5dhCF88CNwBWP3lNjk7Jz6ZR/kL80Sjwjzd4Pd19 + Ep+6mT2q7B1fLNeAWjRsLjMa1praqwo9EJoqWThPIeDeIYRNRA5cloy2/UBVkpvEf0ABSIVsnkMa + QU8KFILKrTKH2D34v0Mb+C3gi5VEtGuuETXX6dngHziSiRR9CwAA + headers: + access-control-allow-origin: ['*'] + connection: [Keep-Alive] + content-encoding: [gzip] + content-length: ['1236'] + content-type: [application/atom+xml; charset=UTF-8] + date: ['Fri, 26 May 2023 09:36:07 GMT'] + keep-alive: ['timeout=5, max=100'] + server: [Apache] + vary: ['Accept-Encoding,User-Agent'] + status: {code: 200, message: OK} +version: 1 diff --git a/tests/unit/utils/test_arxiv.py b/tests/unit/utils/test_arxiv.py index 8233c9b6..5592106a 100644 --- a/tests/unit/utils/test_arxiv.py +++ b/tests/unit/utils/test_arxiv.py @@ -1,5 +1,6 @@ import requests_mock -from pytest import raises +from pytest import raises, mark +import requests from scoap3.utils.arxiv import get_arxiv_categories, clean_arxiv from tests.responses import read_response @@ -70,11 +71,27 @@ def test_extract_arxiv_with_categ(): """ assert clean_arxiv('arXiv:1803.07217 [gr-qc]') == '1803.07217' +@mark.vcr +def test_categories_with_arxiv(): + """Test extraction arXiv categories from arXiv api.""" + categories = get_arxiv_categories(arxiv_id='2111.13053', title="Axial Chiral Vortical Effect in a Sphere with finite size effect", doi="10.1088/1674-1137/acac6d") + assert categories == ['hep-th'] + +@mark.vcr +def test_categories_without_arxiv_just_title(): + """Test extraction arXiv categories from arXiv api.""" + categories = get_arxiv_categories(title="Static properties and Semileptonic transitions of lowest-lying double heavy baryons") + assert categories == ['hep-ph'] + +@mark.vcr +def test_categories_without_arxiv_just_doi(): + """Test extraction arXiv categories from arXiv api.""" + categories = get_arxiv_categories(doi="10.1088/1674-1137/acac6c") + assert categories == [] + +@mark.vcr +def test_categories_without_arxiv_with_title_and_doi(): + """Test extraction arXiv categories from arXiv api.""" + categories = get_arxiv_categories(doi="10.1088/1674-1137/acac6c", title="Static properties and Semileptonic transitions of lowest-lying double heavy baryons") + assert categories == ['hep-ph'] -def test_extract_arxiv_additional_chars(): - """ - Test getting clean arXiv identifier with additional chars. - Delivered for article: 10.1140/epjc/s10052-018-6500-y - """ - with raises(UnicodeEncodeError): - clean_arxiv(u'"1808.01899\u201c')