Skip to content

Commit

Permalink
APS: duplicated affids removal
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Dec 6, 2023
1 parent 804b21d commit 9dd8cde
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
2 changes: 1 addition & 1 deletion hepcrawl/extractors/aps_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _get_authors_and_collab(self, article, dois):
author_affiliations = []
if 'affiliations' in article and 'affiliationIds' in author:
affiliations = build_dict(article['affiliations'], 'id')
for aff_id in author['affiliationIds']:
for aff_id in set(author['affiliationIds']):
if aff_id in affiliations:
author_affiliations.append({'value': affiliations[aff_id]['name']})

Expand Down
1 change: 1 addition & 0 deletions tests/responses/aps/aps_single_response.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
{
"surname":"Alemi",
"affiliationIds":[
"a1",
"a1"
],
"type":"Person",
Expand Down
19 changes: 13 additions & 6 deletions tests/test_aps.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@

from scrapy.http import TextResponse
from hepcrawl.spiders import aps_spider
from hepcrawl.parsers import aps_apr
from .responses import fake_response_from_file



@pytest.fixture
def results():
"""Return results generator from the APS spider."""
Expand Down Expand Up @@ -58,7 +60,8 @@ def test_abstract(results):

def test_title(results):
"""Test extracting title."""
titles = ("You can run, you can hide: The epidemiology and statistical mechanics of zombies",)
titles = (
"You can run, you can hide: The epidemiology and statistical mechanics of zombies",)
for title, record in zip(titles, results):
assert 'title' in record
assert record['title'] == title
Expand Down Expand Up @@ -154,18 +157,21 @@ def test_authors(results):
expected_results = (
dict(
affiliation='Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA',
author_full_names=['Alemi, Alexander A.', 'Bierbaum, Matthew', 'Myers, Christopher R.', 'Sethna, James P.']
author_full_names=['Alemi, Alexander A.', 'Bierbaum, Matthew',
'Myers, Christopher R.', 'Sethna, James P.']
),
)
for expected, record in zip(expected_results, results):
assert 'authors' in record
assert len(record['authors']) == len(expected['author_full_names'])

record_full_names = [author['full_name'] for author in record['authors']]
assert set(expected['author_full_names']) == set(record_full_names) # assert that we have the same list of authors
record_full_names = [author['full_name']
for author in record['authors']]
# assert that we have the same list of authors
assert set(expected['author_full_names']) == set(record_full_names)
for author in record['authors']:
assert author['affiliations'][0]['value'] == expected['affiliation']

for affiliation in author['affiliations']:
assert affiliation['value'] == expected['affiliation']

def test_copyrights(results):
"""Test extracting copyright."""
Expand Down Expand Up @@ -198,3 +204,4 @@ def test_doctype(results):
for expected, record in zip(expected_results, results):
assert 'journal_doctype' in record
assert record['journal_doctype'] == expected

0 comments on commit 9dd8cde

Please sign in to comment.