From c41f697b1807334a1c078d85a9142e962fd74127 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Thu, 7 Dec 2023 13:11:44 +0100 Subject: [PATCH] APS: duplicated affids removal --- hepcrawl/extractors/aps_parser.py | 2 +- tests/responses/aps/aps_single_response.json | 1 + tests/test_aps.py | 40 ++++++++++++-------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/hepcrawl/extractors/aps_parser.py b/hepcrawl/extractors/aps_parser.py index b90ea3d..bb6cb3a 100644 --- a/hepcrawl/extractors/aps_parser.py +++ b/hepcrawl/extractors/aps_parser.py @@ -110,7 +110,7 @@ def _get_authors_and_collab(self, article, dois): author_affiliations = [] if 'affiliations' in article and 'affiliationIds' in author: affiliations = build_dict(article['affiliations'], 'id') - for aff_id in author['affiliationIds']: + for aff_id in set(author['affiliationIds']): if aff_id in affiliations: author_affiliations.append({'value': affiliations[aff_id]['name']}) diff --git a/tests/responses/aps/aps_single_response.json b/tests/responses/aps/aps_single_response.json index f32611a..655a1f9 100644 --- a/tests/responses/aps/aps_single_response.json +++ b/tests/responses/aps/aps_single_response.json @@ -64,6 +64,7 @@ { "surname":"Alemi", "affiliationIds":[ + "a1", "a1" ], "type":"Person", diff --git a/tests/test_aps.py b/tests/test_aps.py index 5c6af6c..a210e4f 100644 --- a/tests/test_aps.py +++ b/tests/test_aps.py @@ -58,7 +58,8 @@ def test_abstract(results): def test_title(results): """Test extracting title.""" - titles = ("You can run, you can hide: The epidemiology and statistical mechanics of zombies",) + titles = ( + "You can run, you can hide: The epidemiology and statistical mechanics of zombies",) for title, record in zip(titles, results): assert 'title' in record assert record['title'] == title @@ -151,20 +152,29 @@ def test_publication_info(results): def test_authors(results): """Test authors.""" - expected_results = ( - dict( - affiliation='Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA', - author_full_names=['Alemi, Alexander A.', 'Bierbaum, Matthew', 'Myers, Christopher R.', 'Sethna, James P.'] - ), - ) - for expected, record in zip(expected_results, results): - assert 'authors' in record - assert len(record['authors']) == len(expected['author_full_names']) - - record_full_names = [author['full_name'] for author in record['authors']] - assert set(expected['author_full_names']) == set(record_full_names) # assert that we have the same list of authors - for author in record['authors']: - assert author['affiliations'][0]['value'] == expected['affiliation'] + expected_results = [{'affiliations': [{'value': u'Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA'}], + 'full_name': u'Alemi, Alexander A.', + 'given_names': u'Alexander A.', + 'raw_name': u'Alexander A. Alemi', + 'surname': u'Alemi'}, + {'affiliations': [{'value': u'Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA'}], + 'full_name': u'Bierbaum, Matthew', + 'given_names': u'Matthew', + 'raw_name': u'Matthew Bierbaum', + 'surname': u'Bierbaum'}, + {'affiliations': [{'value': u'Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA'}, + {'value': u'Institute of Biotechnology, Cornell University, Ithaca, New York 14853, USA'}], + 'full_name': u'Myers, Christopher R.', + 'given_names': u'Christopher R.', + 'raw_name': u'Christopher R. Myers', + 'surname': u'Myers'}, + {'affiliations': [{'value': u'Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA'}], + 'full_name': u'Sethna, James P.', + 'given_names': u'James P.', + 'raw_name': u'James P. Sethna', + 'surname': u'Sethna'}] + + assert results[0]['authors'] == expected_results def test_copyrights(results):