APS: duplicated affids removal

SCOAP3 · Dec 6, 2023 · 9dd8cde · 9dd8cde
1 parent 804b21d
commit 9dd8cde
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 7 deletions.
diff --git a/hepcrawl/extractors/aps_parser.py b/hepcrawl/extractors/aps_parser.py
@@ -110,7 +110,7 @@ def _get_authors_and_collab(self, article, dois):
                 author_affiliations = []
                 if 'affiliations' in article and 'affiliationIds' in author:
                     affiliations = build_dict(article['affiliations'], 'id')
-                    for aff_id in author['affiliationIds']:
+                    for aff_id in set(author['affiliationIds']):
                         if aff_id in affiliations:
                             author_affiliations.append({'value': affiliations[aff_id]['name']})
 

diff --git a/tests/responses/aps/aps_single_response.json b/tests/responses/aps/aps_single_response.json
@@ -64,6 +64,7 @@
             {
                "surname":"Alemi",
                "affiliationIds":[
+                  "a1",
                   "a1"
                ],
                "type":"Person",

diff --git a/tests/test_aps.py b/tests/test_aps.py
@@ -13,9 +13,11 @@
 
 from scrapy.http import TextResponse
 from hepcrawl.spiders import aps_spider
+from hepcrawl.parsers import aps_apr
 from .responses import fake_response_from_file
 
 
+
 @pytest.fixture
 def results():
     """Return results generator from the APS spider."""
@@ -58,7 +60,8 @@ def test_abstract(results):
 
 def test_title(results):
     """Test extracting title."""
-    titles = ("You can run, you can hide: The epidemiology and statistical mechanics of zombies",)
+    titles = (
+        "You can run, you can hide: The epidemiology and statistical mechanics of zombies",)
     for title, record in zip(titles, results):
         assert 'title' in record
         assert record['title'] == title
@@ -154,18 +157,21 @@ def test_authors(results):
     expected_results = (
         dict(
             affiliation='Laboratory of Atomic and Solid State Physics, Cornell University, Ithaca, New York 14853, USA',
-            author_full_names=['Alemi, Alexander A.', 'Bierbaum, Matthew', 'Myers, Christopher R.', 'Sethna, James P.']
+            author_full_names=['Alemi, Alexander A.', 'Bierbaum, Matthew',
+                               'Myers, Christopher R.', 'Sethna, James P.']
         ),
     )
     for expected, record in zip(expected_results, results):
         assert 'authors' in record
         assert len(record['authors']) == len(expected['author_full_names'])
 
-        record_full_names = [author['full_name'] for author in record['authors']]
-        assert set(expected['author_full_names']) == set(record_full_names)  # assert that we have the same list of authors
+        record_full_names = [author['full_name']
+                             for author in record['authors']]
+        # assert that we have the same list of authors
+        assert set(expected['author_full_names']) == set(record_full_names)
         for author in record['authors']:
-            assert author['affiliations'][0]['value'] == expected['affiliation']
-
+            for affiliation in  author['affiliations']:
+                assert affiliation['value'] == expected['affiliation']
 
 def test_copyrights(results):
     """Test extracting copyright."""
@@ -198,3 +204,4 @@ def test_doctype(results):
     for expected, record in zip(expected_results, results):
         assert 'journal_doctype' in record
         assert record['journal_doctype'] == expected
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -64,6 +64,7 @@ @@
                 {
                    "surname":"Alemi",
                    "affiliationIds":[
+                      "a1",
                       "a1"
                    ],
                    "type":"Person",
@@ Expand Down @@