Skip to content

Commit

Permalink
Handle records with dict rather than array affil.
Browse files Browse the repository at this point in the history
Issue: #51
  • Loading branch information
dspinellis committed Jun 30, 2024
1 parent 13286a0 commit c5b040f
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
7 changes: 7 additions & 0 deletions src/alexandria3k/data_sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def current_row_value(self):
self.item_index
]
self.json_data = json.loads(json_string)
# Record 10.17031/637b5e4a8d3ae of file10.17031/part_00001.jsonl
# and others have affiliation as a dict, rather than an array
# containing a dict. Detect and fix.
for relation in ["creators", "contributors"]:
for creator in self.json_data[relation]:
if isinstance(creator.get("affiliation"), dict):
creator["affiliation"] = [creator["affiliation"]]
self.cached_json_item_index = self.item_index
return self.json_data

Expand Down
Binary file modified tests/data/datacite.tar.gz
Binary file not shown.
26 changes: 13 additions & 13 deletions tests/data_sources/test_datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,19 @@ def tearDownClass(cls):
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 7)
self.assertEqual(self.record_count("dc_work_creators"), 26)
self.assertEqual(self.record_count("dc_works"), 9)
self.assertEqual(self.record_count("dc_work_creators"), 29)
self.assertEqual(self.record_count("dc_creator_name_identifiers"), 8)
self.assertEqual(self.record_count("dc_creator_affiliations"), 14)
self.assertEqual(self.record_count("dc_work_titles"), 7)
self.assertEqual(self.record_count("dc_work_subjects"), 25)
self.assertEqual(self.record_count("dc_work_contributors"), 0)
self.assertEqual(self.record_count("dc_creator_affiliations"), 17)
self.assertEqual(self.record_count("dc_work_titles"), 9)
self.assertEqual(self.record_count("dc_work_subjects"), 27)
self.assertEqual(self.record_count("dc_work_contributors"), 1)
self.assertEqual(self.record_count("dc_contributor_name_identifiers"), 0)
self.assertEqual(self.record_count("dc_contributor_affiliations"), 0)
self.assertEqual(self.record_count("dc_contributor_affiliations"), 1)
self.assertEqual(self.record_count("dc_work_dates"), 7)
self.assertEqual(self.record_count("dc_work_related_identifiers"), 20)
self.assertEqual(self.record_count("dc_work_descriptions"), 10)
self.assertEqual(self.record_count("dc_work_geo_locations"), 0)
self.assertEqual(self.record_count("dc_work_descriptions"), 12)
self.assertEqual(self.record_count("dc_work_geo_locations"), 2)
self.assertEqual(self.record_count("dc_work_funding_references"), 2)


Expand All @@ -79,13 +79,13 @@ def test_counts(self):
"""(SELECT DISTINCT work_id
FROM dc_work_creators)"""
),
7,
9,
)

self.assertEqual(self.record_count(
"""(SELECT DISTINCT container_id FROM dc_works)"""
),
3,
4,
)

def test_work_contents(self):
Expand Down Expand Up @@ -229,7 +229,7 @@ def tearDownClass(cls):
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 7)
self.assertEqual(self.record_count("dc_works"), 9)

def test_no_extra_fields(self):
with self.assertRaises(sqlite3.OperationalError):
Expand Down Expand Up @@ -262,7 +262,7 @@ def tearDownClass(cls):
os.unlink(DATABASE_PATH)

def test_counts(self):
self.assertEqual(self.record_count("dc_works"), 4)
self.assertEqual(self.record_count("dc_works"), 6)

def test_no_extra_fields(self):
with self.assertRaises(sqlite3.OperationalError):
Expand Down

0 comments on commit c5b040f

Please sign in to comment.