refactor out jsonld subject extraction

neurobagel · Oct 1, 2024 · 01d5943 · 01d5943
1 parent cbef45c
commit 01d5943
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 14 deletions.
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -12,7 +12,8 @@
 from bagel.derivatives_utils import PROC_STATUS_COLS
 from bagel.utility import (
     extract_and_validate_jsonld_dataset,
-    get_subjects_missing_from_pheno_data,
+    extract_subs_from_jsonld_dataset,
+    get_subs_missing_from_pheno_data,
 )
 
 # TODO: Coordinate with Nipoppy about what we want to name this
@@ -267,13 +268,10 @@ def bids(
         jsonld, jsonld_path
     )
 
-    pheno_subject_dict = {
-        pheno_subject.hasLabel: pheno_subject
-        for pheno_subject in getattr(pheno_dataset, "hasSamples")
-    }
+    pheno_subject_dict = extract_subs_from_jsonld_dataset(pheno_dataset)
 
     # TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
-    unique_bids_subjects = get_subjects_missing_from_pheno_data(
+    unique_bids_subjects = get_subs_missing_from_pheno_data(
         subjects=butil.get_bids_subjects_simple(bids_dir),
         pheno_subjects=pheno_subject_dict.keys(),
     )
@@ -439,13 +437,10 @@ def derivatives(
     )
 
     # Extract subjects from the JSONLD
-    jsonld_subject_dict = {
-        subject.hasLabel: subject
-        for subject in getattr(jsonld_dataset, "hasSamples")
-    }
+    jsonld_subject_dict = extract_subs_from_jsonld_dataset(jsonld_dataset)
 
     # Check that all subjects in the processing status file are found in the JSONLD
-    unique_derivatives_subs = get_subjects_missing_from_pheno_data(
+    unique_derivatives_subs = get_subs_missing_from_pheno_data(
         subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
         pheno_subjects=jsonld_subject_dict.keys(),
     )

diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py
@@ -11,7 +11,10 @@
 import bagel.file_utils as futil
 import bagel.pheno_utils as putil
 from bagel import mappings, models
-from bagel.utility import get_subjects_missing_from_pheno_data
+from bagel.utility import (
+    extract_subs_from_jsonld_dataset,
+    get_subs_missing_from_pheno_data,
+)
 
 
 @pytest.fixture
@@ -442,7 +445,7 @@ def test_get_subjects_missing_from_pheno_data(bids_list, missing_subs):
     # due to using set operations
     assert (
         sorted(
-            get_subjects_missing_from_pheno_data(
+            get_subs_missing_from_pheno_data(
                 pheno_subjects=pheno_list, subjects=bids_list
             )
         )
@@ -555,6 +558,24 @@ def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys):
     assert "Failed to decode the input file" in captured.err
 
 
+def test_extract_subs_from_jsonld_dataset(
+    test_data_upload_path, load_test_json
+):
+    """Test that subjects are correctly extracted from a JSONLD dataset."""
+    dataset = load_test_json(
+        test_data_upload_path / "example_synthetic.jsonld"
+    )
+    dataset.pop("@context")
+    subjects = extract_subs_from_jsonld_dataset(
+        models.Dataset.parse_obj(dataset)
+    )
+
+    assert len(subjects) == 5
+    assert all(
+        isinstance(subject, models.Subject) for subject in subjects.values()
+    )
+
+
 def test_pipeline_uris_are_loaded():
     """Test that pipeline URIs are loaded from the pipeline-catalog submodule."""
 

diff --git a/bagel/utility.py b/bagel/utility.py
@@ -7,7 +7,7 @@
 from bagel import models
 
 
-def get_subjects_missing_from_pheno_data(
+def get_subs_missing_from_pheno_data(
     subjects: Iterable, pheno_subjects: Iterable
 ) -> list:
     """Check a list of subject IDs and return any not found in the provided phenotypic subject list."""
@@ -42,3 +42,13 @@ def extract_and_validate_jsonld_dataset(
         raise typer.Exit(code=1) from err
 
     return context, jsonld_dataset
+
+
+def extract_subs_from_jsonld_dataset(dataset: models.Dataset) -> dict:
+    """
+    Return a dictionary of subjects for a given Neurobagel dataset from JSONLD data,
+    where keys are subject labels and values are the subject objects.
+    """
+    return {
+        subject.hasLabel: subject for subject in getattr(dataset, "hasSamples")
+    }