Skip to content

Commit

Permalink
refactor out jsonld subject extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssadai committed Oct 1, 2024
1 parent cbef45c commit 01d5943
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 14 deletions.
17 changes: 6 additions & 11 deletions bagel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from bagel.derivatives_utils import PROC_STATUS_COLS
from bagel.utility import (
extract_and_validate_jsonld_dataset,
get_subjects_missing_from_pheno_data,
extract_subs_from_jsonld_dataset,
get_subs_missing_from_pheno_data,
)

# TODO: Coordinate with Nipoppy about what we want to name this
Expand Down Expand Up @@ -267,13 +268,10 @@ def bids(
jsonld, jsonld_path
)

pheno_subject_dict = {
pheno_subject.hasLabel: pheno_subject
for pheno_subject in getattr(pheno_dataset, "hasSamples")
}
pheno_subject_dict = extract_subs_from_jsonld_dataset(pheno_dataset)

# TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
unique_bids_subjects = get_subjects_missing_from_pheno_data(
unique_bids_subjects = get_subs_missing_from_pheno_data(
subjects=butil.get_bids_subjects_simple(bids_dir),
pheno_subjects=pheno_subject_dict.keys(),
)
Expand Down Expand Up @@ -439,13 +437,10 @@ def derivatives(
)

# Extract subjects from the JSONLD
jsonld_subject_dict = {
subject.hasLabel: subject
for subject in getattr(jsonld_dataset, "hasSamples")
}
jsonld_subject_dict = extract_subs_from_jsonld_dataset(jsonld_dataset)

# Check that all subjects in the processing status file are found in the JSONLD
unique_derivatives_subs = get_subjects_missing_from_pheno_data(
unique_derivatives_subs = get_subs_missing_from_pheno_data(
subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
pheno_subjects=jsonld_subject_dict.keys(),
)
Expand Down
25 changes: 23 additions & 2 deletions bagel/tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
import bagel.file_utils as futil
import bagel.pheno_utils as putil
from bagel import mappings, models
from bagel.utility import get_subjects_missing_from_pheno_data
from bagel.utility import (
extract_subs_from_jsonld_dataset,
get_subs_missing_from_pheno_data,
)


@pytest.fixture
Expand Down Expand Up @@ -442,7 +445,7 @@ def test_get_subjects_missing_from_pheno_data(bids_list, missing_subs):
# due to using set operations
assert (
sorted(
get_subjects_missing_from_pheno_data(
get_subs_missing_from_pheno_data(
pheno_subjects=pheno_list, subjects=bids_list
)
)
Expand Down Expand Up @@ -555,6 +558,24 @@ def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys):
assert "Failed to decode the input file" in captured.err


def test_extract_subs_from_jsonld_dataset(
test_data_upload_path, load_test_json
):
"""Test that subjects are correctly extracted from a JSONLD dataset."""
dataset = load_test_json(
test_data_upload_path / "example_synthetic.jsonld"
)
dataset.pop("@context")
subjects = extract_subs_from_jsonld_dataset(
models.Dataset.parse_obj(dataset)
)

assert len(subjects) == 5
assert all(
isinstance(subject, models.Subject) for subject in subjects.values()
)


def test_pipeline_uris_are_loaded():
"""Test that pipeline URIs are loaded from the pipeline-catalog submodule."""

Expand Down
12 changes: 11 additions & 1 deletion bagel/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bagel import models


def get_subjects_missing_from_pheno_data(
def get_subs_missing_from_pheno_data(
subjects: Iterable, pheno_subjects: Iterable
) -> list:
"""Check a list of subject IDs and return any not found in the provided phenotypic subject list."""
Expand Down Expand Up @@ -42,3 +42,13 @@ def extract_and_validate_jsonld_dataset(
raise typer.Exit(code=1) from err

Check warning on line 42 in bagel/utility.py

View check run for this annotation

Codecov / codecov/patch

bagel/utility.py#L42

Added line #L42 was not covered by tests

return context, jsonld_dataset


def extract_subs_from_jsonld_dataset(dataset: models.Dataset) -> dict:
"""
Return a dictionary of subjects for a given Neurobagel dataset from JSONLD data,
where keys are subject labels and values are the subject objects.
"""
return {
subject.hasLabel: subject for subject in getattr(dataset, "hasSamples")
}

0 comments on commit 01d5943

Please sign in to comment.