Skip to content

Commit

Permalink
[FIX] Add validators for BIDS-Neurobagel categorical column levels al…
Browse files Browse the repository at this point in the history
…ignment (#160)

* add example with json missing BIDS 'Levels' attr

* add new example12 to e2e bagel pheno test

* add basic util + smoke test for checking presence of BIDS levels

* expand warning for missing bids levels

* add example with BIDS-Neurobagel mismatched levels

* add validator util + test that BIDS/Neurobagel categorical levels mismatch raises warning

---------

Co-authored-by: Alyssa Dai <[email protected]>
Co-authored-by: Sebastian Urchs <[email protected]>
  • Loading branch information
alyssadai and surchs authored Jun 15, 2023
1 parent d18f0b2 commit 0ed0e4d
Show file tree
Hide file tree
Showing 7 changed files with 334 additions and 6 deletions.
41 changes: 40 additions & 1 deletion bagel/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,35 @@ def get_transformed_values(
return transf_val[0]


# TODO: Check all columns and then return list of offending columns' names
def categorical_cols_have_bids_levels(data_dict: dict) -> bool:
for col, attrs in data_dict.items():
if (
is_column_categorical(col, data_dict)
and attrs.get("Levels") is None
):
return False

return True


def get_mismatched_categorical_levels(data_dict: dict) -> list:
"""
Returns list of any categorical columns from a data dictionary that have different entries
for the "Levels" key between the column's BIDS and Neurobagel annotations.
"""
mismatched_cols = []
for col, attrs in data_dict.items():
if is_column_categorical(col, data_dict):
known_levels = list(attrs["Annotations"]["Levels"].keys()) + attrs[
"Annotations"
].get("MissingValues", [])
if set(attrs.get("Levels", {}).keys()).difference(known_levels):
mismatched_cols.append(col)

return mismatched_cols


def are_not_missing(columns: list, row: pd.Series, data_dict: dict) -> bool:
"""
Checks that all values in the specified columns are not missing values. This is mainly useful
Expand Down Expand Up @@ -224,7 +253,7 @@ def find_undefined_cat_col_values(
all_undefined_values = {}
for col, attr in data_dict.items():
if is_column_categorical(col, data_dict):
known_values = list(attr["Levels"].keys()) + attr[
known_values = list(attr["Annotations"]["Levels"].keys()) + attr[
"Annotations"
].get("MissingValues", [])
unknown_values = []
Expand Down Expand Up @@ -300,6 +329,16 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
"Please make sure that only one column is annotated for participant and session IDs."
)

if not categorical_cols_have_bids_levels(data_dict):
warnings.warn(
"The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute."
)

if mismatched_cols := get_mismatched_categorical_levels(data_dict):
warnings.warn(
f"The data dictionary contains columns with mismatched levels between the BIDS and Neurobagel annotations: {mismatched_cols}"
)

if not are_inputs_compatible(data_dict, pheno_df):
raise LookupError(
"The provided data dictionary and phenotypic file are individually valid, "
Expand Down
10 changes: 6 additions & 4 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ Example inputs to the CLI
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |
| synthetic | valid, has `participant` and `session` IDs corresponding to the [`synthetic` example BIDS dataset](https://github.com/bids-standard/bids-examples/tree/master/synthetic) | valid | pass |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |

`* this is expected to fail until we enable multiple participant_ID handling`.

## Example expected CLI outputs

| Example | Description |
| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| example_synthetic.jsonld | Sample output of `bagel pheno` command on the `synthetic` example inputs. Contains subject-level annotated phenotypic attributes obtained by essentially applying the data dictionary (.json) to the original tabular data (.tsv). |
| Example | Description |
|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| example_synthetic.jsonld | Sample output of `bagel pheno` command on the `synthetic` example inputs. Contains subject-level annotated phenotypic attributes obtained by essentially applying the data dictionary (.json) to the original tabular data (.tsv). |
| example_synthetic_pheno-bids.jsonld | Sample output of `bagel bids` command on the example_synthetic.jsonld and the [BIDS `synthetic` example dataset directory](https://github.com/bids-standard/bids-examples/tree/master/synthetic). Contains subject-level annotated imaging metadata (extracted directly from the BIDS dataset structure) on top of the subject-level annotated phenotypic attributes. |
75 changes: 75 additions & 0 deletions bagel/tests/data/example12.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:iso8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example12.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id session_id group sex participant_age
sub-01 ses-01 PAT M "P20Y6M"
sub-01 ses-02 PAT M "P20Y8M"
sub-02 ses-01 CTRL F "P25Y8M"
sub-02 ses-02 CTRL F "P26Y4M"
128 changes: 128 additions & 0 deletions bagel/tests/data/example13.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
}
}
},
"pheno_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:euro",
"Label": "writing the time with a comma - why not"
},
"MissingValues": ["NA"]
}
},
"pheno_sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female",
"missing": "Missing sex",
"O": "Other unimportant level"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
},
"MissingValues": ["missing"]
}
},

"pheno_group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject",
"NA": "Missing group"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
},
"MissingValues": ["NA"]
}
},
"tool1_item1": {
"Description": "item 1 scores for tool1",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool1_item2": {
"Description": "item 2 scores for tool1",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool2_item1": {
"Description": "item 1 scores for tool2",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["not completed"]
}
}
}
11 changes: 11 additions & 0 deletions bagel/tests/data/example13.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
participant_id session_id pheno_age pheno_sex pheno_group tool1_item1 tool1_item2 tool2_item1
sub-01 ses-01 34,1 F CTRL good far hello
sub-01 ses-02 35,3 F CTRL bad near world
sub-02 ses-01 NA M PAT ok missing hello
sub-02 ses-02 39,0 M PAT good middle friends
sub-03 ses-01 22,1 missing NA bad near not completed
sub-03 ses-02 23,2 missing PAT ok far hello
sub-04 ses-01 21,1 F CTRL missing missing hello
sub-04 ses-02 22,3 F CTRL good middle friends
sub-05 ses-01 42,5 M PAT bad near friends
sub-05 ses-02 43,2 M PAT good far world
70 changes: 69 additions & 1 deletion bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@


@pytest.mark.parametrize(
"example", ["example2", "example4", "example6", "example_synthetic"]
"example",
[
"example2",
"example4",
"example6",
"example12",
"example13",
"example_synthetic",
],
)
def test_pheno_valid_inputs_run_successfully(
runner, test_data, tmp_path, example
Expand Down Expand Up @@ -123,6 +131,66 @@ def test_invalid_portal_uris_produces_error(
)


def test_missing_bids_levels_raises_warning(
runner,
test_data,
tmp_path,
):
with pytest.warns(UserWarning) as w:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / "example12.tsv",
"--dictionary",
test_data / "example12.json",
"--output",
tmp_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert len(w) == 1
assert "looks categorical but lacks a BIDS 'Levels' attribute" in str(
w[0].message.args[0]
)


def test_bids_neurobagel_levels_mismatch_raises_warning(
runner,
test_data,
tmp_path,
):
with pytest.warns(UserWarning) as w:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / "example13.tsv",
"--dictionary",
test_data / "example13.json",
"--output",
tmp_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert len(w) == 1
assert all(
warn_substring in str(w[0].message.args[0])
for warn_substring in [
"columns with mismatched levels",
"['pheno_sex']",
]
)


def test_unused_missing_values_raises_warning(
runner,
test_data,
Expand Down

0 comments on commit 0ed0e4d

Please sign in to comment.