diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index e495939..da337ba 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -193,6 +193,35 @@ def get_transformed_values( return transf_val[0] +# TODO: Check all columns and then return list of offending columns' names +def categorical_cols_have_bids_levels(data_dict: dict) -> bool: + for col, attrs in data_dict.items(): + if ( + is_column_categorical(col, data_dict) + and attrs.get("Levels") is None + ): + return False + + return True + + +def get_mismatched_categorical_levels(data_dict: dict) -> list: + """ + Returns list of any categorical columns from a data dictionary that have different entries + for the "Levels" key between the column's BIDS and Neurobagel annotations. + """ + mismatched_cols = [] + for col, attrs in data_dict.items(): + if is_column_categorical(col, data_dict): + known_levels = list(attrs["Annotations"]["Levels"].keys()) + attrs[ + "Annotations" + ].get("MissingValues", []) + if set(attrs.get("Levels", {}).keys()).difference(known_levels): + mismatched_cols.append(col) + + return mismatched_cols + + def are_not_missing(columns: list, row: pd.Series, data_dict: dict) -> bool: """ Checks that all values in the specified columns are not missing values. This is mainly useful @@ -224,7 +253,7 @@ def find_undefined_cat_col_values( all_undefined_values = {} for col, attr in data_dict.items(): if is_column_categorical(col, data_dict): - known_values = list(attr["Levels"].keys()) + attr[ + known_values = list(attr["Annotations"]["Levels"].keys()) + attr[ "Annotations" ].get("MissingValues", []) unknown_values = [] @@ -300,6 +329,16 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: "Please make sure that only one column is annotated for participant and session IDs." ) + if not categorical_cols_have_bids_levels(data_dict): + warnings.warn( + "The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute." + ) + + if mismatched_cols := get_mismatched_categorical_levels(data_dict): + warnings.warn( + f"The data dictionary contains columns with mismatched levels between the BIDS and Neurobagel annotations: {mismatched_cols}" + ) + if not are_inputs_compatible(data_dict, pheno_df): raise LookupError( "The provided data dictionary and phenotypic file are individually valid, " diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index 8b35025..a14268f 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -16,13 +16,15 @@ Example inputs to the CLI | 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail | | 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning | | synthetic | valid, has `participant` and `session` IDs corresponding to the [`synthetic` example BIDS dataset](https://github.com/bids-standard/bids-examples/tree/master/synthetic) | valid | pass | -| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail | +| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail | +| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning | +| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning | `* this is expected to fail until we enable multiple participant_ID handling`. ## Example expected CLI outputs -| Example | Description | -| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| example_synthetic.jsonld | Sample output of `bagel pheno` command on the `synthetic` example inputs. Contains subject-level annotated phenotypic attributes obtained by essentially applying the data dictionary (.json) to the original tabular data (.tsv). | +| Example | Description | +|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| example_synthetic.jsonld | Sample output of `bagel pheno` command on the `synthetic` example inputs. Contains subject-level annotated phenotypic attributes obtained by essentially applying the data dictionary (.json) to the original tabular data (.tsv). | | example_synthetic_pheno-bids.jsonld | Sample output of `bagel bids` command on the example_synthetic.jsonld and the [BIDS `synthetic` example dataset directory](https://github.com/bids-standard/bids-examples/tree/master/synthetic). Contains subject-level annotated imaging metadata (extracted directly from the BIDS dataset structure) on top of the subject-level annotated phenotypic attributes. | diff --git a/bagel/tests/data/example12.json b/bagel/tests/data/example12.json new file mode 100644 index 0000000..e9b7d3a --- /dev/null +++ b/bagel/tests/data/example12.json @@ -0,0 +1,75 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + } + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + } + } + }, + "group": { + "Description": "Group variable", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + } + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:iso8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example12.tsv b/bagel/tests/data/example12.tsv new file mode 100644 index 0000000..705ad3b --- /dev/null +++ b/bagel/tests/data/example12.tsv @@ -0,0 +1,5 @@ +participant_id session_id group sex participant_age +sub-01 ses-01 PAT M "P20Y6M" +sub-01 ses-02 PAT M "P20Y8M" +sub-02 ses-01 CTRL F "P25Y8M" +sub-02 ses-02 CTRL F "P26Y4M" diff --git a/bagel/tests/data/example13.json b/bagel/tests/data/example13.json new file mode 100644 index 0000000..f240156 --- /dev/null +++ b/bagel/tests/data/example13.json @@ -0,0 +1,128 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + } + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + } + } + }, + "pheno_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:euro", + "Label": "writing the time with a comma - why not" + }, + "MissingValues": ["NA"] + } + }, + "pheno_sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female", + "missing": "Missing sex", + "O": "Other unimportant level" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + }, + "MissingValues": ["missing"] + } + }, + + "pheno_group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject", + "NA": "Missing group" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + }, + "MissingValues": ["NA"] + } + }, + "tool1_item1": { + "Description": "item 1 scores for tool1", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogatlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "tool1_item2": { + "Description": "item 2 scores for tool1", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogatlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "tool2_item1": { + "Description": "item 1 scores for tool2", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogatlas:4321", + "Label": "A different imaginary tool" + }, + "MissingValues": ["not completed"] + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example13.tsv b/bagel/tests/data/example13.tsv new file mode 100644 index 0000000..675aa6b --- /dev/null +++ b/bagel/tests/data/example13.tsv @@ -0,0 +1,11 @@ +participant_id session_id pheno_age pheno_sex pheno_group tool1_item1 tool1_item2 tool2_item1 +sub-01 ses-01 34,1 F CTRL good far hello +sub-01 ses-02 35,3 F CTRL bad near world +sub-02 ses-01 NA M PAT ok missing hello +sub-02 ses-02 39,0 M PAT good middle friends +sub-03 ses-01 22,1 missing NA bad near not completed +sub-03 ses-02 23,2 missing PAT ok far hello +sub-04 ses-01 21,1 F CTRL missing missing hello +sub-04 ses-02 22,3 F CTRL good middle friends +sub-05 ses-01 42,5 M PAT bad near friends +sub-05 ses-02 43,2 M PAT good far world \ No newline at end of file diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 414b8e6..39d316d 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -4,7 +4,15 @@ @pytest.mark.parametrize( - "example", ["example2", "example4", "example6", "example_synthetic"] + "example", + [ + "example2", + "example4", + "example6", + "example12", + "example13", + "example_synthetic", + ], ) def test_pheno_valid_inputs_run_successfully( runner, test_data, tmp_path, example @@ -123,6 +131,66 @@ def test_invalid_portal_uris_produces_error( ) +def test_missing_bids_levels_raises_warning( + runner, + test_data, + tmp_path, +): + with pytest.warns(UserWarning) as w: + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example12.tsv", + "--dictionary", + test_data / "example12.json", + "--output", + tmp_path, + "--name", + "testing dataset", + ], + catch_exceptions=False, + ) + + assert len(w) == 1 + assert "looks categorical but lacks a BIDS 'Levels' attribute" in str( + w[0].message.args[0] + ) + + +def test_bids_neurobagel_levels_mismatch_raises_warning( + runner, + test_data, + tmp_path, +): + with pytest.warns(UserWarning) as w: + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example13.tsv", + "--dictionary", + test_data / "example13.json", + "--output", + tmp_path, + "--name", + "testing dataset", + ], + catch_exceptions=False, + ) + + assert len(w) == 1 + assert all( + warn_substring in str(w[0].message.args[0]) + for warn_substring in [ + "columns with mismatched levels", + "['pheno_sex']", + ] + ) + + def test_unused_missing_values_raises_warning( runner, test_data,