diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index 98cd401..26142af 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -9,11 +9,13 @@ import jsonschema import pandas as pd import pydantic +import typer from pandas import DataFrame from typer import BadParameter from bagel import dictionary_models, mappings, models from bagel.mappings import COGATLAS, NB, NCIT, NIDM, SNOMED +from bagel.utility import file_encoding_error_message DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema() @@ -42,9 +44,21 @@ def validate_portal_uri(portal: str) -> Optional[str]: def load_pheno(input_p: Path) -> pd.DataFrame | None: """Load a .tsv pheno file and do some basic validation.""" if input_p.suffix == ".tsv": - pheno_df: DataFrame = pd.read_csv( - input_p, sep="\t", keep_default_na=False, dtype=str - ) + try: + pheno_df: DataFrame = pd.read_csv( + input_p, + sep="\t", + keep_default_na=False, + dtype=str, + encoding="utf-8", + ) + except UnicodeDecodeError as e: + # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed + typer.echo( + file_encoding_error_message(input_p), + err=True, + ) + raise typer.Exit(code=1) from e if pheno_df.shape[1] > 1: return pheno_df @@ -352,9 +366,21 @@ def validate_data_dict(data_dict: dict) -> None: try: jsonschema.validate(data_dict, DICTIONARY_SCHEMA) except jsonschema.ValidationError as e: + # TODO: When *every* item in an input JSON is not schema valid, + # jsonschema.validate will raise a ValidationError for only *one* item among them. + # Weirdly, the item that is chosen can vary, possibly due to jsonschema internally processing/validating + # items in an inconsistent order. + # You can reproduce this by running `bagel pheno` on the example_invalid input pair. + # This isn't necessarily a problem as the error will not be wrong, but if we care about + # returning ALL invalid items, we may want to use something like a Draft7Validator instance instead. + # NOTE: If the validation error occurs at the root level (i.e., the entire JSON object fails), + # e.path may be empty. We have a backup descriptor "Entire document" for the offending item in this case. raise ValueError( - "The provided data dictionary is not a valid Neurobagel data dictionary. " - "Make sure that each annotated column contains an 'Annotations' key." + "The provided data dictionary is not a valid Neurobagel data dictionary.\n" + "Details:\n" + f"Entry that failed validation: {e.path[-1] if e.path else 'Entire document'}\n" + f"{e.message}\n" + "Tip: Make sure that each annotated column contains an 'Annotations' key." ) from e if get_annotated_columns(data_dict) == []: diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index f3c9b10..37b039d 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -26,6 +26,8 @@ Example inputs to the CLI | 19 | Example with two columns about diagnosis | Valid | pass | | 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass | | 21 | Valid, based on example 2 but contains a dash in a column name | Valid | pass | +| iso88591 | invalid, ISO-8859-1 encoded and contains French characters in the age column | invalid, ISO-8859-1 encoded and contains French characters in the age column | fail | +| invalid_json | - | not valid JSON, contains trailing comma after `group` key-value pair | fail | `* this is expected to fail until we enable multiple participant_ID handling`. diff --git a/bagel/tests/data/example_invalid_json.json b/bagel/tests/data/example_invalid_json.json new file mode 100644 index 0000000..42f14e5 --- /dev/null +++ b/bagel/tests/data/example_invalid_json.json @@ -0,0 +1,45 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + }, + "Identifies": "participant" + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + }, + "Identifies": "session" + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "ncit:C94342", + "Label": "Healthy Control" + } + } + } + }, +} \ No newline at end of file diff --git a/bagel/tests/data/example_iso88591.json b/bagel/tests/data/example_iso88591.json new file mode 100644 index 0000000..59a8f87 --- /dev/null +++ b/bagel/tests/data/example_iso88591.json @@ -0,0 +1,35 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + }, + "Identifies": "participant" + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + }, + "Identifies": "session" + } + }, + "âge_des_participants": { + "Description": "âge du participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:FromISO8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example_iso88591.tsv b/bagel/tests/data/example_iso88591.tsv new file mode 100644 index 0000000..d715f52 --- /dev/null +++ b/bagel/tests/data/example_iso88591.tsv @@ -0,0 +1,5 @@ +participant_id session_id âge_des_participants +sub-01 ses-01 "P20Y6M" +sub-01 ses-02 "P20Y8M" +sub-02 ses-01 "P25Y8M" +sub-02 ses-02 "P26Y4M" diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py index 9b382e3..a76e93b 100644 --- a/bagel/tests/test_utility.py +++ b/bagel/tests/test_utility.py @@ -4,11 +4,13 @@ import pandas as pd import pytest +import typer from bids import BIDSLayout import bagel.bids_utils as butil import bagel.pheno_utils as putil from bagel import mappings +from bagel.utility import load_json @pytest.fixture @@ -60,29 +62,73 @@ def test_all_used_namespaces_have_urls( ), f"The namespace '{prefix}' was used in the data dictionary, but was not defined in the @context." -def test_schema_invalid_column_raises_error(): - partial_data_dict = ( - { - "participant_id": { - "Description": "A participant ID", - "Annotations": { - "IsAbout": { - "TermURL": "nb:ParticipantID", - "Label": "Unique participant identifier", - } +@pytest.mark.parametrize( + "partial_data_dict, invalid_column_name", + [ + # sex column missing Levels + ( + { + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + }, + "Identifies": "participant", + }, + }, + "sex": { + "Description": "Participant sex", + "Annotations": { + "IsAbout": {"TermURL": "nb:Sex", "Label": ""} + }, }, }, - "sex": { - "Description": "Participant sex", - "Annotations": {"IsAbout": {"TermURL": "nb:Sex", "Label": ""}}, + "sex", + ), + # age column missing Transformation + ( + { + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + }, + "Identifies": "participant", + }, + }, + "age": { + "Description": "Participant age", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age", + } + }, + }, }, - }, - ) - + "age", + ), + ], +) +def test_schema_invalid_column_raises_error( + partial_data_dict, invalid_column_name +): + """ + Test that when an input data dictionary contains a schema invalid column annotation, + an informative error is raised which includes the name of the offending column. + """ with pytest.raises(ValueError) as e: putil.validate_data_dict(partial_data_dict) - assert "not a valid Neurobagel data dictionary" in str(e.value) + for substring in [ + "not a valid Neurobagel data dictionary", + invalid_column_name, + ]: + assert substring in str(e.value) def test_get_columns_that_are_about_concept(test_data, load_test_json): @@ -474,3 +520,30 @@ def test_get_session_path_when_session_missing(bids_sub_id): assert session_path.endswith(f"sub-{bids_sub_id}") assert Path(session_path).is_absolute() assert Path(session_path).is_dir() + + +@pytest.mark.parametrize( + "unreadable_json,expected_message", + [ + ("example_iso88591.json", "Failed to decode the input file"), + ("example_invalid_json.json", "not valid JSON"), + ], +) +def test_failed_json_reading_raises_informative_error( + test_data, unreadable_json, expected_message, capsys +): + """Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message.""" + with pytest.raises(typer.Exit): + load_json(test_data / unreadable_json) + captured = capsys.readouterr() + + assert expected_message in captured.err + + +def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys): + """Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message.""" + with pytest.raises(typer.Exit): + putil.load_pheno(test_data / "example_iso88591.tsv") + captured = capsys.readouterr() + + assert "Failed to decode the input file" in captured.err diff --git a/bagel/utility.py b/bagel/utility.py index 71d0a97..0665f13 100644 --- a/bagel/utility.py +++ b/bagel/utility.py @@ -4,10 +4,38 @@ import typer +def file_encoding_error_message(input_p: Path) -> str: + """Return a message for when a file cannot be read due to encoding issues.""" + return typer.style( + f"Failed to decode the input file {input_p}. " + "Please ensure that both your phenotypic .tsv file and .json data dictionary have UTF-8 encoding.\n" + "Tip: Need help converting your file? Try a tool like iconv (http://linux.die.net/man/1/iconv) or https://www.freeformatter.com/convert-file-encoding.html.", + fg=typer.colors.RED, + ) + + def load_json(input_p: Path) -> dict: """Load a user-specified json type file.""" - with open(input_p, "r") as f: - return json.load(f) + try: + with open(input_p, "r", encoding="utf-8") as f: + return json.load(f) + except UnicodeDecodeError as e: + # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed + typer.echo( + file_encoding_error_message(input_p), + err=True, + ) + raise typer.Exit(code=1) from e + except json.JSONDecodeError as e: + typer.echo( + typer.style( + f"The provided data dictionary {input_p} is not valid JSON. " + "Please provide a valid JSON file.", + fg=typer.colors.RED, + ), + err=True, + ) + raise typer.Exit(code=1) from e def check_overwrite(output: Path, overwrite: bool):