Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] More user-friendly handling of input decoding/reading errors #337

Merged
merged 9 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions bagel/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
import jsonschema
import pandas as pd
import pydantic
import typer
from pandas import DataFrame
from typer import BadParameter

from bagel import dictionary_models, mappings, models
from bagel.mappings import COGATLAS, NB, NCIT, NIDM, SNOMED
from bagel.utility import file_encoding_error_message

DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema()

Expand Down Expand Up @@ -42,9 +44,21 @@ def validate_portal_uri(portal: str) -> Optional[str]:
def load_pheno(input_p: Path) -> pd.DataFrame | None:
"""Load a .tsv pheno file and do some basic validation."""
if input_p.suffix == ".tsv":
pheno_df: DataFrame = pd.read_csv(
input_p, sep="\t", keep_default_na=False, dtype=str
)
try:
pheno_df: DataFrame = pd.read_csv(
input_p,
sep="\t",
keep_default_na=False,
dtype=str,
encoding="utf-8",
)
except UnicodeDecodeError as e:
# TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed
typer.echo(
file_encoding_error_message(input_p),
err=True,
)
raise typer.Exit(code=1) from e
alyssadai marked this conversation as resolved.
Show resolved Hide resolved

if pheno_df.shape[1] > 1:
return pheno_df
Expand Down Expand Up @@ -352,9 +366,21 @@ def validate_data_dict(data_dict: dict) -> None:
try:
jsonschema.validate(data_dict, DICTIONARY_SCHEMA)
except jsonschema.ValidationError as e:
# TODO: When *every* item in an input JSON is not schema valid,
# jsonschema.validate will raise a ValidationError for only *one* item among them.
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
# Weirdly, the item that is chosen can vary, possibly due to jsonschema internally processing/validating
# items in an inconsistent order.
# You can reproduce this by running `bagel pheno` on the example_invalid input pair.
# This isn't necessarily a problem as the error will not be wrong, but if we care about
# returning ALL invalid items, we may want to use something like a Draft7Validator instance instead.
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
# NOTE: If the validation error occurs at the root level (i.e., the entire JSON object fails),
# e.path may be empty. We have a backup descriptor "Entire document" for the offending item in this case.
raise ValueError(
"The provided data dictionary is not a valid Neurobagel data dictionary. "
"Make sure that each annotated column contains an 'Annotations' key."
"The provided data dictionary is not a valid Neurobagel data dictionary.\n"
"Details:\n"
f"Entry that failed validation: {e.path[-1] if e.path else 'Entire document'}\n"
f"{e.message}\n"
"Tip: Make sure that each annotated column contains an 'Annotations' key."
) from e

if get_annotated_columns(data_dict) == []:
Expand Down
2 changes: 2 additions & 0 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Example inputs to the CLI
| 19 | Example with two columns about diagnosis | Valid | pass |
| 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass |
| 21 | Valid, based on example 2 but contains a dash in a column name | Valid | pass |
| iso88591 | invalid, ISO-8859-1 encoded and contains French characters in the age column | invalid, ISO-8859-1 encoded and contains French characters in the age column | fail |
| invalid_json | - | not valid JSON, contains trailing comma after `group` key-value pair | fail |

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
45 changes: 45 additions & 0 deletions bagel/tests/data/example_invalid_json.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
},
"Identifies": "session"
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
}
},
}
35 changes: 35 additions & 0 deletions bagel/tests/data/example_iso88591.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{

Check warning on line 1 in bagel/tests/data/example_iso88591.json

View workflow job for this annotation

GitHub Actions / Check for spelling errors

Cannot decode file using encoding "utf-8"
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
},
"Identifies": "session"
}
},
"âge_des_participants": {
"Description": "âge du participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:FromISO8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example_iso88591.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id session_id âge_des_participants

Check warning on line 1 in bagel/tests/data/example_iso88591.tsv

View workflow job for this annotation

GitHub Actions / Check for spelling errors

Cannot decode file using encoding "utf-8"
sub-01 ses-01 "P20Y6M"
sub-01 ses-02 "P20Y8M"
sub-02 ses-01 "P25Y8M"
sub-02 ses-02 "P26Y4M"
102 changes: 85 additions & 17 deletions bagel/tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

import pandas as pd
import pytest
import typer
from bids import BIDSLayout

import bagel.bids_utils as butil
import bagel.pheno_utils as putil
from bagel import mappings
from bagel.utility import load_json


@pytest.fixture
Expand Down Expand Up @@ -60,29 +62,68 @@ def test_all_used_namespaces_have_urls(
), f"The namespace '{prefix}' was used in the data dictionary, but was not defined in the @context."


def test_schema_invalid_column_raises_error():
partial_data_dict = (
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier",
}
@pytest.mark.parametrize(
"partial_data_dict, invalid_item",
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
[
# sex column missing Levels
(
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier",
},
"Identifies": "participant",
},
},
"sex": {
"Description": "Participant sex",
"Annotations": {
"IsAbout": {"TermURL": "nb:Sex", "Label": ""}
},
},
},
"sex": {
"Description": "Participant sex",
"Annotations": {"IsAbout": {"TermURL": "nb:Sex", "Label": ""}},
"sex",
),
# age column missing Transformation
(
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier",
},
"Identifies": "participant",
},
},
"age": {
"Description": "Participant age",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age",
}
},
},
},
},
)

"age",
),
],
)
def test_schema_invalid_column_raises_error(partial_data_dict, invalid_item):
"""
Test that when an input data dictionary contains a schema invalid column annotation,
an informative error is raised which includes the name of the offending column.
"""
with pytest.raises(ValueError) as e:
putil.validate_data_dict(partial_data_dict)

assert "not a valid Neurobagel data dictionary" in str(e.value)
for substring in ["not a valid Neurobagel data dictionary", invalid_item]:
assert substring in str(e.value)


def test_get_columns_that_are_about_concept(test_data, load_test_json):
Expand Down Expand Up @@ -474,3 +515,30 @@ def test_get_session_path_when_session_missing(bids_sub_id):
assert session_path.endswith(f"sub-{bids_sub_id}")
assert Path(session_path).is_absolute()
assert Path(session_path).is_dir()


@pytest.mark.parametrize(
"unreadable_json,expected_message",
[
("example_iso88591.json", "Failed to decode the input file"),
("example_invalid_json.json", "not valid JSON"),
],
)
def test_failed_json_reading_raises_informative_error(
test_data, unreadable_json, expected_message, capsys
):
"""Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message."""
with pytest.raises(typer.Exit):
load_json(test_data / unreadable_json)
captured = capsys.readouterr()

assert expected_message in captured.err


def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys):
"""Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message."""
with pytest.raises(typer.Exit):
putil.load_pheno(test_data / "example_iso88591.tsv")
captured = capsys.readouterr()

assert "Failed to decode the input file" in captured.err
32 changes: 30 additions & 2 deletions bagel/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,38 @@
import typer


def file_encoding_error_message(input_p: Path) -> str:
"""Return a message for when a file cannot be read due to encoding issues."""
return typer.style(
f"Failed to decode the input file {input_p}. "
"Please ensure that both your phenotypic .tsv file and .json data dictionary have UTF-8 encoding.\n"
"Tip: Need help converting your file? Try a tool like iconv (http://linux.die.net/man/1/iconv) or https://www.freeformatter.com/convert-file-encoding.html.",
fg=typer.colors.RED,
)


def load_json(input_p: Path) -> dict:
"""Load a user-specified json type file."""
with open(input_p, "r") as f:
return json.load(f)
try:
with open(input_p, "r", encoding="utf-8") as f:
return json.load(f)
except UnicodeDecodeError as e:
# TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed
typer.echo(
file_encoding_error_message(input_p),
err=True,
)
raise typer.Exit(code=1) from e
except json.JSONDecodeError as e:
typer.echo(
typer.style(
f"The provided data dictionary {input_p} is not valid JSON. "
"Please provide a valid JSON file.",
fg=typer.colors.RED,
),
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
err=True,
)
raise typer.Exit(code=1) from e


def check_overwrite(output: Path, overwrite: bool):
Expand Down