neurobagel · alyssadai · Aug 16, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
@@ -9,11 +9,13 @@
 import jsonschema
 import pandas as pd
 import pydantic
+import typer
 from pandas import DataFrame
 from typer import BadParameter
 
 from bagel import dictionary_models, mappings, models
 from bagel.mappings import COGATLAS, NB, NCIT, NIDM, SNOMED
+from bagel.utility import file_encoding_error_message
 
 DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema()
 
@@ -42,9 +44,21 @@ def validate_portal_uri(portal: str) -> Optional[str]:
 def load_pheno(input_p: Path) -> pd.DataFrame | None:
     """Load a .tsv pheno file and do some basic validation."""
     if input_p.suffix == ".tsv":
-        pheno_df: DataFrame = pd.read_csv(
-            input_p, sep="\t", keep_default_na=False, dtype=str
-        )
+        try:
+            pheno_df: DataFrame = pd.read_csv(
+                input_p,
+                sep="\t",
+                keep_default_na=False,
+                dtype=str,
+                encoding="utf-8",
+            )
+        except UnicodeDecodeError as e:
+            # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed
+            typer.echo(
+                file_encoding_error_message(input_p),
+                err=True,
+            )
+            raise typer.Exit(code=1) from e
 
         if pheno_df.shape[1] > 1:
             return pheno_df
@@ -352,9 +366,21 @@ def validate_data_dict(data_dict: dict) -> None:
     try:
         jsonschema.validate(data_dict, DICTIONARY_SCHEMA)
     except jsonschema.ValidationError as e:
+        # TODO: When *every* item in an input JSON is not schema valid,
+        # jsonschema.validate will raise a ValidationError for only *one* item among them.
+        # Weirdly, the item that is chosen can vary, possibly due to jsonschema internally processing/validating
+        # items in an inconsistent order.
+        # You can reproduce this by running `bagel pheno` on the example_invalid input pair.
+        # This isn't necessarily a problem as the error will not be wrong, but if we care about
+        # returning ALL invalid items, we may want to use something like a Draft7Validator instance instead.
+        # NOTE: If the validation error occurs at the root level (i.e., the entire JSON object fails),
+        # e.path may be empty. We have a backup descriptor "Entire document" for the offending item in this case.
         raise ValueError(
-            "The provided data dictionary is not a valid Neurobagel data dictionary. "
-            "Make sure that each annotated column contains an 'Annotations' key."
+            "The provided data dictionary is not a valid Neurobagel data dictionary.\n"
+            "Details:\n"
+            f"Entry that failed validation: {e.path[-1] if e.path else 'Entire document'}\n"
+            f"{e.message}\n"
+            "Tip: Make sure that each annotated column contains an 'Annotations' key."
         ) from e
 
     if get_annotated_columns(data_dict) == []:

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -26,6 +26,8 @@ Example inputs to the CLI
 | 19 | Example with two columns about diagnosis | Valid | pass |
 | 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass |
 | 21 | Valid, based on example 2 but contains a dash in a column name | Valid | pass |
+| iso88591 | invalid, ISO-8859-1 encoded and contains French characters in the age column | invalid, ISO-8859-1 encoded and contains French characters in the age column | fail |
+| invalid_json | - | not valid JSON, contains trailing comma after `group` key-value pair | fail |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 

diff --git a/bagel/tests/data/example_invalid_json.json b/bagel/tests/data/example_invalid_json.json
@@ -0,0 +1,45 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PAT": "Patient",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PAT": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+}
diff --git a/bagel/tests/data/example_iso88591.json b/bagel/tests/data/example_iso88591.json
@@ -0,0 +1,35 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "âge_des_participants": {
+    "Description": "âge du participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromISO8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
diff --git a/bagel/tests/data/example_iso88591.tsv b/bagel/tests/data/example_iso88591.tsv
@@ -0,0 +1,5 @@
+participant_id	session_id	âge_des_participants
+sub-01	ses-01	"P20Y6M"
+sub-01	ses-02	"P20Y8M"
+sub-02	ses-01	"P25Y8M"
+sub-02	ses-02	"P26Y4M"
diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py
@@ -4,11 +4,13 @@
 
 import pandas as pd
 import pytest
+import typer
 from bids import BIDSLayout
 
 import bagel.bids_utils as butil
 import bagel.pheno_utils as putil
 from bagel import mappings
+from bagel.utility import load_json
 
 
 @pytest.fixture
@@ -60,29 +62,68 @@ def test_all_used_namespaces_have_urls(
         ), f"The namespace '{prefix}' was used in the data dictionary, but was not defined in the @context."
 
 
-def test_schema_invalid_column_raises_error():
-    partial_data_dict = (
-        {
-            "participant_id": {
-                "Description": "A participant ID",
-                "Annotations": {
-                    "IsAbout": {
-                        "TermURL": "nb:ParticipantID",
-                        "Label": "Unique participant identifier",
-                    }
+@pytest.mark.parametrize(
+    "partial_data_dict, invalid_item",
+    [
+        # sex column missing Levels
+        (
+            {
+                "participant_id": {
+                    "Description": "A participant ID",
+                    "Annotations": {
+                        "IsAbout": {
+                            "TermURL": "nb:ParticipantID",
+                            "Label": "Unique participant identifier",
+                        },
+                        "Identifies": "participant",
+                    },
+                },
+                "sex": {
+                    "Description": "Participant sex",
+                    "Annotations": {
+                        "IsAbout": {"TermURL": "nb:Sex", "Label": ""}
+                    },
                 },
             },
-            "sex": {
-                "Description": "Participant sex",
-                "Annotations": {"IsAbout": {"TermURL": "nb:Sex", "Label": ""}},
+            "sex",
+        ),
+        # age column missing Transformation
+        (
+            {
+                "participant_id": {
+                    "Description": "A participant ID",
+                    "Annotations": {
+                        "IsAbout": {
+                            "TermURL": "nb:ParticipantID",
+                            "Label": "Unique participant identifier",
+                        },
+                        "Identifies": "participant",
+                    },
+                },
+                "age": {
+                    "Description": "Participant age",
+                    "Annotations": {
+                        "IsAbout": {
+                            "TermURL": "nb:Age",
+                            "Label": "Chronological age",
+                        }
+                    },
+                },
             },
-        },
-    )
-
+            "age",
+        ),
+    ],
+)
+def test_schema_invalid_column_raises_error(partial_data_dict, invalid_item):
+    """
+    Test that when an input data dictionary contains a schema invalid column annotation,
+    an informative error is raised which includes the name of the offending column.
+    """
     with pytest.raises(ValueError) as e:
         putil.validate_data_dict(partial_data_dict)
 
-    assert "not a valid Neurobagel data dictionary" in str(e.value)
+    for substring in ["not a valid Neurobagel data dictionary", invalid_item]:
+        assert substring in str(e.value)
 
 
 def test_get_columns_that_are_about_concept(test_data, load_test_json):
@@ -474,3 +515,30 @@ def test_get_session_path_when_session_missing(bids_sub_id):
     assert session_path.endswith(f"sub-{bids_sub_id}")
     assert Path(session_path).is_absolute()
     assert Path(session_path).is_dir()
+
+
+@pytest.mark.parametrize(
+    "unreadable_json,expected_message",
+    [
+        ("example_iso88591.json", "Failed to decode the input file"),
+        ("example_invalid_json.json", "not valid JSON"),
+    ],
+)
+def test_failed_json_reading_raises_informative_error(
+    test_data, unreadable_json, expected_message, capsys
+):
+    """Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message."""
+    with pytest.raises(typer.Exit):
+        load_json(test_data / unreadable_json)
+    captured = capsys.readouterr()
+
+    assert expected_message in captured.err
+
+
+def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys):
+    """Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message."""
+    with pytest.raises(typer.Exit):
+        putil.load_pheno(test_data / "example_iso88591.tsv")
+    captured = capsys.readouterr()
+
+    assert "Failed to decode the input file" in captured.err
diff --git a/bagel/utility.py b/bagel/utility.py
@@ -4,10 +4,38 @@
 import typer
 
 
+def file_encoding_error_message(input_p: Path) -> str:
+    """Return a message for when a file cannot be read due to encoding issues."""
+    return typer.style(
+        f"Failed to decode the input file {input_p}. "
+        "Please ensure that both your phenotypic .tsv file and .json data dictionary have UTF-8 encoding.\n"
+        "Tip: Need help converting your file? Try a tool like iconv (http://linux.die.net/man/1/iconv) or https://www.freeformatter.com/convert-file-encoding.html.",
+        fg=typer.colors.RED,
+    )
+
+
 def load_json(input_p: Path) -> dict:
     """Load a user-specified json type file."""
-    with open(input_p, "r") as f:
-        return json.load(f)
+    try:
+        with open(input_p, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except UnicodeDecodeError as e:
+        # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed
+        typer.echo(
+            file_encoding_error_message(input_p),
+            err=True,
+        )
+        raise typer.Exit(code=1) from e
+    except json.JSONDecodeError as e:
+        typer.echo(
+            typer.style(
+                f"The provided data dictionary {input_p} is not valid JSON. "
+                "Please provide a valid JSON file.",
+                fg=typer.colors.RED,
+            ),
+            err=True,
+        )
+        raise typer.Exit(code=1) from e
 
 
 def check_overwrite(output: Path, overwrite: bool):