diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 21999ea..4c81325 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -21,6 +21,7 @@ from .utils import ( TransitionalGenerateJsonSchema, _ensure_newline, + sanitize_value, strip_top_level_optional, version2tuple, ) @@ -310,7 +311,7 @@ def migrate( def _get_samples(value: dict, stats: _stats_type, hierarchy: Any) -> _stats_type: if "sampleType" in value: sampletype = value["sampleType"]["name"] - obj = value["identifier"].replace("_", "-") + obj = sanitize_value(value["identifier"]) if obj not in stats[sampletype]: stats[sampletype].append(obj) if "wasDerivedFrom" in value: @@ -354,7 +355,7 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: if value["species"] not in stats["species"]: stats["species"].append(value["species"]) if value.get("identifier", None): - subject = value["identifier"].replace("_", "-") + subject = sanitize_value(value["identifier"]) if subject not in stats["subjects"]: stats["subjects"].append(subject) diff --git a/dandischema/tests/test_utils.py b/dandischema/tests/test_utils.py index 75a62d4..71d9e7a 100644 --- a/dandischema/tests/test_utils.py +++ b/dandischema/tests/test_utils.py @@ -2,7 +2,13 @@ import pytest -from ..utils import _ensure_newline, name2title, strip_top_level_optional, version2tuple +from ..utils import ( + _ensure_newline, + name2title, + sanitize_value, + strip_top_level_optional, + version2tuple, +) @pytest.mark.parametrize( @@ -72,3 +78,13 @@ def test_newline() -> None: ) def test_strip_top_level_optional(input_: type, expected_output: type) -> None: assert strip_top_level_optional(input_) == expected_output + + +def test_sanitize_value() -> None: + # . is not sanitized in extension but elsewhere + assert sanitize_value("_.ext", "extension") == "-.ext" + assert sanitize_value("_.ext", "unrelated") == "--ext" + assert sanitize_value("_.ext") == "--ext" + assert sanitize_value("A;B") == "A-B" + assert sanitize_value("A\\/B") == "A--B" + assert sanitize_value("A\"'B") == "A--B" diff --git a/dandischema/utils.py b/dandischema/utils.py index c73167c..f82fac4 100644 --- a/dandischema/utils.py +++ b/dandischema/utils.py @@ -114,3 +114,25 @@ def strip_top_level_optional(type_: Any) -> Any: else: # `type_` is not an Optional return type_ + + +def sanitize_value(value: str, field: str = "non-extension", sub: str = "-") -> str: + """Replace all "non-compliant" characters with - + + Of particular importance is _ which we use, as in BIDS, to separate + _key-value entries. It is not sanitizing to BIDS level of clarity though. + In BIDS only alphanumerics are allowed, and here we only replace some known + to be offending symbols with `sub`. + + When `field` is not "extension", we also replace ".". + + Based on dandi.organize._sanitize_value. + + .. versionchanged:: 0.8.3 + + ``sanitize_value`` added + """ + value = re.sub(r"[_*\\/<>:|\"'?%@;,\s]", sub, value) + if field != "extension": + value = value.replace(".", sub) + return value