Skip to content

Commit

Permalink
Merge pull request #175 from dandi/enh-sanitize-value
Browse files Browse the repository at this point in the history
ENH: adopt sanitize_value from dandi-cli and use for sanitization of identifier
  • Loading branch information
yarikoptic authored Aug 13, 2024
2 parents 1110194 + 03748b9 commit 6fc16b8
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 3 deletions.
5 changes: 3 additions & 2 deletions dandischema/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .utils import (
TransitionalGenerateJsonSchema,
_ensure_newline,
sanitize_value,
strip_top_level_optional,
version2tuple,
)
Expand Down Expand Up @@ -310,7 +311,7 @@ def migrate(
def _get_samples(value: dict, stats: _stats_type, hierarchy: Any) -> _stats_type:
if "sampleType" in value:
sampletype = value["sampleType"]["name"]
obj = value["identifier"].replace("_", "-")
obj = sanitize_value(value["identifier"])
if obj not in stats[sampletype]:
stats[sampletype].append(obj)
if "wasDerivedFrom" in value:
Expand Down Expand Up @@ -354,7 +355,7 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
if value["species"] not in stats["species"]:
stats["species"].append(value["species"])
if value.get("identifier", None):
subject = value["identifier"].replace("_", "-")
subject = sanitize_value(value["identifier"])
if subject not in stats["subjects"]:
stats["subjects"].append(subject)

Expand Down
18 changes: 17 additions & 1 deletion dandischema/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

import pytest

from ..utils import _ensure_newline, name2title, strip_top_level_optional, version2tuple
from ..utils import (
_ensure_newline,
name2title,
sanitize_value,
strip_top_level_optional,
version2tuple,
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -72,3 +78,13 @@ def test_newline() -> None:
)
def test_strip_top_level_optional(input_: type, expected_output: type) -> None:
assert strip_top_level_optional(input_) == expected_output


def test_sanitize_value() -> None:
# . is not sanitized in extension but elsewhere
assert sanitize_value("_.ext", "extension") == "-.ext"
assert sanitize_value("_.ext", "unrelated") == "--ext"
assert sanitize_value("_.ext") == "--ext"
assert sanitize_value("A;B") == "A-B"
assert sanitize_value("A\\/B") == "A--B"
assert sanitize_value("A\"'B") == "A--B"
22 changes: 22 additions & 0 deletions dandischema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,25 @@ def strip_top_level_optional(type_: Any) -> Any:
else:
# `type_` is not an Optional
return type_


def sanitize_value(value: str, field: str = "non-extension", sub: str = "-") -> str:
"""Replace all "non-compliant" characters with -
Of particular importance is _ which we use, as in BIDS, to separate
_key-value entries. It is not sanitizing to BIDS level of clarity though.
In BIDS only alphanumerics are allowed, and here we only replace some known
to be offending symbols with `sub`.
When `field` is not "extension", we also replace ".".
Based on dandi.organize._sanitize_value.
.. versionchanged:: 0.8.3
``sanitize_value`` added
"""
value = re.sub(r"[_*\\/<>:|\"'?%@;,\s]", sub, value)
if field != "extension":
value = value.replace(".", sub)
return value

0 comments on commit 6fc16b8

Please sign in to comment.