From ebb4b46b9226bbb0c810b7df6c6718e54edbd6cd Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2023 17:35:29 -0400 Subject: [PATCH 1/3] Adopt sanitize_value from dandi-cli --- dandischema/tests/test_utils.py | 18 +++++++++++++++++- dandischema/utils.py | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/dandischema/tests/test_utils.py b/dandischema/tests/test_utils.py index 75a62d4..71d9e7a 100644 --- a/dandischema/tests/test_utils.py +++ b/dandischema/tests/test_utils.py @@ -2,7 +2,13 @@ import pytest -from ..utils import _ensure_newline, name2title, strip_top_level_optional, version2tuple +from ..utils import ( + _ensure_newline, + name2title, + sanitize_value, + strip_top_level_optional, + version2tuple, +) @pytest.mark.parametrize( @@ -72,3 +78,13 @@ def test_newline() -> None: ) def test_strip_top_level_optional(input_: type, expected_output: type) -> None: assert strip_top_level_optional(input_) == expected_output + + +def test_sanitize_value() -> None: + # . is not sanitized in extension but elsewhere + assert sanitize_value("_.ext", "extension") == "-.ext" + assert sanitize_value("_.ext", "unrelated") == "--ext" + assert sanitize_value("_.ext") == "--ext" + assert sanitize_value("A;B") == "A-B" + assert sanitize_value("A\\/B") == "A--B" + assert sanitize_value("A\"'B") == "A--B" diff --git a/dandischema/utils.py b/dandischema/utils.py index c73167c..e660acf 100644 --- a/dandischema/utils.py +++ b/dandischema/utils.py @@ -114,3 +114,25 @@ def strip_top_level_optional(type_: Any) -> Any: else: # `type_` is not an Optional return type_ + + +def sanitize_value(value: str, field: str = "non-extension", sub: str = "-") -> str: + """Replace all "non-compliant" characters with - + + Of particular importance is _ which we use, as in BIDS, to separate + _key-value entries. It is not sanitizing to BIDS level of clarity though. + In BIDS only alphanumerics are allowed, and here we only replace some known + to be offending symbols with `sub`. + + When `field` is not "extension", we also replace ".". + + Based on dandi.organize._sanitize_value. + + .. versionchanged:: 0.8.3 + + ``sanitize_value`` added + """ + value = re.sub(r"[_*\\/<>:|\"'?%@;]", sub, value) + if field != "extension": + value = value.replace(".", sub) + return value From 61c374d56286154406720fa9ebd6436460dc2be5 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 6 Apr 2023 18:15:06 -0400 Subject: [PATCH 2/3] ENH: use sanitize_value helper instead of adhoc replacement of _ This seems to Closes #172 as well --- dandischema/metadata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 21999ea..4c81325 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -21,6 +21,7 @@ from .utils import ( TransitionalGenerateJsonSchema, _ensure_newline, + sanitize_value, strip_top_level_optional, version2tuple, ) @@ -310,7 +311,7 @@ def migrate( def _get_samples(value: dict, stats: _stats_type, hierarchy: Any) -> _stats_type: if "sampleType" in value: sampletype = value["sampleType"]["name"] - obj = value["identifier"].replace("_", "-") + obj = sanitize_value(value["identifier"]) if obj not in stats[sampletype]: stats[sampletype].append(obj) if "wasDerivedFrom" in value: @@ -354,7 +355,7 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None: if value["species"] not in stats["species"]: stats["species"].append(value["species"]) if value.get("identifier", None): - subject = value["identifier"].replace("_", "-") + subject = sanitize_value(value["identifier"]) if subject not in stats["subjects"]: stats["subjects"].append(subject) From 03748b958560d753e99c33671814b48acbd87a35 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 13 Aug 2024 10:53:17 -0400 Subject: [PATCH 3/3] More sanitarization of entities like done in dandi-cli --- dandischema/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandischema/utils.py b/dandischema/utils.py index e660acf..f82fac4 100644 --- a/dandischema/utils.py +++ b/dandischema/utils.py @@ -132,7 +132,7 @@ def sanitize_value(value: str, field: str = "non-extension", sub: str = "-") -> ``sanitize_value`` added """ - value = re.sub(r"[_*\\/<>:|\"'?%@;]", sub, value) + value = re.sub(r"[_*\\/<>:|\"'?%@;,\s]", sub, value) if field != "extension": value = value.replace(".", sub) return value