From dffc328b2dbe5d9a24ed79c6c9c9778fcb7de335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0imon=20Mandl=C3=ADk?= Date: Mon, 28 Oct 2024 18:40:27 +0100 Subject: [PATCH] fixed a Unicode related bug in string shortening --- Project.toml | 2 +- docs/src/api/schema.md | 4 ++-- docs/src/manual/schema_inference.md | 2 +- src/JsonGrinder.jl | 2 +- src/schema/leaf.jl | 5 +++-- src/switches.jl | 22 +++++++++++----------- test/extractor.jl | 6 +++--- test/leaf_entry.jl | 10 +++++++--- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index 827842cc..9177f3e3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JsonGrinder" uuid = "d201646e-a9c0-11e8-1063-23b139159713" authors = ["pevnak ", "Matej Racinsky ", "Simon Mandlik "] -version = "2.5.5" +version = "2.6.0" [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" diff --git a/docs/src/api/schema.md b/docs/src/api/schema.md index fa6e97eb..f515cc9e 100644 --- a/docs/src/api/schema.md +++ b/docs/src/api/schema.md @@ -22,6 +22,6 @@ ArrayEntry JsonGrinder.max_values JsonGrinder.max_values! -JsonGrinder.max_string_length -JsonGrinder.max_string_length! +JsonGrinder.max_string_codeunits +JsonGrinder.max_string_codeunits! ``` diff --git a/docs/src/manual/schema_inference.md b/docs/src/manual/schema_inference.md index 7b72cbf2..8627bff9 100644 --- a/docs/src/manual/schema_inference.md +++ b/docs/src/manual/schema_inference.md @@ -125,7 +125,7 @@ the `categorical_limit` argument. Similarly, [`JsonGrinder.jl`](https://github.com/CTUAvastLab/JsonGrinder.jl) also shortens strings that are too long before saving them to schema. This can be governed with the -[`JsonGrinder.max_string_length`](@ref) parameter. +[`JsonGrinder.max_string_codeunits`](@ref) parameter. ## Preprocessing diff --git a/src/JsonGrinder.jl b/src/JsonGrinder.jl index 0c359110..6b7414c6 100644 --- a/src/JsonGrinder.jl +++ b/src/JsonGrinder.jl @@ -18,7 +18,7 @@ const FloatType = Float32 include("switches.jl") @compat public max_values, max_values! -@compat public max_string_length, max_string_length! +@compat public max_string_codeunits, max_string_codeunits! include("exceptions.jl") diff --git a/src/schema/leaf.jl b/src/schema/leaf.jl index a0f4abc9..e8d0f510 100644 --- a/src/schema/leaf.jl +++ b/src/schema/leaf.jl @@ -12,8 +12,9 @@ LeafEntry(::Type{<:Real}) = LeafEntry(Dict{Real, Int}(), 0) LeafEntry(::Type{<:AbstractString}) = LeafEntry(Dict{String, Int}(), 0) function shorten_string(v::AbstractString) - length(v) ≤ max_string_length() && return v - join((v[1:max_string_length()], length(v), bytes2hex(sha1(v))), "_") + l = ncodeunits(v) + l ≤ max_string_codeunits() && return v + join((v[1:thisind(v, max_string_codeunits())], l, bytes2hex(sha1(v))), "_") end update!(e::LeafEntry{Real}, v::Real) = _update_leaf!(e, v) diff --git a/src/switches.jl b/src/switches.jl index 29e13c25..c319f38d 100644 --- a/src/switches.jl +++ b/src/switches.jl @@ -25,29 +25,29 @@ function max_values!(n::Int; persist=false) end end -const _max_string_length = Ref{Int}(@load_preference("max_string_len", 10_000)) +const _max_string_codeunits = Ref{Int}(@load_preference("max_string_len", 10_000)) """ - JsonGrinder.max_string_length!(n::Int) + JsonGrinder.max_string_codeunits!(n::Int) -Get the current value of the `max_string_length` parameter. +Get the current value of the `max_string_codeunits` parameter. -See also: [`JsonGrinder.max_string_length!`](@ref). +See also: [`JsonGrinder.max_string_codeunits!`](@ref). """ -max_string_length() = _max_string_length[] +max_string_codeunits() = _max_string_codeunits[] """ - JsonGrinder.max_string_length!(n::Int; persist=false) + JsonGrinder.max_string_codeunits!(n::Int; persist=false) -Set the value of the `max_string_length` parameter. +Set the value of the `max_string_codeunits` parameter. Set `persist=true` to persist this setting between sessions. -See also: [`JsonGrinder.max_string_length`](@ref). +See also: [`JsonGrinder.max_string_codeunits`](@ref). """ -function max_string_length!(n::Int; persist=false) - _max_string_length[] = n +function max_string_codeunits!(n::Int; persist=false) + _max_string_codeunits[] = n if persist - @set_preferences!("max_string_length" => n) + @set_preferences!("max_string_codeunits" => n) end end diff --git a/test/extractor.jl b/test/extractor.jl index 50e767f3..e7b46793 100644 --- a/test/extractor.jl +++ b/test/extractor.jl @@ -123,8 +123,8 @@ end end @testset "long strings trimming" begin - max_string_length = JsonGrinder.max_string_length() - JsonGrinder.max_string_length!(2) + max_string_codeunits = JsonGrinder.max_string_codeunits() + JsonGrinder.max_string_codeunits!(2) s1 = "foo" s2 = "foa" @@ -134,7 +134,7 @@ end @test ext(s2).data ≈ [0, 1, 0] @test ext("bar").data ≈ [0, 0, 1] end - JsonGrinder.max_string_length!(max_string_length) + JsonGrinder.max_string_codeunits!(max_string_codeunits) end end diff --git a/test/leaf_entry.jl b/test/leaf_entry.jl index 1bad26c6..a85637a4 100644 --- a/test/leaf_entry.jl +++ b/test/leaf_entry.jl @@ -104,8 +104,8 @@ end end @testset "LeafEntry max_string_len" begin - max_string_len = JsonGrinder.max_string_length() - JsonGrinder.max_string_length!(3) + max_string_len = JsonGrinder.max_string_codeunits() + JsonGrinder.max_string_codeunits!(3) @test JsonGrinder.shorten_string("a") == "a" @test JsonGrinder.shorten_string("foo") == "foo" @@ -124,7 +124,11 @@ end @test !haskey(e.counts, "foo bar") @test !haskey(e.counts, "barbaz") - JsonGrinder.max_string_length!(max_string_len) + JsonGrinder.max_string_codeunits!(4) + @test JsonGrinder.shorten_string("αβ") == "αβ" + @test JsonGrinder.shorten_string("αβγ") == "αβ_6_a33e01a96d92c1643ebb4774c1b10d7d9d4b6b6f" + + JsonGrinder.max_string_codeunits!(max_string_len) end @testset "DictEntry update!" begin