Skip to content

Commit

Permalink
fixed a Unicode related bug in string shortening
Browse files Browse the repository at this point in the history
  • Loading branch information
simonmandlik committed Oct 28, 2024
1 parent 1b16ac1 commit dffc328
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 24 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JsonGrinder"
uuid = "d201646e-a9c0-11e8-1063-23b139159713"
authors = ["pevnak <[email protected]>", "Matej Racinsky <[email protected]>", "Simon Mandlik <[email protected]>"]
version = "2.5.5"
version = "2.6.0"

[deps]
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
Expand Down
4 changes: 2 additions & 2 deletions docs/src/api/schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ ArrayEntry
JsonGrinder.max_values
JsonGrinder.max_values!
JsonGrinder.max_string_length
JsonGrinder.max_string_length!
JsonGrinder.max_string_codeunits
JsonGrinder.max_string_codeunits!
```
2 changes: 1 addition & 1 deletion docs/src/manual/schema_inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ the `categorical_limit` argument.

Similarly, [`JsonGrinder.jl`](https://github.com/CTUAvastLab/JsonGrinder.jl) also shortens strings
that are too long before saving them to schema. This can be governed with the
[`JsonGrinder.max_string_length`](@ref) parameter.
[`JsonGrinder.max_string_codeunits`](@ref) parameter.

## Preprocessing

Expand Down
2 changes: 1 addition & 1 deletion src/JsonGrinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const FloatType = Float32

include("switches.jl")
@compat public max_values, max_values!
@compat public max_string_length, max_string_length!
@compat public max_string_codeunits, max_string_codeunits!

include("exceptions.jl")

Expand Down
5 changes: 3 additions & 2 deletions src/schema/leaf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ LeafEntry(::Type{<:Real}) = LeafEntry(Dict{Real, Int}(), 0)
LeafEntry(::Type{<:AbstractString}) = LeafEntry(Dict{String, Int}(), 0)

function shorten_string(v::AbstractString)
length(v) max_string_length() && return v
join((v[1:max_string_length()], length(v), bytes2hex(sha1(v))), "_")
l = ncodeunits(v)
l max_string_codeunits() && return v
join((v[1:thisind(v, max_string_codeunits())], l, bytes2hex(sha1(v))), "_")
end

update!(e::LeafEntry{Real}, v::Real) = _update_leaf!(e, v)
Expand Down
22 changes: 11 additions & 11 deletions src/switches.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@ function max_values!(n::Int; persist=false)
end
end

const _max_string_length = Ref{Int}(@load_preference("max_string_len", 10_000))
const _max_string_codeunits = Ref{Int}(@load_preference("max_string_len", 10_000))

"""
JsonGrinder.max_string_length!(n::Int)
JsonGrinder.max_string_codeunits!(n::Int)
Get the current value of the `max_string_length` parameter.
Get the current value of the `max_string_codeunits` parameter.
See also: [`JsonGrinder.max_string_length!`](@ref).
See also: [`JsonGrinder.max_string_codeunits!`](@ref).
"""
max_string_length() = _max_string_length[]
max_string_codeunits() = _max_string_codeunits[]

"""
JsonGrinder.max_string_length!(n::Int; persist=false)
JsonGrinder.max_string_codeunits!(n::Int; persist=false)
Set the value of the `max_string_length` parameter.
Set the value of the `max_string_codeunits` parameter.
Set `persist=true` to persist this setting between sessions.
See also: [`JsonGrinder.max_string_length`](@ref).
See also: [`JsonGrinder.max_string_codeunits`](@ref).
"""
function max_string_length!(n::Int; persist=false)
_max_string_length[] = n
function max_string_codeunits!(n::Int; persist=false)
_max_string_codeunits[] = n
if persist
@set_preferences!("max_string_length" => n)
@set_preferences!("max_string_codeunits" => n)
end
end
6 changes: 3 additions & 3 deletions test/extractor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ end
end

@testset "long strings trimming" begin
max_string_length = JsonGrinder.max_string_length()
JsonGrinder.max_string_length!(2)
max_string_codeunits = JsonGrinder.max_string_codeunits()
JsonGrinder.max_string_codeunits!(2)
s1 = "foo"
s2 = "foa"

Expand All @@ -134,7 +134,7 @@ end
@test ext(s2).data [0, 1, 0]
@test ext("bar").data [0, 0, 1]
end
JsonGrinder.max_string_length!(max_string_length)
JsonGrinder.max_string_codeunits!(max_string_codeunits)
end
end

Expand Down
10 changes: 7 additions & 3 deletions test/leaf_entry.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ end
end

@testset "LeafEntry max_string_len" begin
max_string_len = JsonGrinder.max_string_length()
JsonGrinder.max_string_length!(3)
max_string_len = JsonGrinder.max_string_codeunits()
JsonGrinder.max_string_codeunits!(3)

@test JsonGrinder.shorten_string("a") == "a"
@test JsonGrinder.shorten_string("foo") == "foo"
Expand All @@ -124,7 +124,11 @@ end
@test !haskey(e.counts, "foo bar")
@test !haskey(e.counts, "barbaz")

JsonGrinder.max_string_length!(max_string_len)
JsonGrinder.max_string_codeunits!(4)
@test JsonGrinder.shorten_string("αβ") == "αβ"
@test JsonGrinder.shorten_string("αβγ") == "αβ_6_a33e01a96d92c1643ebb4774c1b10d7d9d4b6b6f"

JsonGrinder.max_string_codeunits!(max_string_len)
end

@testset "DictEntry update!" begin
Expand Down

2 comments on commit dffc328

@simonmandlik
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/118241

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v2.6.0 -m "<description of version>" dffc328b2dbe5d9a24ed79c6c9c9778fcb7de335
git push origin v2.6.0

Please sign in to comment.