diff --git a/src/normalize.jl b/src/normalize.jl index bb03292..de4b94e 100644 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -62,13 +62,23 @@ end ### replace -struct ReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: SentenceNormalizer{T} +struct SentenceReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: SentenceNormalizer{T} base::T pattern::P end -ReplaceNormalizer(pattern) = ReplaceNormalizer(DefaultTokenization(), pattern) +SentenceReplaceNormalizer(pattern) = SentenceReplaceNormalizer(DefaultTokenization(), pattern) -normalizer(t::ReplaceNormalizer) = Base.Fix2(replace, t.pattern) +normalizer(t::SentenceReplaceNormalizer) = Base.Fix2(replace, t.pattern) + +struct WordReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: WordNormalizer{T} + base::T + pattern::P +end +WordReplaceNormalizer(pattern) = WordReplaceNormalizer(DefaultTokenization(), pattern) + +normalizer(t::WordReplaceNormalizer) = Base.Fix2(replace, t.pattern) + +const ReplaceNormalizer = SentenceReplaceNormalizer ### Codemap diff --git a/src/split.jl b/src/split.jl index 892afa5..a150319 100644 --- a/src/split.jl +++ b/src/split.jl @@ -3,9 +3,9 @@ struct EachSplitTokenization{S} <: BaseTokenization end @static if VERSION < v"1.8" - splitting(t::EachSplitTokenization, s::SentenceStage) = split(getvalue(s), t.splitter) + splitting(t::EachSplitTokenization, s::SentenceStage) = split(getvalue(s), t.splitter; keepempty = false) else - splitting(t::EachSplitTokenization, s::SentenceStage) = eachsplit(getvalue(s), t.splitter) + splitting(t::EachSplitTokenization, s::SentenceStage) = eachsplit(getvalue(s), t.splitter; keepempty = false) end struct EachMatchTokenization <: BaseTokenization diff --git a/test/runtests.jl b/test/runtests.jl index e57121f..e224079 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,6 +16,7 @@ using TextEncodeBase: AbstractTokenizer, AbstractTokenization, WordTokenization, EachSplitTokenization, EachMatchTokenization, IndexedTokenization, MatchTokenization, UnicodeNormalizer, CodeNormalizer, CodeUnMap, + SentenceReplaceNormalizer, WordReplaceNormalizer, TokenStages, Document, Sentence, Word, Token, Batch using TextEncodeBase: getvalue, getmeta, updatevalue, with_head_tail, trunc_and_pad, trunc_or_pad, nested2batch, nestedcall @@ -137,6 +138,17 @@ end @test map(getvalue, tkr(sentence)) == map(x->replace(x, r"\d+"=>"NUMBER"), nltk_word_tokenize(sentence.x)) @test tkr(word) == [Token(word.x)] + + tkr1 = FlatTokenizer(SentenceReplaceNormalizer(r"(.+)"=>s"--\1")) + @test map(getvalue, tkr1(document)) == + mapfoldl(nltk_word_tokenize ∘ Base.Fix1(*, "--"), append!, split_sentences(document.x)) + @test map(getvalue, tkr1(sentence)) == nltk_word_tokenize("--" * sentence.x) + @test tkr1(word) == [Token("--" * word.x)] + tkr2 = FlatTokenizer(WordReplaceNormalizer(r"(.+)"=>s"--\1")) + @test map(getvalue, tkr2(document)) == map( + Base.Fix1(*, "--"), mapfoldl(nltk_word_tokenize, append!, split_sentences(document.x))) + @test map(getvalue, tkr2(sentence)) == map(Base.Fix1(*, "--"), nltk_word_tokenize(sentence.x)) + @test tkr2(word) == [Token("--" * word.x)] end @testset "code normalizer" begin