Skip to content

Commit

Permalink
add word replace normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
chengchingwen committed Nov 9, 2022
1 parent bb2f267 commit faedcaf
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
16 changes: 13 additions & 3 deletions src/normalize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,23 @@ end

### replace

struct ReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: SentenceNormalizer{T}
struct SentenceReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: SentenceNormalizer{T}
base::T
pattern::P
end
ReplaceNormalizer(pattern) = ReplaceNormalizer(DefaultTokenization(), pattern)
SentenceReplaceNormalizer(pattern) = SentenceReplaceNormalizer(DefaultTokenization(), pattern)

normalizer(t::ReplaceNormalizer) = Base.Fix2(replace, t.pattern)
normalizer(t::SentenceReplaceNormalizer) = Base.Fix2(replace, t.pattern)

struct WordReplaceNormalizer{T<:AbstractTokenization, P<:Pair} <: WordNormalizer{T}
base::T
pattern::P
end
WordReplaceNormalizer(pattern) = WordReplaceNormalizer(DefaultTokenization(), pattern)

normalizer(t::WordReplaceNormalizer) = Base.Fix2(replace, t.pattern)

const ReplaceNormalizer = SentenceReplaceNormalizer

### Codemap

Expand Down
4 changes: 2 additions & 2 deletions src/split.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ struct EachSplitTokenization{S} <: BaseTokenization
end

@static if VERSION < v"1.8"
splitting(t::EachSplitTokenization, s::SentenceStage) = split(getvalue(s), t.splitter)
splitting(t::EachSplitTokenization, s::SentenceStage) = split(getvalue(s), t.splitter; keepempty = false)
else
splitting(t::EachSplitTokenization, s::SentenceStage) = eachsplit(getvalue(s), t.splitter)
splitting(t::EachSplitTokenization, s::SentenceStage) = eachsplit(getvalue(s), t.splitter; keepempty = false)
end

struct EachMatchTokenization <: BaseTokenization
Expand Down
12 changes: 12 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using TextEncodeBase: AbstractTokenizer, AbstractTokenization,
WordTokenization, EachSplitTokenization, EachMatchTokenization,
IndexedTokenization, MatchTokenization,
UnicodeNormalizer, CodeNormalizer, CodeUnMap,
SentenceReplaceNormalizer, WordReplaceNormalizer,
TokenStages, Document, Sentence, Word, Token, Batch
using TextEncodeBase: getvalue, getmeta, updatevalue,
with_head_tail, trunc_and_pad, trunc_or_pad, nested2batch, nestedcall
Expand Down Expand Up @@ -137,6 +138,17 @@ end
@test map(getvalue, tkr(sentence)) ==
map(x->replace(x, r"\d+"=>"NUMBER"), nltk_word_tokenize(sentence.x))
@test tkr(word) == [Token(word.x)]

tkr1 = FlatTokenizer(SentenceReplaceNormalizer(r"(.+)"=>s"--\1"))
@test map(getvalue, tkr1(document)) ==
mapfoldl(nltk_word_tokenize Base.Fix1(*, "--"), append!, split_sentences(document.x))
@test map(getvalue, tkr1(sentence)) == nltk_word_tokenize("--" * sentence.x)
@test tkr1(word) == [Token("--" * word.x)]
tkr2 = FlatTokenizer(WordReplaceNormalizer(r"(.+)"=>s"--\1"))
@test map(getvalue, tkr2(document)) == map(
Base.Fix1(*, "--"), mapfoldl(nltk_word_tokenize, append!, split_sentences(document.x)))
@test map(getvalue, tkr2(sentence)) == map(Base.Fix1(*, "--"), nltk_word_tokenize(sentence.x))
@test tkr2(word) == [Token("--" * word.x)]
end

@testset "code normalizer" begin
Expand Down

0 comments on commit faedcaf

Please sign in to comment.