Skip to content

Commit

Permalink
add match tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
chengchingwen committed Jan 11, 2022
1 parent 49947e9 commit b3f2f4c
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 6 deletions.
5 changes: 3 additions & 2 deletions src/TextEncodeBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@ struct DefaultTokenization <: AbstractTokenization end

tokenization(::AbstractTokenizer) = DefaultTokenization()

include("./utils.jl")
include("./base.jl")
include("./indexed.jl")
# include("./match.jl")
include("tkrs.jl")
include("./match.jl")
include("./tkrs.jl")

end
19 changes: 15 additions & 4 deletions src/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,16 @@ Document(x) = Document(x, nothing)
Sentence(x) = Sentence(x, nothing)
SubSentence(x) = SubSentence(x, nothing)
Word(x) = Word(x, nothing)
SubWord(x) = SubWord(x, nothing)
Token(x) = Token(x, nothing)

updatemeta(x::Document, meta) = Document(x.x, meta)
updatemeta(x::Sentence, meta) = Sentence(x.x, meta)
updatemeta(x::SubSentence, meta) = SubSentence(x.x, meta)
updatemeta(x::Word, meta) = Word(x.x, meta)
updatemeta(x::SubWord, meta) = SubWord(x.x, meta)
updatemeta(x::Token, meta) = Token(x.x, meta)

function Base.show(io::IO, t::TokenStages)
print(io, typeof(t).name.name)
vs = filter(!isnothing, ntuple(i->getfield(t, i), fieldcount(typeof(t))))
Expand Down Expand Up @@ -70,13 +78,16 @@ let ATR = AbstractTokenizer, AT = AbstractTokenization
# [tokenization dispatch] default behavior on specific stages, mark the splitting result for further tokenization
global @inline tokenize(::AT, ::DocumentStage, x) = Sentence(x)
global @inline tokenize(::AT, ::SentenceStage, x) = Token(x)
# [tokenization dispatch] skip if splitting result is already wrapped
global @inline tokenize(::AT, ::SubSentenceStage, x) = Token(x)
# [tokenization dispatch] default skip if splitting result is already wrapped
global @inline tokenize(::AT, ::TokenStages, x::TokenStages) = x

# [full dispatch, default to ignore tokenizer] the outer-most api, but these stages are usually unsplittable
global @inline tokenize(::ATR, ::AT, w::WordStage) = [Token(w.x)]
global @inline tokenize(::ATR, ::AT, w::SubWordStage) = [Token(w.x)]
global @inline tokenize(::ATR, ::AT, t::TokenStage) = [t]
global @inline tokenize(tkr::ATR, t::AT, s::Union{WordStage, SubWordStage, TokenStage}) = tokenize(t, s)
# [tokenization dispatch] default behavior of unspplittable type
global @inline tokenize(::AT, w::WordStage) = [Token(w.x)]
global @inline tokenize(::AT, w::SubWordStage) = [Token(w.x)]
global @inline tokenize(::AT, t::TokenStage) = [t]
# [full dispatch] the outer-most api, splitting input and recursively tokenize the result. ignore if input is empty
global @inline tokenize(tkr::ATR, t::AT, x::TokenStages) = tokenize_procedure(tkr, t, x)
end
Expand Down
54 changes: 54 additions & 0 deletions src/match.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
struct MatchTokenization <: AbstractTokenization
patterns::Vector{Regex}
end

splitting(t::MatchTokenization, s::SentenceStage) = collect(Tuple{Bool, SubString}, matchsplits(t.patterns, s.x))

@inline tokenize(t::MatchTokenization, s::SentenceStage, (istoken, x)) = istoken ? Token(x, s.meta) : SubSentence(x, s.meta)


struct IndexedMatchTokenization <: AbstractTokenization
patterns::Vector{Regex}
end

@inline splitting(t::IndexedMatchTokenization, s::SentenceStage) = splitting(MatchTokenization(t.patterns), s)
@inline splitting(::IndexedMatchTokenization, s::TokenStages, x) = splitting(IndexedTokenization(), s, x)

function splitting(::IndexedMatchTokenization, s::SubSentenceStage, x)
lastid = length(x)
!isnothing(s.meta.rsibling) && (s.meta.rsibling[] = lastid + s.meta.offset[])
return enumerate(x)
end

function splitting(::IndexedMatchTokenization, s::SentenceStage, x)
tokenoffset = map(Base.RefValue, 0:length(x)-1)
RV = Base.RefValue{Int}
v = Tuple{RV, Tuple{Bool, SubString}, Union{RV, Nothing}}[]
for ((i, sp), offset) in zip(enumerate(x), tokenoffset)
push!(v, (offset, sp, i == lastindex(x) ? nothing : tokenoffset[i+1]))
end
return v
end

function tokenize(::IndexedMatchTokenization, s::SentenceStage, (offset, (istoken, x), rsibling))
meta = merge(s.meta, (offset = offset, rsibling = rsibling))
return istoken ? Token(x, meta) : SubSentence(x, meta)
end

@inline tokenize(::IndexedMatchTokenization, d::DocumentStage, x) = tokenize(IndexedTokenization(), d, x)

function tokenize(::IndexedMatchTokenization, s::SubSentenceStage, (i, x))
offset = s.meta.offset[]
meta = Base.structdiff(s.meta, NamedTuple{(:offset, :rsibling)})
return Token(x, merge(meta, (token_id = i+offset,)))
end

function tokenize(::IndexedMatchTokenization, x::TokenStage)
if haskey(x.meta, :offset) && haskey(x.meta, :rsibling)
cid = x.meta.offset[]+1
x.meta.rsibling[] = cid
meta = Base.structdiff(x.meta, NamedTuple{(:offset, :rsibling)})
return [updatemeta(x, merge(meta, (token_id = cid,)))]
end
return [x]
end
17 changes: 17 additions & 0 deletions src/tkrs.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,24 @@
struct MixedTokenization{T <:Tuple} <: AbstractTokenization
ts::T
end
MixedTokenization(ts...) = MixedTokenization(ts)

"tokenizer that run the default behavior"
struct NaiveTokenizer <: AbstractTokenizer end

"default behavior but counting the index"
struct NaiveIndexedTokenizer <: AbstractTokenizer end
tokenization(::NaiveIndexedTokenizer) = IndexedTokenization()

"default behavior but don't split some pattern"
struct NaiveMatchTokenizer <: AbstractTokenizer
patterns::Vector{Regex}
end
tokenization(tkr::NaiveMatchTokenizer) = MatchTokenization(tkr.patterns)

"default behavior but counting index and don't split some pattern"
struct NaiveIndexedMatchTokenizer <: AbstractTokenizer
patterns::Vector{Regex}
end
tokenization(tkr::NaiveIndexedMatchTokenizer) = IndexedMatchTokenization(tkr.patterns)

70 changes: 70 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# match utils

struct MatchSplitIterator
t::Regex
s::Union{String, SubString}
end
Base.eltype(::Type{MatchSplitIterator}) = Tuple{Bool, SubString}
Base.IteratorSize(::Type{MatchSplitIterator}) = Base.SizeUnknown()

function Base.iterate(itr::MatchSplitIterator, (r, i, e) = (nothing, firstindex(itr.s), lastindex(itr.s)))
i > e && return nothing
t, s = itr.t, itr.s
if !isnothing(r)
ri, re = first(r), last(r)
j = isempty(r) ? first(r) : last(r)
v = (true, SubString(s, ri, re))
return v, j > e ? (nothing, i, -1) : (nothing, @inbounds(nextind(s, j)), e)
end

r = findnext(itr.t, itr.s, i)
if isnothing(r)
return (false, SubString(s, i, e)), (nothing, i, -1)
end

ri, re = first(r), last(r)
if i != ri
return (false, SubString(s, i, @inbounds(prevind(s, ri)))), (r, i, e)
else
j = isempty(r) ? first(r) : last(r)
v = (true, SubString(s, ri, re))
return v, j > e ? (nothing, i, -1) : (nothing, @inbounds(nextind(s, j)), e)
end
nothing
end

matchsplit(t, s) = matchsplit!(Tuple{Bool, SubString}[], t, s)
function matchsplit!(found, t, s)
i, e = firstindex(s), lastindex(s)

while true
r = findnext(t, s, i)
if isnothing(r)
push!(found, (false, SubString(s, i, e)))
break
end

ri, re = first(r), last(r)
i != ri && push!(found, (false, @inbounds SubString(s, i, prevind(s, ri))))
push!(found, (true, SubString(s, ri, re)))

j = isempty(r) ? first(r) : last(r)
j > e && break
@inbounds i = nextind(s, j)
i > e && break
end
return found
end

function matchsplits(patterns, x)
m, ms = first(patterns), @view patterns[2:end]
sp = MatchSplitIterator(m, x)

for m in ms
iters = Iterators.map(sp) do (istoken, s)
istoken ? ((istoken, s) for _ = 1:1) : MatchSplitIterator(m, s)
end
sp = Iterators.Flatten(iters)
end
return sp
end

0 comments on commit b3f2f4c

Please sign in to comment.