-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
49947e9
commit b3f2f4c
Showing
5 changed files
with
159 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
struct MatchTokenization <: AbstractTokenization | ||
patterns::Vector{Regex} | ||
end | ||
|
||
splitting(t::MatchTokenization, s::SentenceStage) = collect(Tuple{Bool, SubString}, matchsplits(t.patterns, s.x)) | ||
|
||
@inline tokenize(t::MatchTokenization, s::SentenceStage, (istoken, x)) = istoken ? Token(x, s.meta) : SubSentence(x, s.meta) | ||
|
||
|
||
struct IndexedMatchTokenization <: AbstractTokenization | ||
patterns::Vector{Regex} | ||
end | ||
|
||
@inline splitting(t::IndexedMatchTokenization, s::SentenceStage) = splitting(MatchTokenization(t.patterns), s) | ||
@inline splitting(::IndexedMatchTokenization, s::TokenStages, x) = splitting(IndexedTokenization(), s, x) | ||
|
||
function splitting(::IndexedMatchTokenization, s::SubSentenceStage, x) | ||
lastid = length(x) | ||
!isnothing(s.meta.rsibling) && (s.meta.rsibling[] = lastid + s.meta.offset[]) | ||
return enumerate(x) | ||
end | ||
|
||
function splitting(::IndexedMatchTokenization, s::SentenceStage, x) | ||
tokenoffset = map(Base.RefValue, 0:length(x)-1) | ||
RV = Base.RefValue{Int} | ||
v = Tuple{RV, Tuple{Bool, SubString}, Union{RV, Nothing}}[] | ||
for ((i, sp), offset) in zip(enumerate(x), tokenoffset) | ||
push!(v, (offset, sp, i == lastindex(x) ? nothing : tokenoffset[i+1])) | ||
end | ||
return v | ||
end | ||
|
||
function tokenize(::IndexedMatchTokenization, s::SentenceStage, (offset, (istoken, x), rsibling)) | ||
meta = merge(s.meta, (offset = offset, rsibling = rsibling)) | ||
return istoken ? Token(x, meta) : SubSentence(x, meta) | ||
end | ||
|
||
@inline tokenize(::IndexedMatchTokenization, d::DocumentStage, x) = tokenize(IndexedTokenization(), d, x) | ||
|
||
function tokenize(::IndexedMatchTokenization, s::SubSentenceStage, (i, x)) | ||
offset = s.meta.offset[] | ||
meta = Base.structdiff(s.meta, NamedTuple{(:offset, :rsibling)}) | ||
return Token(x, merge(meta, (token_id = i+offset,))) | ||
end | ||
|
||
function tokenize(::IndexedMatchTokenization, x::TokenStage) | ||
if haskey(x.meta, :offset) && haskey(x.meta, :rsibling) | ||
cid = x.meta.offset[]+1 | ||
x.meta.rsibling[] = cid | ||
meta = Base.structdiff(x.meta, NamedTuple{(:offset, :rsibling)}) | ||
return [updatemeta(x, merge(meta, (token_id = cid,)))] | ||
end | ||
return [x] | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,24 @@ | ||
struct MixedTokenization{T <:Tuple} <: AbstractTokenization | ||
ts::T | ||
end | ||
MixedTokenization(ts...) = MixedTokenization(ts) | ||
|
||
"tokenizer that run the default behavior" | ||
struct NaiveTokenizer <: AbstractTokenizer end | ||
|
||
"default behavior but counting the index" | ||
struct NaiveIndexedTokenizer <: AbstractTokenizer end | ||
tokenization(::NaiveIndexedTokenizer) = IndexedTokenization() | ||
|
||
"default behavior but don't split some pattern" | ||
struct NaiveMatchTokenizer <: AbstractTokenizer | ||
patterns::Vector{Regex} | ||
end | ||
tokenization(tkr::NaiveMatchTokenizer) = MatchTokenization(tkr.patterns) | ||
|
||
"default behavior but counting index and don't split some pattern" | ||
struct NaiveIndexedMatchTokenizer <: AbstractTokenizer | ||
patterns::Vector{Regex} | ||
end | ||
tokenization(tkr::NaiveIndexedMatchTokenizer) = IndexedMatchTokenization(tkr.patterns) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# match utils | ||
|
||
struct MatchSplitIterator | ||
t::Regex | ||
s::Union{String, SubString} | ||
end | ||
Base.eltype(::Type{MatchSplitIterator}) = Tuple{Bool, SubString} | ||
Base.IteratorSize(::Type{MatchSplitIterator}) = Base.SizeUnknown() | ||
|
||
function Base.iterate(itr::MatchSplitIterator, (r, i, e) = (nothing, firstindex(itr.s), lastindex(itr.s))) | ||
i > e && return nothing | ||
t, s = itr.t, itr.s | ||
if !isnothing(r) | ||
ri, re = first(r), last(r) | ||
j = isempty(r) ? first(r) : last(r) | ||
v = (true, SubString(s, ri, re)) | ||
return v, j > e ? (nothing, i, -1) : (nothing, @inbounds(nextind(s, j)), e) | ||
end | ||
|
||
r = findnext(itr.t, itr.s, i) | ||
if isnothing(r) | ||
return (false, SubString(s, i, e)), (nothing, i, -1) | ||
end | ||
|
||
ri, re = first(r), last(r) | ||
if i != ri | ||
return (false, SubString(s, i, @inbounds(prevind(s, ri)))), (r, i, e) | ||
else | ||
j = isempty(r) ? first(r) : last(r) | ||
v = (true, SubString(s, ri, re)) | ||
return v, j > e ? (nothing, i, -1) : (nothing, @inbounds(nextind(s, j)), e) | ||
end | ||
nothing | ||
end | ||
|
||
matchsplit(t, s) = matchsplit!(Tuple{Bool, SubString}[], t, s) | ||
function matchsplit!(found, t, s) | ||
i, e = firstindex(s), lastindex(s) | ||
|
||
while true | ||
r = findnext(t, s, i) | ||
if isnothing(r) | ||
push!(found, (false, SubString(s, i, e))) | ||
break | ||
end | ||
|
||
ri, re = first(r), last(r) | ||
i != ri && push!(found, (false, @inbounds SubString(s, i, prevind(s, ri)))) | ||
push!(found, (true, SubString(s, ri, re))) | ||
|
||
j = isempty(r) ? first(r) : last(r) | ||
j > e && break | ||
@inbounds i = nextind(s, j) | ||
i > e && break | ||
end | ||
return found | ||
end | ||
|
||
function matchsplits(patterns, x) | ||
m, ms = first(patterns), @view patterns[2:end] | ||
sp = MatchSplitIterator(m, x) | ||
|
||
for m in ms | ||
iters = Iterators.map(sp) do (istoken, s) | ||
istoken ? ((istoken, s) for _ = 1:1) : MatchSplitIterator(m, s) | ||
end | ||
sp = Iterators.Flatten(iters) | ||
end | ||
return sp | ||
end |