Skip to content

Commit

Permalink
regex fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
drizk1 committed Oct 14, 2023
1 parent f408510 commit b4805bd
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 26 deletions.
Binary file modified .DS_Store
Binary file not shown.
Empty file added docs/src/index.md
Empty file.
33 changes: 20 additions & 13 deletions src/TidierText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,36 +53,39 @@ function bind_tf_idf(df::DataFrame, term_col::Symbol, document_col::Symbol, n_co
return df_copy
end

function regex_tokenizer(text::String, pattern="\\s+")
return split(text, Regex(pattern))
function regex_tokenizer(text::String, pattern="\\s*")
tokens = split(text, Regex(pattern), keepempty=false)
stripped_tokens = strip.(tokens)
#println("Original: ", repr(text)) # Debug line
#println("Tokens after stripping: ", stripped_tokens) # Debug line
return filter(x -> x != "", stripped_tokens)
end


function character_tokenizer(text::String; to_lower=false, strip_non_alphanum=false)
function character_tokenizer(text::String; to_lower=true, strip_non_alphanum=false)
to_lower && (text = lowercase(text))
strip_non_alphanum && (text = replace(text, r"[^\w\s]" => ""))
return collect(text)
end


function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=false)
function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=true)
to_lower && (text = lowercase(text))
tokens = split(replace(text, r"[^\w\s]" => ""), r"\s")
return [join(tokens[i:i+n-1], " ") for i in 1:length(tokens)-n+1]
end

function punctuation_space_tokenize(text::String; to_lower=false)
function punctuation_space_tokenize(text::String; to_lower=true)
to_lower && (text = lowercase(text))
return split(replace(text, r"[^\w\s]" => ""), r"\s")
end

function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol,
tokenizer::Function;
to_lower::Bool=false)
texts = df[!, input_col]
tokenizer::Function;
to_lower::Bool=true)
texts = df[!, input_col]

if to_lower
texts = lowercase.(texts)
texts = lowercase.(texts)
end

token_list = tokenizer.(texts)
Expand All @@ -94,13 +97,17 @@ function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol,
repeat_indices = Vector{Int}(undef, sum(repeat_lengths))
counter = 1
@inbounds for i in eachindex(repeat_lengths)
repeat_indices[counter:counter+repeat_lengths[i]-1] .= i
counter += repeat_lengths[i]
repeat_indices[counter:counter+repeat_lengths[i]-1] .= i
counter += repeat_lengths[i]
end

new_df = df[repeat_indices, :]
new_df[!, output_col] = flat_token_list



#end

return new_df
end

Expand All @@ -122,7 +129,7 @@ end



function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=false, strip_non_alphanum=false)
function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=true, strip_non_alphanum=false)
return unnest_tokens(df, output_col, input_col, (text, args...) -> character_tokenizer(text; to_lower=to_lower, strip_non_alphanum=strip_non_alphanum); to_lower=to_lower)
end

Expand Down
25 changes: 12 additions & 13 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,16 @@ julia> @unnest_regex(df, word, text, "the")
─────┼───────────────────────────────────
1 │ 1 The quick brown fox jumps.
2 │ 2 One column and
3 │ 2 one row?
3 │ 2 one row?
julia> @unnest_regex(df, word, text, "the", to_lower = true)
4×2 DataFrame
Row │ doc word
│ Int64 SubStrin…
─────┼────────────────────────────────
1 │ 1
2 │ 1 quick brown fox jumps.
3 │ 2 one column and
4 │ 2 one row?
3×2 DataFrame
Row │ doc word
│ Int64 SubStrin…
─────┼───────────────────────────────
1 │ 1 quick brown fox jumps.
2 │ 2 one column and
3 │ 2 one row?
```
"""

Expand All @@ -196,7 +195,7 @@ Creates n-grams from the text in `input_col` of `df`, outputting the result to `
julia> using DataFrames;
df = DataFrame(text = ["The quick brown fox jumps.", "The sun rises in the east."], doc = [1, 2]);
julia> @unnest_ngrams(df, term, text, 2)
julia> @unnest_ngrams(df, term, text, 2, to_lower = false)
9×2 DataFrame
Row │ doc term
│ Int64 String
Expand All @@ -211,16 +210,16 @@ julia> @unnest_ngrams(df, term, text, 2)
8 │ 2 in the
9 │ 2 the east
julia> @unnest_ngrams(df, term, text, 2, to_lower = true)
julia> @unnest_ngrams(df, term, text, 2)
9×2 DataFrame
Row │ doc term
│ Int64 String
─────┼────────────────────
1 │ 1 the quick
1 │ 1 The quick
2 │ 1 quick brown
3 │ 1 brown fox
4 │ 1 fox jumps
5 │ 2 the sun
5 │ 2 The sun
6 │ 2 sun rises
7 │ 2 rises in
8 │ 2 in the
Expand Down

0 comments on commit b4805bd

Please sign in to comment.