From b4805bd2069118c5caf1f636040e00ffb0c8964f Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 14 Oct 2023 12:36:39 -0400 Subject: [PATCH] regex fixes --- .DS_Store | Bin 6148 -> 6148 bytes docs/src/index.md | 0 src/TidierText.jl | 33 ++++++++++++++++++++------------- src/docstrings.jl | 25 ++++++++++++------------- 4 files changed, 32 insertions(+), 26 deletions(-) create mode 100644 docs/src/index.md diff --git a/.DS_Store b/.DS_Store index aea61649a5cba8693613f7785d9620dd982725e7..d93642fd883721c058c3ca7a43c8cf6f3c3b2234 100644 GIT binary patch delta 149 zcmZoMXffEJ#uC%=mw|zSg+Y%YogtH x != "", stripped_tokens) end - -function character_tokenizer(text::String; to_lower=false, strip_non_alphanum=false) +function character_tokenizer(text::String; to_lower=true, strip_non_alphanum=false) to_lower && (text = lowercase(text)) strip_non_alphanum && (text = replace(text, r"[^\w\s]" => "")) return collect(text) end -function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=false) +function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=true) to_lower && (text = lowercase(text)) tokens = split(replace(text, r"[^\w\s]" => ""), r"\s") return [join(tokens[i:i+n-1], " ") for i in 1:length(tokens)-n+1] end -function punctuation_space_tokenize(text::String; to_lower=false) +function punctuation_space_tokenize(text::String; to_lower=true) to_lower && (text = lowercase(text)) return split(replace(text, r"[^\w\s]" => ""), r"\s") end function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol, - tokenizer::Function; - to_lower::Bool=false) - texts = df[!, input_col] + tokenizer::Function; + to_lower::Bool=true) +texts = df[!, input_col] if to_lower - texts = lowercase.(texts) + texts = lowercase.(texts) end token_list = tokenizer.(texts) @@ -94,13 +97,17 @@ function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol, repeat_indices = Vector{Int}(undef, sum(repeat_lengths)) counter = 1 @inbounds for i in eachindex(repeat_lengths) - repeat_indices[counter:counter+repeat_lengths[i]-1] .= i - counter += repeat_lengths[i] + repeat_indices[counter:counter+repeat_lengths[i]-1] .= i + counter += repeat_lengths[i] end new_df = df[repeat_indices, :] new_df[!, output_col] = flat_token_list + + + #end + return new_df end @@ -122,7 +129,7 @@ end -function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=false, strip_non_alphanum=false) +function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=true, strip_non_alphanum=false) return unnest_tokens(df, output_col, input_col, (text, args...) -> character_tokenizer(text; to_lower=to_lower, strip_non_alphanum=strip_non_alphanum); to_lower=to_lower) end diff --git a/src/docstrings.jl b/src/docstrings.jl index 4b80868..e38fbbb 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -161,17 +161,16 @@ julia> @unnest_regex(df, word, text, "the") ─────┼─────────────────────────────────── 1 │ 1 The quick brown fox jumps. 2 │ 2 One column and - 3 │ 2 one row? + 3 │ 2 one row? julia> @unnest_regex(df, word, text, "the", to_lower = true) -4×2 DataFrame - Row │ doc word - │ Int64 SubStrin… -─────┼──────────────────────────────── - 1 │ 1 - 2 │ 1 quick brown fox jumps. - 3 │ 2 one column and - 4 │ 2 one row? +3×2 DataFrame + Row │ doc word + │ Int64 SubStrin… +─────┼─────────────────────────────── + 1 │ 1 quick brown fox jumps. + 2 │ 2 one column and + 3 │ 2 one row? ``` """ @@ -196,7 +195,7 @@ Creates n-grams from the text in `input_col` of `df`, outputting the result to ` julia> using DataFrames; df = DataFrame(text = ["The quick brown fox jumps.", "The sun rises in the east."], doc = [1, 2]); -julia> @unnest_ngrams(df, term, text, 2) +julia> @unnest_ngrams(df, term, text, 2, to_lower = false) 9×2 DataFrame Row │ doc term │ Int64 String @@ -211,16 +210,16 @@ julia> @unnest_ngrams(df, term, text, 2) 8 │ 2 in the 9 │ 2 the east -julia> @unnest_ngrams(df, term, text, 2, to_lower = true) +julia> @unnest_ngrams(df, term, text, 2) 9×2 DataFrame Row │ doc term │ Int64 String ─────┼──────────────────── - 1 │ 1 the quick + 1 │ 1 The quick 2 │ 1 quick brown 3 │ 1 brown fox 4 │ 1 fox jumps - 5 │ 2 the sun + 5 │ 2 The sun 6 │ 2 sun rises 7 │ 2 rises in 8 │ 2 in the