diff --git a/.DS_Store b/.DS_Store index aea6164..d93642f 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..e69de29 diff --git a/src/TidierText.jl b/src/TidierText.jl index 6f033ee..a93576e 100644 --- a/src/TidierText.jl +++ b/src/TidierText.jl @@ -53,36 +53,39 @@ function bind_tf_idf(df::DataFrame, term_col::Symbol, document_col::Symbol, n_co return df_copy end -function regex_tokenizer(text::String, pattern="\\s+") - return split(text, Regex(pattern)) +function regex_tokenizer(text::String, pattern="\\s*") + tokens = split(text, Regex(pattern), keepempty=false) + stripped_tokens = strip.(tokens) + #println("Original: ", repr(text)) # Debug line + #println("Tokens after stripping: ", stripped_tokens) # Debug line + return filter(x -> x != "", stripped_tokens) end - -function character_tokenizer(text::String; to_lower=false, strip_non_alphanum=false) +function character_tokenizer(text::String; to_lower=true, strip_non_alphanum=false) to_lower && (text = lowercase(text)) strip_non_alphanum && (text = replace(text, r"[^\w\s]" => "")) return collect(text) end -function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=false) +function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=true) to_lower && (text = lowercase(text)) tokens = split(replace(text, r"[^\w\s]" => ""), r"\s") return [join(tokens[i:i+n-1], " ") for i in 1:length(tokens)-n+1] end -function punctuation_space_tokenize(text::String; to_lower=false) +function punctuation_space_tokenize(text::String; to_lower=true) to_lower && (text = lowercase(text)) return split(replace(text, r"[^\w\s]" => ""), r"\s") end function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol, - tokenizer::Function; - to_lower::Bool=false) - texts = df[!, input_col] + tokenizer::Function; + to_lower::Bool=true) +texts = df[!, input_col] if to_lower - texts = lowercase.(texts) + texts = lowercase.(texts) end token_list = tokenizer.(texts) @@ -94,13 +97,17 @@ function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol, repeat_indices = Vector{Int}(undef, sum(repeat_lengths)) counter = 1 @inbounds for i in eachindex(repeat_lengths) - repeat_indices[counter:counter+repeat_lengths[i]-1] .= i - counter += repeat_lengths[i] + repeat_indices[counter:counter+repeat_lengths[i]-1] .= i + counter += repeat_lengths[i] end new_df = df[repeat_indices, :] new_df[!, output_col] = flat_token_list + + + #end + return new_df end @@ -122,7 +129,7 @@ end -function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=false, strip_non_alphanum=false) +function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=true, strip_non_alphanum=false) return unnest_tokens(df, output_col, input_col, (text, args...) -> character_tokenizer(text; to_lower=to_lower, strip_non_alphanum=strip_non_alphanum); to_lower=to_lower) end diff --git a/src/docstrings.jl b/src/docstrings.jl index 4b80868..e38fbbb 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -161,17 +161,16 @@ julia> @unnest_regex(df, word, text, "the") ─────┼─────────────────────────────────── 1 │ 1 The quick brown fox jumps. 2 │ 2 One column and - 3 │ 2 one row? + 3 │ 2 one row? julia> @unnest_regex(df, word, text, "the", to_lower = true) -4×2 DataFrame - Row │ doc word - │ Int64 SubStrin… -─────┼──────────────────────────────── - 1 │ 1 - 2 │ 1 quick brown fox jumps. - 3 │ 2 one column and - 4 │ 2 one row? +3×2 DataFrame + Row │ doc word + │ Int64 SubStrin… +─────┼─────────────────────────────── + 1 │ 1 quick brown fox jumps. + 2 │ 2 one column and + 3 │ 2 one row? ``` """ @@ -196,7 +195,7 @@ Creates n-grams from the text in `input_col` of `df`, outputting the result to ` julia> using DataFrames; df = DataFrame(text = ["The quick brown fox jumps.", "The sun rises in the east."], doc = [1, 2]); -julia> @unnest_ngrams(df, term, text, 2) +julia> @unnest_ngrams(df, term, text, 2, to_lower = false) 9×2 DataFrame Row │ doc term │ Int64 String @@ -211,16 +210,16 @@ julia> @unnest_ngrams(df, term, text, 2) 8 │ 2 in the 9 │ 2 the east -julia> @unnest_ngrams(df, term, text, 2, to_lower = true) +julia> @unnest_ngrams(df, term, text, 2) 9×2 DataFrame Row │ doc term │ Int64 String ─────┼──────────────────── - 1 │ 1 the quick + 1 │ 1 The quick 2 │ 1 quick brown 3 │ 1 brown fox 4 │ 1 fox jumps - 5 │ 2 the sun + 5 │ 2 The sun 6 │ 2 sun rises 7 │ 2 rises in 8 │ 2 in the