From b4805bd2069118c5caf1f636040e00ffb0c8964f Mon Sep 17 00:00:00 2001
From: drizk1 <rizkytennis@gmail.com>
Date: Sat, 14 Oct 2023 12:36:39 -0400
Subject: [PATCH] regex fixes

---
 .DS_Store         | Bin 6148 -> 6148 bytes
 docs/src/index.md |   0
 src/TidierText.jl |  33 ++++++++++++++++++++-------------
 src/docstrings.jl |  25 ++++++++++++-------------
 4 files changed, 32 insertions(+), 26 deletions(-)
 create mode 100644 docs/src/index.md

diff --git a/.DS_Store b/.DS_Store
index aea61649a5cba8693613f7785d9620dd982725e7..d93642fd883721c058c3ca7a43c8cf6f3c3b2234 100644
GIT binary patch
delta 149
zcmZoMXffEJ#uC%=mw|zSg+Y%YogtH<Bsbs1B`GIA2`I+l=yR*3HutC_s(cDuUPZpp
z9$mg5!!S5GKeqs=2WaZ%CKgjhxu6Cho0*}Qp@<<F*?gerzv|q{S6SyUckapFT*5X%
F5C9b!CaM4c

delta 149
zcmZoMXffEJ#uAhKhk=2Cg+Y%YogtH<Bsbs1B`GIA2`I)PAm@2#SN2gyRQVLV@&y@&
z!O8i#1wcIv49uIGSWFq^0v~|b48;sZ49Upm14V_GXHUM$I)}OAT;}Eywh4j&_s%9h

diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/TidierText.jl b/src/TidierText.jl
index 6f033ee..a93576e 100644
--- a/src/TidierText.jl
+++ b/src/TidierText.jl
@@ -53,36 +53,39 @@ function bind_tf_idf(df::DataFrame, term_col::Symbol, document_col::Symbol, n_co
     return df_copy
 end
 
-function regex_tokenizer(text::String, pattern="\\s+")
-    return split(text, Regex(pattern))
+function regex_tokenizer(text::String, pattern="\\s*")
+    tokens = split(text, Regex(pattern), keepempty=false)
+    stripped_tokens = strip.(tokens)
+    #println("Original: ", repr(text))  # Debug line
+    #println("Tokens after stripping: ", stripped_tokens)  # Debug line
+    return filter(x -> x != "", stripped_tokens)
 end
 
-
-function character_tokenizer(text::String; to_lower=false, strip_non_alphanum=false)
+function character_tokenizer(text::String; to_lower=true, strip_non_alphanum=false)
     to_lower && (text = lowercase(text))
     strip_non_alphanum && (text = replace(text, r"[^\w\s]" => ""))
     return collect(text)
 end
 
 
-function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=false)
+function ngram_tokenizer(text::String; n::Int=2, to_lower::Bool=true)
     to_lower && (text = lowercase(text))
     tokens = split(replace(text, r"[^\w\s]" => ""), r"\s")
     return [join(tokens[i:i+n-1], " ") for i in 1:length(tokens)-n+1]
 end
 
-function punctuation_space_tokenize(text::String; to_lower=false)
+function punctuation_space_tokenize(text::String; to_lower=true)
     to_lower && (text = lowercase(text))
     return split(replace(text, r"[^\w\s]" => ""), r"\s")
 end
 
 function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol, 
-                       tokenizer::Function; 
-                       to_lower::Bool=false)
-    texts = df[!, input_col]
+    tokenizer::Function; 
+    to_lower::Bool=true)
+texts = df[!, input_col]
 
     if to_lower
-        texts = lowercase.(texts)
+    texts = lowercase.(texts)
     end
 
     token_list = tokenizer.(texts)
@@ -94,13 +97,17 @@ function unnest_tokens(df::DataFrame, output_col::Symbol, input_col::Symbol,
     repeat_indices = Vector{Int}(undef, sum(repeat_lengths))
     counter = 1
     @inbounds for i in eachindex(repeat_lengths)
-        repeat_indices[counter:counter+repeat_lengths[i]-1] .= i
-        counter += repeat_lengths[i]
+    repeat_indices[counter:counter+repeat_lengths[i]-1] .= i
+    counter += repeat_lengths[i]
     end
 
     new_df = df[repeat_indices, :]
     new_df[!, output_col] = flat_token_list
 
+
+
+    #end
+
     return new_df
 end
 
@@ -122,7 +129,7 @@ end
 
 
 
-function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=false, strip_non_alphanum=false)
+function unnest_characters(df::DataFrame, output_col::Symbol, input_col::Symbol; to_lower::Bool=true, strip_non_alphanum=false)
     return unnest_tokens(df, output_col, input_col, (text, args...) -> character_tokenizer(text; to_lower=to_lower, strip_non_alphanum=strip_non_alphanum); to_lower=to_lower)
 end
 
diff --git a/src/docstrings.jl b/src/docstrings.jl
index 4b80868..e38fbbb 100644
--- a/src/docstrings.jl
+++ b/src/docstrings.jl
@@ -161,17 +161,16 @@ julia>  @unnest_regex(df, word, text, "the")
 ─────┼───────────────────────────────────
    1 │     1  The quick brown fox jumps.
    2 │     2  One column and
-   3 │     2   one row?
+   3 │     2  one row?
 
 julia>  @unnest_regex(df, word, text,  "the", to_lower = true)
-4×2 DataFrame
- Row │ doc    word                    
-     │ Int64  SubStrin…               
-─────┼────────────────────────────────
-   1 │     1
-   2 │     1   quick brown fox jumps.
-   3 │     2  one column and
-   4 │     2   one row?
+3×2 DataFrame
+ Row │ doc    word                   
+     │ Int64  SubStrin…              
+─────┼───────────────────────────────
+   1 │     1  quick brown fox jumps.
+   2 │     2  one column and
+   3 │     2  one row?
 ```
 """
 
@@ -196,7 +195,7 @@ Creates n-grams from the text in `input_col` of `df`, outputting the result to `
 julia>  using DataFrames;
         df = DataFrame(text = ["The quick brown fox jumps.", "The sun rises in the east."], doc = [1, 2]);
 
-julia>  @unnest_ngrams(df, term, text, 2)
+julia>  @unnest_ngrams(df, term, text, 2, to_lower = false)
 9×2 DataFrame
  Row │ doc    term        
      │ Int64  String      
@@ -211,16 +210,16 @@ julia>  @unnest_ngrams(df, term, text, 2)
    8 │     2  in the
    9 │     2  the east
 
-julia> @unnest_ngrams(df, term, text, 2, to_lower = true)
+julia> @unnest_ngrams(df, term, text, 2)
 9×2 DataFrame
  Row │ doc    term        
      │ Int64  String      
 ─────┼────────────────────
-   1 │     1  the quick
+   1 │     1  The quick
    2 │     1  quick brown
    3 │     1  brown fox
    4 │     1  fox jumps
-   5 │     2  the sun
+   5 │     2  The sun
    6 │     2  sun rises
    7 │     2  rises in
    8 │     2  in the