TidierOrg · drizk1 · Dec 18, 2024 · Dec 18, 2024 · Dec 26, 2024 · Dec 28, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # TidierData.jl updates
 
+## v16.3
+- Bugfix: `@summary` no longer errors with non-numeric columns. Instead, it only reports non-numeric summary stats on non-numeric columns. Minor changes to summary column names to be lowercase and snakecase.
+
 ## v0.16.2 - 2024-09-03
 - Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
 - Adds `@head`

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.16.2"
+version = "0.16.3"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -1026,21 +1026,38 @@ julia> @chain df @pull(2)
 
 const docstring_left_join =
 """
-    @left_join(df1, df2, [by])
+    @left_join(df1, df2, [join_by])
 
 Perform a left join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
-
+
+julia> @left_join(df1, df2, join_by(a == a))
+2×3 DataFrame
+ Row │ a       b      c       
+     │ String  Int64  Int64?  
+─────┼────────────────────────
+   1 │ a           1        3
+   2 │ b           2  missing 
+
+julia> @left_join(df1, df2, join_by("a"=="a"))
+2×3 DataFrame
+ Row │ a       b      c       
+     │ String  Int64  Int64?  
+─────┼────────────────────────
+   1 │ a           1        3
+   2 │ b           2  missing 
+
 julia> @left_join(df1, df2)
 2×3 DataFrame
  Row │ a       b      c       
@@ -1085,21 +1102,38 @@ julia> @left_join(df1, df2, "a" = "a")
 
 const docstring_right_join =
 """
-    @right_join(df1, df2, [by])
+    @right_join(df1, df2, [join_by])
 
 Perform a right join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
-
+
+julia> @right_join(df1, df2, join_by(a == a))
+2×3 DataFrame
+ Row │ a       b        c     
+     │ String  Int64?   Int64 
+─────┼────────────────────────
+   1 │ a             1      3
+   2 │ c       missing      4
+
+julia> @right_join(df1, df2, join_by("a"=="a"))
+2×3 DataFrame
+ Row │ a       b        c     
+     │ String  Int64?   Int64 
+─────┼────────────────────────
+   1 │ a             1      3
+   2 │ c       missing      4
+
 julia> @right_join(df1, df2)
 2×3 DataFrame
  Row │ a       b        c     
@@ -1144,21 +1178,36 @@ julia> @right_join(df1, df2, "a" = "a")
 
 const docstring_inner_join =
 """
-    @inner_join(df1, df2, [by])
+    @inner_join(df1, df2, [join_by])
 
 Perform a inner join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
-
+
+julia> @inner_join(df1, df2, join_by(a == a))
+1×3 DataFrame
+ Row │ a       b      c     
+     │ String  Int64  Int64 
+─────┼──────────────────────
+   1 │ a           1      3
+
+julia> @inner_join(df1, df2, join_by("a"=="a"))
+1×3 DataFrame
+ Row │ a       b      c     
+     │ String  Int64  Int64 
+─────┼──────────────────────
+   1 │ a           1      3
+
 julia> @inner_join(df1, df2)
 1×3 DataFrame
  Row │ a       b      c     
@@ -1198,21 +1247,41 @@ julia> @inner_join(df1, df2, "a" = "a")
 
 const docstring_full_join =
 """
-    @full_join(df1, df2, [by])
+    @full_join(df1, df2, [join_by])
 
 Perform a full join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
+
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
-
+
+julia> @full_join(df1, df2, join_by(a == a))
+3×3 DataFrame
+ Row │ a       b        c       
+     │ String  Int64?   Int64?  
+─────┼──────────────────────────
+   1 │ a             1        3
+   2 │ b             2  missing 
+   3 │ c       missing        4
+
+julia> @full_join(df1, df2, join_by("a"=="a"))
+3×3 DataFrame
+ Row │ a       b        c       
+     │ String  Int64?   Int64?  
+─────┼──────────────────────────
+   1 │ a             1        3
+   2 │ b             2  missing 
+   3 │ c       missing        4
+
 julia> @full_join(df1, df2)
 3×3 DataFrame
  Row │ a       b        c       
@@ -1262,21 +1331,37 @@ julia> @full_join(df1, df2, "a" = "a")
 
 const docstring_anti_join =
 """
-    @anti_join(df1, df2, [by])
+    @anti_join(df1, df2, [join_by])
 
 Perform an anti-join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
+
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
-
+
+julia> @anti_join(df1, df2, join_by(a == a))
+1×2 DataFrame
+ Row │ a       b     
+     │ String  Int64 
+─────┼───────────────
+   1 │ b           2
+
+julia> @anti_join(df1, df2, join_by("a"=="a"))
+1×2 DataFrame
+ Row │ a       b     
+     │ String  Int64 
+─────┼───────────────
+   1 │ b           2
+
 julia> @anti_join(df1, df2)
 1×2 DataFrame
  Row │ a       b     
@@ -1316,13 +1401,15 @@ julia> @anti_join(df1, df2, "a" = "a")
 
 const docstring_semi_join =
 """
-    @semi_join(df1, df2, [by])
+    @semi_join(df1, df2, [join_by])
 
 Perform an semi-join on `df1` and `df2` with an optional `by`.
 
 # Arguments
 - `df1`: A DataFrame.
 - `df2`: A DataFrame.
+- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
+
 - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.
 
 # Examples
@@ -1331,6 +1418,20 @@ julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);
 
 julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);
 
+julia> @semi_join(df1, df2, join_by(a == a))
+1×2 DataFrame
+ Row │ a       b     
+     │ String  Int64 
+─────┼───────────────
+   1 │ a           1
+
+julia> @semi_join(df1, df2, join_by("a"=="a"))
+1×2 DataFrame
+ Row │ a       b     
+     │ String  Int64 
+─────┼───────────────
+   1 │ a           1
+
 julia> @semi_join(df1, df2)
 1×2 DataFrame
  Row │ a       b     
@@ -2415,7 +2516,8 @@ For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, media
 julia> df = DataFrame(a = [1, 2, 3, 4, 5],
                       b = [missing, 7, 8, 9, 10],
                       c = [11, missing, 13, 14, missing],
-                      d = [16, 17, 18, 19, 20]);
+                      d = [16.1, 17.2, 18.3, 19.4, 20.5],
+                      e = ["a", "a", "a", "a", "a"]);
 
 julia> @summary(df);
 

diff --git a/src/parsing.jl b/src/parsing.jl
@@ -216,7 +216,7 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String})
   tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr)
 
   src = Union{Expr,QuoteNode}[] # type can be either a QuoteNode or a expression containing a selection helper function
-
+  
   if @capture(tidy_expr, expr_Symbol)
     push!(src, QuoteNode(expr))
   elseif @capture(tidy_expr, expr_String)
@@ -229,6 +229,18 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String})
     lhs = QuoteNode(Symbol(lhs))
     rhs = QuoteNode(Symbol(rhs))
     push!(src, :($lhs => $rhs))
+  elseif tidy_expr isa Expr && tidy_expr.head == :call && tidy_expr.args[1] == :join_by
+    for arg in tidy_expr.args[2:end]
+        if arg isa Expr && arg.head == :call && arg.args[1] == Symbol("==")
+            lhs_arg = arg.args[2]
+            rhs_arg = arg.args[3]
+            push!(src, :($(QuoteNode(lhs_arg)) => $(QuoteNode(rhs_arg))))
+        elseif isa(arg, Symbol)
+          push!(src, :($(QuoteNode(arg)) => $(QuoteNode(arg))))
+        else
+          push!(src, arg)
+        end
+    end
   else
     @capture(tidy_expr, (args__,))
     for arg in args

diff --git a/src/separate_unite.jl b/src/separate_unite.jl
@@ -211,8 +211,8 @@ function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimite
       for row in eachrow(temp_df)
           value = row[column]
           # Handle missing values and non-string types
-          if ismissing(value) || typeof(value) != String
-              push!(expanded_data[column], [value])
+          if ismissing(value) || !(value isa AbstractString)
+            push!(expanded_data[column], [value])
           else
               push!(expanded_data[column], split(value, delimiter))
           end

diff --git a/src/summary.jl b/src/summary.jl
@@ -3,18 +3,37 @@ function summary_stats(df::DataFrame)
     summary_data = []
     for column in colnames
         col = df[:, column]
-        col_nonmissing = collect(skipmissing(col))
-        push!(summary_data, (
-            Column = column,
-            Min = minimum(col_nonmissing),
-            Q1 = quantile(col_nonmissing, 0.25),
-            Median = median(col_nonmissing),
-            Mean = mean(col_nonmissing),
-            Q3 = quantile(col_nonmissing, 0.75),
-            Max = maximum(col_nonmissing),
-            Count = length(col_nonmissing),
-            Missing_Count = count(ismissing, col)
-        ))
+        if eltype(col) <: Union{Number, Missing}
+            col_nonmissing = collect(skipmissing(col))
+            push!(summary_data, (
+                column = column,
+                min = minimum(col_nonmissing),
+                q1 = quantile(col_nonmissing, 0.25),
+                median = median(col_nonmissing),
+                mean = mean(col_nonmissing),
+                q3 = quantile(col_nonmissing, 0.75),
+                max = maximum(col_nonmissing),
+                non_missing_values = length(col_nonmissing),
+                missing_values = count(ismissing, col),
+                total_values = length(col),
+                unique_values = length(unique(col_nonmissing))
+            ))
+        else
+            col_nonmissing = collect(skipmissing(col))
+            push!(summary_data, (
+                column = column,
+                min = nothing,
+                q1 = nothing,
+                median = nothing,
+                mean = nothing,
+                q3 = nothing,
+                max = nothing,
+                non_missing_values = length(col_nonmissing),
+                missing_values = count(ismissing, col),
+                total_values = length(col),
+                unique_values = length(unique(col_nonmissing))
+            ))
+        end
     end
     return DataFrame(summary_data)
 end