diff --git a/NEWS.md b/NEWS.md index d6cab94f..001262e9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,10 @@ # TidierData.jl updates -## v1.0.0 - 2023-07-28 +## v0.9.0 - 2023-08-04 +- Exposed `not_vectorized[]` as a package global variable so that the user or other packages can modify it +- Added `@separate`, `@unite`, and `@summary` + +## v0.8.0 - 2023-07-28 - `Tidier.jl` cloned and changed to `TidierData.jl` ## v0.7.7 - 2023-07-15 diff --git a/Project.toml b/Project.toml index 23cec51a..cbbca53c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.8.0" +version = "0.9.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index 0f7de2d4..ab4f4faa 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,10 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` +- `@separate()` and `@unite()` - `@drop_na()` - `@clean_names()` (as in R's `janitor::clean_names()` function) +- `@summary()` (as in R's `summary()` function) TidierData.jl also supports the following helper functions: diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl new file mode 100644 index 00000000..be6e6da5 --- /dev/null +++ b/docs/examples/UserGuide/sep_unite.jl @@ -0,0 +1,22 @@ +# Follwing the tidyverse syntax, the `@separate()` macro in `TidierData.jl` separates a single column into multiple columns. This is particularly useful for splitting a column containing delimited values into individual columns. + +using TidierData + +df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); + +# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter + +@chain df begin + @separate(a, (b, c, d), "-") +end + +# The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter + +# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter + +df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); + +@chain df begin + @unite(new_col, (b, c, d), "/") +end + diff --git a/docs/examples/UserGuide/summary.jl b/docs/examples/UserGuide/summary.jl new file mode 100644 index 00000000..3b70eeaa --- /dev/null +++ b/docs/examples/UserGuide/summary.jl @@ -0,0 +1,28 @@ +# The `@summary()` macro in `TidierData.jl` provides a concise way to compute summary statistics on data. Similar to its R counterpart, it will provide the mean, median, Q1, Q3, minimum, maximum, and number of missing values in a numerical column or columns. + +# ## Summary for the whole dataframe + +using TidierData + +df = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]); + +@chain df begin + @summary() +end + +@summary(df) + +# ## You can specify columns for which you want to compute the summary. This is useful if the DataFrame has a large number of columns and you're interested in only a subset of them. + +@chain df begin + @summary(B) +end + +@summary(df, B) + +# ## or for a range of columns + +@chain df begin + @select(B:D) + @summary() # you can also write this @summary(2:4) +end \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index af6ff619..2e95a8a9 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -101,8 +101,10 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` + - `@separate()` and `@unite()` - `@drop_na()` - `@clean_names()` (as in R's `janitor::clean_names()` function) + - `@summary()` (as in R's `summary()` function) ``` TidierData.jl also supports the following helper functions: diff --git a/src/TidierData.jl b/src/TidierData.jl index ce5af439..33dd0280 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -9,7 +9,7 @@ using Reexport # Exporting `Cols` because `summarize(!!vars, funs))` with multiple interpolated # columns requires `Cols()` to be nested within `Cols()`, so `Cols` needs to be exported. -@reexport using DataFrames: DataFrame, Cols, describe, nrow, proprow +@reexport using DataFrames: DataFrame, Cols, describe, nrow, proprow, Not, Between, select @reexport using Chain @reexport using Statistics @reexport using ShiftedArrays: lag, lead @@ -17,7 +17,8 @@ using Reexport export TidierData_set, across, desc, n, row_number, starts_with, ends_with, matches, if_else, case_when, ntile, as_float, as_integer, as_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, - @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse + @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse, @separate, + @unite, @summary # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? @@ -39,6 +40,8 @@ include("pseudofunctions.jl") include("helperfunctions.jl") include("ntile.jl") include("type_conversions.jl") +include("separate_unite.jl") +include("summary.jl") # Function to set global variables """ diff --git a/src/docstrings.jl b/src/docstrings.jl index 231841e2..275d06a3 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1988,4 +1988,93 @@ julia> as_string(1.5) julia> as_string(missing) missing ``` +""" + +const docstring_separate = +""" + @separate(df, From, Into, Separator) + +Separate a string column into mulitiple new columns based on a specified delimter + +# Arguments +- `df`: A DataFrame +- `From`: Column that will be split +- `Into`: New column names, supports [] or () +- `Separator`: the string or chacater on which to split + +# Examples +```jldoctest +julia> df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); + +julia> @separate(df, a, [b, c, d], "-") +3×3 DataFrame + Row │ b c d + │ SubStrin… SubStrin… SubStrin…? +─────┼────────────────────────────────── + 1 │ 1 1 missing + 2 │ 2 2 missing + 3 │ 3 3 3 + +julia> @chain df begin + @separate(a, (b, c, d), "-") + end +3×3 DataFrame + Row │ b c d + │ SubStrin… SubStrin… SubStrin…? +─────┼────────────────────────────────── + 1 │ 1 1 missing + 2 │ 2 2 missing + 3 │ 3 3 3 +``` +""" + +const docstring_unite = +""" + @unite(df, new_cols, from_cols, sep) + +Separate a multiple columns into one new columns using a specific delimter + +# Arguments +- `df`: A DataFrame +- `new_col`: New column that will recieve the combination +- `from_cols`: Column names that it will combine, supports [] or () +- `sep`: the string or character that will seprate the values in the new column + +# Examples +```jldoctest +julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); + +julia> @unite(df, new_col, (b, c, d), "-") +3×4 DataFrame + Row │ b c d new_col + │ String String String? String +─────┼────────────────────────────────── + 1 │ 1 1 missing 1-1 + 2 │ 2 2 missing 2-2 + 3 │ 3 3 3 3-3-3 +``` +""" + +const docstring_summary = +""" + @summary(df, cols...) + +For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, median, number of missing values + +# Arguments +- 'df': A DataFrame +- `cols`: columns on which summary will be performed. This is an optional arguement, without which summary will be performed on all numerical columns + +# Examples +```jldoctest +julia> df = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]); + +julia> @summary(df); + +julia> @summary(df, (B:D)); + +julia> @chain df begin + @summary(B:D) + end; +``` """ \ No newline at end of file diff --git a/src/separate_unite.jl b/src/separate_unite.jl new file mode 100644 index 00000000..bd08d3fd --- /dev/null +++ b/src/separate_unite.jl @@ -0,0 +1,65 @@ +function safe_getindex(arr, index, default_value="") + if index <= length(arr) + return arr[index] + else + return default_value + end +end + +function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::String) + new_df = df[:, :] + new_cols = map(x -> split(x, sep), new_df[:, col]) + max_cols = maximum(length.(new_cols)) + + if length(into) < max_cols + error("Not enough names provided in `into` for all split columns.") + end + + for i in 1:max_cols + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + end + + new_df = select(new_df, Not(col)) + + return new_df +end + +""" +$docstring_separate +""" +macro separate(df, from, into, sep) + from = QuoteNode(from) + + if @capture(into, (args__,)) + elseif @capture(into, [args__]) + end + + args = QuoteNode.(args) + + var_expr = quote + separate($(esc(df)), $from, [$(args...)], $sep) + end +end + + +function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") + new_df = df[:, :] + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + return new_df +end + +""" +$docstring_unite +""" +macro unite(df, new_col, from_cols, sep) + new_col = QuoteNode(new_col) + + if @capture(from_cols, (args__,)) + elseif @capture(from_cols, [args__]) + end + + args = QuoteNode.(args) + var_expr = quote + unite($(esc(df)), $new_col, [$(args...)], $sep) + end +end diff --git a/src/summary.jl b/src/summary.jl new file mode 100644 index 00000000..e98bdec5 --- /dev/null +++ b/src/summary.jl @@ -0,0 +1,38 @@ +function summary_stats(df::DataFrame) + colnames = names(df) + summary_data = [] + for column in colnames + col = df[:, column] + col_nonmissing = collect(skipmissing(col)) + push!(summary_data, ( + Column = column, + Min = minimum(col_nonmissing), + Q1 = quantile(col_nonmissing, 0.25), + Median = median(col_nonmissing), + Mean = mean(col_nonmissing), + Q3 = quantile(col_nonmissing, 0.75), + Max = maximum(col_nonmissing), + Count = length(col_nonmissing), + Missing_Count = count(ismissing, col) + )) + end + return DataFrame(summary_data) +end + +""" +$docstring_summary +""" +macro summary(df, cols...) + if length(cols) == 0 + return quote + summary_stats($(esc(df))) + end + else + selected_cols = [parse_tidy(col) for col in cols] + return quote + _selected_df = select($(esc(df)), $(selected_cols...)) + summary_stats(_selected_df) + end + end +end +