diff --git a/NEWS.md b/NEWS.md index 83745563..8febdb2b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # TidierData.jl updates +## v0.14.4 - 2023-12-30 +- Adds `@unnest_wider()` +- Adds `@unnest_longer()` +- Adds `@nest()` +- Fixes tidy selection in `@unite()` + ## v0.14.3 - 2023-12-22 - Adds support for interpolation and tidy selection in `@fill_missing` - Fixes tidy selection in `@separate_rows()` diff --git a/Project.toml b/Project.toml index 20b11173..2bc6ccb4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.14.3" +version = "0.14.4" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index ec736967..66b021f1 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ TidierData.jl currently supports the following top-level macros: - `@pivot_wider()` and `@pivot_longer()` - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` +- `@unnest_longer()`, `@unnest_wider()`, and `@nest()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/docs/examples/UserGuide/nesting.jl b/docs/examples/UserGuide/nesting.jl new file mode 100644 index 00000000..585f7c7e --- /dev/null +++ b/docs/examples/UserGuide/nesting.jl @@ -0,0 +1,71 @@ +# ## `@nest` + +# Nest columns into a dataframe nested into a new column + +using TidierData + +df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) + +nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz) + +# To return to the original dataframe, you can unnest wider and then longer. + +@chain nested_df begin + @unnest_wider(n3:n2) + @unnest_longer(y:ab) +end + +# Or you can unnest longer and then wider. + +@chain nested_df begin + @unnest_longer(n3:n2) + @unnest_wider(n3:n2) +end + +# ## `@unnest_longer` + +# `@unnest_longer` adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns. + +df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]); + +@chain df begin + @unnest_longer(y) +end + +# If there are rows with empty arrays, `keep_empty` will prevent these rows from being dropped. `include_indices` will add a new column for each flattened column that logs the position of each entry in the array. + +@chain df begin + @unnest_longer(y, keep_empty = true, indices_include = true) +end + +# ## `@unnest_wider` + +# `@unnest_wider` will widen a column or column(s) of Dicts, Arrays, Tuples or Dataframes into multiple columns. + +df2 = DataFrame( + name = ["Zaki", "Farida"], + attributes = [ + Dict("age" => 25, "city" => "New York"), + Dict("age" => 30, "city" => "Los Angeles")]); + +@chain df2 begin + @unnest_wider(attributes) +end + + +# ## Unnesting nested Dataframes with different lengths which contains arrays + +df3 = DataFrame( + x = 1:3, + y = Any[ + DataFrame(), + DataFrame(a = ["A"], b = [14]), + DataFrame(a = ["A", "B", "C"], b = [13, 12, 11], c = [4, 4, 4]) + ] +) +# `df3` contains dataframes in with different widths that also contain arrays. Chaining together `@unnest_wider` and `@unnest_longer` will unnest the columns to tuples first and then they will be fully unnested after. + +@chain df3 begin + @unnest_wider(y) + @unnest_longer(a:c, keep_empty = true) +end \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 751c5499..a745da80 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -132,6 +132,7 @@ nav: - "Binding" : "examples/generated/UserGuide/binding.md" - "Pivoting": "examples/generated/UserGuide/pivots.md" - "Separating" : "examples/generated/UserGuide/sep_unite.md" + - "Nesting" : "examples/generated/UserGuide/nesting.md" - "@summary" : "examples/generated/UserGuide/summary.md" - "Column names": "examples/generated/UserGuide/column_names.md" - "Interpolation" : "examples/generated/UserGuide/interpolation.md" diff --git a/docs/src/index.md b/docs/src/index.md index 6d7540fa..7425e442 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -104,7 +104,8 @@ TidierData.jl currently supports the following top-level macros: - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - `@separate()`, `@separate_rows()`, and `@unite()` - - `@drop_missing()` and `@fill_missing` + - `@drop_missing()` and `@fill_missing()` + - `@unnest_longer()`, `@unnest_wider()`, and `@nest()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) ``` diff --git a/src/TidierData.jl b/src/TidierData.jl index c64f9538..c56d6a4f 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -20,7 +20,8 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows, + @unnest_longer, @unnest_wider, @nest # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? @@ -51,6 +52,7 @@ include("separate_unite.jl") include("summary.jl") include("is_type.jl") include("missings.jl") +include("nests.jl") # Function to set global variables """ diff --git a/src/docstrings.jl b/src/docstrings.jl index 10b2e3e8..a904b81a 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -28,7 +28,7 @@ This function should only be called inside of TidierData.jl macros. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @summarize(across(b, minimum)) @@ -98,7 +98,7 @@ This function should only be called inside of TidierData.jl macros. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @select(where(is_number)) @@ -203,7 +203,7 @@ Select variables in a DataFrame. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df @select(a, b, c) 5×3 DataFrame @@ -360,7 +360,7 @@ Create a new DataFrame with only computed columns. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @transmute(d = b + c) @@ -390,7 +390,7 @@ to rename and select columns. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @rename(d = b, e = c) @@ -421,7 +421,7 @@ rows as `df`. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @mutate(d = b + c, b_minus_mean_b = b - mean(b)) @@ -508,7 +508,7 @@ Create a new DataFrame with one row that aggregating all observations from the i # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @summarize(mean_b = mean(b), median_b = median(b)) @@ -560,7 +560,7 @@ Subset a DataFrame and return a copy of DataFrame where specified conditions are # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @filter(b >= mean(b)) @@ -608,7 +608,7 @@ sets of `cols`. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @group_by(a) @@ -653,7 +653,7 @@ If this is applied to a `GroupedDataFrame`, then it removes the grouping. If thi # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @group_by(a) @@ -3077,4 +3077,242 @@ julia> @separate_rows(df, b:d, ";" ) 5 │ 3 dd 5 11 6 │ 3 ee 6 12 ``` -""" \ No newline at end of file +""" + +const docstring_unnest_wider = +""" + @unnest_wider(df, columns, names_sep=) + +Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns. + +# Arguments +- `df`: A DataFrame. +- `columns`: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names. +- `names_sep`: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator. + +# Examples +```jldoctest +julia> df = DataFrame(name = ["Zaki", "Farida"], attributes = [ + Dict("age" => 25, "city" => "New York"), + Dict("age" => 30, "city" => "Los Angeles")]); + +julia> @unnest_wider(df, attributes) +2×3 DataFrame + Row │ name city age + │ String String Int64 +─────┼──────────────────────────── + 1 │ Zaki New York 25 + 2 │ Farida Los Angeles 30 + +julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] + +julia> @unnest_wider(df2, b:c, names_sep = "_") +2×5 DataFrame + Row │ a b_1 b_2 c_1 c_2 + │ Int64 Int64 Int64 Int64 Int64 +─────┼─────────────────────────────────── + 1 │ 1 1 2 5 6 + 2 │ 2 3 4 7 8 +``` +""" + +const docstring_unnest_longer = +""" + @unnest_longer(df, columns, indices_include=false) + +Unnest arrays in columns from a DataFrame to create a longer DataFrame with one row for each entry of the array. + +# Arguments +- `df`: A DataFrame. +- `columns`: Columns to unnest. Can be a column symbols or a range of columns if they align for number of values. +- `indices_include`: Optional. When set to `true`, adds an index column for each unnested column, which logs the position of each array entry. +- `keep_empty`: Optional. When set to `true`, rows with empty arrays are kept, not skipped, and unnested as missing. + +# Examples +```jldoctest +julia> df = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] + +julia> @unnest_longer(df, 2) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Array… +─────┼────────────────────── + 1 │ 1 1 [5, 6] + 2 │ 1 2 [5, 6] + 3 │ 2 3 [7, 8] + 4 │ 2 4 [7, 8] + +julia> @unnest_longer(df, b:c, indices_include=true) +4×5 DataFrame + Row │ a b c b_id c_id + │ Int64 Int64 Int64 Int64 Int64 +─────┼─────────────────────────────────── + 1 │ 1 1 5 1 1 + 2 │ 1 2 6 2 2 + 3 │ 2 3 7 1 1 + 4 │ 2 4 8 2 2 + +julia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]) +4×2 DataFrame + Row │ x y + │ Int64 Array… +─────┼───────────────────── + 1 │ 1 Any[] + 2 │ 2 Any[1, 2, 3] + 3 │ 3 Any[4, 5] + 4 │ 4 Any[] + +julia> @unnest_longer(df2, y, keep_empty = true) +7×2 DataFrame + Row │ x y + │ Int64 Any +─────┼──────────────── + 1 │ 1 missing + 2 │ 2 1 + 3 │ 2 2 + 4 │ 2 3 + 5 │ 3 4 + 6 │ 3 5 + 7 │ 4 missing +``` +""" + +const docstring_nest = +""" + @nest(df, new_column = nesting_columns) + +Multiple columns are nested into one or more new columns in a DataFrame. +# Arguments +- `df`: A DataFrame +- `new_column`: New column name +- `nesting_columns`: Columns to be nested into the new_column +# Examples +```jldoctest +julia> df = DataFrame(a = repeat('a':'e', inner = 3), + b = 1:15, + c_1 = 16:30, + c_2 = 31:45); + +julia> @nest(df, data = b:c_2) +5×2 DataFrame + Row │ a data + │ Char DataFrame +─────┼───────────────────── + 1 │ a 3×3 DataFrame + 2 │ b 3×3 DataFrame + 3 │ c 3×3 DataFrame + 4 │ d 3×3 DataFrame + 5 │ e 3×3 DataFrame + +julia> @nest(df, data_1 = b, data_2 = starts_with("c")) +5×3 DataFrame + Row │ a data_1 data_2 + │ Char DataFrame DataFrame +─────┼──────────────────────────────────── + 1 │ a 3×1 DataFrame 3×2 DataFrame + 2 │ b 3×1 DataFrame 3×2 DataFrame + 3 │ c 3×1 DataFrame 3×2 DataFrame + 4 │ d 3×1 DataFrame 3×2 DataFrame + 5 │ e 3×1 DataFrame 3×2 DataFrame + +julia> @chain df begin + @nest(data = b:c_2) + @unnest_longer(data) + end +15×2 DataFrame + Row │ a data + │ Char NamedTup… +─────┼──────────────────────────────────── + 1 │ a (b = 1, c_1 = 16, c_2 = 31) + 2 │ a (b = 2, c_1 = 17, c_2 = 32) + 3 │ a (b = 3, c_1 = 18, c_2 = 33) + 4 │ b (b = 4, c_1 = 19, c_2 = 34) + 5 │ b (b = 5, c_1 = 20, c_2 = 35) + 6 │ b (b = 6, c_1 = 21, c_2 = 36) + 7 │ c (b = 7, c_1 = 22, c_2 = 37) + 8 │ c (b = 8, c_1 = 23, c_2 = 38) + 9 │ c (b = 9, c_1 = 24, c_2 = 39) + 10 │ d (b = 10, c_1 = 25, c_2 = 40) + 11 │ d (b = 11, c_1 = 26, c_2 = 41) + 12 │ d (b = 12, c_1 = 27, c_2 = 42) + 13 │ e (b = 13, c_1 = 28, c_2 = 43) + 14 │ e (b = 14, c_1 = 29, c_2 = 44) + 15 │ e (b = 15, c_1 = 30, c_2 = 45) + +julia> @chain df begin + @nest(data = b:c_2) + @unnest_wider(data) + end +5×4 DataFrame + Row │ a b c_1 c_2 + │ Char Any Any Any +─────┼──────────────────────────────────────────────── + 1 │ a [1, 2, 3] [16, 17, 18] [31, 32, 33] + 2 │ b [4, 5, 6] [19, 20, 21] [34, 35, 36] + 3 │ c [7, 8, 9] [22, 23, 24] [37, 38, 39] + 4 │ d [10, 11, 12] [25, 26, 27] [40, 41, 42] + 5 │ e [13, 14, 15] [28, 29, 30] [43, 44, 45] + +julia> @chain df begin + @nest(data = -a) + @unnest_wider(data) # wider first + @unnest_longer(-a) # then longer + end +15×4 DataFrame + Row │ a b c_1 c_2 + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 1 16 31 + 2 │ a 2 17 32 + 3 │ a 3 18 33 + 4 │ b 4 19 34 + 5 │ b 5 20 35 + 6 │ b 6 21 36 + 7 │ c 7 22 37 + 8 │ c 8 23 38 + 9 │ c 9 24 39 + 10 │ d 10 25 40 + 11 │ d 11 26 41 + 12 │ d 12 27 42 + 13 │ e 13 28 43 + 14 │ e 14 29 44 + 15 │ e 15 30 45 + +julia> @chain df begin + @nest(data = -a) + @unnest_longer(data) # longer first + @unnest_wider(-a) # then wider + end +15×4 DataFrame + Row │ a b c_2 c_1 + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 1 31 16 + 2 │ a 2 32 17 + 3 │ a 3 33 18 + 4 │ b 4 34 19 + 5 │ b 5 35 20 + 6 │ b 6 36 21 + 7 │ c 7 37 22 + 8 │ c 8 38 23 + 9 │ c 9 39 24 + 10 │ d 10 40 25 + 11 │ d 11 41 26 + 12 │ d 12 42 27 + 13 │ e 13 43 28 + 14 │ e 14 44 29 + 15 │ e 15 45 30 +``` +""" diff --git a/src/nests.jl b/src/nests.jl new file mode 100644 index 00000000..061aab89 --- /dev/null +++ b/src/nests.jl @@ -0,0 +1,333 @@ +function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::Union{String, Nothing}=nothing) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + df_copy = copy(is_grouped ? parent(df) : df) + + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + for col in column_symbols + col_type = typeof(df_copy[1, col]) + + if col_type <: DataFrame + # Handling DataFrames + nested_col_names = unique([name for i in 1:nrow(df_copy) for name in names(df_copy[i, col])]) + + for nested_col in nested_col_names + new_col_name = names_sep === nothing ? nested_col : Symbol(string(col, names_sep, nested_col)) + combined_nested_col = Any[missing for _ in 1:nrow(df_copy)] + + for row in 1:nrow(df_copy) + nested_df = df_copy[row, col] + if ncol(nested_df) > 0 && haskey(nested_df[1, :], nested_col) + combined_nested_col[row] = nested_df[!, nested_col] + # Extract single value if there's only one element + if length(combined_nested_col[row]) == 1 + combined_nested_col[row] = combined_nested_col[row][1] + end + end + end + df_copy[!, new_col_name] = combined_nested_col + end + elseif col_type <: NamedTuple || col_type <: Union{NamedTuple, Missing} + # Handling NamedTuples and missing values + keys_set = Set{Symbol}() + for item in df_copy[!, col] + if item !== missing + union!(keys_set, keys(item)) + end + end + + for key in keys_set + new_col_name = names_sep === nothing ? key : Symbol(string(col, names_sep, key)) + df_copy[!, new_col_name] = [item !== missing ? get(item, key, missing) : missing for item in df_copy[!, col]] + end + + + elseif col_type <: Dict + keys_set = Set{String}() + for item in df_copy[!, col] + union!(keys_set, keys(item)) + end + + for key in keys_set + new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], key) + end + + elseif col_type <: Array + n = length(first(df_copy[!, col])) + for i in 1:n + new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], i) + end + + else + error("Column $col contains neither dictionaries nor arrays nor DataFrames") + end + + select!(df_copy, Not(col)) + end + + if is_grouped + df_copy = groupby(df_copy, grouping_columns) + end + + return df_copy +end + +""" +$docstring_unnest_wider +""" +macro unnest_wider(df, exprs...) + names_sep = :(nothing) + if length(exprs) >= 2 && isa(exprs[end], Expr) && exprs[end].head == :(=) && exprs[end].args[1] == :names_sep + names_sep = esc(exprs[end].args[2]) + exprs = exprs[1:end-1] + end + + interpolated_exprs = parse_interpolation.(exprs) + tidy_exprs = [parse_tidy(i[1]) for i in interpolated_exprs] + + df_expr = quote + unnest_wider($(esc(df)), [$(tidy_exprs...)], names_sep=$names_sep) + end + + return df_expr +end + +function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_include::Union{Nothing, Bool}=nothing, keep_empty::Bool=false) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + df_copy = copy(is_grouped ? parent(df) : df) + + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + # Preprocess columns + for col in column_symbols + df_copy[!, col] = [ismissing(x) ? (keep_empty ? [missing] : missing) : + isa(x, DataFrame) ? (nrow(x) > 0 ? Tables.rowtable(x) : (keep_empty ? [missing] : [])) : + isempty(x) ? (keep_empty ? [missing] : x) : + x for x in df_copy[!, col]] + end + + # Apply filter if keep_empty is false + if !keep_empty + df_copy = filter(row -> !any(ismissing, [row[col] for col in column_symbols]), df_copy) + end + # Flatten the dataframe + flattened_df = flatten(df_copy, column_symbols) + + if indices_include === true + for col in column_symbols + col_indices = Symbol(string(col), "_id") + indices = [j for i in 1:nrow(df_copy) for j in 1:length(df_copy[i, col])] + flattened_df[!, col_indices] = indices + end + end + + if is_grouped + flattened_df = groupby(flattened_df, grouping_columns) + end + + return flattened_df +end + +""" +$docstring_unnest_longer +""" +macro unnest_longer(df, exprs...) + indices_include = :(nothing) + keep_empty = :(false) + + named_args = filter(e -> isa(e, Expr) && e.head == :(=), exprs) + for arg in named_args + if arg.args[1] == :indices_include + indices_include = esc(arg.args[2]) + elseif arg.args[1] == :keep_empty + keep_empty = esc(arg.args[2]) + end + end + column_exprs = filter(e -> !(isa(e, Expr) && e.head == :(=)), exprs) + + interpolated_exprs = parse_interpolation.(column_exprs) + tidy_exprs = [parse_tidy(i[1]) for i in interpolated_exprs] + + df_expr = quote + unnest_longer($(esc(df)), [$(tidy_exprs...)], indices_include=$indices_include, keep_empty = $keep_empty) + end + + return df_expr +end + + +function nest_pairs(df; kwargs...) + df_copy = copy(df) + nested_dataframes = Dict() + grouping_columns = names(df) + + # Determine grouping columns based on all specified column sets + for (_, cols) in kwargs + if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 + start_col, end_col = cols.args + start_idx = findfirst(==(start_col), names(df)) + end_idx = findfirst(==(end_col), names(df)) + if isnothing(start_idx) || isnothing(end_idx) + throw(ArgumentError("Column range $cols is invalid")) + end + cols = names(df)[start_idx:end_idx] + elseif isa(cols, Symbol) + cols = [cols] + end + + column_symbols = names(df, Cols(cols)) + grouping_columns = setdiff(grouping_columns, column_symbols) + end + + # Group the DataFrame once using these grouping columns + grouped_df = groupby(df_copy, grouping_columns) + + # Nest each specified set of columns based on the single grouped DataFrame + for (new_col_name, cols) in kwargs + if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 + start_col, end_col = cols.args + start_idx = findfirst(==(start_col), names(df)) + end_idx = findfirst(==(end_col), names(df)) + cols = names(df)[start_idx:end_idx] + elseif isa(cols, Symbol) + cols = [cols] + end + + column_symbols = names(df, Cols(cols)) + nested_dataframes[new_col_name] = [DataFrame(select(sub_df, column_symbols)) for sub_df in grouped_df] + end + + # Creating a new DataFrame with all grouping columns + unique_groups = unique(df[:, grouping_columns]) + new_df = DataFrame(unique_groups) + + # Aligning and adding the nested DataFrame columns + for (new_col_name, nested_df_list) in nested_dataframes + aligned_nested_df = [nested_df_list[i] for i in 1:nrow(new_df)] + new_df[!, new_col_name] = aligned_nested_df + end + + return new_df +end + +# For groups. Its a little bit slow i think but it works. +# I am not sure if this is something that could ungroup -> regroup +# so for now I have opted for the safer strategy +function nest_pairs(gdf::GroupedDataFrame; kwargs...) + group_cols = groupcols(gdf) + results = [] + for group in gdf + # Convert the group to a DataFrame + df_group = DataFrame(group) + processed_group = nest_pairs(df_group; kwargs...) + push!(results, processed_group) + end + combined_df = vcat(results...) + return groupby(combined_df, group_cols) +end + + +""" +$docstring_nest +""" +macro nest(df, args...) + kwargs_exprs = [] + + for arg in args + if isa(arg, Expr) && arg.head == :(=) + key = esc(arg.args[1]) # Extract and escape the key + # this extra processing was unavoidable for some reason to enable tidy selection + # Check if the argument is a range expression + if isa(arg.args[2], Expr) && arg.args[2].head == :(:) && length(arg.args[2].args) == 2 + # Handle range expressions as Between selectors + first_col, last_col = arg.args[2].args + value_expr = Expr(:call, :Between, esc(first_col), esc(last_col)) + else + # Apply parse_interpolation and parse_tidy to the value + interpolated_value, _, _ = parse_interpolation(arg.args[2]) + tidy_value = parse_tidy(interpolated_value) + + # Use the existing logic for non-range expressions + if @capture(tidy_value, (args__,)) || @capture(tidy_value, [args__]) + args = QuoteNode.(args) + value_expr = :[$(args...)] + else + value_expr = tidy_value + end + end + + # Construct the keyword argument expression + push!(kwargs_exprs, Expr(:kw, key, value_expr)) + else + println("Argument is not recognized as a keyword argument: ", arg) + end + end + + # Construct the function call to nest24 with keyword arguments + return quote + nest_pairs($(esc(df)), $(kwargs_exprs...)) + end +end + + +#function nest_by(df::DataFrame; by, key = :data) +# by_expr = by isa Expr ? (by,) : (by,) +# by_symbols = names(df, Cols(by_expr...)) + +# cols_to_nest = setdiff(names(df), by_symbols) + +# nested_data = map(eachrow(df)) do row +# [row[c] for c in cols_to_nest] +# end + +# nested_df = DataFrame() +# for sym in by_symbols +# nested_df[!, sym] = df[!, sym] +# end +# nested_df[!, key] = nested_data +# +# return nested_df +#end + +#""" +#$docstring_nest_by +#""" +#macro nest_by(df, args...) +# if length(args) == 2 +# by_cols, new_col = args +# new_col_quoted = QuoteNode(new_col) +# elseif length(args) == 1 +# by_cols = args[1] +# new_col_quoted = :(:data) +# else +# error("Incorrect number of arguments provided to @nest") +# end +# +# interpolated_by_cols, _, _ = parse_interpolation(by_cols) +# interpolated_by_cols = parse_tidy(interpolated_by_cols) +# +# if @capture(interpolated_by_cols, (first_col:last_col)) +# by_cols_expr = :($(first_col):$(last_col)) +# elseif @capture(interpolated_by_cols, (args__,)) || @capture(interpolated_by_cols, [args__]) +# args = QuoteNode.(args) +# by_cols_expr = :[$(args...)] +# else +# by_cols_expr = quote +# if typeof($interpolated_by_cols) <: Tuple +# collect(Symbol.($interpolated_by_cols)) +# else +# $interpolated_by_cols +# end +# end +# end +# +# return quote +# nest_by($(esc(df)), by = $by_cols_expr, key = $new_col_quoted) +# end +#end \ No newline at end of file diff --git a/src/separate_unite.jl b/src/separate_unite.jl index be34947e..ebcf42d3 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -56,29 +56,32 @@ $docstring_unite macro unite(df, new_col, from_cols, sep) new_col_quoted = QuoteNode(new_col) interpolated_from_cols, _, _ = parse_interpolation(from_cols) + interpolated_from_cols = parse_tidy(interpolated_from_cols) - if @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) - args = QuoteNode.(args) - from_cols_expr = :[$(args...)] + if @capture(interpolated_from_cols, (first_col:last_col)) + from_cols_expr = :($(first_col):$(last_col)) + elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) + args = QuoteNode.(args) + from_cols_expr = :[$(args...)] else - from_cols_expr = quote - if typeof($interpolated_from_cols) <: Tuple - collect(Symbol.($interpolated_from_cols)) - - else - $interpolated_from_cols - end - end + from_cols_expr = quote + if typeof($interpolated_from_cols) <: Tuple + collect(Symbol.($interpolated_from_cols)) + else + $interpolated_from_cols + end + end end - return quote - unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep))) + unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep))) end end -function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") +function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_") new_df = df[:, :] - new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + cols_expr = columns isa Expr ? (columns,) : columns + column_symbols = names(df, Cols(cols_expr...)) + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])] return new_df end