diff --git a/NEWS.md b/NEWS.md index f2568f1..f221043 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,9 @@ # TidierData.jl updates -## v0.16.2 - 2024-08-05 +## v0.16.2 - 2024-09-03 - Bugfix: `@slice_min` and `@slice_max` respect the `n` argument - Adds `@head` +- Adds `extra` argument for `@separate()` and `remove` argument for `@unite()` ## v0.16.1 - 2024-06-09 - Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns. diff --git a/src/TidierData.jl b/src/TidierData.jl index 0507199..58f1a0c 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -28,7 +28,7 @@ const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) # The global do-not-vectorize "list" -const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr]) +const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode]) # The global do-not-escape "list" # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped diff --git a/src/docstrings.jl b/src/docstrings.jl index c037167..ae20886 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2305,15 +2305,16 @@ missing const docstring_separate = """ - @separate(df, From, Into, Separator) + @separate(df, from, into, sep, extra = "merge") Separate a string column into mulitiple new columns based on a specified delimter # Arguments - `df`: A DataFrame -- `From`: Column that will be split -- `Into`: New column names, supports [] or () -- `Separator`: the string or chacater on which to split +- `from`: Column that will be split +- `into`: New column names, supports [] or () +- `sep`: the string or character on which to split +- `extra`: "merge", "warn" and "drop" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. "warn" generates a warning message for dropped values. # Examples ```jldoctest @@ -2338,12 +2339,33 @@ julia> @chain df begin 1 │ 1 1 missing 2 │ 2 2 missing 3 │ 3 3 3 + +julia> @separate(df, a, (b, c), "-") +3×2 DataFrame + Row │ b c + │ SubStrin… String +─────┼─────────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 3-3 + +julia> @chain df begin + @separate(a, (b, c), "-", extra = "drop") + end +3×2 DataFrame + Row │ b c + │ SubStrin… SubStrin… +─────┼────────────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 3 + ``` """ const docstring_unite = """ - @unite(df, new_cols, from_cols, sep) + @unite(df, new_cols, from_cols, sep, remove = true) Separate a multiple columns into one new columns using a specific delimter @@ -2351,13 +2373,23 @@ Separate a multiple columns into one new columns using a specific delimter - `df`: A DataFrame - `new_col`: New column that will recieve the combination - `from_cols`: Column names that it will combine, supports [] or () -- `sep`: the string or character that will seprate the values in the new column +- `sep`: the string or character that will separate the values in the new column +- `remove`: defaults to `true`, removes input columns from data frame # Examples ```jldoctest julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); julia> @unite(df, new_col, (b, c, d), "-") +3×1 DataFrame + Row │ new_col + │ String +─────┼───────── + 1 │ 1-1 + 2 │ 2-2 + 3 │ 3-3-3 + +julia> @unite(df, new_col, (b, c, d), "-", remove = false) 3×4 DataFrame Row │ b c d new_col │ String String String? String @@ -3112,14 +3144,14 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) const docstring_separate_rows = """ - separate_rows(df, columns..., delimiter) + separate_rows(df, columns..., sep) Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter. # Arguments - `df`: A DataFrame - `columns`: A column or multiple columns to be split. Can be a mix of integers and column names. -- `delimiter`: The string or character or regular expression used to split the column values. +- `sep`: The string or character or regular expression used to split the column values. # Examples ```jldoctest @@ -3135,7 +3167,7 @@ julia> df = DataFrame(a = 1:3, 2 │ 2 aa;bb;cc 2;3;4 8;9;10 3 │ 3 dd;ee 5;6 11;12 -julia> @separate_rows(df, 2, 4, ";" ) +julia> @separate_rows(df, 2, 4, ";") 6×4 DataFrame Row │ a b c d │ Int64 SubStrin… String SubStrin… @@ -3147,7 +3179,7 @@ julia> @separate_rows(df, 2, 4, ";" ) 5 │ 3 dd 5;6 11 6 │ 3 ee 5;6 12 -julia> @separate_rows(df, b:d, ";" ) +julia> @separate_rows(df, b:d, ";") 6×4 DataFrame Row │ a b c d │ Int64 SubStrin… SubStrin… SubStrin… @@ -3163,7 +3195,7 @@ julia> @separate_rows(df, b:d, ";" ) const docstring_unnest_wider = """ - @unnest_wider(df, columns, names_sep=) + @unnest_wider(df, columns, names_sep) Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns. @@ -3236,7 +3268,7 @@ julia> @unnest_longer(df, 2) 3 │ 2 3 [7, 8] 4 │ 2 4 [7, 8] -julia> @unnest_longer(df, b:c, indices_include=true) +julia> @unnest_longer(df, b:c, indices_include = true) 4×5 DataFrame Row │ a b c b_id c_id │ Int64 Int64 Int64 Int64 Int64 diff --git a/src/separate_unite.jl b/src/separate_unite.jl index e19d6f6..52bb2ce 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -9,40 +9,67 @@ end """ $docstring_separate """ -macro separate(df, from, into, sep) - from_quoted = QuoteNode(from) - - interpolated_into, _, _ = parse_interpolation(into) - - if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__]) - args = QuoteNode.(args) - into_expr = :[$(args...)] - else - into_expr = quote - if typeof($interpolated_into) <: Vector{String} - Symbol.($interpolated_into) - else - $interpolated_into - end +macro separate(df, from, into, sep, args...) + extra = "merge" + for arg in args + if isa(arg, Expr) && arg.head == :(=) + if arg.args[1] == :extra + extra = arg.args[2] end end - - return quote - separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep))) - end + end + + from_quoted = QuoteNode(from) + + interpolated_into, _, _ = parse_interpolation(into) + + if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__]) + args = QuoteNode.(args) + into_expr = :[$(args...)] + else + into_expr = quote + if typeof($interpolated_into) <: Vector{String} + Symbol.($interpolated_into) + else + $interpolated_into + end + end + end + + return quote + separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)); extra=$(esc(extra))) + end end -function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) +function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}; extra::String = "merge") new_df = df[:, :] new_cols = map(x -> split(x, sep), new_df[:, col]) max_cols = maximum(length.(new_cols)) - if length(into) < max_cols - error("Not enough names provided in `into` for all split columns.") + if length(into) < max_cols && extra == "warn" + @warn "Dropping extra split parts that don't fit into the provided `into` columns." + max_cols = length(into) + elseif length(into) < max_cols && extra == "drop" + max_cols = length(into) + elseif length(into) < max_cols && extra == "merge" + merge = true + elseif length(into) < max_cols + error("Not enough names provided in \"into\" for all split columns.") + else + merge = false end - for i in 1:max_cols - new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + for i in 1:length(into) + if i < length(into) || (extra == "warn" && i <= max_cols) || (extra == "drop" && i <= max_cols) + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + elseif i == length(into) && merge + new_df[:, into[i]] = map(x -> length(x) >= i ? join(x[i:end], sep) : missing, new_cols) + else + for i in 1:max_cols + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + end + + end end new_df = select(new_df, Not(col)) @@ -50,41 +77,58 @@ function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{R return new_df end + """ $docstring_unite """ -macro unite(df, new_col, from_cols, sep) - new_col_quoted = QuoteNode(new_col) - interpolated_from_cols, _, _ = parse_interpolation(from_cols) - interpolated_from_cols = parse_tidy(interpolated_from_cols) +macro unite(df, new_col, from_cols, sep, args...) + remove=true + for arg in args + if isa(arg, Expr) && arg.head == :(=) + if arg.args[1] == :remove + remove = arg.args[2] + end + end + end + new_col_quoted = QuoteNode(new_col) + interpolated_from_cols, _, _ = parse_interpolation(from_cols) + interpolated_from_cols = parse_tidy(interpolated_from_cols) - if @capture(interpolated_from_cols, (first_col:last_col)) + if @capture(interpolated_from_cols, (first_col:last_col)) from_cols_expr = :($(first_col):$(last_col)) - elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) + elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) args = QuoteNode.(args) from_cols_expr = :[$(args...)] - else + else from_cols_expr = quote if typeof($interpolated_from_cols) <: Tuple collect(Symbol.($interpolated_from_cols)) else - $interpolated_from_cols + $interpolated_from_cols end end - end - return quote - unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep))) - end + end + + return quote + unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)); remove=$(esc(remove))) + end end -function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_") + +function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_"; remove::Bool=true) new_df = df[:, :] cols_expr = columns isa Expr ? (columns,) : columns column_symbols = names(df, Cols(cols_expr...)) new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])] + + if remove + new_df = select(new_df, Not(column_symbols)) + end + return new_df end + """ $docstring_separate_rows """