diff --git a/Project.toml b/Project.toml index c4b8da7d..5e3b7ced 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.12.2" +version = "0.13.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index c9b2840b..949f1b55 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,8 @@ To support R-style programming, TidierData.jl is implemented using macros. TidierData.jl currently supports the following top-level macros: - `@glimpse()` -- `@select()`, `@rename()`, and `@distinct()` +- `@select()` and `@distinct()` +- `@rename()` and `@rename_with()` - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` @@ -106,6 +107,7 @@ TidierData.jl also supports the following helper functions: - `everything()`, `starts_with()`, `ends_with()`, `matches()`, and `contains()` - `as_float()`, `as_integer()`, and `as_string()` - `is_float()`, `is_integer()`, and `is_string()` +- `missing_if()` and `replace_missing()` See the documentation [Home](https://tidierorg.github.io/TidierData.jl/latest/) page for a guide on how to get started, or the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions. diff --git a/docs/examples/UserGuide/fill_missing.jl b/docs/examples/UserGuide/fill_missing.jl index a1b572bb..71d3a96c 100644 --- a/docs/examples/UserGuide/fill_missing.jl +++ b/docs/examples/UserGuide/fill_missing.jl @@ -34,3 +34,18 @@ end @fill_missing(a, "down") end +# ## `replace_missing()` +# The `replace_missing` function facilitates the replacement of `missing` values with a specified replacement. + +@chain df begin + @mutate(b = replace_missing(b, 2)) +end + +# ## `missing_if()` +# The `missing_if` function is used to introduce `missing` values under specific conditions. + +@chain df begin + @mutate(b = missing_if(b, 5)) +end + +# Both `missing_if` and `replace_missing` are not type specifc. \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 5ed7eff3..3c443d89 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -90,7 +90,8 @@ TidierData.jl currently supports the following top-level macros: ```@raw html !!! example "Top-level macros:" - `@glimpse()` - - `@select()`, `@rename()`, and `@distinct()` + - `@select()` and `@distinct()` + - `@rename()` and `@rename_with()` - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` @@ -120,6 +121,7 @@ TidierData.jl also supports the following helper functions: - `everything()`, `starts_with()`, `ends_with()`, `matches()`, and `contains()` - `as_float()`, `as_integer()`, and `as_string()` - `is_float()`, `is_integer()`, and `is_string()` + - `missing_if()` and `replace_missing()` ``` See the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions. diff --git a/src/TidierData.jl b/src/TidierData.jl index 7cac6701..833176ba 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -16,10 +16,10 @@ using Reexport @reexport using ShiftedArrays: lag, lead export TidierData_set, across, desc, n, row_number, everything, starts_with, ends_with, matches, if_else, case_when, ntile, - as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, + as_float, as_integer, as_string, is_float, is_integer, is_string, missing_if, replace_missing, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @rename_with # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? @@ -608,4 +608,55 @@ macro glimpse(df, width = 80) return df_expr end +""" +$docstring_rename_with +""" +macro rename_with(df, fn, exprs...) + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + any_found_n = any([i[2] for i in interpolated_exprs]) + any_found_row_number = any([i[3] for i in interpolated_exprs]) + + tidy_exprs = parse_tidy.(tidy_exprs) + df_expr = quote + local df_copy = copy($(esc(df))) + + if $any_found_n + if df_copy isa GroupedDataFrame + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + else + transform!(df_copy, nrow => :TidierData_n) + end + end + + if $any_found_row_number + if df_copy isa GroupedDataFrame + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + else + transform!(df_copy, eachindex => :TidierData_row_number) + end + end + + local columns_to_rename + if isempty($(esc(exprs))) + columns_to_rename = names(df_copy) + else + columns_to_rename = names(select(copy(df_copy), $(tidy_exprs...))) + end + + local renames = filter(p -> p.first in columns_to_rename, Pair.(names(df_copy), $(esc(fn)).(names(df_copy)))) + + if df_copy isa GroupedDataFrame + rename!(df_copy, renames; ungroup = false) + else + rename!(df_copy, renames) + end + + df_copy + end + + return df_expr +end + end \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index 5fc9a5e0..0cc2516c 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2567,4 +2567,110 @@ julia> @chain df begin 2 │ missing 0.3 missing 3 │ 0.2 2.0 0.2 ``` +""" +const docstring_missing_if = +""" + missing_if(x, value) + +Replace a specific `value` with `missing` in `x`. + +## Arguments +- `x`: The input value which can be of any type. If `x` is already `missing` or equals `value`, the function will return `missing`. Otherwise, it returns `x` unaltered. +- `value`: The specific value to be checked against. + +## Examples +```jldoctest +julia> df = DataFrame( + a = [1, missing, 3, 4], + b = ["apple", "apple", "banana", "cherry"] + ); + +julia> @chain df begin + @mutate(a = missing_if(a, 4), + b = missing_if(b, "apple")) + end +4×2 DataFrame + Row │ a b + │ Int64? String? +─────┼────────────────── + 1 │ 1 missing + 2 │ missing missing + 3 │ 3 banana + 4 │ missing cherry +``` +""" + +const docstring_replace_missing = +""" + replace_missing(x, replacement) + +Replace `missing` values in `x` with a specified `replacement` value. + +# Arguments +- `x`: The input value which can be of any type. If `x` is `missing`, the function will return `replacement`. Otherwise, it returns `x` unaltered. +- `replacement`: The value to replace `missing` with in `x`. + +# Examples +```jldoctest +julia> df = DataFrame( + a = [1, missing, 3, 4], + b = [4, 5, missing, 8] + ); + +julia> @chain df begin + @mutate(a = replace_missing(a, 100), + b = replace_missing(b, 35)) + end +4×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 1 4 + 2 │ 100 5 + 3 │ 3 35 + 4 │ 4 8 +``` +""" + +const docstring_rename_with = +""" + @rename_with(df, fn, exprs...) + +Renames the chosen column names using a function + +# Arguments +- `df`: a DataFrame +- `fn`: desired function to (such as str_remove_all from TidierStrings) +- `exprs`: One or more unquoted variable names separated by commas. Variable names +can also be used as their positions in the data, like `x:y`, to select +a range of variables. Variables names can also be chosen with starts with. Defaults to all columns if empty. + +# Examples +```jldoctest +julia> function str_remove_all(column, pattern::String) + if ismissing(column) + return column + end + patterns = split(pattern, '|') + for p in patterns + column = replace(column, strip(p) => "") + end + return column + end; + +julia> df = DataFrame( + term_a = ["apple", "banana", "cherry"], + document_a = ["doc_1", "doc2", "doc3"], + _n_ = [1, 2, 3] + ); + +julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) +3×3 DataFrame + Row │ term_a document _n_ + │ String String Int64 +─────┼───────────────────────── + 1 │ apple doc_1 1 + 2 │ banana doc2 2 + 3 │ cherry doc3 3 +``` """ \ No newline at end of file diff --git a/src/missings.jl b/src/missings.jl index 3226d362..6af74d42 100644 --- a/src/missings.jl +++ b/src/missings.jl @@ -115,4 +115,14 @@ macro fill_missing(df, args...) fill_missing($(esc(df)), [$(cols_quoted...)], $method) end end -end \ No newline at end of file +end + +""" +$docstring_missing_if +""" +missing_if(x, value) = ismissing(x) ? x : (x == value ? missing : x) + +""" +$docstring_replace_missing +""" +replace_missing(x, replacement) = ismissing(x) ? replacement : x diff --git a/src/separate_unite.jl b/src/separate_unite.jl index 22817003..b8dbec3a 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -61,14 +61,24 @@ end $docstring_unite """ macro unite(df, new_col, from_cols, sep) - new_col = QuoteNode(new_col) - - if @capture(from_cols, (args__,)) - elseif @capture(from_cols, [args__]) + new_col_quoted = QuoteNode(new_col) + interpolated_from_cols, _, _ = parse_interpolation(from_cols) + + if @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) + args = QuoteNode.(args) + from_cols_expr = :[$(args...)] + else + from_cols_expr = quote + if typeof($interpolated_from_cols) <: Tuple + collect(Symbol.($interpolated_from_cols)) + + else + $interpolated_from_cols + end + end end - - args = QuoteNode.(args) - var_expr = quote - unite($(esc(df)), $new_col, [$(args...)], $sep) + + return quote + unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep))) end -end +end \ No newline at end of file