diff --git a/NEWS.md b/NEWS.md index ba13ad1b..31e26729 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # TidierData.jl updates +## v0.12.0 - 2023-09-10 +- Fixes `!!` interpolation so that it works using normal Julia scoping rules. It no longer uses `Main.eval()` in the implementation. The way interpolation works contains some breaking changes, and the documentation has been updated accordingly. +- Fixes name conflict with `Cleaner.rename()` and `DataFrames.rename()` +- Adds `categorical()` to array of non-vectorized functions. + ## v0.11.0 - 2023-08-22 - Add `@fill_missing()`, `@slice_sample()`, `is_float()`, `is_integer()`, `is_string()` - Rename `@drop_na()` to `@drop_missing()` to be consistent with Julia data types. diff --git a/Project.toml b/Project.toml index 761ad800..3d052d13 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.11.0" +version = "0.12.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/docs/examples/UserGuide/interpolation.jl b/docs/examples/UserGuide/interpolation.jl index 6c7a11d4..4a48ae3d 100644 --- a/docs/examples/UserGuide/interpolation.jl +++ b/docs/examples/UserGuide/interpolation.jl @@ -1,10 +1,10 @@ -# The `!!` ("bang bang") operator can be used to interpolate values of variables from the global environment into your code. This operator is borrowed from the R `rlang` package. At some point, we may switch to using native Julia interpolation, but for a variety of reasons that introduce some complexity with native interpolation, we plan to continue to support `!!` interpolation. +# The `!!` ("bang bang") operator can be used to interpolate values of variables from the parent environment into your code. This operator is borrowed from the R `rlang` package. At some point, we may switch to using native Julia interpolation, but for a variety of reasons that introduce some complexity with native interpolation, we plan to continue to support `!!` interpolation. # To interpolate multiple variables, the `rlang` R package uses the `!!!` "triple bang" operator. However, in `TidierData.jl`, the `!!` "bang bang" operator can be used to interpolate either single or multiple values as shown in the examples below. -# Since the `!!` operator can only access variables in the global environment, we will set these variables in a somewhat roundabout way for the purposes of documentation. However, in interactive use, you can simply write `myvar = :b` instead of wrapping this code inside of an `@eval()` macro as is done here. +# Note: You can only interpolate values from variables in the parent environment. If you would like to interpolate column names, you have two options: you can either use `across()` or you can use `@aside` with `@pull()` to create variables in the parent environment containing the values of those columns which can then be accessed using interpolatino. -# Note: `myvar = :b`, `myvar = (:a, :b)`, and `myvar = [:a, :b]` all refer to *columns* with those names. On the other hand, `myvar = "b"`, `myvar = ("a", "b")` and `myvar = ["a", "b"]` will interpolate those *values*. See below for examples. +# myvar = :b`, `myvar = (:a, :b)`, and `myvar = [:a, :b]` all refer to *columns* with those names. On the other hand, `myvar = "b"`, `myvar = ("a", "b")` and `myvar = ["a", "b"]` will interpolate those *values*. See below for examples. using TidierData @@ -14,31 +14,23 @@ df = DataFrame(a = string.(repeat('a':'e', inner = 2)), # ## Select the column (because `myvar` contains a symbol) -@eval(Main, myvar = :b) +myvar = :b @chain df begin @select(!!myvar) end -# ## Select multiple variables (tuple of symbols) - -@eval(Main, myvars_tuple = (:a, :b)) - -@chain df begin - @select(!!myvars_tuple) -end - # ## Select multiple variables (vector of symbols) -@eval(Main, myvars_vector = [:a, :b]) +myvars = [:a, :b] @chain df begin - @select(!!myvars_vector) + @select(!!myvars) end -# ## Filter rows containing the *value* of `myvar_string` (because `myvar_string` does) +# ## Filter rows containing the *value* of `myvar_string` -@eval(Main, myvar_string = "b") +myvar_string = "b" @chain df begin @filter(a == !!myvar_string) @@ -48,15 +40,15 @@ end # Note that for `in` to work here, we have to wrap it in `[]` because otherwise, the string will be converted into a collection of characters, which are a different data type. -@eval(Main, myvar_string = "b") +myvar_string = "b" @chain df begin @filter(a in [!!myvar_string]) end -# ## You can also use this for a tuple or vector of strings. +# ## You can also use this for a vector (or tuple) of strings. -@eval(Main, myvars_string = ("a", "b")) +myvars_string = ["a", "b"] @chain df begin @filter(a in !!myvars_string) @@ -64,15 +56,29 @@ end # ## Mutate one variable -@eval(Main, myvar = :b) +# Remember: You cannot interpolate column names into `@mutate()` expressions. However, you *can* create a temporary variable containing the values of the column in question *or* you can use `@mutate()` with `across()`. + +# ### Option 1: Create a temporary variable containing the values of the column. + +myvar = :b @chain df begin - @mutate(!!myvar = !!myvar + 1) + @aside(myvar_values = @pull(_, !!myvar)) + @mutate(d = !!myvar_values + 1) +end + +# ### Option 2: Use `@mutate()` with `across()` + +# Note: when using `across()`, anonymous functions are not vectorized. This is intentional to allow users to specify their function exactly as desired. + +@chain df begin + @mutate(across(!!myvar, x -> x .+ 1)) + @rename(d = b_function) end # ## Summarize across one variable -@eval(Main, myvar = :b) +myvar = :b @chain df begin @summarize(across(!!myvar, mean)) @@ -80,29 +86,38 @@ end # ## Summarize across multiple variables -@eval(Main, myvars_tuple = (:b, :c)) +myvars = [:b, :c] @chain df begin - @summarize(across(!!myvars_tuple, (mean, minimum, maximum))) + @summarize(across(!!myvars, (mean, minimum, maximum))) +end + +# ## Group by one interpolated variable + +myvar = :a + +@chain df begin + @group_by(!!myvar) + @summarize(c = mean(c)) end # ## Group by multiple interpolated variables -@eval(Main, myvars_tuple = (:a, :b)) +myvars = [:a, :b] @chain df begin - @group_by(!!myvars_tuple) + @group_by(!!myvars) @summarize(c = mean(c)) end +# Notice that `df` remains grouped by `a` because the `@summarize()` peeled off one layer of grouping. + # ## Global constants -# Because global constants like `pi` exist in the `Main` module, they can also be accessed using interpolation. For example, let's calculate the area of circles with a radius of 1 up to 5. +# You can also use `!!` interpolation to access global variables like `pi`. df = DataFrame(radius = 1:5) -# We can interpolate `pi` (from the `Main` module) to help with this. - @chain df begin @mutate(area = !!pi * radius^2) end @@ -112,28 +127,26 @@ end # While interpolation using `!!` is concise and handy, it's not required. You can also access user-defined globals and global constant variables using the following syntax: @chain df begin - @mutate(area = Main.pi * radius^2) + @mutate(area = esc(pi) * radius^2) end -# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable` or `Main.variable` syntax to refer to this variable. - -# ## There's one other situation when `!!` interpolation may not work correctly: inside a `for` loop. +# Since we know that `pi` is defined in the `Main` module, we can also access it using `Main.pi`. -# This is only a problem if the variable being interpolated using `!!` is the iterator. Because macros as expanded during *parsing* of the code (before it is compiled), the expanded code contains the last value of the global variable *before* the loop is run and does not update with each iteration of the loop. +@chain df begin + @mutate(area = Main.pi * radius^2) +end -# To get around this, we can use `@eval(Main, variable)` inside our code, where `variable` refers to the iterator. Let's show a simple example of this where we print out each column one at a time using a `for` loop. +# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable`, `esc(variable)`, or `[Module_name_here].variable` syntax to refer to this variable. -# We first need to initialize the global variable using `global_col = Symbol()`. +# Note: You can use `!!` interpolation anywhere, including inside of functions and loops. -# ```julia -# global_col = Symbol() -# for col in [:a, :b, :c] -# global global_col = col -# @chain df begin -# @select(@eval(Main, global_col)) -# println -# end -# end -# ``` +df = DataFrame(a = string.(repeat('a':'e', inner = 2)), + b = [1,1,1,2,2,2,3,3,3,4], + c = 11:20) -# The reason this works is because the `@eval()` macro inside `@select()` is not evaluated right away (unlike `!!`) but rather is evaluated at a later stage and thus is updated with each iteration. Instead of using the `@eval()` macro, we could instead have instead written `Main.eval(:global_col)`, which is functionally the same. \ No newline at end of file +for col in [:b, :c] + @chain df begin + @summarize(across(!!col, mean)) + println + end +end diff --git a/src/TidierData.jl b/src/TidierData.jl index f62f2e35..e6a0f141 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -5,7 +5,7 @@ using MacroTools using Chain using Statistics using StatsBase # primarily for `sample()` -using Cleaner +import Cleaner # changed from `using Cleaner` because of name conflict with `DataFrames.rename()` using Reexport # Exporting `Cols` because `summarize(!!vars, funs))` with multiple interpolated @@ -26,7 +26,7 @@ const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) # Expose the global do-not-vectorize "list" -const not_vectorized = Ref{Vector{Symbol}}([:Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :as_categorical, :is_categorical]) +const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) # Includes include("docstrings.jl") diff --git a/src/clean_names.jl b/src/clean_names.jl index c79d4ecb..8366b738 100644 --- a/src/clean_names.jl +++ b/src/clean_names.jl @@ -11,13 +11,13 @@ macro clean_names(df, case) @chain $(esc(df)) begin DataFrame # remove grouping - polish_names(_; style = style) + Cleaner.polish_names(_; style = style) DataFrame # convert back to DataFrame groupby(col_names; sort = true) # regroup end else @chain $(esc(df)) begin - polish_names(_; style = style) + Cleaner.polish_names(_; style = style) DataFrame # convert back to DataFrame end end @@ -33,13 +33,13 @@ macro clean_names(df) @chain $(esc(df)) begin DataFrame # remove grouping - polish_names + Cleaner.polish_names DataFrame # convert back to DataFrame groupby(col_names; sort = true) # regroup end else @chain $(esc(df)) begin - polish_names + Cleaner.polish_names DataFrame # convert back to DataFrame end end diff --git a/src/parsing.jl b/src/parsing.jl index 6b4c76b3..ae250efc 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -99,13 +99,13 @@ function parse_pivot_arg(tidy_expr::Union{Expr,Symbol,Number}) end # Not exported -function parse_function(lhs::Symbol, rhs::Expr; autovec::Bool=true, subset::Bool=false) +function parse_function(lhs::Union{Symbol, Expr}, rhs::Expr; autovec::Bool=true, subset::Bool=false) lhs = QuoteNode(lhs) src = Symbol[] MacroTools.postwalk(rhs) do x - if @capture(x, (fn_(args__)) | (fn_.(args__))) + if @capture(x, (fn_(args__)) | (fn_.(args__))) && fn != :esc args = args[isa.(args, Symbol)] push!(src, args...) end @@ -226,7 +226,9 @@ function parse_group_by(tidy_expr::Union{Expr,Symbol}) return :(Cols($(args...),)) elseif @capture(tidy_expr, lhs_ = rhs_) return QuoteNode(lhs) - else + elseif tidy_expr isa Expr + return tidy_expr + else # if it's a Symbol return QuoteNode(tidy_expr) end end @@ -332,7 +334,7 @@ function parse_escape_function(rhs_expr::Union{Expr,Symbol}) if @capture(x, fn_(args__)) # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped - if fn in [:in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] + if fn in [:esc :in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] return x elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name return :($(esc(fn))($(args...))) @@ -340,7 +342,7 @@ function parse_escape_function(rhs_expr::Union{Expr,Symbol}) return x end elseif @capture(x, fn_.(args__)) - if fn in [:in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] + if fn in [:esc :in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] return x elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name return :($(esc(fn)).($(args...))) @@ -361,19 +363,7 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari var_expr = MacroTools.postwalk(var_expr) do x if @capture(x, !!variable_Symbol) - variable = Main.eval(variable) - if variable isa AbstractString - return variable # Strings are now treated as Strings and not columns - elseif variable isa Symbol - return variable - else # Tuple or Vector of columns - if variable[1] isa Symbol - variable = QuoteNode.(variable) - return :(Cols($(variable...),)) - else - return variable - end - end + return esc(variable) # `hello` in Julia is converted to Core.@cmd("hello") # Since MacroTools is unable to match this pattern, we can directly # evaluate the expression to see if it matches. If it does, the 3rd argument @@ -396,6 +386,8 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari else return :($fn()) end + elseif @capture(x, esc(variable_)) + return esc(variable) end return x end