diff --git a/NEWS.md b/NEWS.md index 1235619d..42336414 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.14.0 - 2023-12-12 +- Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped +- Add `collect()` to not_vectorized[] array + ## v0.13.5 - 2023-12-05 - `@summarize()` and `@summarise()` now perform auto-vectorization in the same way as `@mutate()`, meaning that the top-level macros are now all consistent in their treatment of auto-vectorization. - Update documentation to describe new auto-vectorization behavior and give an example of how to modify the `TidierData.not_vectorized[]` array. diff --git a/Project.toml b/Project.toml index bbb390d7..d305da79 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.13.5" +version = "0.14.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/docs/examples/UserGuide/interpolation.jl b/docs/examples/UserGuide/interpolation.jl index 4a48ae3d..da4d9d49 100644 --- a/docs/examples/UserGuide/interpolation.jl +++ b/docs/examples/UserGuide/interpolation.jl @@ -122,21 +122,21 @@ df = DataFrame(radius = 1:5) @mutate(area = !!pi * radius^2) end -# ## Alternative interpolation syntax - -# While interpolation using `!!` is concise and handy, it's not required. You can also access user-defined globals and global constant variables using the following syntax: +# As of v0.14.0, global constants defined within the Base or Core modules (like `missing`, `pi`, and `Real` can be directly referenced without any `!!`) @chain df begin - @mutate(area = esc(pi) * radius^2) + @mutate(area = pi * radius^2) end +# ## Alternative interpolation syntax + # Since we know that `pi` is defined in the `Main` module, we can also access it using `Main.pi`. @chain df begin @mutate(area = Main.pi * radius^2) end -# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable`, `esc(variable)`, or `[Module_name_here].variable` syntax to refer to this variable. +# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable` or `[Module_name_here].variable` syntax to refer to this variable. # Note: You can use `!!` interpolation anywhere, including inside of functions and loops. diff --git a/src/TidierData.jl b/src/TidierData.jl index 326f0343..af840daa 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -25,8 +25,12 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) -# Expose the global do-not-vectorize "list" -const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) +# The global do-not-vectorize "list" +const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) + +# The global do-not-escape "list" +# `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped +const not_escaped = Ref{Vector{Symbol}}([:esc, :in, :∈, :∉, :Ref, :Set, :Cols, :collect, :(:), :∘, :(=>), :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :startswith, :contains, :endswith]) # Includes include("docstrings.jl") diff --git a/src/parsing.jl b/src/parsing.jl index fb9b81e2..9d222a3f 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -331,10 +331,30 @@ end # Not exported function parse_escape_function(rhs_expr::Union{Expr,Symbol}) rhs_expr = MacroTools.postwalk(rhs_expr) do x - if @capture(x, fn_(args__)) - # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped - if fn in [:esc :in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] + # If it's already escaped, make sure it needs to remain escaped + if @capture(x, esc(variable_Symbol)) + if hasproperty(Base, variable) && !(typeof(getproperty(Base, variable)) <: Function) + # Remove the escaping if referring to a constant value like Base.pi + return variable + elseif @capture(x, variable_Symbol) && hasproperty(Core, variable) && !(typeof(getproperty(Core, variable)) <: Function) + # Remove the escaping if referring to a data type like Core.Int64 + return variable + elseif variable in not_escaped[] + return variable + elseif contains(string(variable), r"[^\W0-9]\w*$") # valid variable name + return esc(variable) + else + return variable + end + elseif @capture(x, fn_(args__)) + if hasproperty(Base, fn) && typeof(getproperty(Base, fn)) <: Function + return x + elseif hasproperty(Core, fn) && typeof(getproperty(Core, fn)) <: Function + return x + elseif hasproperty(Statistics, fn) && typeof(getproperty(Statistics, fn)) <: Function + return x + elseif fn in not_escaped[] return x elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name return :($(esc(fn))($(args...))) @@ -366,6 +386,10 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari var_expr = MacroTools.postwalk(var_expr) do x if @capture(x, !!variable_Symbol) return esc(variable) + # If a variable has already been escaped and marked with a `!!` (e.g., `!!pi`), + # then it won't be re-escaped. + elseif @capture(x, !!expr_) + return expr # `hello` in Julia is converted to Core.@cmd("hello") # Since MacroTools is unable to match this pattern, we can directly # evaluate the expression to see if it matches. If it does, the 3rd argument @@ -389,7 +413,20 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari return :($fn()) end elseif @capture(x, esc(variable_)) - return esc(variable) + return esc(variable) + # Escape any native Julia symbols that come from the Base or Core packages + # This includes :missing but also includes all data types (e.g., :Real, :String, etc.) + # To refer to a column named String, you can use `String` (in backticks) + elseif @capture(x, variable_Symbol) + if variable in not_escaped[] + return variable + elseif hasproperty(Base, variable) && !(typeof(getproperty(Base, variable)) <: Function) + return esc(variable) + elseif @capture(x, variable_Symbol) && hasproperty(Core, variable) && !(typeof(getproperty(Core, variable)) <: Function) + return esc(variable) + else + return variable + end end return x end