From ca10c469c043f5ad2eeaf2e8679e6666fbbad0b5 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 12 Dec 2023 00:19:26 -0500 Subject: [PATCH 1/3] Create list of escaped symbols, and populate with `:missing` for now. --- NEWS.md | 3 +++ Project.toml | 2 +- src/TidierData.jl | 5 ++++- src/parsing.jl | 4 +++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1235619d..ab822882 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # TidierData.jl updates +## v0.13.6 - 2023-12-12 +- Adds support for auto-escaped symbols. E.g., in `@mutate(b = missing)`, `missing` is now assumed to refer to `missing` values rather than a column named `missing`. We will need to further populate this list. + ## v0.13.5 - 2023-12-05 - `@summarize()` and `@summarise()` now perform auto-vectorization in the same way as `@mutate()`, meaning that the top-level macros are now all consistent in their treatment of auto-vectorization. - Update documentation to describe new auto-vectorization behavior and give an example of how to modify the `TidierData.not_vectorized[]` array. diff --git a/Project.toml b/Project.toml index bbb390d7..f3bd70a2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.13.5" +version = "0.13.6" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/src/TidierData.jl b/src/TidierData.jl index 326f0343..7498c432 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -25,9 +25,12 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) -# Expose the global do-not-vectorize "list" +# The global do-not-vectorize "list" const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) +# The global set of symbols to escape +const escaped_symbols = Ref{Vector{Symbol}}([:missing]) + # Includes include("docstrings.jl") include("parsing.jl") diff --git a/src/parsing.jl b/src/parsing.jl index fb9b81e2..8cc5cd1b 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -389,7 +389,9 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari return :($fn()) end elseif @capture(x, esc(variable_)) - return esc(variable) + return esc(variable) + elseif @capture(x, variable_Symbol) && variable in escaped_symbols[] + return esc(variable) end return x end From a7713c74055e75669f0e09dda0c7f4dd85c0c489 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 12 Dec 2023 03:30:48 -0500 Subject: [PATCH 2/3] Major update to parsing engine to support auto-escaping across all reserved names in the Base and Core modules. --- NEWS.md | 5 +++-- Project.toml | 2 +- src/TidierData.jl | 7 ++++--- src/parsing.jl | 45 ++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index ab822882..42336414 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # TidierData.jl updates -## v0.13.6 - 2023-12-12 -- Adds support for auto-escaped symbols. E.g., in `@mutate(b = missing)`, `missing` is now assumed to refer to `missing` values rather than a column named `missing`. We will need to further populate this list. +## v0.14.0 - 2023-12-12 +- Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped +- Add `collect()` to not_vectorized[] array ## v0.13.5 - 2023-12-05 - `@summarize()` and `@summarise()` now perform auto-vectorization in the same way as `@mutate()`, meaning that the top-level macros are now all consistent in their treatment of auto-vectorization. diff --git a/Project.toml b/Project.toml index f3bd70a2..d305da79 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.13.6" +version = "0.14.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/src/TidierData.jl b/src/TidierData.jl index 7498c432..af840daa 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -26,10 +26,11 @@ const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) # The global do-not-vectorize "list" -const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) +const not_vectorized = Ref{Vector{Symbol}}([:esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical]) -# The global set of symbols to escape -const escaped_symbols = Ref{Vector{Symbol}}([:missing]) +# The global do-not-escape "list" +# `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped +const not_escaped = Ref{Vector{Symbol}}([:esc, :in, :∈, :∉, :Ref, :Set, :Cols, :collect, :(:), :∘, :(=>), :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :startswith, :contains, :endswith]) # Includes include("docstrings.jl") diff --git a/src/parsing.jl b/src/parsing.jl index 8cc5cd1b..9d222a3f 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -331,10 +331,30 @@ end # Not exported function parse_escape_function(rhs_expr::Union{Expr,Symbol}) rhs_expr = MacroTools.postwalk(rhs_expr) do x - if @capture(x, fn_(args__)) - # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped - if fn in [:esc :in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] + # If it's already escaped, make sure it needs to remain escaped + if @capture(x, esc(variable_Symbol)) + if hasproperty(Base, variable) && !(typeof(getproperty(Base, variable)) <: Function) + # Remove the escaping if referring to a constant value like Base.pi + return variable + elseif @capture(x, variable_Symbol) && hasproperty(Core, variable) && !(typeof(getproperty(Core, variable)) <: Function) + # Remove the escaping if referring to a data type like Core.Int64 + return variable + elseif variable in not_escaped[] + return variable + elseif contains(string(variable), r"[^\W0-9]\w*$") # valid variable name + return esc(variable) + else + return variable + end + elseif @capture(x, fn_(args__)) + if hasproperty(Base, fn) && typeof(getproperty(Base, fn)) <: Function + return x + elseif hasproperty(Core, fn) && typeof(getproperty(Core, fn)) <: Function + return x + elseif hasproperty(Statistics, fn) && typeof(getproperty(Statistics, fn)) <: Function + return x + elseif fn in not_escaped[] return x elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name return :($(esc(fn))($(args...))) @@ -366,6 +386,10 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari var_expr = MacroTools.postwalk(var_expr) do x if @capture(x, !!variable_Symbol) return esc(variable) + # If a variable has already been escaped and marked with a `!!` (e.g., `!!pi`), + # then it won't be re-escaped. + elseif @capture(x, !!expr_) + return expr # `hello` in Julia is converted to Core.@cmd("hello") # Since MacroTools is unable to match this pattern, we can directly # evaluate the expression to see if it matches. If it does, the 3rd argument @@ -390,8 +414,19 @@ function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summari end elseif @capture(x, esc(variable_)) return esc(variable) - elseif @capture(x, variable_Symbol) && variable in escaped_symbols[] - return esc(variable) + # Escape any native Julia symbols that come from the Base or Core packages + # This includes :missing but also includes all data types (e.g., :Real, :String, etc.) + # To refer to a column named String, you can use `String` (in backticks) + elseif @capture(x, variable_Symbol) + if variable in not_escaped[] + return variable + elseif hasproperty(Base, variable) && !(typeof(getproperty(Base, variable)) <: Function) + return esc(variable) + elseif @capture(x, variable_Symbol) && hasproperty(Core, variable) && !(typeof(getproperty(Core, variable)) <: Function) + return esc(variable) + else + return variable + end end return x end From 171d6af955d83ba9a9f942f6aebe4adb4b202226 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 12 Dec 2023 03:44:26 -0500 Subject: [PATCH 3/3] Fixed documentation error. --- docs/examples/UserGuide/interpolation.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/examples/UserGuide/interpolation.jl b/docs/examples/UserGuide/interpolation.jl index 4a48ae3d..da4d9d49 100644 --- a/docs/examples/UserGuide/interpolation.jl +++ b/docs/examples/UserGuide/interpolation.jl @@ -122,21 +122,21 @@ df = DataFrame(radius = 1:5) @mutate(area = !!pi * radius^2) end -# ## Alternative interpolation syntax - -# While interpolation using `!!` is concise and handy, it's not required. You can also access user-defined globals and global constant variables using the following syntax: +# As of v0.14.0, global constants defined within the Base or Core modules (like `missing`, `pi`, and `Real` can be directly referenced without any `!!`) @chain df begin - @mutate(area = esc(pi) * radius^2) + @mutate(area = pi * radius^2) end +# ## Alternative interpolation syntax + # Since we know that `pi` is defined in the `Main` module, we can also access it using `Main.pi`. @chain df begin @mutate(area = Main.pi * radius^2) end -# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable`, `esc(variable)`, or `[Module_name_here].variable` syntax to refer to this variable. +# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable` or `[Module_name_here].variable` syntax to refer to this variable. # Note: You can use `!!` interpolation anywhere, including inside of functions and loops.