diff --git a/NEWS.md b/NEWS.md index f8f5a60c..f2568f1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.16.2 - 2024-08-05 +- Bugfix: `@slice_min` and `@slice_max` respect the `n` argument +- Adds `@head` + ## v0.16.1 - 2024-06-09 - Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns. - The `:` selector from Julia is now available and equivalent to `everything()` diff --git a/Project.toml b/Project.toml index edfcf379..d7d9e33a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.16.1" +version = "0.16.2" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index dd7d71a5..37128e51 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ To support R-style programming, TidierData.jl is implemented using macros. TidierData.jl currently supports the following top-level macros: -- `@glimpse()` +- `@glimpse()` and `@head()` - `@select()` and `@distinct()` - `@rename()` and `@rename_with()` - `@mutate()` and `@transmute()` diff --git a/docs/src/index.md b/docs/src/index.md index 9066dd39..d904f719 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -84,7 +84,7 @@ TidierData.jl currently supports the following top-level macros: ```@raw html !!! example "Top-level macros:" - - `@glimpse()` + - `@glimpse()` and `@head()` - `@select()` and `@distinct()` - `@rename()` and `@rename_with()` - `@mutate()` and `@transmute()` diff --git a/src/TidierData.jl b/src/TidierData.jl index c43b5553..0507199e 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -21,7 +21,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows, - @unnest_longer, @unnest_wider, @nest, @relocate + @unnest_longer, @unnest_wider, @nest, @relocate, @head # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? @@ -688,4 +688,24 @@ macro rename_with(df, fn, exprs...) return df_expr end +""" +$docstring_head +""" +macro head(df, exprs=6) + return quote + local df_input = $(esc(df)) + local n = $(esc(exprs)) + + if df_input isa GroupedDataFrame + grouped_result = combine(df_input) do sdf + first(sdf, n) + end + groupby(grouped_result, df_input.cols) + else + first(copy(df_input), n) + end + end +end + + end \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index 7a08f6bc..c0371679 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2834,14 +2834,15 @@ julia> @chain df begin 1 │ 5.0 7.0 5.0 julia> @chain df begin - @slice_max(b, with_ties = false, n = 2) + @slice_max(b, n = 3) end -2×3 DataFrame +3×3 DataFrame Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 2 │ 6.0 7.0 6.0 + 3 │ 1.0 6.0 1.0 julia> @chain df begin @slice_max(b, prop = 0.5, missing_rm = true) @@ -2897,15 +2898,15 @@ julia> @chain df begin 1 │ missing 0.3 0.2 julia> @chain df begin - @slice_min(b, with_ties = true, n = 1) - end -2×3 DataFrame - Row │ a b c - │ Float64? Float64? Float64? -─────┼─────────────────────────────── - 1 │ missing 0.3 0.2 - 2 │ missing 0.3 missing - + @slice_min(b, n = 3) + end +3×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼──────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + 3 │ 0.2 2.0 0.2 julia> @chain df begin @slice_min(b, prop = 0.5, missing_rm = true) @@ -2950,7 +2951,7 @@ julia> @chain df begin 3 │ missing missing 0.2 julia> @chain df begin - @slice_head(prop = .25) + @slice_head(prop = 0.25) end 2×3 DataFrame Row │ a b c @@ -2991,7 +2992,7 @@ julia> @chain df begin 3 │ 6.0 7.0 6.0 julia> @chain df begin - @slice_tail(prop = .25) + @slice_tail(prop = 0.25) end 2×3 DataFrame Row │ a b c @@ -3461,3 +3462,72 @@ julia> @relocate(df, B:C) # bring columns to the front 5 │ 10 E 5 C 5 E ``` """ + +const docstring_head = +""" + @head(df, value) +Shows the first n rows of the the data frame or of each group in a grouped data frame. + +# Arguments +- `df`: The data frame. +- `value`: number of rows to be returned. Defaults to 6 if left blank. + +# Examples +``` +julia> df = DataFrame(a = vcat(repeat(["a"], inner = 4), + repeat(["b"], inner = 4)), + b = 1:8) +8×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ a 1 + 2 │ a 2 + 3 │ a 3 + 4 │ a 4 + 5 │ b 5 + 6 │ b 6 + 7 │ b 7 + 8 │ b 8 + +julia> @head(df, 3) +3×2 DataFrame + Row │ a b + │ String? Int64 +─────┼──────────────── + 1 │ a 1 + 2 │ a 2 + 3 │ a 3 + +julia> @head(df) +6×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ a 1 + 2 │ a 2 + 3 │ a 3 + 4 │ a 4 + 5 │ b 5 + 6 │ b 6 + +julia> @chain df begin + @group_by a + @head 2 + end +GroupedDataFrame with 2 groups based on key: a +First Group (2 rows): a = "a" + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ a 1 + 2 │ a 2 +⋮ +Last Group (2 rows): a = "b" + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ b 5 + 2 │ b 6 +``` +""" \ No newline at end of file diff --git a/src/slice.jl b/src/slice.jl index a6ada20e..089e08f6 100644 --- a/src/slice.jl +++ b/src/slice.jl @@ -75,172 +75,229 @@ end $docstring_slice_max """ macro slice_max(df, exprs...) - exprs = parse_blocks(exprs...) + exprs = parse_blocks(exprs...) + + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + n = 1 # default value for n + + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + elseif lhs == :n + n = rhs # Capture n if provided + end + else + column = expr + end + end + + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + + if column === nothing + throw(ArgumentError("No column provided")) + end + + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $missing_rm + @filter($(esc(df)), !ismissing($column)) + else + $(esc(df)) + end + + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + max_value_rows = nrow(@filter(sdf, $column == maximum(skipmissing($(column))))) + selected_df = if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + if $with_ties && num_rows > max_value_rows + first(@arrange(sdf, desc($column)), num_rows) + elseif $with_ties && num_rows < max_value_rows + first(@arrange(sdf, desc($column)), max_value_rows) + else + first(@arrange(sdf, desc($column)), num_rows) + end + else + if $with_ties && $n > max_value_rows + first(@arrange(sdf, desc($column)), $n) + elseif $with_ties && $n < max_value_rows && $n != 1 + first(@arrange(sdf, desc($column)), max_value_rows) + elseif $with_ties && $n < max_value_rows && $n == 1 + first(@arrange(sdf, desc($column)), max_value_rows) + elseif !$with_ties && $n < max_value_rows + first(@arrange(sdf, desc($column)), $n) + else + first(@arrange(sdf, desc($column)), $n) + end + end + push!(result_dfs, selected_df) + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + max_value_rows = nrow(@filter(temp_df, $column == maximum(skipmissing($column)))) + temp_df = if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + if $with_ties && num_rows > max_value_rows + first(@arrange(temp_df, desc($column)), num_rows) + elseif $with_ties && num_rows < max_value_rows + first(@arrange(temp_df, desc($column)), max_value_rows) + else + first(@arrange(temp_df, desc($column)), num_rows) + end + else + if $with_ties && $n > max_value_rows + first(@arrange(temp_df, desc($column)), $n) + elseif $with_ties && $n < max_value_rows && $n != 1 + first(@arrange(temp_df, desc($column)), max_value_rows) + elseif $with_ties && $n < max_value_rows && $n == 1 + first(@arrange(temp_df, desc($column)), max_value_rows) + else !$with_ties && $n < max_value_rows + first(@arrange(temp_df, desc($column)), $n) + end + end + end + + temp_df + end - expr_dict = Dict() - column = nothing - missing_rm = true - with_ties = true - arranged = false - for expr in exprs - if @capture(expr, lhs_ = rhs_) - expr_dict[lhs] = rhs - if lhs == :missing_rm - missing_rm = rhs - elseif lhs == :prop - arranged = true - end - else - column = expr - end - end - if haskey(expr_dict, :with_ties) - with_ties = expr_dict[:with_ties] - end - if column === nothing - throw(ArgumentError("No column provided")) - end - return quote - grouping_cols = Symbol[] - if $(esc(df)) isa DataFrames.GroupedDataFrame - grouping_cols = DataFrames.groupcols($(esc(df))) - end - temp_df = if $arranged - if $missing_rm - @chain $(esc(df)) begin - @filter(!ismissing($column)) - @arrange(desc($column)) - end - else - @chain $(esc(df)) begin - @arrange(desc($column)) - end - end - else - @filter($(esc(df)), $column == maximum(skipmissing($column))) - end - if temp_df isa DataFrames.GroupedDataFrame - result_dfs = [] - for sdf in temp_df - local prop_val - if haskey($expr_dict, :prop) - prop_val = $expr_dict[:prop] - if prop_val < 0.0 || prop_val > 1.0 - throw(ArgumentError("Prop value should be between 0 and 1")) - end - num_rows = floor(Int, nrow(sdf) * prop_val) - push!(result_dfs, first(sdf, num_rows)) - elseif $with_ties - push!(result_dfs, sdf) - else - n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 - push!(result_dfs, first(sdf, n)) - end - end - temp_df = vcat(result_dfs...) - temp_df = DataFrames.groupby(temp_df, grouping_cols) - else - local prop_val - if haskey($expr_dict, :prop) - prop_val = $expr_dict[:prop] - if prop_val < 0.0 || prop_val > 1.0 - throw(ArgumentError("Prop value should be between 0 and 1")) - end - num_rows = floor(Int, nrow(temp_df) * prop_val) - temp_df = first(temp_df, num_rows) - elseif !$with_ties - n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 - temp_df = first(temp_df, n) - end - temp_df - end - end end + """ $docstring_slice_min """ macro slice_min(df, exprs...) - exprs = parse_blocks(exprs...) + exprs = parse_blocks(exprs...) + + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + n = 1 # default value for n + + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + elseif lhs == :n + n = rhs # Capture n if provided + end + else + column = expr + end + end + + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + + if column === nothing + throw(ArgumentError("No column provided")) + end + + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $missing_rm + @filter($(esc(df)), !ismissing($column)) + else + $(esc(df)) + end + + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + max_value_rows = nrow(@filter(sdf, $column == minimum(skipmissing($(column))))) + selected_df = if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + if $with_ties && num_rows > max_value_rows + first(@arrange(sdf, ($column)), num_rows) + elseif $with_ties && num_rows < max_value_rows + first(@arrange(sdf, ($column)), max_value_rows) + else + first(@arrange(sdf, ($column)), num_rows) + end + else + if $with_ties && $n > max_value_rows + first(@arrange(sdf, ($column)), $n) + elseif $with_ties && $n < max_value_rows && $n != 1 + first(@arrange(sdf, ($column)), max_value_rows) + elseif $with_ties && $n < max_value_rows && $n == 1 + first(@arrange(sdf, ($column)), max_value_rows) + elseif !$with_ties && $n < max_value_rows + first(@arrange(sdf, ($column)), $n) + else + first(@arrange(sdf, ($column)), $n) + end + end + push!(result_dfs, selected_df) + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + max_value_rows = nrow(@filter(temp_df, $column == minimum(skipmissing($column)))) + temp_df = if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + if $with_ties && num_rows > max_value_rows + first(@arrange(temp_df, ($column)), num_rows) + elseif $with_ties && num_rows < max_value_rows + first(@arrange(temp_df, ($column)), max_value_rows) + else + first(@arrange(temp_df, ($column)), num_rows) + end + else + if $with_ties && $n > max_value_rows + first(@arrange(temp_df, ($column)), $n) + elseif $with_ties && $n < max_value_rows && $n != 1 + first(@arrange(temp_df, ($column)), max_value_rows) + elseif $with_ties && $n < max_value_rows && $n == 1 + first(@arrange(temp_df, ($column)), max_value_rows) + else !$with_ties && $n < max_value_rows + first(@arrange(temp_df, ($column)), $n) + end + end + end + + temp_df + end - expr_dict = Dict() - column = nothing - missing_rm = true - with_ties = true - arranged = false - for expr in exprs - if @capture(expr, lhs_ = rhs_) - expr_dict[lhs] = rhs - if lhs == :missing_rm - missing_rm = rhs - elseif lhs == :prop - arranged = true - end - else - column = expr - end - end - if haskey(expr_dict, :with_ties) - with_ties = expr_dict[:with_ties] - end - if column === nothing - throw(ArgumentError("No column provided")) - end - return quote - grouping_cols = Symbol[] - if $(esc(df)) isa DataFrames.GroupedDataFrame - grouping_cols = DataFrames.groupcols($(esc(df))) - end - temp_df = if $arranged - if $missing_rm - @chain $(esc(df)) begin - @filter(!ismissing($column)) - @arrange($column) - end - else - @chain $(esc(df)) begin - @arrange($column) - end - end - else - @filter($(esc(df)), $column == minimum(skipmissing($column))) - end - if temp_df isa DataFrames.GroupedDataFrame - result_dfs = [] - for sdf in temp_df - local prop_val - if haskey($expr_dict, :prop) - prop_val = $expr_dict[:prop] - if prop_val < 0.0 || prop_val > 1.0 - throw(ArgumentError("Prop value should be between 0 and 1")) - end - num_rows = floor(Int, nrow(sdf) * prop_val) - push!(result_dfs, first(sdf, num_rows)) - elseif $with_ties - push!(result_dfs, sdf) - else - n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 - push!(result_dfs, first(sdf, n)) - end - end - temp_df = vcat(result_dfs...) - temp_df = DataFrames.groupby(temp_df, grouping_cols) - else - local prop_val - if haskey($expr_dict, :prop) - prop_val = $expr_dict[:prop] - if prop_val < 0.0 || prop_val > 1.0 - throw(ArgumentError("Prop value should be between 0 and 1")) - end - num_rows = floor(Int, nrow(temp_df) * prop_val) - temp_df = first(temp_df, num_rows) - elseif !$with_ties - n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 - temp_df = first(temp_df, n) - end - temp_df - end - end end """