Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds join_by #128

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# TidierData.jl updates

## v16.3
- Bugfix: `@summary` no longer errors with non-numeric columns. Instead, it only reports non-numeric summary stats on non-numeric columns. Minor changes to summary column names to be lowercase and snakecase.

## v0.16.2 - 2024-09-03
- Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
- Adds `@head`
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TidierData"
uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
authors = ["Karandeep Singh"]
version = "0.16.2"
version = "0.16.3"

[deps]
Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"
Expand Down
126 changes: 114 additions & 12 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1026,21 +1026,38 @@ julia> @chain df @pull(2)

const docstring_left_join =
"""
@left_join(df1, df2, [by])
@left_join(df1, df2, [join_by])

Perform a left join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
```jldoctest
julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);


julia> @left_join(df1, df2, join_by(a == a))
2×3 DataFrame
Row │ a b c
│ String Int64 Int64?
─────┼────────────────────────
1 │ a 1 3
2 │ b 2 missing

julia> @left_join(df1, df2, join_by("a"=="a"))
2×3 DataFrame
Row │ a b c
│ String Int64 Int64?
─────┼────────────────────────
1 │ a 1 3
2 │ b 2 missing

julia> @left_join(df1, df2)
2×3 DataFrame
Row │ a b c
Expand Down Expand Up @@ -1085,21 +1102,38 @@ julia> @left_join(df1, df2, "a" = "a")

const docstring_right_join =
"""
@right_join(df1, df2, [by])
@right_join(df1, df2, [join_by])

Perform a right join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
```jldoctest
julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);


julia> @right_join(df1, df2, join_by(a == a))
2×3 DataFrame
Row │ a b c
│ String Int64? Int64
─────┼────────────────────────
1 │ a 1 3
2 │ c missing 4

julia> @right_join(df1, df2, join_by("a"=="a"))
2×3 DataFrame
Row │ a b c
│ String Int64? Int64
─────┼────────────────────────
1 │ a 1 3
2 │ c missing 4

julia> @right_join(df1, df2)
2×3 DataFrame
Row │ a b c
Expand Down Expand Up @@ -1144,21 +1178,36 @@ julia> @right_join(df1, df2, "a" = "a")

const docstring_inner_join =
"""
@inner_join(df1, df2, [by])
@inner_join(df1, df2, [join_by])

Perform a inner join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`
- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
```jldoctest
julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);


julia> @inner_join(df1, df2, join_by(a == a))
1×3 DataFrame
Row │ a b c
│ String Int64 Int64
─────┼──────────────────────
1 │ a 1 3

julia> @inner_join(df1, df2, join_by("a"=="a"))
1×3 DataFrame
Row │ a b c
│ String Int64 Int64
─────┼──────────────────────
1 │ a 1 3

julia> @inner_join(df1, df2)
1×3 DataFrame
Row │ a b c
Expand Down Expand Up @@ -1198,21 +1247,41 @@ julia> @inner_join(df1, df2, "a" = "a")

const docstring_full_join =
"""
@full_join(df1, df2, [by])
@full_join(df1, df2, [join_by])

Perform a full join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`

- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
```jldoctest
julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);


julia> @full_join(df1, df2, join_by(a == a))
3×3 DataFrame
Row │ a b c
│ String Int64? Int64?
─────┼──────────────────────────
1 │ a 1 3
2 │ b 2 missing
3 │ c missing 4

julia> @full_join(df1, df2, join_by("a"=="a"))
3×3 DataFrame
Row │ a b c
│ String Int64? Int64?
─────┼──────────────────────────
1 │ a 1 3
2 │ b 2 missing
3 │ c missing 4

julia> @full_join(df1, df2)
3×3 DataFrame
Row │ a b c
Expand Down Expand Up @@ -1262,21 +1331,37 @@ julia> @full_join(df1, df2, "a" = "a")

const docstring_anti_join =
"""
@anti_join(df1, df2, [by])
@anti_join(df1, df2, [join_by])

Perform an anti-join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`

- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
```jldoctest
julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);


julia> @anti_join(df1, df2, join_by(a == a))
1×2 DataFrame
Row │ a b
│ String Int64
─────┼───────────────
1 │ b 2

julia> @anti_join(df1, df2, join_by("a"=="a"))
1×2 DataFrame
Row │ a b
│ String Int64
─────┼───────────────
1 │ b 2

julia> @anti_join(df1, df2)
1×2 DataFrame
Row │ a b
Expand Down Expand Up @@ -1316,13 +1401,15 @@ julia> @anti_join(df1, df2, "a" = "a")

const docstring_semi_join =
"""
@semi_join(df1, df2, [by])
@semi_join(df1, df2, [join_by])

Perform an semi-join on `df1` and `df2` with an optional `by`.

# Arguments
- `df1`: A DataFrame.
- `df2`: A DataFrame.
- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)`

- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`.

# Examples
Expand All @@ -1331,6 +1418,20 @@ julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @semi_join(df1, df2, join_by(a == a))
1×2 DataFrame
Row │ a b
│ String Int64
─────┼───────────────
1 │ a 1

julia> @semi_join(df1, df2, join_by("a"=="a"))
1×2 DataFrame
Row │ a b
│ String Int64
─────┼───────────────
1 │ a 1

julia> @semi_join(df1, df2)
1×2 DataFrame
Row │ a b
Expand Down Expand Up @@ -2415,7 +2516,8 @@ For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, media
julia> df = DataFrame(a = [1, 2, 3, 4, 5],
b = [missing, 7, 8, 9, 10],
c = [11, missing, 13, 14, missing],
d = [16, 17, 18, 19, 20]);
d = [16.1, 17.2, 18.3, 19.4, 20.5],
e = ["a", "a", "a", "a", "a"]);

julia> @summary(df);

Expand Down
14 changes: 13 additions & 1 deletion src/parsing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String})
tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr)

src = Union{Expr,QuoteNode}[] # type can be either a QuoteNode or a expression containing a selection helper function

if @capture(tidy_expr, expr_Symbol)
push!(src, QuoteNode(expr))
elseif @capture(tidy_expr, expr_String)
Expand All @@ -229,6 +229,18 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String})
lhs = QuoteNode(Symbol(lhs))
rhs = QuoteNode(Symbol(rhs))
push!(src, :($lhs => $rhs))
elseif tidy_expr isa Expr && tidy_expr.head == :call && tidy_expr.args[1] == :join_by
for arg in tidy_expr.args[2:end]
if arg isa Expr && arg.head == :call && arg.args[1] == Symbol("==")
lhs_arg = arg.args[2]
rhs_arg = arg.args[3]
push!(src, :($(QuoteNode(lhs_arg)) => $(QuoteNode(rhs_arg))))
elseif isa(arg, Symbol)
push!(src, :($(QuoteNode(arg)) => $(QuoteNode(arg))))
else
push!(src, arg)
end
end
else
@capture(tidy_expr, (args__,))
for arg in args
Expand Down
4 changes: 2 additions & 2 deletions src/separate_unite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimite
for row in eachrow(temp_df)
value = row[column]
# Handle missing values and non-string types
if ismissing(value) || typeof(value) != String
push!(expanded_data[column], [value])
if ismissing(value) || !(value isa AbstractString)
push!(expanded_data[column], [value])
else
push!(expanded_data[column], split(value, delimiter))
end
Expand Down
43 changes: 31 additions & 12 deletions src/summary.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,37 @@ function summary_stats(df::DataFrame)
summary_data = []
for column in colnames
col = df[:, column]
col_nonmissing = collect(skipmissing(col))
push!(summary_data, (
Column = column,
Min = minimum(col_nonmissing),
Q1 = quantile(col_nonmissing, 0.25),
Median = median(col_nonmissing),
Mean = mean(col_nonmissing),
Q3 = quantile(col_nonmissing, 0.75),
Max = maximum(col_nonmissing),
Count = length(col_nonmissing),
Missing_Count = count(ismissing, col)
))
if eltype(col) <: Union{Number, Missing}
col_nonmissing = collect(skipmissing(col))
push!(summary_data, (
column = column,
min = minimum(col_nonmissing),
q1 = quantile(col_nonmissing, 0.25),
median = median(col_nonmissing),
mean = mean(col_nonmissing),
q3 = quantile(col_nonmissing, 0.75),
max = maximum(col_nonmissing),
non_missing_values = length(col_nonmissing),
missing_values = count(ismissing, col),
total_values = length(col),
unique_values = length(unique(col_nonmissing))
))
else
col_nonmissing = collect(skipmissing(col))
push!(summary_data, (
column = column,
min = nothing,
q1 = nothing,
median = nothing,
mean = nothing,
q3 = nothing,
max = nothing,
non_missing_values = length(col_nonmissing),
missing_values = count(ismissing, col),
total_values = length(col),
unique_values = length(unique(col_nonmissing))
))
end
end
return DataFrame(summary_data)
end
Expand Down
Loading