-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #18 from drizk1/main
- Loading branch information
Showing
10 changed files
with
257 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Follwing the tidyverse syntax, the `@separate()` macro in `TidierData.jl` separates a single column into multiple columns. This is particularly useful for splitting a column containing delimited values into individual columns. | ||
|
||
using TidierData | ||
|
||
df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); | ||
|
||
# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter | ||
|
||
@chain df begin | ||
@separate(a, (b, c, d), "-") | ||
end | ||
|
||
# The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter | ||
|
||
# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter | ||
|
||
df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); | ||
|
||
@chain df begin | ||
@unite(new_col, (b, c, d), "/") | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# The `@summary()` macro in `TidierData.jl` provides a concise way to compute summary statistics on data. Similar to its R counterpart, it will provide the mean, median, Q1, Q3, minimum, maximum, and number of missing values in a numerical column or columns. | ||
|
||
# ## Summary for the whole dataframe | ||
|
||
using TidierData | ||
|
||
df = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]); | ||
|
||
@chain df begin | ||
@summary() | ||
end | ||
|
||
@summary(df) | ||
|
||
# ## You can specify columns for which you want to compute the summary. This is useful if the DataFrame has a large number of columns and you're interested in only a subset of them. | ||
|
||
@chain df begin | ||
@summary(B) | ||
end | ||
|
||
@summary(df, B) | ||
|
||
# ## or for a range of columns | ||
|
||
@chain df begin | ||
@select(B:D) | ||
@summary() # you can also write this @summary(2:4) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
function safe_getindex(arr, index, default_value="") | ||
if index <= length(arr) | ||
return arr[index] | ||
else | ||
return default_value | ||
end | ||
end | ||
|
||
function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::String) | ||
new_df = df[:, :] | ||
new_cols = map(x -> split(x, sep), new_df[:, col]) | ||
max_cols = maximum(length.(new_cols)) | ||
|
||
if length(into) < max_cols | ||
error("Not enough names provided in `into` for all split columns.") | ||
end | ||
|
||
for i in 1:max_cols | ||
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) | ||
end | ||
|
||
new_df = select(new_df, Not(col)) | ||
|
||
return new_df | ||
end | ||
|
||
""" | ||
$docstring_separate | ||
""" | ||
macro separate(df, from, into, sep) | ||
from = QuoteNode(from) | ||
|
||
if @capture(into, (args__,)) | ||
elseif @capture(into, [args__]) | ||
end | ||
|
||
args = QuoteNode.(args) | ||
|
||
var_expr = quote | ||
separate($(esc(df)), $from, [$(args...)], $sep) | ||
end | ||
end | ||
|
||
|
||
function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") | ||
new_df = df[:, :] | ||
new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] | ||
return new_df | ||
end | ||
|
||
""" | ||
$docstring_unite | ||
""" | ||
macro unite(df, new_col, from_cols, sep) | ||
new_col = QuoteNode(new_col) | ||
|
||
if @capture(from_cols, (args__,)) | ||
elseif @capture(from_cols, [args__]) | ||
end | ||
|
||
args = QuoteNode.(args) | ||
var_expr = quote | ||
unite($(esc(df)), $new_col, [$(args...)], $sep) | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
function summary_stats(df::DataFrame) | ||
colnames = names(df) | ||
summary_data = [] | ||
for column in colnames | ||
col = df[:, column] | ||
col_nonmissing = collect(skipmissing(col)) | ||
push!(summary_data, ( | ||
Column = column, | ||
Min = minimum(col_nonmissing), | ||
Q1 = quantile(col_nonmissing, 0.25), | ||
Median = median(col_nonmissing), | ||
Mean = mean(col_nonmissing), | ||
Q3 = quantile(col_nonmissing, 0.75), | ||
Max = maximum(col_nonmissing), | ||
Count = length(col_nonmissing), | ||
Missing_Count = count(ismissing, col) | ||
)) | ||
end | ||
return DataFrame(summary_data) | ||
end | ||
|
||
""" | ||
$docstring_summary | ||
""" | ||
macro summary(df, cols...) | ||
if length(cols) == 0 | ||
return quote | ||
summary_stats($(esc(df))) | ||
end | ||
else | ||
selected_cols = [parse_tidy(col) for col in cols] | ||
return quote | ||
_selected_df = select($(esc(df)), $(selected_cols...)) | ||
summary_stats(_selected_df) | ||
end | ||
end | ||
end | ||
|
66b8639
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
66b8639
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/89023
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via: