diff --git a/literate_notebooks/src-PT-BR/01_constructors.jl b/literate_notebooks/src-PT-BR/01_constructors.jl new file mode 100644 index 0000000..333a81e --- /dev/null +++ b/literate_notebooks/src-PT-BR/01_constructors.jl @@ -0,0 +1,143 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# +# Let's get started by loading the `DataFrames` package. + +using DataFrames + +# ## Constructors and conversion + +#- + +# ### Constructors +# +# In this section, you'll see many ways to create a `DataFrame` using the `DataFrame()` constructor. +# +# First, we could create an empty DataFrame, + +DataFrame() # empty DataFrame + +# Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. + +DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) + +# We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. + +x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) +DataFrame(x) + +# Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. +# +# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: + +DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) + +# Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. + +DataFrame([rand(3) for i in 1:3]) + +# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. + +DataFrame(rand(3)) + +# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). + +DataFrame(transpose([1, 2, 3])) + +# Pass a second argument to give the columns names. + +DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) + +# Here we create a `DataFrame` from a matrix, + +DataFrame(rand(3,4)) + +# and here we do the same but also pass column names. + +DataFrame(rand(3,4), Symbol.('a':'d')) + +# We can also construct an uninitialized DataFrame. +# +# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. + +DataFrame([Int, Float64, Any], [:A, :B, :C], 1) + +# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# +# This will be fixed in next release of DataFrames! + +DataFrame([Int, Float64, String], [:A, :B, :C], 1) + +# To initialize a `DataFrame` with column names, but no rows use + +DataFrame([Int, Float64, String], [:A, :B, :C], 0) + +# This syntax gives us a quick way to create homogenous `DataFrame`. + +DataFrame(Int, 3, 5) + +# This example is similar, but has nonhomogenous columns. + +DataFrame([Int, Float64], 4) + +# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. +# +# Note that `copy` creates a shallow copy. + +y = DataFrame(x) +z = copy(x) +(x === y), (x === z), isequal(x, z) + +# ### Conversion to a matrix +# +# Let's start by creating a `DataFrame` with two rows and two columns. + +x = DataFrame(x=1:2, y=["A", "B"]) + +# We can create a matrix by passing this `DataFrame` to `Matrix`. + +Matrix(x) + +# This would work even if the `DataFrame` had some `missing`s: + +x = DataFrame(x=1:2, y=[missing,"B"]) + +#- + +Matrix(x) + +# In the two previous matrix examples, Julia created matrices with elements of type `Any`. We can see more clearly that the type of matrix is inferred when we pass, for example, a `DataFrame` of integers to `Matrix`, creating a 2D `Array` of `Int64`s: + +x = DataFrame(x=1:2, y=3:4) + +#- + +Matrix(x) + +# In this next example, Julia correctly identifies that `Union` is needed to express the type of the resulting `Matrix` (which contains `missing`s). + +x = DataFrame(x=1:2, y=[missing,4]) + +#- + +Matrix(x) + +# Note that we can't force a conversion of `missing` values to `Int`s! + +Matrix{Int}(x) + +# ### Handling of duplicate column names +# +# We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) + +df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) + +# Otherwise, duplicates will not be allowed in the future. + +df = DataFrame(:a=>1, :a=>2, :a_1=>3) + +# A constructor that is passed column names as keyword arguments is a corner case. +# You cannot pass `makeunique` to allow duplicates here. + +df = DataFrame(a=1, a=2, makeunique=true) + diff --git a/literate_notebooks/src-PT-BR/02_basicinfo.jl b/literate_notebooks/src-PT-BR/02_basicinfo.jl new file mode 100644 index 0000000..6cde7c6 --- /dev/null +++ b/literate_notebooks/src-PT-BR/02_basicinfo.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Getting basic information about a data frame +# +# Let's start by creating a `DataFrame` object, `x`, so that we can learn how to get information on that data frame. + +x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"]) + +# The standard `size` function works to get dimensions of the `DataFrame`, + +size(x), size(x, 1), size(x, 2) + +# as well as `nrow` and `ncol` from R; `length` gives number of columns. + +nrow(x), ncol(x), length(x) + +# `describe` gives basic summary statistics of data in your `DataFrame`. + +describe(x) + +# Use `showcols` to get informaton about columns stored in a DataFrame. + +showcols(x) + +# `names` will return the names of all columns, + +names(x) + +# and `eltypes` returns their types. + +eltypes(x) + +# Here we create some large DataFrame + +y = DataFrame(rand(1:10, 1000, 10)); + +# and then we can use `head` to peek into its top rows + +head(y) + +# and `tail` to see its bottom rows. + +tail(y, 3) + +# ### Most elementary get and set operations +# +# Given the `DataFrame`, `x`, here are three ways to grab one of its columns as a `Vector`: + +x[1], x[:A], x[:, 1] + +# To grab one row as a DataFrame, we can index as follows. + +x[1, :] + +# We can grab a single cell or element with the same syntax to grab an element of an array. + +x[1, 1] + +# Assignment can be done in ranges to a scalar, + +x[1:2, 1:2] = 1 +x + +# to a vector of length equal to the number of assigned rows, + +x[1:2, 1:2] = [1,2] +x + +# or to another data frame of matching size. + +x[1:2, 1:2] = DataFrame([5 6; 7 8]) +x + diff --git a/literate_notebooks/src-PT-BR/03_missingvalues.jl b/literate_notebooks/src-PT-BR/03_missingvalues.jl new file mode 100644 index 0000000..1e17d97 --- /dev/null +++ b/literate_notebooks/src-PT-BR/03_missingvalues.jl @@ -0,0 +1,112 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Handling missing values +# +# A singelton type `Missings.Missing` allows us to deal with missing values. + +missing, typeof(missing) + +# Arrays automatically create an appropriate union type. + +x = [1, 2, missing, 3] + +# `ismissing` checks if passed value is missing. + +ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) + +# We can extract the type combined with Missing from a `Union` via +# +# (This is useful for arrays!) + +eltype(x), Missings.T(eltype(x)) + +# `missing` comparisons produce `missing`. + +missing == missing, missing != missing, missing < missing + +# This is also true when `missing`s are compared with values of other types. + +1 == missing, 1 != missing, 1 < missing + +# `isequal`, `isless`, and `===` produce results of type `Bool`. + +isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) + +# In the next few examples, we see that many (not all) functions handle `missing`. + +map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1 + +#- + +map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 + +#- + +map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 + +# `skipmissing` returns iterator skipping missing values. We can use `collect` and `skipmissing` to create an array that excludes these missing values. + +collect(skipmissing([1, missing, 2, missing])) + +# Similarly, here we combine `collect` and `Missings.replace` to create an array that replaces all missing values with some value (`NaN` in this case). + +collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) + +# Another way to do this: + +coalesce.([1.0, missing, 2.0, missing], NaN) + +# Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned). + +coalesce.([1.0, missing, nothing, missing], NaN) + +# You can use `recode` if you have homogenous output types. + +recode([1.0, missing, 2.0, missing], missing=>NaN) + +# You can use `unique` or `levels` to get unique values with or without missings, respectively. + +unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) + +# In this next example, we convert `x` to `y` with `allowmissing`, where `y` has a type that accepts missings. + +x = [1,2,3] +y = allowmissing(x) + +# Then, we convert back with `disallowmissing`. This would fail if `y` contained missing values! + +z = disallowmissing(y) +x,y,z + +# In this next example, we show that the type of each column in `x` is initially `Int64`. After using `allowmissing!` to accept missing values in columns 1 and 3, the types of those columns become `Union`s of `Int64` and `Missings.Missing`. + +x = DataFrame(Int, 2, 3) +println("Before: ", eltypes(x)) +allowmissing!(x, 1) # make first column accept missings +allowmissing!(x, :x3) # make :x3 column accept missings +println("After: ", eltypes(x)) + +# In this next example, we'll use `completecases` to find all the rows of a `DataFrame` that have complete data. + +x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) +println(x) +println("Complete cases:\n", completecases(x)) + +# We can use `dropmissing` or `dropmissing!` to remove the rows with incomplete data from a `DataFrame` and either create a new `DataFrame` or mutate the original in-place. + +y = dropmissing(x) +dropmissing!(x) +[x, y] + +# When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values. + +showcols(x) + +# Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values. + +disallowmissing!(x) +showcols(x) + diff --git a/literate_notebooks/src-PT-BR/04_loadsave.jl b/literate_notebooks/src-PT-BR/04_loadsave.jl new file mode 100644 index 0000000..d166830 --- /dev/null +++ b/literate_notebooks/src-PT-BR/04_loadsave.jl @@ -0,0 +1,64 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Load and save DataFrames +# We do not cover all features of the packages. Please refer to their documentation to learn them. +# +# Here we'll load `CSV` to read and write CSV files and `JLD`, which allows us to work with a Julia native binary format. + +using CSV +using JLD + +# Let's create a simple `DataFrame` for testing purposes, + +x = DataFrame(A=[true, false, true], B=[1, 2, missing], + C=[missing, "b", "c"], D=['a', missing, 'c']) + + +# and use `eltypes` to look at the columnwise types. + +eltypes(x) + +# Let's use `CSV` to save `x` to disk; make sure `x.csv` does not conflict with some file in your working directory. + +CSV.write("x.csv", x) + +# Now we can see how it was saved by reading `x.csv`. + +print(read("x.csv", String)) + +# We can also load it back. `use_mmap=false` disables memory mapping so that on Windows the file can be deleted in the same session. + +y = CSV.read("x.csv", use_mmap=false) + +# When loading in a `DataFrame` from a `CSV`, all columns allow `Missing` by default. Note that the column types have changed! + +eltypes(y) + +# Now let's save `x` to a file in a binary format; make sure that `x.jld` does not exist in your working directory. + +save("x.jld", "x", x) + +# After loading in `x.jld` as `y`, `y` is identical to `x`. + +y = load("x.jld", "x") + +# Note that the column types of `y` are the same as those of `x`! + +eltypes(y) + +# Next, we'll create the files `bigdf.csv` and `bigdf.jld`, so be careful that you don't already have these files on disc! +# +# In particular, we'll time how long it takes us to write a `DataFrame` with 10^3 rows and 10^5 columns to `.csv` and `.jld` files. *You can expect JLD to be faster!* Use `compress=true` to reduce file sizes. + +bigdf = DataFrame(Bool, 10^3, 10^2) +@time CSV.write("bigdf.csv", bigdf) +@time save("bigdf.jld", "bigdf", bigdf) +getfield.(stat.(["bigdf.csv", "bigdf.jld"]), :size) + +# Finally, let's clean up. Do not run the next cell unless you are sure that it will not erase your important files. + +foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) + diff --git a/literate_notebooks/src-PT-BR/05_columns.jl b/literate_notebooks/src-PT-BR/05_columns.jl new file mode 100644 index 0000000..f32e02a --- /dev/null +++ b/literate_notebooks/src-PT-BR/05_columns.jl @@ -0,0 +1,187 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Manipulating columns of DataFrame + +#- + +# ### Renaming columns +# +# Let's start with a `DataFrame` of `Bool`s that has default column names. + +x = DataFrame(Bool, 3, 4) + +# With `rename`, we create new `DataFrame`; here we rename the column `:x1` to `:A`. (`rename` also accepts collections of Pairs.) + +rename(x, :x1 => :A) + +# With `rename!` we do an in place transformation. +# +# This time we've applied a function to every column name. + +rename!(c -> Symbol(string(c)^2), x) + +# We can also change the name of a particular column without knowing the original. +# +# Here we change the name of the third column, creating a new `DataFrame`. + +rename(x, names(x)[3] => :third) + +# With `names!`, we can change the names of all variables. + +names!(x, [:a, :b, :c, :d]) + +# We get an error when we try to provide duplicate names + +names!(x, fill(:a, 4)) + +# unless we pass `makeunique=true`, which allows us to handle duplicates in passed names. + +names!(x, fill(:a, 4), makeunique=true) + +# ### Reordering columns + +#- + +# We can reorder the names(x) vector as needed, creating a new DataFrame. + +srand(1234) +x[shuffle(names(x))] + +# also `permutecols!` will be introduced in next release of DataFrames + +#- + +# ### Merging/adding columns + +x = DataFrame([(i,j) for i in 1:3, j in 1:4]) + +# With `hcat` we can merge two `DataFrame`s. Also [x y] syntax is supported but only when DataFrames have unique column names. + +hcat(x, x, makeunique=true) + +# We can also use `hcat` to add a new column; a default name `:x1` will be used for this column, so `makeunique=true` is needed. + +y = hcat(x, [1,2,3], makeunique=true) + +# You can also prepend a vector with `hcat`. + +hcat([1,2,3], x, makeunique=true) + +# Alternatively you could append a vector with the following syntax. This is a bit more verbose but cleaner. + +y = [x DataFrame(A=[1,2,3])] + +# Here we do the same but add column `:A` to the front. + +y = [DataFrame(A=[1,2,3]) x] + +# A column can also be added in the middle. Here a brute-force method is used and a new DataFrame is created. + +using BenchmarkTools +@btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]] + +# We could also do this with a specialized in place method `insert!`. Let's add `:newcol` to the `DataFrame` `y`. + +insert!(y, 2, [1,2,3], :newcol) + +# If you want to insert the same column name several times `makeunique=true` is needed as usual. + +insert!(y, 2, [1,2,3], :newcol, makeunique=true) + +# We can see how much faster it is to insert a column with `insert!` than with `hcat` using `@btime`. + +@btime insert!(copy($x), 3, [1,2,3], :A) + +# Let's use `insert!` to append a column in place, + +insert!(x, ncol(x)+1, [1,2,3], :A) + +# and to in place prepend a column. + +insert!(x, 1, [1,2,3], :B) + +# With `merge!`, let's merge the second DataFrame into first, but overwriting duplicates. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +df1, df2, merge!(df1, df2) + +# For comparison: merge two `DataFrames`s but renaming duplicate names via `hcat`. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +hcat(df1, df2, makeunique=true) + +# ### Subsetting/removing columns +# +# Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns. + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# First we could do this by index + +x[[1,2,4,5]] + +# or by column name. + +x[[:x1, :x4]] + +# We can also choose to keep or exclude columns by `Bool`. (We need a vector whose length is the number of columns in the original `DataFrame`.) + +x[[true, false, true, false, true]] + +# Here we create a single column `DataFrame`, + +x[[:x1]] + +# and here we access the vector contained in column `:x1`. + +x[:x1] + +# We could grab the same vector by column number + +x[1] + +# and remove everything from a `DataFrame` with `empty!`. + +empty!(y) + +# Here we create a copy of `x` and delete the 3rd column from the copy with `delete!`. + +z = copy(x) +x, delete!(z, 3) + +# ### Modify column by name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# With the following syntax, the existing column is modified without performing any copying. + +x[:x1] = x[:x2] +x + +# We can also use the following syntax to add a new column at the end of a `DataFrame`. + +x[:A] = [1,2,3] +x + +# A new column name will be added to our `DataFrame` with the following syntax as well (7 is equal to `ncol(x)+1`). + +x[7] = 11:13 +x + +# ### Find column name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# We can check if a column with a given name exists via + +:x1 in names(x) + +# and determine its index via + +findfirst(names(x), :x2) + diff --git a/literate_notebooks/src-PT-BR/06_rows.jl b/literate_notebooks/src-PT-BR/06_rows.jl new file mode 100644 index 0000000..3660e40 --- /dev/null +++ b/literate_notebooks/src-PT-BR/06_rows.jl @@ -0,0 +1,177 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package +srand(1); + +# ## Manipulating rows of DataFrame + +#- + +# ### Reordering rows + +x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :) + +#- + +issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted + +#- + +sort!(x, :x) # sort x in place + +#- + +y = sort(x, :id) # new DataFrame + +#- + +sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing + +#- + +sort(x, (order(:y, rev=true), :x)) # the same as above + +#- + +sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff + +#- + +x[shuffle(1:10), :] # reorder rows (here randomly) + +#- + +sort!(x, :id) +x[[1,10],:] = x[[10,1],:] # swap rows +x + +#- + +x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again +x + +# ### Merging/adding rows + +x = DataFrame(rand(3, 5)) + +#- + +[x; x] # merge by rows - data frames must have the same column names; the same is vcat + +#- + +y = x[reverse(names(x))] # get y with other order of names + +#- + +vcat(x, y) # we get what we want as vcat does column name matching + +#- + +vcat(x, y[1:3]) # but column names must still match + +#- + +append!(x, x) # the same but modifies x + +#- + +append!(x, y) # here column names must match exactly + +#- + +push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types +x + +#- + +push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries +x + +# ### Subsetting/removing rows + +x = DataFrame(id=1:10, val='a':'j') + +#- + +x[1:2, :] # by index + +#- + +view(x, 1:2) # the same but a view + +#- + +x[repmat([true, false], 5), :] # by Bool, exact length required + +#- + +view(x, repmat([true, false], 5), :) # view again + +#- + +deleterows!(x, 7) # delete one row + +#- + +deleterows!(x, 6:7) # delete a collection of rows + +#- + +x = DataFrame([1:4, 2:5, 3:6]) + +#- + +filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow + +#- + +## in place modification of x, an example with do-block syntax +filter!(x) do r + if r[:x1] > 2.5 + return r[:x2] < 4.5 + end + r[:x3] < 3.5 +end + +# ### Deduplicating + +x = DataFrame(A=[1,2], B=["x","y"]) +append!(x, x) +x[:C] = 1:4 +x + +#- + +unique(x, [1,2]) # get first unique rows for given index + +#- + +unique(x) # now we look at whole rows + +#- + +nonunique(x, :A) # get indicators of non-unique rows + +#- + +unique!(x, :B) # modify x in place + +# ### Extracting one row from `DataFrame` into a vector + +x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true]) + +#- + +cols = [:x, :y] +[x[1, col] for col in cols] # subset of columns + +#- + +[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x + +#- + +Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added + diff --git a/literate_notebooks/src-PT-BR/07_factors.jl b/literate_notebooks/src-PT-BR/07_factors.jl new file mode 100644 index 0000000..a3ff03c --- /dev/null +++ b/literate_notebooks/src-PT-BR/07_factors.jl @@ -0,0 +1,231 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Working with CategoricalArrays + +#- + +# ### Constructor + +x = categorical(["A", "B", "B", "C"]) # unordered + +#- + +y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order + +#- + +z = categorical(["A","B","B","C", missing]) # unordered with missings + +#- + +c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks + +#- + +by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # just to make sure it works right + +#- + +v = categorical([1,2,2,3,3]) # contains integers not strings + +#- + +Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector + +# ### Managing levels + +arr = [x,y,z,c,v] + +#- + +isordered.(arr) # chcek if categorical array is orderd + +#- + +ordered!(x, true), isordered(x) # make x ordered + +#- + +ordered!(x, false), isordered(x) # and unordered again + +#- + +levels.(arr) # list levels + +#- + +unique.(arr) # missing will be included + +#- + +y[1] < y[2] # can compare as y is ordered + +#- + +v[1] < v[2] # not comparable, v is unordered although it contains integers + +#- + +levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays + +#- + +y[1] < y[2] # observe that the order is changed + +#- + +levels!(z, ["A", "B"]) # you have to specify all levels that are present + +#- + +levels!(z, ["A", "B"], allow_missing=true) # unless the underlying array allows for missings and force removal of levels + +#- + +z[1] = "B" +z # now z has only "B" entries + +#- + +levels(z) # but it remembers the levels it had (the reason is mostly performance) + +#- + +droplevels!(z) # this way we can clean it up +levels(z) + +# ### Data manipulation + +x, levels(x) + +#- + +x[2] = "0" +x, levels(x) # new level added at the end (works only for unordered) + +#- + +v, levels(v) + +#- + +v[1] + v[2] # even though underlying data is Int, we cannot operate on it + +#- + +Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive) + +#- + +get(v[1]) + get(v[2]) # or get a single value + +#- + +get.(v) # this will work for arrays witout missings + +#- + +get.(z) # but will fail on missing values + +#- + +Vector{Union{String, Missing}}(z) # you have to do the conversion + +#- + +z[1]*z[2], z.^2 # the only exception are CategoricalArrays based on String - you can operate on them normally + +#- + +recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent + +#- + +recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings + +#- + +recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly + +#- + +t = categorical([1:5; missing]) +t, levels(t) + +#- + +recode!(t, [1,3]=>2) +t, levels(t) # note that the levels are dropped after recode + +#- + +t = categorical([1,2,3], ordered=true) +levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance + +#- + +t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level +levels(recode(t, 300, [1,2]=>100, 3=>200)) + +# ### Comparisons + +x = categorical([1,2,3]) +xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)] +levels!(xs[2], [3,2,1]) +levels!(xs[4], [2,3,1]) +[a == b for a in xs, b in xs] # all are equal - comparison only by contents + +#- + +signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray +## all are different, notice that x[1] and x[2] are unordered but have a different order of levels +[signature(a) == signature(b) for a in xs, b in xs] + +#- + +x[1] < x[2] # you cannot compare elements of unordered CategoricalArray + +#- + +t[1] < t[2] # but you can do it for an ordered one + +#- + +isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered + +#- + +y = deepcopy(x) # but not across categorical arrays +isless(x[1], y[2]) + +#- + +isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray + +#- + +x[1] == y[2] # equality tests works OK across CategoricalArrays + +# ### Categorical columns in a DataFrame + +df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) + +#- + +categorical!(df) # converts all eltype(AbstractString) columns to categorical + +#- + +showcols(df) + +#- + +categorical!(df, :x) # manually convert to categorical column :x + +#- + +showcols(df) + diff --git a/literate_notebooks/src-PT-BR/08_joins.jl b/literate_notebooks/src-PT-BR/08_joins.jl new file mode 100644 index 0000000..e52bc22 --- /dev/null +++ b/literate_notebooks/src-PT-BR/08_joins.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** + +using DataFrames # load package + +# ## Joining DataFrames + +#- + +# ### Preparing DataFrames for a join + +x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"]) +y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99]) +x,y + +#- + +rename!(x, :ID=>:id) # names of columns on which we want to join must be the same + +# ### Standard joins: inner, left, right, outer, semi, anti + +join(x, y, on=:id) # :inner join by default, missing is joined + +#- + +join(x, y, on=:id, kind=:left) + +#- + +join(x, y, on=:id, kind=:right) + +#- + +join(x, y, on=:id, kind=:outer) + +#- + +join(x, y, on=:id, kind=:semi) + +#- + +join(x, y, on=:id, kind=:anti) + +# ### Cross join + +## cross-join does not require on argument +## it produces a Cartesian product or arguments +function expand_grid(;xs...) # a simple replacement for expand.grid in R + reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross), + DataFrame(Pair(xs[1]...)), xs[2:end]) +end + +expand_grid(a=[1,2], b=["a","b","c"], c=[true,false]) + +# ### Complex cases of joins + +x = DataFrame(id1=[1,1,2,2,missing,missing], + id2=[1,11,2,21,missing,99], + name = ["Alice", "Bob", "Conor", "Dave","Zed", "Zoe"]) +y = DataFrame(id1=[1,1,3,3,missing,missing], + id2=[11,1,31,3,missing,999], + age = [21,22,23,24,99, 100]) +x,y + +#- + +join(x, y, on=[:id1, :id2]) # joining on two columns + +#- + +join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join) + +#- + +join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows) + diff --git a/literate_notebooks/src-PT-BR/09_reshaping.jl b/literate_notebooks/src-PT-BR/09_reshaping.jl new file mode 100644 index 0000000..d6ec25b --- /dev/null +++ b/literate_notebooks/src-PT-BR/09_reshaping.jl @@ -0,0 +1,90 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Reshaping DataFrames + +#- + +# ### Wide to long + +x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114]) + +#- + +melt(x, :id, [:M1, :M2]) # first pass id-variables and then measure variables; meltdf makes a view + +#- + +## optionally you can rename columns; melt and stack are identical but order of arguments is reversed +stack(x, [:M1, :M2], :id, variable_name=:key, value_name=:observed) # first measures and then id-s; stackdf creates view + +#- + +## if second argument is omitted in melt or stack , all other columns are assumed to be the second argument +## but measure variables are selected only if they are <: AbstractFloat +melt(x, [:id, :id2]) + +#- + +melt(x, [1, 2]) # you can use index instead of symbol + +#- + +bigx = DataFrame(rand(10^6, 10)) # a test comparing creation of new DataFrame and a view +bigx[:id] = 1:10^6 +@time melt(bigx, :id) +@time melt(bigx, :id) +@time meltdf(bigx, :id) +@time meltdf(bigx, :id); + +#- + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +melt(x) + +#- + +melt(DataFrame(rand(3,2))) # by default stack and melt treats floats as value columns + +#- + +df = DataFrame(rand(3,2)) +df[:key] = [1,1,1] +mdf = melt(df) # duplicates in key are silently accepted + +# ### Long to wide + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +y = melt(x, [1,2]) +display(x) +display(y) + +#- + +unstack(y, :id2, :variable, :value) # stndard unstack with a unique key + +#- + +unstack(y, :variable, :value) # all other columns are treated as keys + +#- + +## by default :id, :variable and :value names are assumed; in this case it produces duplicate keys +unstack(y) + +#- + +df = stack(DataFrame(rand(3,2))) + +#- + +unstack(df, :variable, :value) # unable to unstack when no key column is present + diff --git a/literate_notebooks/src-PT-BR/10_transforms.jl b/literate_notebooks/src-PT-BR/10_transforms.jl new file mode 100644 index 0000000..3b5b4aa --- /dev/null +++ b/literate_notebooks/src-PT-BR/10_transforms.jl @@ -0,0 +1,80 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Split-apply-combine + +x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8)) + +#- + +gx1 = groupby(x, :id) + +#- + +gx2 = groupby(x, [:id, :id2]) + +#- + +vcat(gx2...) # back to the original DataFrame + +#- + +x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5) + +#- + +showall(groupby(x, :id)) # by default groups include mising values and are not sorted + +#- + +showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :) + +#- + +x = DataFrame(id=rand('a':'d', 100), v=rand(100)); +by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame + +#- + +by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output + +#- + +by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better + +#- + +x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100)) +aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id + +#- + +aggregate(x, :id, sum, sort=true) # also can be sorted + +# *We omit the discussion of of map/combine as I do not find them very useful (better to use by)* + +x = DataFrame(rand(3, 5)) + +#- + +map(mean, eachcol(x)) # map a function over each column and return a data frame + +#- + +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values + +#- + +colwise(mean, x) # colwise is similar, but produces a vector + +#- + +x[:id] = [1,1,2] +colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame + +#- + +map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame + diff --git a/literate_notebooks/src-PT-BR/11_performance.jl b/literate_notebooks/src-PT-BR/11_performance.jl new file mode 100644 index 0000000..005e877 --- /dev/null +++ b/literate_notebooks/src-PT-BR/11_performance.jl @@ -0,0 +1,135 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames +using BenchmarkTools + +# ## Performance tips + +#- + +# ### Access by column number is faster than by name + +x = DataFrame(rand(5, 1000)) +@btime x[500]; +@btime x[:x500]; + +# ### When working with data `DataFrame` use barrier functions or type annotation + +function f_bad() # this function will be slow + srand(1); x = DataFrame(rand(1000000,2)) + y, z = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_bad(); + +#- + +@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame` + +#- + +## solution 1 is to use barrier function (it should be possible to use it in almost any code) +function f_inner(y,z) + p = 0.0 + for i in 1:length(y) + p += y[i]*z[i] + end + p +end + +function f_barrier() # extract the work to an inner function + srand(1); x = DataFrame(rand(1000000,2)) + f_inner(x[1], x[2]) +end + +function f_inbuilt() # or use inbuilt function if possible + srand(1); x = DataFrame(rand(1000000,2)) + dot(x[1], x[2]) +end + +@btime f_barrier(); +@btime f_inbuilt(); + +#- + +## solution 2 is to provide the types of extracted columns +## it is simpler but there are cases in which you will not know these types +function f_typed() + srand(1); x = DataFrame(rand(1000000,2)) + y::Vector{Float64}, z::Vector{Float64} = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_typed(); + +# ### Consider using delayed `DataFrame` creation technique + +function f1() + x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly + for c in 1:ncol(x) + d = x[c] + for r in 1:nrow(x) + d[r] = rand() + end + end + x +end + +function f2() + x = Vector{Any}(100) + for c in 1:length(x) + d = Vector{Float64}(10^4) + for r in 1:length(d) + d[r] = rand() + end + x[c] = d + end + DataFrame(x) # we delay creation of DataFrame after we have our job done +end + +@btime f1(); +@btime f2(); + +# ### You can add rows to a `DataFrame` in place and it is fast + +x = DataFrame(rand(10^6, 5)) +y = DataFrame(transpose(1.0:5.0)) +z = [1.0:5.0;] + +@btime vcat($x, $y); # creates a new DataFrame - slow +@btime append!($x, $y); # in place - fast + +x = DataFrame(rand(10^6, 5)) # reset to the same starting point +@btime push!($x, $z); # add a single row in place - fastest + +# ### Allowing `missing` as well as `categorical` slows down computations + +using StatsBase + +function test(data) # uses countmap function to test performance + println(eltype(data)) + x = rand(data, 10^6) + y = categorical(x) + println(" raw:") + @btime countmap($x) + println(" categorical:") + @btime countmap($y) + nothing +end + +test(1:10) +test([randstring() for i in 1:10]) +test(allowmissing(1:10)) +test(allowmissing([randstring() for i in 1:10])) + + diff --git a/literate_notebooks/src-PT-BR/12_pitfalls.jl b/literate_notebooks/src-PT-BR/12_pitfalls.jl new file mode 100644 index 0000000..8eb5e79 --- /dev/null +++ b/literate_notebooks/src-PT-BR/12_pitfalls.jl @@ -0,0 +1,73 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames + +# ## Possible pitfalls + +#- + +# ### Know what is copied when creating a `DataFrame` + +x = DataFrame(rand(3, 5)) + +#- + +y = DataFrame(x) +x === y # no copyinng performed + +#- + +y = copy(x) +x === y # not the same object + +#- + +all(x[i] === y[i] for i in ncol(x)) # but the columns are the same + +#- + +x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges + +#- + +y === df[:y] # the same object + +#- + +typeof(x), typeof(df[:x]) # range is converted to a vector + +# ### Do not modify the parent of `GroupedDataFrame` + +x = DataFrame(id=repeat([1,2], outer=3), x=1:6) +g = groupby(x, :id) + +#- + +x[1:3, 1]=[2,2,2] +g # well - it is wrong now, g is only a view + +# ### Remember that you can filter columns of a `DataFrame` using booleans + +srand(1) +x = DataFrame(rand(5, 5)) + +#- + +x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans + +#- + +x[x[:x1] .< 0.25, :] # probably this is what we wanted + +# ### Column selection for DataFrame creates aliases unless explicitly copied + +x = DataFrame(a=1:3) +x[:b] = x[1] # alias +x[:c] = x[:, 1] # also alias +x[:d] = x[1][:] # copy +x[:e] = copy(x[1]) # explicit copy +display(x) +x[1,1] = 100 +display(x) + diff --git a/literate_notebooks/src-PT-BR/13_extras.jl b/literate_notebooks/src-PT-BR/13_extras.jl new file mode 100644 index 0000000..5140a31 --- /dev/null +++ b/literate_notebooks/src-PT-BR/13_extras.jl @@ -0,0 +1,198 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 13, 2018** + +using DataFrames + +# ## Extras - selected functionalities of selected packages + +#- + +# ### FreqTables: creating cross tabulations + +using FreqTables +df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) +ft = freqtable(df, :a, :b) # observe that dimensions are sorted if possible + +#- + +ft[1,1], ft['b', "z"] # you can index the result using numbers or names + +#- + +prop(ft, 1) # getting proportions - 1 means we want to calculate them in rows (first dimension) + +#- + +prop(ft, 2) # and columns are normalized to 1.0 now + +#- + +x = categorical(rand(1:3, 10)) +levels!(x, [3, 1, 2, 4]) # reordering levels and adding an extra level +freqtable(x) # order is preserved and not-used level is shown + +#- + +freqtable([1,1,2,3,missing]) # by default missings are listed + +#- + +freqtable([1,1,2,3,missing], skipmissing=true) # but we can skip them + +# ### DataFramesMeta - working on `DataFrame` + +using DataFramesMeta +df = DataFrame(x=1:8, y='a':'h', z=repeat([true,false], outer=4)) + +#- + +@with(df, :x+:z) # expressions with columns of DataFrame + +#- + +@with df begin # you can define code blocks + a = :x[:z] + b = :x[.!:z] + :y + [a; b] +end + +#- + +a # @with creates hard scope so variables do not leak out + +#- + +df2 = DataFrame(a = [:a, :b, :c]) +@with(df2, :a .== ^(:a)) # sometimes we want to work on raw Symbol, ^() escapes it + +#- + +df2 = DataFrame(x=1:3, y=4:6, z=7:9) +@with(df2, _I_(2:3)) # _I_(expression) is translated to df2[expression] + +#- + +@where(df, :x .< 4, :z .== true) # very useful macro for filtering + +#- + +@select(df, :x, y = 2*:x, z=:y) # create a new DataFrame based on the old one + +#- + +@transform(df, a=1, x = 2*:x, y=:x) # create a new DataFrame adding columns based on the old one + +#- + +@transform(df, a=1, b=:a) # old DataFrame is used and :a is not present there + +#- + +@orderby(df, :z, -:x) # sorting into a new data frame, less powerful than sort, but lightweight + +#- + +@linq df |> # chaining of operations on DataFrame + where(:x .< 5) |> + orderby(:z) |> + transform(x²=:x.^2) |> + select(:z, :x, :x²) + +#- + +f(df, col) = df[col] # you can define your own functions and put them in the chain +@linq df |> where(:x .<= 4) |> f(:x) + +# ### DataFramesMeta - working on grouped `DataFrame` + +df = DataFrame(a = 1:12, b = repeat('a':'d', outer=3)) +g = groupby(df, :b) + +#- + +@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # more convinient than by from DataFrames + +#- + +@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) # the same as by but on grouped DataFrame + +#- + +@where(g, mean(:a) > 6.5) # filter gropus on aggregate conditions + +#- + +@orderby(g, -sum(:a)) # order groups on aggregate conditions + +#- + +@transform(g, center = mean(:a), centered = :a - mean(:a)) # perform operations within a group and return ungroped DataFrame + +#- + +DataFrame(g) # a nice convinience function not defined in DataFrames + +#- + +@transform(g) # actually this is the same + +#- + +@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # you can do chaining on grouped DataFrames as well + +# ### DataFramesMeta - rowwise operations on `DataFrame` + +df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) + +#- + +## such conditions are often needed but are complex to write +@transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) + +#- + +## one option is to use a function that works on a single observation and broadcast it +myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" +@transform(df, x = myfun.(:a, :b)) + +#- + +## or you can use @byrow! macro that allows you to process DataFrame rowwise +@byrow! df begin + @newcol x::Vector{String} + :x = :a > 6 && :b == 4 ? "yes" : "no" +end + +# ### Visualizing data with StatPlots + +using StatPlots # you might need to setup Plots package and some plotting backend first + +#- + +## we present only a minimal functionality of the package + +#- + +srand(1) +df = DataFrame(x = sort(randn(1000)), y=randn(1000), z = [fill("b", 500); fill("a", 500)]) + +#- + +@df df plot(:x, :y, legend=:topleft, label="y(x)") # a most basic plot + +#- + +@df df density(:x, label="") # density plot + +#- + +@df df histogram(:y, label="y") # and a histogram + +#- + +@df df boxplot(:z, :x, label="x") + +#- + +@df df violin(:z, :y, label="y") + diff --git a/literate_notebooks/src-PT-BR/README.md b/literate_notebooks/src-PT-BR/README.md new file mode 100644 index 0000000..4733e5c --- /dev/null +++ b/literate_notebooks/src-PT-BR/README.md @@ -0,0 +1,147 @@ +# An Introduction to DataFrames + +[Bogumił Kamiński](http://bogumilkaminski.pl/about/), November 2020, 2020 + +**The tutorial is for DataFrames 0.22.1** + +A brief introduction to basic usage of [DataFrames](https://github.com/JuliaData/DataFrames.jl). + +The tutorial contains a specification of the project environment version under +which it should be run. In order to prepare this environment, before using the +tutorial notebooks, while in the project folder run the following command in the +command line: + +``` +julia -e 'using Pkg; Pkg.activate("."); Pkg.instantiate()' +``` + +Tested under Julia 1.5.3. The project dependencies are the following: + +``` + [69666777] Arrow v1.0.1 + [6e4b80f9] BenchmarkTools v0.5.0 + [336ed68f] CSV v0.8.2 + [324d7699] CategoricalArrays v0.9.0 + [944b1d66] CodecZlib v0.7.0 + [a93c6f00] DataFrames v0.22.1 + [1313f7d8] DataFramesMeta v0.6.0 + [5789e2e9] FileIO v1.4.4 + [da1fdf0e] FreqTables v0.4.2 + [7073ff75] IJulia v1.23.0 + [babc3d20] JDF v0.2.20 + [9da8a3cd] JLSO v2.4.0 + [b9914132] JSONTables v1.0.0 + [86f7a689] NamedArrays v0.9.4 + [b98c9c47] Pipe v1.3.0 + [2dfb63ee] PooledArrays v0.5.3 + [f3b207a7] StatsPlots v0.14.17 + [bd369af6] Tables v1.2.1 + [a5390f91] ZipFile v0.9.3 + [9a3f8284] Random + [10745b16] Statistics +``` + +I will try to keep the material up to date as the packages evolve. + +This tutorial covers +[DataFrames](https://github.com/JuliaData/DataFrames.jl) +and [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl), +as they constitute the core of [DataFrames](https://github.com/JuliaData/DataFrames.jl) +along with selected file reading and writing packages. + +In the last [extras](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) +part mentions *selected* functionalities of *selected* useful packages that I find useful for data manipulation, currently those are: +[FreqTables](https://github.com/nalimilan/FreqTables.jl), +[DataFramesMeta](https://github.com/JuliaStats/DataFramesMeta.jl) (pending its update to support DataFrames.jl 0.22 release), +[StatsPlots](https://github.com/JuliaPlots/StatsPlots.jl). + +# Setting up Jupyter Notebook for work with DataFrames.jl + +By default Jupyter Notebook will limit the number of rows and columns when +displaying a data frame to roughly fit the screen size (like in the REPL). + +You can override this behavior by setting `ENV["COLUMNS"]` or `ENV["LINES"]` +variables to hold the maximum width and height of output in characters +respectively when running a notebook. Alternatively you can add the following +entry `"COLUMNS": "1000", "LINES": "100"` to `"env"` variable in your Jupyter +kernel file. See +[here](https://jupyter-client.readthedocs.io/en/stable/kernels.html) for +information about location and specification of Jupyter kernels. + +# TOC + +| File | Topic | +|-------------------------------------------------------------------------------------------------------------------|-----------------------------------| +| [01_constructors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/01_constructors.ipynb) | Creating DataFrame and conversion | +| [02_basicinfo.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/02_basicinfo.ipynb) | Getting summary information | +| [03_missingvalues.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/03_missingvalues.ipynb) | Handling missing values | +| [04_loadsave.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/04_loadsave.ipynb) | Loading and saving DataFrames | +| [05_columns.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/05_columns.ipynb) | Working with columns of DataFrame | +| [06_rows.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/06_rows.ipynb) | Working with row of DataFrame | +| [07_factors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/07_factors.ipynb) | Working with categorical data | +| [08_joins.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/08_joins.ipynb) | Joining DataFrames | +| [09_reshaping.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/09_reshaping.ipynb) | Reshaping DataFrames | +| [10_transforms.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/10_transforms.ipynb) | Transforming DataFrames | +| [11_performance.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/11_performance.ipynb) | Performance tips | +| [12_pitfalls.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/12_pitfalls.ipynb) | Possible pitfalls | +| [13_extras.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) | Additional interesting packages | + +Changelog: + +| Date | Changes | +| ---------- | ------------------------------------------------------------ | +| 2017-12-05 | Initial release | +| 2017-12-06 | Added description of `insert!`, `merge!`, `empty!`, `categorical!`, `delete!`, `DataFrames.index` | +| 2017-12-09 | Added performance tips | +| 2017-12-10 | Added pitfalls | +| 2017-12-18 | Added additional worthwhile packages: *FreqTables* and *DataFramesMeta* | +| 2017-12-29 | Added description of `filter` and `filter!` | +| 2017-12-31 | Added description of conversion to `Matrix` | +| 2018-04-06 | Added example of extracting a row from a `DataFrame` | +| 2018-04-21 | Major update of whole tutorial | +| 2018-05-01 | Added `byrow!` example | +| 2018-05-13 | Added `StatPlots` package to extras | +| 2018-05-23 | Improved comments in sections 1 do 5 by [Jane Herriman](https://github.com/xorJane) | +| 2018-07-25 | Update to 0.11.7 release | +| 2018-08-25 | Update to Julia 1.0 release: sections 1 to 10 | +| 2018-08-29 | Update to Julia 1.0 release: sections 11, 12 and 13 | +| 2018-09-05 | Update to Julia 1.0 release: FreqTables section | +| 2018-09-10 | Added CSVFiles section to chapter on load/save | +| 2018-09-26 | Updated to DataFrames 0.14.0 | +| 2018-10-04 | Updated to DataFrames 0.14.1, added `haskey` and `repeat` | +| 2018-12-08 | Updated to DataFrames 0.15.2 | +| 2019-01-03 | Updated to DataFrames 0.16.0, added serialization instructions | +| 2019-01-18 | Updated to DataFrames 0.17.0, added `passmissing` | +| 2019-01-27 | Added Feather.jl file read/write | +| 2019-01-30 | Renamed StatPlots.jl to StatsPlots.jl and added Tables.jl| +| 2019-02-08 | Added `groupvars` and `groupindices` functions| +| 2019-04-27 | Updated to DataFrames 0.18.0, dropped JLD2.jl | +| 2019-04-30 | Updated handling of missing values description | +| 2019-07-16 | Updated to DataFrames 0.19.0 | +| 2019-08-14 | Added JSONTables.jl and `Tables.columnindex` | +| 2019-08-16 | Added Project.toml and Manifest.toml | +| 2019-08-26 | Update to Julia 1.2 and DataFrames 0.19.3 | +| 2019-08-29 | Add example how to compress/decompress CSV file using CodecZlib | +| 2019-08-30 | Add examples of JLSO.jl and ZipFile.jl by [xiaodaigh](https://github.com/xiaodaigh) | +| 2019-11-03 | Add examples of JDF.jl by [xiaodaigh](https://github.com/xiaodaigh) | +| 2019-12-08 | Updated to DataFrames 0.20.0 | +| 2020-05-06 | Updated to DataFrames 0.21.0 (except load/save and extras) | +| 2020-11-20 | Updated to DataFrames 0.22.0 (except DataFramesMeta.jl which does not work yet) | +| 2020-11-26 | Updated to DataFramesMeta.jl 0.6; update by @pdeffebach | + +# Core functions summary + +1. Constructors: `DataFrame`, `DataFrame!`, `Tables.rowtable`, `Tables.columntable`, `Matrix`, `eachcol`, `eachrow`, `Tables.namedtupleiterator`, `empty`, `empty!` +2. Getting summary: `size`, `nrow`, `ncol`, `describe`, `names`, `eltypes`, `first`, `last`, `getindex`, `setindex!`, `@view`, `isapprox` +3. Handling missing: `missing` (singleton instance of `Missing`), `ismissing`, `nonmissingtype`, `skipmissing`, `replace`, `replace!`, `coalesce`, `allowmissing`, `disallowmissing`, `allowmissing!`, `completecases`, `dropmissing`, `dropmissing!`, `disallowmissing`, `disallowmissing!`, `passmissing` +4. Loading and saving: `CSV` (package), `CSVFiles` (package), `Serialization` (module), `CSV.read`, `CSV.write`, `save`, `load`, `serialize`, `deserialize`, `Arrow.write`, `Arrow.Table` (from Arrow.jl package), `JSONTables` (package), `arraytable`, `objecttable`, `jsontable`, `CodecZlib` (module), `GzipCompressorStream`, `GzipDecompressorStream`, `JDF.jl` (package), `JDF.savejdf`, `JDF.loadjdf`, `JLSO.jl` (package), `JLSO.save`, `JLSO.load`, `ZipFile.jl` (package), `ZipFile.reader`, `ZipFile.writer`, `ZipFile.addfile` +5. Working with columns: `rename`, `rename!`, `hcat`, `insertcols!`, `categorical!`, `columnindex`, `hasproperty`, `select`, `select!`, `transform`, `transform!`, `combine`, `Not`, `All`, `Between`, `ByRow`, `AsTable` +6. Working with rows: `sort!`, `sort`, `issorted`, `append!`, `vcat`, `push!`, `view`, `filter`, `filter!`, `delete!`, `unique`, `nonunique`, `unique!`, `repeat`, `parent`, `parentindices`, `flatten`, `@pipe` (from `Pipe` package), `only` +7. Working with categorical: `categorical`, `cut`, `isordered`, `ordered!`, `levels`, `unique`, `levels!`, `droplevels!`, `get`, `recode`, `recode!` +8. Joining: `innerjoin`, `leftjoin`, `rightjoin`, `outerjoin`, `semijoin`, `antijoin`, `crossjoin` +9. Reshaping: `stack`, `unstack` +10. Transforming: `groupby`, `mapcols`, `parent`, `groupcols`, `valuecols`, `groupindices`, `keys` (for `GroupedDataFrame`), `combine`, `select`, `select!`, `transform`, `transform!`, `@pipe` (from `Pipe` package) +11. Extras: + * [FreqTables](https://github.com/nalimilan/FreqTables.jl): `freqtable`, `prop`, `Name` + * [DataFramesMeta](https://github.com/JuliaStats/DataFramesMeta.jl): `@with`, `@where`, `@select`, `@transform`, `@orderby`, `@linq`, `@by`, `@combine`, `@eachrow`, `@newcol`, `^`, `cols` + * [StatsPlots](https://github.com/JuliaPlots/StatsPlots.jl): `@df`, `plot`, `density`, `histogram`,`boxplot`, `violin` diff --git a/literate_notebooks/src/01_constructors.jl b/literate_notebooks/src/01_constructors.jl new file mode 100644 index 0000000..333a81e --- /dev/null +++ b/literate_notebooks/src/01_constructors.jl @@ -0,0 +1,143 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# +# Let's get started by loading the `DataFrames` package. + +using DataFrames + +# ## Constructors and conversion + +#- + +# ### Constructors +# +# In this section, you'll see many ways to create a `DataFrame` using the `DataFrame()` constructor. +# +# First, we could create an empty DataFrame, + +DataFrame() # empty DataFrame + +# Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. + +DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) + +# We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. + +x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) +DataFrame(x) + +# Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. +# +# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: + +DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) + +# Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. + +DataFrame([rand(3) for i in 1:3]) + +# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. + +DataFrame(rand(3)) + +# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). + +DataFrame(transpose([1, 2, 3])) + +# Pass a second argument to give the columns names. + +DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) + +# Here we create a `DataFrame` from a matrix, + +DataFrame(rand(3,4)) + +# and here we do the same but also pass column names. + +DataFrame(rand(3,4), Symbol.('a':'d')) + +# We can also construct an uninitialized DataFrame. +# +# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. + +DataFrame([Int, Float64, Any], [:A, :B, :C], 1) + +# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# +# This will be fixed in next release of DataFrames! + +DataFrame([Int, Float64, String], [:A, :B, :C], 1) + +# To initialize a `DataFrame` with column names, but no rows use + +DataFrame([Int, Float64, String], [:A, :B, :C], 0) + +# This syntax gives us a quick way to create homogenous `DataFrame`. + +DataFrame(Int, 3, 5) + +# This example is similar, but has nonhomogenous columns. + +DataFrame([Int, Float64], 4) + +# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. +# +# Note that `copy` creates a shallow copy. + +y = DataFrame(x) +z = copy(x) +(x === y), (x === z), isequal(x, z) + +# ### Conversion to a matrix +# +# Let's start by creating a `DataFrame` with two rows and two columns. + +x = DataFrame(x=1:2, y=["A", "B"]) + +# We can create a matrix by passing this `DataFrame` to `Matrix`. + +Matrix(x) + +# This would work even if the `DataFrame` had some `missing`s: + +x = DataFrame(x=1:2, y=[missing,"B"]) + +#- + +Matrix(x) + +# In the two previous matrix examples, Julia created matrices with elements of type `Any`. We can see more clearly that the type of matrix is inferred when we pass, for example, a `DataFrame` of integers to `Matrix`, creating a 2D `Array` of `Int64`s: + +x = DataFrame(x=1:2, y=3:4) + +#- + +Matrix(x) + +# In this next example, Julia correctly identifies that `Union` is needed to express the type of the resulting `Matrix` (which contains `missing`s). + +x = DataFrame(x=1:2, y=[missing,4]) + +#- + +Matrix(x) + +# Note that we can't force a conversion of `missing` values to `Int`s! + +Matrix{Int}(x) + +# ### Handling of duplicate column names +# +# We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) + +df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) + +# Otherwise, duplicates will not be allowed in the future. + +df = DataFrame(:a=>1, :a=>2, :a_1=>3) + +# A constructor that is passed column names as keyword arguments is a corner case. +# You cannot pass `makeunique` to allow duplicates here. + +df = DataFrame(a=1, a=2, makeunique=true) + diff --git a/literate_notebooks/src/02_basicinfo.jl b/literate_notebooks/src/02_basicinfo.jl new file mode 100644 index 0000000..6cde7c6 --- /dev/null +++ b/literate_notebooks/src/02_basicinfo.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Getting basic information about a data frame +# +# Let's start by creating a `DataFrame` object, `x`, so that we can learn how to get information on that data frame. + +x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"]) + +# The standard `size` function works to get dimensions of the `DataFrame`, + +size(x), size(x, 1), size(x, 2) + +# as well as `nrow` and `ncol` from R; `length` gives number of columns. + +nrow(x), ncol(x), length(x) + +# `describe` gives basic summary statistics of data in your `DataFrame`. + +describe(x) + +# Use `showcols` to get informaton about columns stored in a DataFrame. + +showcols(x) + +# `names` will return the names of all columns, + +names(x) + +# and `eltypes` returns their types. + +eltypes(x) + +# Here we create some large DataFrame + +y = DataFrame(rand(1:10, 1000, 10)); + +# and then we can use `head` to peek into its top rows + +head(y) + +# and `tail` to see its bottom rows. + +tail(y, 3) + +# ### Most elementary get and set operations +# +# Given the `DataFrame`, `x`, here are three ways to grab one of its columns as a `Vector`: + +x[1], x[:A], x[:, 1] + +# To grab one row as a DataFrame, we can index as follows. + +x[1, :] + +# We can grab a single cell or element with the same syntax to grab an element of an array. + +x[1, 1] + +# Assignment can be done in ranges to a scalar, + +x[1:2, 1:2] = 1 +x + +# to a vector of length equal to the number of assigned rows, + +x[1:2, 1:2] = [1,2] +x + +# or to another data frame of matching size. + +x[1:2, 1:2] = DataFrame([5 6; 7 8]) +x + diff --git a/literate_notebooks/src/03_missingvalues.jl b/literate_notebooks/src/03_missingvalues.jl new file mode 100644 index 0000000..1e17d97 --- /dev/null +++ b/literate_notebooks/src/03_missingvalues.jl @@ -0,0 +1,112 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Handling missing values +# +# A singelton type `Missings.Missing` allows us to deal with missing values. + +missing, typeof(missing) + +# Arrays automatically create an appropriate union type. + +x = [1, 2, missing, 3] + +# `ismissing` checks if passed value is missing. + +ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) + +# We can extract the type combined with Missing from a `Union` via +# +# (This is useful for arrays!) + +eltype(x), Missings.T(eltype(x)) + +# `missing` comparisons produce `missing`. + +missing == missing, missing != missing, missing < missing + +# This is also true when `missing`s are compared with values of other types. + +1 == missing, 1 != missing, 1 < missing + +# `isequal`, `isless`, and `===` produce results of type `Bool`. + +isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) + +# In the next few examples, we see that many (not all) functions handle `missing`. + +map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1 + +#- + +map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 + +#- + +map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 + +# `skipmissing` returns iterator skipping missing values. We can use `collect` and `skipmissing` to create an array that excludes these missing values. + +collect(skipmissing([1, missing, 2, missing])) + +# Similarly, here we combine `collect` and `Missings.replace` to create an array that replaces all missing values with some value (`NaN` in this case). + +collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) + +# Another way to do this: + +coalesce.([1.0, missing, 2.0, missing], NaN) + +# Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned). + +coalesce.([1.0, missing, nothing, missing], NaN) + +# You can use `recode` if you have homogenous output types. + +recode([1.0, missing, 2.0, missing], missing=>NaN) + +# You can use `unique` or `levels` to get unique values with or without missings, respectively. + +unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) + +# In this next example, we convert `x` to `y` with `allowmissing`, where `y` has a type that accepts missings. + +x = [1,2,3] +y = allowmissing(x) + +# Then, we convert back with `disallowmissing`. This would fail if `y` contained missing values! + +z = disallowmissing(y) +x,y,z + +# In this next example, we show that the type of each column in `x` is initially `Int64`. After using `allowmissing!` to accept missing values in columns 1 and 3, the types of those columns become `Union`s of `Int64` and `Missings.Missing`. + +x = DataFrame(Int, 2, 3) +println("Before: ", eltypes(x)) +allowmissing!(x, 1) # make first column accept missings +allowmissing!(x, :x3) # make :x3 column accept missings +println("After: ", eltypes(x)) + +# In this next example, we'll use `completecases` to find all the rows of a `DataFrame` that have complete data. + +x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) +println(x) +println("Complete cases:\n", completecases(x)) + +# We can use `dropmissing` or `dropmissing!` to remove the rows with incomplete data from a `DataFrame` and either create a new `DataFrame` or mutate the original in-place. + +y = dropmissing(x) +dropmissing!(x) +[x, y] + +# When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values. + +showcols(x) + +# Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values. + +disallowmissing!(x) +showcols(x) + diff --git a/literate_notebooks/src/04_loadsave.jl b/literate_notebooks/src/04_loadsave.jl new file mode 100644 index 0000000..d166830 --- /dev/null +++ b/literate_notebooks/src/04_loadsave.jl @@ -0,0 +1,64 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Load and save DataFrames +# We do not cover all features of the packages. Please refer to their documentation to learn them. +# +# Here we'll load `CSV` to read and write CSV files and `JLD`, which allows us to work with a Julia native binary format. + +using CSV +using JLD + +# Let's create a simple `DataFrame` for testing purposes, + +x = DataFrame(A=[true, false, true], B=[1, 2, missing], + C=[missing, "b", "c"], D=['a', missing, 'c']) + + +# and use `eltypes` to look at the columnwise types. + +eltypes(x) + +# Let's use `CSV` to save `x` to disk; make sure `x.csv` does not conflict with some file in your working directory. + +CSV.write("x.csv", x) + +# Now we can see how it was saved by reading `x.csv`. + +print(read("x.csv", String)) + +# We can also load it back. `use_mmap=false` disables memory mapping so that on Windows the file can be deleted in the same session. + +y = CSV.read("x.csv", use_mmap=false) + +# When loading in a `DataFrame` from a `CSV`, all columns allow `Missing` by default. Note that the column types have changed! + +eltypes(y) + +# Now let's save `x` to a file in a binary format; make sure that `x.jld` does not exist in your working directory. + +save("x.jld", "x", x) + +# After loading in `x.jld` as `y`, `y` is identical to `x`. + +y = load("x.jld", "x") + +# Note that the column types of `y` are the same as those of `x`! + +eltypes(y) + +# Next, we'll create the files `bigdf.csv` and `bigdf.jld`, so be careful that you don't already have these files on disc! +# +# In particular, we'll time how long it takes us to write a `DataFrame` with 10^3 rows and 10^5 columns to `.csv` and `.jld` files. *You can expect JLD to be faster!* Use `compress=true` to reduce file sizes. + +bigdf = DataFrame(Bool, 10^3, 10^2) +@time CSV.write("bigdf.csv", bigdf) +@time save("bigdf.jld", "bigdf", bigdf) +getfield.(stat.(["bigdf.csv", "bigdf.jld"]), :size) + +# Finally, let's clean up. Do not run the next cell unless you are sure that it will not erase your important files. + +foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) + diff --git a/literate_notebooks/src/05_columns.jl b/literate_notebooks/src/05_columns.jl new file mode 100644 index 0000000..f32e02a --- /dev/null +++ b/literate_notebooks/src/05_columns.jl @@ -0,0 +1,187 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Manipulating columns of DataFrame + +#- + +# ### Renaming columns +# +# Let's start with a `DataFrame` of `Bool`s that has default column names. + +x = DataFrame(Bool, 3, 4) + +# With `rename`, we create new `DataFrame`; here we rename the column `:x1` to `:A`. (`rename` also accepts collections of Pairs.) + +rename(x, :x1 => :A) + +# With `rename!` we do an in place transformation. +# +# This time we've applied a function to every column name. + +rename!(c -> Symbol(string(c)^2), x) + +# We can also change the name of a particular column without knowing the original. +# +# Here we change the name of the third column, creating a new `DataFrame`. + +rename(x, names(x)[3] => :third) + +# With `names!`, we can change the names of all variables. + +names!(x, [:a, :b, :c, :d]) + +# We get an error when we try to provide duplicate names + +names!(x, fill(:a, 4)) + +# unless we pass `makeunique=true`, which allows us to handle duplicates in passed names. + +names!(x, fill(:a, 4), makeunique=true) + +# ### Reordering columns + +#- + +# We can reorder the names(x) vector as needed, creating a new DataFrame. + +srand(1234) +x[shuffle(names(x))] + +# also `permutecols!` will be introduced in next release of DataFrames + +#- + +# ### Merging/adding columns + +x = DataFrame([(i,j) for i in 1:3, j in 1:4]) + +# With `hcat` we can merge two `DataFrame`s. Also [x y] syntax is supported but only when DataFrames have unique column names. + +hcat(x, x, makeunique=true) + +# We can also use `hcat` to add a new column; a default name `:x1` will be used for this column, so `makeunique=true` is needed. + +y = hcat(x, [1,2,3], makeunique=true) + +# You can also prepend a vector with `hcat`. + +hcat([1,2,3], x, makeunique=true) + +# Alternatively you could append a vector with the following syntax. This is a bit more verbose but cleaner. + +y = [x DataFrame(A=[1,2,3])] + +# Here we do the same but add column `:A` to the front. + +y = [DataFrame(A=[1,2,3]) x] + +# A column can also be added in the middle. Here a brute-force method is used and a new DataFrame is created. + +using BenchmarkTools +@btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]] + +# We could also do this with a specialized in place method `insert!`. Let's add `:newcol` to the `DataFrame` `y`. + +insert!(y, 2, [1,2,3], :newcol) + +# If you want to insert the same column name several times `makeunique=true` is needed as usual. + +insert!(y, 2, [1,2,3], :newcol, makeunique=true) + +# We can see how much faster it is to insert a column with `insert!` than with `hcat` using `@btime`. + +@btime insert!(copy($x), 3, [1,2,3], :A) + +# Let's use `insert!` to append a column in place, + +insert!(x, ncol(x)+1, [1,2,3], :A) + +# and to in place prepend a column. + +insert!(x, 1, [1,2,3], :B) + +# With `merge!`, let's merge the second DataFrame into first, but overwriting duplicates. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +df1, df2, merge!(df1, df2) + +# For comparison: merge two `DataFrames`s but renaming duplicate names via `hcat`. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +hcat(df1, df2, makeunique=true) + +# ### Subsetting/removing columns +# +# Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns. + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# First we could do this by index + +x[[1,2,4,5]] + +# or by column name. + +x[[:x1, :x4]] + +# We can also choose to keep or exclude columns by `Bool`. (We need a vector whose length is the number of columns in the original `DataFrame`.) + +x[[true, false, true, false, true]] + +# Here we create a single column `DataFrame`, + +x[[:x1]] + +# and here we access the vector contained in column `:x1`. + +x[:x1] + +# We could grab the same vector by column number + +x[1] + +# and remove everything from a `DataFrame` with `empty!`. + +empty!(y) + +# Here we create a copy of `x` and delete the 3rd column from the copy with `delete!`. + +z = copy(x) +x, delete!(z, 3) + +# ### Modify column by name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# With the following syntax, the existing column is modified without performing any copying. + +x[:x1] = x[:x2] +x + +# We can also use the following syntax to add a new column at the end of a `DataFrame`. + +x[:A] = [1,2,3] +x + +# A new column name will be added to our `DataFrame` with the following syntax as well (7 is equal to `ncol(x)+1`). + +x[7] = 11:13 +x + +# ### Find column name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# We can check if a column with a given name exists via + +:x1 in names(x) + +# and determine its index via + +findfirst(names(x), :x2) + diff --git a/literate_notebooks/src/06_rows.jl b/literate_notebooks/src/06_rows.jl new file mode 100644 index 0000000..3660e40 --- /dev/null +++ b/literate_notebooks/src/06_rows.jl @@ -0,0 +1,177 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package +srand(1); + +# ## Manipulating rows of DataFrame + +#- + +# ### Reordering rows + +x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :) + +#- + +issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted + +#- + +sort!(x, :x) # sort x in place + +#- + +y = sort(x, :id) # new DataFrame + +#- + +sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing + +#- + +sort(x, (order(:y, rev=true), :x)) # the same as above + +#- + +sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff + +#- + +x[shuffle(1:10), :] # reorder rows (here randomly) + +#- + +sort!(x, :id) +x[[1,10],:] = x[[10,1],:] # swap rows +x + +#- + +x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again +x + +# ### Merging/adding rows + +x = DataFrame(rand(3, 5)) + +#- + +[x; x] # merge by rows - data frames must have the same column names; the same is vcat + +#- + +y = x[reverse(names(x))] # get y with other order of names + +#- + +vcat(x, y) # we get what we want as vcat does column name matching + +#- + +vcat(x, y[1:3]) # but column names must still match + +#- + +append!(x, x) # the same but modifies x + +#- + +append!(x, y) # here column names must match exactly + +#- + +push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types +x + +#- + +push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries +x + +# ### Subsetting/removing rows + +x = DataFrame(id=1:10, val='a':'j') + +#- + +x[1:2, :] # by index + +#- + +view(x, 1:2) # the same but a view + +#- + +x[repmat([true, false], 5), :] # by Bool, exact length required + +#- + +view(x, repmat([true, false], 5), :) # view again + +#- + +deleterows!(x, 7) # delete one row + +#- + +deleterows!(x, 6:7) # delete a collection of rows + +#- + +x = DataFrame([1:4, 2:5, 3:6]) + +#- + +filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow + +#- + +## in place modification of x, an example with do-block syntax +filter!(x) do r + if r[:x1] > 2.5 + return r[:x2] < 4.5 + end + r[:x3] < 3.5 +end + +# ### Deduplicating + +x = DataFrame(A=[1,2], B=["x","y"]) +append!(x, x) +x[:C] = 1:4 +x + +#- + +unique(x, [1,2]) # get first unique rows for given index + +#- + +unique(x) # now we look at whole rows + +#- + +nonunique(x, :A) # get indicators of non-unique rows + +#- + +unique!(x, :B) # modify x in place + +# ### Extracting one row from `DataFrame` into a vector + +x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true]) + +#- + +cols = [:x, :y] +[x[1, col] for col in cols] # subset of columns + +#- + +[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x + +#- + +Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added + diff --git a/literate_notebooks/src/07_factors.jl b/literate_notebooks/src/07_factors.jl new file mode 100644 index 0000000..a3ff03c --- /dev/null +++ b/literate_notebooks/src/07_factors.jl @@ -0,0 +1,231 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Working with CategoricalArrays + +#- + +# ### Constructor + +x = categorical(["A", "B", "B", "C"]) # unordered + +#- + +y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order + +#- + +z = categorical(["A","B","B","C", missing]) # unordered with missings + +#- + +c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks + +#- + +by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # just to make sure it works right + +#- + +v = categorical([1,2,2,3,3]) # contains integers not strings + +#- + +Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector + +# ### Managing levels + +arr = [x,y,z,c,v] + +#- + +isordered.(arr) # chcek if categorical array is orderd + +#- + +ordered!(x, true), isordered(x) # make x ordered + +#- + +ordered!(x, false), isordered(x) # and unordered again + +#- + +levels.(arr) # list levels + +#- + +unique.(arr) # missing will be included + +#- + +y[1] < y[2] # can compare as y is ordered + +#- + +v[1] < v[2] # not comparable, v is unordered although it contains integers + +#- + +levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays + +#- + +y[1] < y[2] # observe that the order is changed + +#- + +levels!(z, ["A", "B"]) # you have to specify all levels that are present + +#- + +levels!(z, ["A", "B"], allow_missing=true) # unless the underlying array allows for missings and force removal of levels + +#- + +z[1] = "B" +z # now z has only "B" entries + +#- + +levels(z) # but it remembers the levels it had (the reason is mostly performance) + +#- + +droplevels!(z) # this way we can clean it up +levels(z) + +# ### Data manipulation + +x, levels(x) + +#- + +x[2] = "0" +x, levels(x) # new level added at the end (works only for unordered) + +#- + +v, levels(v) + +#- + +v[1] + v[2] # even though underlying data is Int, we cannot operate on it + +#- + +Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive) + +#- + +get(v[1]) + get(v[2]) # or get a single value + +#- + +get.(v) # this will work for arrays witout missings + +#- + +get.(z) # but will fail on missing values + +#- + +Vector{Union{String, Missing}}(z) # you have to do the conversion + +#- + +z[1]*z[2], z.^2 # the only exception are CategoricalArrays based on String - you can operate on them normally + +#- + +recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent + +#- + +recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings + +#- + +recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly + +#- + +t = categorical([1:5; missing]) +t, levels(t) + +#- + +recode!(t, [1,3]=>2) +t, levels(t) # note that the levels are dropped after recode + +#- + +t = categorical([1,2,3], ordered=true) +levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance + +#- + +t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level +levels(recode(t, 300, [1,2]=>100, 3=>200)) + +# ### Comparisons + +x = categorical([1,2,3]) +xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)] +levels!(xs[2], [3,2,1]) +levels!(xs[4], [2,3,1]) +[a == b for a in xs, b in xs] # all are equal - comparison only by contents + +#- + +signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray +## all are different, notice that x[1] and x[2] are unordered but have a different order of levels +[signature(a) == signature(b) for a in xs, b in xs] + +#- + +x[1] < x[2] # you cannot compare elements of unordered CategoricalArray + +#- + +t[1] < t[2] # but you can do it for an ordered one + +#- + +isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered + +#- + +y = deepcopy(x) # but not across categorical arrays +isless(x[1], y[2]) + +#- + +isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray + +#- + +x[1] == y[2] # equality tests works OK across CategoricalArrays + +# ### Categorical columns in a DataFrame + +df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) + +#- + +categorical!(df) # converts all eltype(AbstractString) columns to categorical + +#- + +showcols(df) + +#- + +categorical!(df, :x) # manually convert to categorical column :x + +#- + +showcols(df) + diff --git a/literate_notebooks/src/08_joins.jl b/literate_notebooks/src/08_joins.jl new file mode 100644 index 0000000..e52bc22 --- /dev/null +++ b/literate_notebooks/src/08_joins.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** + +using DataFrames # load package + +# ## Joining DataFrames + +#- + +# ### Preparing DataFrames for a join + +x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"]) +y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99]) +x,y + +#- + +rename!(x, :ID=>:id) # names of columns on which we want to join must be the same + +# ### Standard joins: inner, left, right, outer, semi, anti + +join(x, y, on=:id) # :inner join by default, missing is joined + +#- + +join(x, y, on=:id, kind=:left) + +#- + +join(x, y, on=:id, kind=:right) + +#- + +join(x, y, on=:id, kind=:outer) + +#- + +join(x, y, on=:id, kind=:semi) + +#- + +join(x, y, on=:id, kind=:anti) + +# ### Cross join + +## cross-join does not require on argument +## it produces a Cartesian product or arguments +function expand_grid(;xs...) # a simple replacement for expand.grid in R + reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross), + DataFrame(Pair(xs[1]...)), xs[2:end]) +end + +expand_grid(a=[1,2], b=["a","b","c"], c=[true,false]) + +# ### Complex cases of joins + +x = DataFrame(id1=[1,1,2,2,missing,missing], + id2=[1,11,2,21,missing,99], + name = ["Alice", "Bob", "Conor", "Dave","Zed", "Zoe"]) +y = DataFrame(id1=[1,1,3,3,missing,missing], + id2=[11,1,31,3,missing,999], + age = [21,22,23,24,99, 100]) +x,y + +#- + +join(x, y, on=[:id1, :id2]) # joining on two columns + +#- + +join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join) + +#- + +join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows) + diff --git a/literate_notebooks/src/09_reshaping.jl b/literate_notebooks/src/09_reshaping.jl new file mode 100644 index 0000000..d6ec25b --- /dev/null +++ b/literate_notebooks/src/09_reshaping.jl @@ -0,0 +1,90 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Reshaping DataFrames + +#- + +# ### Wide to long + +x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114]) + +#- + +melt(x, :id, [:M1, :M2]) # first pass id-variables and then measure variables; meltdf makes a view + +#- + +## optionally you can rename columns; melt and stack are identical but order of arguments is reversed +stack(x, [:M1, :M2], :id, variable_name=:key, value_name=:observed) # first measures and then id-s; stackdf creates view + +#- + +## if second argument is omitted in melt or stack , all other columns are assumed to be the second argument +## but measure variables are selected only if they are <: AbstractFloat +melt(x, [:id, :id2]) + +#- + +melt(x, [1, 2]) # you can use index instead of symbol + +#- + +bigx = DataFrame(rand(10^6, 10)) # a test comparing creation of new DataFrame and a view +bigx[:id] = 1:10^6 +@time melt(bigx, :id) +@time melt(bigx, :id) +@time meltdf(bigx, :id) +@time meltdf(bigx, :id); + +#- + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +melt(x) + +#- + +melt(DataFrame(rand(3,2))) # by default stack and melt treats floats as value columns + +#- + +df = DataFrame(rand(3,2)) +df[:key] = [1,1,1] +mdf = melt(df) # duplicates in key are silently accepted + +# ### Long to wide + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +y = melt(x, [1,2]) +display(x) +display(y) + +#- + +unstack(y, :id2, :variable, :value) # stndard unstack with a unique key + +#- + +unstack(y, :variable, :value) # all other columns are treated as keys + +#- + +## by default :id, :variable and :value names are assumed; in this case it produces duplicate keys +unstack(y) + +#- + +df = stack(DataFrame(rand(3,2))) + +#- + +unstack(df, :variable, :value) # unable to unstack when no key column is present + diff --git a/literate_notebooks/src/10_transforms.jl b/literate_notebooks/src/10_transforms.jl new file mode 100644 index 0000000..3b5b4aa --- /dev/null +++ b/literate_notebooks/src/10_transforms.jl @@ -0,0 +1,80 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Split-apply-combine + +x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8)) + +#- + +gx1 = groupby(x, :id) + +#- + +gx2 = groupby(x, [:id, :id2]) + +#- + +vcat(gx2...) # back to the original DataFrame + +#- + +x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5) + +#- + +showall(groupby(x, :id)) # by default groups include mising values and are not sorted + +#- + +showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :) + +#- + +x = DataFrame(id=rand('a':'d', 100), v=rand(100)); +by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame + +#- + +by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output + +#- + +by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better + +#- + +x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100)) +aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id + +#- + +aggregate(x, :id, sum, sort=true) # also can be sorted + +# *We omit the discussion of of map/combine as I do not find them very useful (better to use by)* + +x = DataFrame(rand(3, 5)) + +#- + +map(mean, eachcol(x)) # map a function over each column and return a data frame + +#- + +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values + +#- + +colwise(mean, x) # colwise is similar, but produces a vector + +#- + +x[:id] = [1,1,2] +colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame + +#- + +map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame + diff --git a/literate_notebooks/src/11_performance.jl b/literate_notebooks/src/11_performance.jl new file mode 100644 index 0000000..005e877 --- /dev/null +++ b/literate_notebooks/src/11_performance.jl @@ -0,0 +1,135 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames +using BenchmarkTools + +# ## Performance tips + +#- + +# ### Access by column number is faster than by name + +x = DataFrame(rand(5, 1000)) +@btime x[500]; +@btime x[:x500]; + +# ### When working with data `DataFrame` use barrier functions or type annotation + +function f_bad() # this function will be slow + srand(1); x = DataFrame(rand(1000000,2)) + y, z = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_bad(); + +#- + +@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame` + +#- + +## solution 1 is to use barrier function (it should be possible to use it in almost any code) +function f_inner(y,z) + p = 0.0 + for i in 1:length(y) + p += y[i]*z[i] + end + p +end + +function f_barrier() # extract the work to an inner function + srand(1); x = DataFrame(rand(1000000,2)) + f_inner(x[1], x[2]) +end + +function f_inbuilt() # or use inbuilt function if possible + srand(1); x = DataFrame(rand(1000000,2)) + dot(x[1], x[2]) +end + +@btime f_barrier(); +@btime f_inbuilt(); + +#- + +## solution 2 is to provide the types of extracted columns +## it is simpler but there are cases in which you will not know these types +function f_typed() + srand(1); x = DataFrame(rand(1000000,2)) + y::Vector{Float64}, z::Vector{Float64} = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_typed(); + +# ### Consider using delayed `DataFrame` creation technique + +function f1() + x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly + for c in 1:ncol(x) + d = x[c] + for r in 1:nrow(x) + d[r] = rand() + end + end + x +end + +function f2() + x = Vector{Any}(100) + for c in 1:length(x) + d = Vector{Float64}(10^4) + for r in 1:length(d) + d[r] = rand() + end + x[c] = d + end + DataFrame(x) # we delay creation of DataFrame after we have our job done +end + +@btime f1(); +@btime f2(); + +# ### You can add rows to a `DataFrame` in place and it is fast + +x = DataFrame(rand(10^6, 5)) +y = DataFrame(transpose(1.0:5.0)) +z = [1.0:5.0;] + +@btime vcat($x, $y); # creates a new DataFrame - slow +@btime append!($x, $y); # in place - fast + +x = DataFrame(rand(10^6, 5)) # reset to the same starting point +@btime push!($x, $z); # add a single row in place - fastest + +# ### Allowing `missing` as well as `categorical` slows down computations + +using StatsBase + +function test(data) # uses countmap function to test performance + println(eltype(data)) + x = rand(data, 10^6) + y = categorical(x) + println(" raw:") + @btime countmap($x) + println(" categorical:") + @btime countmap($y) + nothing +end + +test(1:10) +test([randstring() for i in 1:10]) +test(allowmissing(1:10)) +test(allowmissing([randstring() for i in 1:10])) + + diff --git a/literate_notebooks/src/12_pitfalls.jl b/literate_notebooks/src/12_pitfalls.jl new file mode 100644 index 0000000..8eb5e79 --- /dev/null +++ b/literate_notebooks/src/12_pitfalls.jl @@ -0,0 +1,73 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames + +# ## Possible pitfalls + +#- + +# ### Know what is copied when creating a `DataFrame` + +x = DataFrame(rand(3, 5)) + +#- + +y = DataFrame(x) +x === y # no copyinng performed + +#- + +y = copy(x) +x === y # not the same object + +#- + +all(x[i] === y[i] for i in ncol(x)) # but the columns are the same + +#- + +x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges + +#- + +y === df[:y] # the same object + +#- + +typeof(x), typeof(df[:x]) # range is converted to a vector + +# ### Do not modify the parent of `GroupedDataFrame` + +x = DataFrame(id=repeat([1,2], outer=3), x=1:6) +g = groupby(x, :id) + +#- + +x[1:3, 1]=[2,2,2] +g # well - it is wrong now, g is only a view + +# ### Remember that you can filter columns of a `DataFrame` using booleans + +srand(1) +x = DataFrame(rand(5, 5)) + +#- + +x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans + +#- + +x[x[:x1] .< 0.25, :] # probably this is what we wanted + +# ### Column selection for DataFrame creates aliases unless explicitly copied + +x = DataFrame(a=1:3) +x[:b] = x[1] # alias +x[:c] = x[:, 1] # also alias +x[:d] = x[1][:] # copy +x[:e] = copy(x[1]) # explicit copy +display(x) +x[1,1] = 100 +display(x) + diff --git a/literate_notebooks/src/13_extras.jl b/literate_notebooks/src/13_extras.jl new file mode 100644 index 0000000..5140a31 --- /dev/null +++ b/literate_notebooks/src/13_extras.jl @@ -0,0 +1,198 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 13, 2018** + +using DataFrames + +# ## Extras - selected functionalities of selected packages + +#- + +# ### FreqTables: creating cross tabulations + +using FreqTables +df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) +ft = freqtable(df, :a, :b) # observe that dimensions are sorted if possible + +#- + +ft[1,1], ft['b', "z"] # you can index the result using numbers or names + +#- + +prop(ft, 1) # getting proportions - 1 means we want to calculate them in rows (first dimension) + +#- + +prop(ft, 2) # and columns are normalized to 1.0 now + +#- + +x = categorical(rand(1:3, 10)) +levels!(x, [3, 1, 2, 4]) # reordering levels and adding an extra level +freqtable(x) # order is preserved and not-used level is shown + +#- + +freqtable([1,1,2,3,missing]) # by default missings are listed + +#- + +freqtable([1,1,2,3,missing], skipmissing=true) # but we can skip them + +# ### DataFramesMeta - working on `DataFrame` + +using DataFramesMeta +df = DataFrame(x=1:8, y='a':'h', z=repeat([true,false], outer=4)) + +#- + +@with(df, :x+:z) # expressions with columns of DataFrame + +#- + +@with df begin # you can define code blocks + a = :x[:z] + b = :x[.!:z] + :y + [a; b] +end + +#- + +a # @with creates hard scope so variables do not leak out + +#- + +df2 = DataFrame(a = [:a, :b, :c]) +@with(df2, :a .== ^(:a)) # sometimes we want to work on raw Symbol, ^() escapes it + +#- + +df2 = DataFrame(x=1:3, y=4:6, z=7:9) +@with(df2, _I_(2:3)) # _I_(expression) is translated to df2[expression] + +#- + +@where(df, :x .< 4, :z .== true) # very useful macro for filtering + +#- + +@select(df, :x, y = 2*:x, z=:y) # create a new DataFrame based on the old one + +#- + +@transform(df, a=1, x = 2*:x, y=:x) # create a new DataFrame adding columns based on the old one + +#- + +@transform(df, a=1, b=:a) # old DataFrame is used and :a is not present there + +#- + +@orderby(df, :z, -:x) # sorting into a new data frame, less powerful than sort, but lightweight + +#- + +@linq df |> # chaining of operations on DataFrame + where(:x .< 5) |> + orderby(:z) |> + transform(x²=:x.^2) |> + select(:z, :x, :x²) + +#- + +f(df, col) = df[col] # you can define your own functions and put them in the chain +@linq df |> where(:x .<= 4) |> f(:x) + +# ### DataFramesMeta - working on grouped `DataFrame` + +df = DataFrame(a = 1:12, b = repeat('a':'d', outer=3)) +g = groupby(df, :b) + +#- + +@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # more convinient than by from DataFrames + +#- + +@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) # the same as by but on grouped DataFrame + +#- + +@where(g, mean(:a) > 6.5) # filter gropus on aggregate conditions + +#- + +@orderby(g, -sum(:a)) # order groups on aggregate conditions + +#- + +@transform(g, center = mean(:a), centered = :a - mean(:a)) # perform operations within a group and return ungroped DataFrame + +#- + +DataFrame(g) # a nice convinience function not defined in DataFrames + +#- + +@transform(g) # actually this is the same + +#- + +@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # you can do chaining on grouped DataFrames as well + +# ### DataFramesMeta - rowwise operations on `DataFrame` + +df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) + +#- + +## such conditions are often needed but are complex to write +@transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) + +#- + +## one option is to use a function that works on a single observation and broadcast it +myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" +@transform(df, x = myfun.(:a, :b)) + +#- + +## or you can use @byrow! macro that allows you to process DataFrame rowwise +@byrow! df begin + @newcol x::Vector{String} + :x = :a > 6 && :b == 4 ? "yes" : "no" +end + +# ### Visualizing data with StatPlots + +using StatPlots # you might need to setup Plots package and some plotting backend first + +#- + +## we present only a minimal functionality of the package + +#- + +srand(1) +df = DataFrame(x = sort(randn(1000)), y=randn(1000), z = [fill("b", 500); fill("a", 500)]) + +#- + +@df df plot(:x, :y, legend=:topleft, label="y(x)") # a most basic plot + +#- + +@df df density(:x, label="") # density plot + +#- + +@df df histogram(:y, label="y") # and a histogram + +#- + +@df df boxplot(:z, :x, label="x") + +#- + +@df df violin(:z, :y, label="y") +